Compare commits
8 Commits
v0.17.0
...
v0.14.0rc2
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
7f42dc20bb | ||
|
|
c2a37a3cf8 | ||
|
|
0e31fc7996 | ||
|
|
6ac0fcf416 | ||
|
|
b62249725c | ||
|
|
1b57275207 | ||
|
|
2c24bc6996 | ||
|
|
0aa8c40552 |
@@ -169,6 +169,24 @@ steps:
|
||||
env:
|
||||
DOCKER_BUILDKIT: "1"
|
||||
|
||||
- block: "Build ROCm release image"
|
||||
key: block-rocm-release-image-build
|
||||
depends_on: ~
|
||||
|
||||
- label: "Build release image (ROCm)"
|
||||
depends_on: block-rocm-release-image-build
|
||||
id: build-release-image-rocm
|
||||
agents:
|
||||
queue: cpu_queue_postmerge
|
||||
commands:
|
||||
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
||||
# Build base image first
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --tag rocm/vllm-dev:base-$BUILDKITE_COMMIT --target final --progress plain -f docker/Dockerfile.rocm_base ."
|
||||
# Build vLLM ROCm image using the base
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg BASE_IMAGE=rocm/vllm-dev:base-$BUILDKITE_COMMIT --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-rocm --target vllm-openai --progress plain -f docker/Dockerfile.rocm ."
|
||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-rocm"
|
||||
|
||||
|
||||
- label: "Build and publish nightly multi-arch image to DockerHub"
|
||||
depends_on:
|
||||
- create-multi-arch-manifest
|
||||
@@ -196,3 +214,365 @@ steps:
|
||||
env:
|
||||
DOCKER_BUILDKIT: "1"
|
||||
DOCKERHUB_USERNAME: "vllmbot"
|
||||
|
||||
# =============================================================================
|
||||
# ROCm Release Pipeline (x86_64 only)
|
||||
# =============================================================================
|
||||
#
|
||||
# vLLM version is determined by the Buildkite checkout (like CUDA pipeline).
|
||||
# To build a specific version, trigger the build from that branch/tag.
|
||||
#
|
||||
# Environment variables for ROCm builds (set via Buildkite UI or schedule):
|
||||
# ROCM_PYTHON_VERSION: Python version (default: 3.12)
|
||||
# PYTORCH_ROCM_ARCH: GPU architectures (default: gfx90a;gfx942;gfx950;gfx1100;gfx1101;gfx1200;gfx1201;gfx1150;gfx1151)
|
||||
# ROCM_UPLOAD_WHEELS: Upload to S3 (default: false for nightly, true for releases)
|
||||
# ROCM_FORCE_REBUILD: Force rebuild base wheels, ignore S3 cache (default: false)
|
||||
#
|
||||
# Note: ROCm version is determined by BASE_IMAGE in docker/Dockerfile.rocm_base
|
||||
# (currently rocm/dev-ubuntu-22.04:7.1-complete)
|
||||
#
|
||||
# =============================================================================
|
||||
|
||||
# ROCm Input Step - Collect build configuration (manual trigger only)
|
||||
- input: "ROCm Wheel Release Build Configuration"
|
||||
key: input-rocm-config
|
||||
depends_on: ~
|
||||
if: build.source == "ui"
|
||||
fields:
|
||||
- text: "Python Version"
|
||||
key: "rocm-python-version"
|
||||
default: "3.12"
|
||||
hint: "Python version (e.g., 3.12)"
|
||||
- text: "GPU Architectures"
|
||||
key: "rocm-pytorch-rocm-arch"
|
||||
default: "gfx90a;gfx942;gfx950;gfx1100;gfx1101;gfx1200;gfx1201;gfx1150;gfx1151"
|
||||
hint: "Semicolon-separated GPU architectures"
|
||||
- select: "Upload Wheels to S3"
|
||||
key: "rocm-upload-wheels"
|
||||
default: "true"
|
||||
options:
|
||||
- label: "No - Build only (nightly/dev)"
|
||||
value: "false"
|
||||
- label: "Yes - Upload to S3 (release)"
|
||||
value: "true"
|
||||
- select: "Force Rebuild Base Wheels"
|
||||
key: "rocm-force-rebuild"
|
||||
default: "false"
|
||||
hint: "Ignore S3 cache and rebuild base wheels from scratch"
|
||||
options:
|
||||
- label: "No - Use cached wheels if available"
|
||||
value: "false"
|
||||
- label: "Yes - Rebuild even if cache exists"
|
||||
value: "true"
|
||||
|
||||
# ROCm Job 1: Build ROCm Base Wheels (with S3 caching)
|
||||
- label: ":rocm: Build ROCm Base Wheels"
|
||||
id: build-rocm-base-wheels
|
||||
depends_on:
|
||||
- step: input-rocm-config
|
||||
allow_failure: true # Allow failure so non-UI builds can proceed (input step is skipped)
|
||||
agents:
|
||||
queue: cpu_queue_postmerge
|
||||
commands:
|
||||
# Set configuration and check cache
|
||||
- |
|
||||
set -euo pipefail
|
||||
|
||||
# Get values from meta-data (set by input step) or use defaults
|
||||
PYTHON_VERSION="$$(buildkite-agent meta-data get rocm-python-version 2>/dev/null || echo '')"
|
||||
export PYTHON_VERSION="$${PYTHON_VERSION:-3.12}"
|
||||
|
||||
PYTORCH_ROCM_ARCH="$$(buildkite-agent meta-data get rocm-pytorch-rocm-arch 2>/dev/null || echo '')"
|
||||
export PYTORCH_ROCM_ARCH="$${PYTORCH_ROCM_ARCH:-gfx90a;gfx942;gfx950;gfx1100;gfx1101;gfx1200;gfx1201;gfx1150;gfx1151}"
|
||||
|
||||
# Check for force rebuild flag
|
||||
ROCM_FORCE_REBUILD="$${ROCM_FORCE_REBUILD:-}"
|
||||
if [ -z "$${ROCM_FORCE_REBUILD}" ]; then
|
||||
ROCM_FORCE_REBUILD="$$(buildkite-agent meta-data get rocm-force-rebuild 2>/dev/null || echo '')"
|
||||
fi
|
||||
|
||||
echo "========================================"
|
||||
echo "ROCm Base Wheels Build Configuration"
|
||||
echo "========================================"
|
||||
echo " PYTHON_VERSION: $${PYTHON_VERSION}"
|
||||
echo " PYTORCH_ROCM_ARCH: $${PYTORCH_ROCM_ARCH}"
|
||||
echo " ROCM_FORCE_REBUILD: $${ROCM_FORCE_REBUILD:-false}"
|
||||
echo "========================================"
|
||||
|
||||
# Save resolved config for later jobs
|
||||
buildkite-agent meta-data set "rocm-python-version" "$${PYTHON_VERSION}"
|
||||
buildkite-agent meta-data set "rocm-pytorch-rocm-arch" "$${PYTORCH_ROCM_ARCH}"
|
||||
|
||||
# Check S3 cache for pre-built wheels
|
||||
CACHE_KEY=$$(.buildkite/scripts/cache-rocm-base-wheels.sh key)
|
||||
CACHE_PATH=$$(.buildkite/scripts/cache-rocm-base-wheels.sh path)
|
||||
echo ""
|
||||
echo "Cache key: $${CACHE_KEY}"
|
||||
echo "Cache path: $${CACHE_PATH}"
|
||||
|
||||
# Save cache key for downstream jobs
|
||||
buildkite-agent meta-data set "rocm-cache-key" "$${CACHE_KEY}"
|
||||
|
||||
CACHE_STATUS="miss"
|
||||
if [ "$${ROCM_FORCE_REBUILD}" != "true" ]; then
|
||||
CACHE_STATUS=$$(.buildkite/scripts/cache-rocm-base-wheels.sh check)
|
||||
else
|
||||
echo "Force rebuild requested, skipping cache check"
|
||||
fi
|
||||
|
||||
if [ "$${CACHE_STATUS}" = "hit" ]; then
|
||||
echo ""
|
||||
echo "CACHE HIT! Downloading pre-built wheels..."
|
||||
echo ""
|
||||
.buildkite/scripts/cache-rocm-base-wheels.sh download
|
||||
|
||||
# Set the S3 path for the cached Docker image (for Job 2 to download)
|
||||
S3_ARTIFACT_PATH="s3://$${S3_BUCKET}/rocm/cache/$${CACHE_KEY}"
|
||||
buildkite-agent meta-data set "rocm-docker-image-s3-path" "$${S3_ARTIFACT_PATH}/rocm-base-image.tar.gz"
|
||||
|
||||
# Mark that we used cache (for Docker image handling)
|
||||
buildkite-agent meta-data set "rocm-used-cache" "true"
|
||||
|
||||
echo ""
|
||||
echo "Cache download complete. Skipping Docker build."
|
||||
echo "Docker image will be downloaded from: $${S3_ARTIFACT_PATH}/rocm-base-image.tar.gz"
|
||||
else
|
||||
echo ""
|
||||
echo "CACHE MISS. Building from scratch..."
|
||||
echo ""
|
||||
|
||||
# Build full base image (for later vLLM build)
|
||||
DOCKER_BUILDKIT=1 docker buildx build \
|
||||
--file docker/Dockerfile.rocm_base \
|
||||
--tag rocm/vllm-dev:base-$${BUILDKITE_BUILD_NUMBER} \
|
||||
--build-arg PYTORCH_ROCM_ARCH="$${PYTORCH_ROCM_ARCH}" \
|
||||
--build-arg PYTHON_VERSION="$${PYTHON_VERSION}" \
|
||||
--build-arg USE_SCCACHE=1 \
|
||||
--build-arg SCCACHE_BUCKET_NAME=vllm-build-sccache \
|
||||
--build-arg SCCACHE_REGION_NAME=us-west-2 \
|
||||
--build-arg SCCACHE_S3_NO_CREDENTIALS=0 \
|
||||
--load \
|
||||
.
|
||||
|
||||
# Build debs_wheel_release stage for wheel extraction
|
||||
DOCKER_BUILDKIT=1 docker buildx build \
|
||||
--file docker/Dockerfile.rocm_base \
|
||||
--tag rocm-base-debs:$${BUILDKITE_BUILD_NUMBER} \
|
||||
--target debs_wheel_release \
|
||||
--build-arg PYTORCH_ROCM_ARCH="$${PYTORCH_ROCM_ARCH}" \
|
||||
--build-arg PYTHON_VERSION="$${PYTHON_VERSION}" \
|
||||
--build-arg USE_SCCACHE=1 \
|
||||
--build-arg SCCACHE_BUCKET_NAME=vllm-build-sccache \
|
||||
--build-arg SCCACHE_REGION_NAME=us-west-2 \
|
||||
--build-arg SCCACHE_S3_NO_CREDENTIALS=0 \
|
||||
--load \
|
||||
.
|
||||
|
||||
# Extract wheels from Docker image
|
||||
mkdir -p artifacts/rocm-base-wheels
|
||||
container_id=$$(docker create rocm-base-debs:$${BUILDKITE_BUILD_NUMBER})
|
||||
docker cp $${container_id}:/app/debs/. artifacts/rocm-base-wheels/
|
||||
docker rm $${container_id}
|
||||
echo "Extracted base wheels:"
|
||||
ls -lh artifacts/rocm-base-wheels/
|
||||
|
||||
# Upload wheels to S3 cache for future builds
|
||||
echo ""
|
||||
echo "Uploading wheels to S3 cache..."
|
||||
.buildkite/scripts/cache-rocm-base-wheels.sh upload
|
||||
|
||||
# Export base Docker image for reuse in vLLM build
|
||||
mkdir -p artifacts/rocm-docker-image
|
||||
docker save rocm/vllm-dev:base-$${BUILDKITE_BUILD_NUMBER} | gzip > artifacts/rocm-docker-image/rocm-base-image.tar.gz
|
||||
echo "Docker image size:"
|
||||
ls -lh artifacts/rocm-docker-image/
|
||||
|
||||
# Upload large Docker image to S3 (also cached by cache key)
|
||||
S3_ARTIFACT_PATH="s3://$${S3_BUCKET}/rocm/cache/$${CACHE_KEY}"
|
||||
echo "Uploading Docker image to $${S3_ARTIFACT_PATH}/"
|
||||
aws s3 cp artifacts/rocm-docker-image/rocm-base-image.tar.gz "$${S3_ARTIFACT_PATH}/rocm-base-image.tar.gz"
|
||||
|
||||
# Save the S3 path for downstream jobs
|
||||
buildkite-agent meta-data set "rocm-docker-image-s3-path" "$${S3_ARTIFACT_PATH}/rocm-base-image.tar.gz"
|
||||
|
||||
# Mark that we did NOT use cache
|
||||
buildkite-agent meta-data set "rocm-used-cache" "false"
|
||||
|
||||
echo ""
|
||||
echo "Build complete. Wheels cached for future builds."
|
||||
fi
|
||||
artifact_paths:
|
||||
- "artifacts/rocm-base-wheels/*.whl"
|
||||
env:
|
||||
DOCKER_BUILDKIT: "1"
|
||||
S3_BUCKET: "vllm-wheels"
|
||||
|
||||
# ROCm Job 2: Build vLLM ROCm Wheel
|
||||
- label: ":python: Build vLLM ROCm Wheel"
|
||||
id: build-rocm-vllm-wheel
|
||||
depends_on:
|
||||
- step: build-rocm-base-wheels
|
||||
allow_failure: false
|
||||
agents:
|
||||
queue: cpu_queue_postmerge
|
||||
timeout_in_minutes: 180
|
||||
commands:
|
||||
# Download artifacts and prepare Docker image
|
||||
- |
|
||||
set -euo pipefail
|
||||
|
||||
# Ensure git tags are up-to-date (Buildkite's default fetch doesn't update tags)
|
||||
# This fixes version detection when tags are moved/force-pushed
|
||||
echo "Fetching latest tags from origin..."
|
||||
git fetch --tags --force origin
|
||||
|
||||
# Log tag information for debugging version detection
|
||||
echo "========================================"
|
||||
echo "Git Tag Verification"
|
||||
echo "========================================"
|
||||
echo "Current HEAD: $(git rev-parse HEAD)"
|
||||
echo "git describe --tags: $(git describe --tags 2>/dev/null || echo 'No tags found')"
|
||||
echo ""
|
||||
echo "Recent tags (pointing to commits near HEAD):"
|
||||
git tag -l --sort=-creatordate | head -5
|
||||
echo "setuptools_scm version detection:"
|
||||
pip install -q setuptools_scm 2>/dev/null || true
|
||||
python3 -c "import setuptools_scm; print(' Detected version:', setuptools_scm.get_version())" 2>/dev/null || echo " (setuptools_scm not available in this environment)"
|
||||
echo "========================================"
|
||||
|
||||
# Download wheel artifacts from current build
|
||||
echo "Downloading wheel artifacts from current build"
|
||||
buildkite-agent artifact download "artifacts/rocm-base-wheels/*.whl" .
|
||||
|
||||
# Download Docker image from S3 (too large for Buildkite artifacts)
|
||||
DOCKER_IMAGE_S3_PATH="$$(buildkite-agent meta-data get rocm-docker-image-s3-path 2>/dev/null || echo '')"
|
||||
if [ -z "$${DOCKER_IMAGE_S3_PATH}" ]; then
|
||||
echo "ERROR: rocm-docker-image-s3-path metadata not found"
|
||||
echo "This should have been set by the build-rocm-base-wheels job"
|
||||
exit 1
|
||||
fi
|
||||
echo "Downloading Docker image from $${DOCKER_IMAGE_S3_PATH}"
|
||||
mkdir -p artifacts/rocm-docker-image
|
||||
aws s3 cp "$${DOCKER_IMAGE_S3_PATH}" artifacts/rocm-docker-image/rocm-base-image.tar.gz
|
||||
|
||||
# Load base Docker image and capture the tag
|
||||
echo "Loading base Docker image..."
|
||||
LOAD_OUTPUT=$$(gunzip -c artifacts/rocm-docker-image/rocm-base-image.tar.gz | docker load)
|
||||
echo "$${LOAD_OUTPUT}"
|
||||
# Extract the actual loaded image tag from "Loaded image: <tag>" output
|
||||
# This avoids picking up stale images (like rocm/vllm-dev:nightly) already on the agent
|
||||
BASE_IMAGE_TAG=$$(echo "$${LOAD_OUTPUT}" | grep "Loaded image:" | sed 's/Loaded image: //')
|
||||
if [ -z "$${BASE_IMAGE_TAG}" ]; then
|
||||
echo "ERROR: Failed to extract image tag from docker load output"
|
||||
echo "Load output was: $${LOAD_OUTPUT}"
|
||||
exit 1
|
||||
fi
|
||||
echo "Loaded base image: $${BASE_IMAGE_TAG}"
|
||||
|
||||
# Prepare base wheels for Docker build context
|
||||
mkdir -p docker/context/base-wheels
|
||||
touch docker/context/base-wheels/.keep
|
||||
cp artifacts/rocm-base-wheels/*.whl docker/context/base-wheels/
|
||||
echo "Base wheels for vLLM build:"
|
||||
ls -lh docker/context/base-wheels/
|
||||
|
||||
# Get GPU architectures from meta-data
|
||||
PYTORCH_ROCM_ARCH="$$(buildkite-agent meta-data get rocm-pytorch-rocm-arch 2>/dev/null || echo '')"
|
||||
PYTORCH_ROCM_ARCH="$${PYTORCH_ROCM_ARCH:-gfx90a;gfx942;gfx950;gfx1100;gfx1101;gfx1200;gfx1201;gfx1150;gfx1151}"
|
||||
|
||||
echo "========================================"
|
||||
echo "Building vLLM wheel with:"
|
||||
echo " BUILDKITE_COMMIT: $${BUILDKITE_COMMIT}"
|
||||
echo " BUILDKITE_BRANCH: $${BUILDKITE_BRANCH}"
|
||||
echo " PYTORCH_ROCM_ARCH: $${PYTORCH_ROCM_ARCH}"
|
||||
echo " BASE_IMAGE: $${BASE_IMAGE_TAG}"
|
||||
echo "========================================"
|
||||
|
||||
# Build vLLM wheel using local checkout (REMOTE_VLLM=0)
|
||||
DOCKER_BUILDKIT=1 docker build \
|
||||
--file docker/Dockerfile.rocm \
|
||||
--target export_vllm_wheel_release \
|
||||
--output type=local,dest=rocm-dist \
|
||||
--build-arg BASE_IMAGE="$${BASE_IMAGE_TAG}" \
|
||||
--build-arg ARG_PYTORCH_ROCM_ARCH="$${PYTORCH_ROCM_ARCH}" \
|
||||
--build-arg REMOTE_VLLM=0 \
|
||||
--build-arg GIT_REPO_CHECK=1 \
|
||||
--build-arg USE_SCCACHE=1 \
|
||||
--build-arg SCCACHE_BUCKET_NAME=vllm-build-sccache \
|
||||
--build-arg SCCACHE_REGION_NAME=us-west-2 \
|
||||
--build-arg SCCACHE_S3_NO_CREDENTIALS=0 \
|
||||
.
|
||||
|
||||
echo "Built vLLM wheel:"
|
||||
ls -lh rocm-dist/*.whl
|
||||
|
||||
# Copy wheel to artifacts directory
|
||||
mkdir -p artifacts/rocm-vllm-wheel
|
||||
cp rocm-dist/*.whl artifacts/rocm-vllm-wheel/
|
||||
echo "Final vLLM wheel:"
|
||||
ls -lh artifacts/rocm-vllm-wheel/
|
||||
artifact_paths:
|
||||
- "artifacts/rocm-vllm-wheel/*.whl"
|
||||
env:
|
||||
DOCKER_BUILDKIT: "1"
|
||||
S3_BUCKET: "vllm-wheels"
|
||||
|
||||
# ROCm Job 3: Upload Wheels to S3
|
||||
- label: ":s3: Upload ROCm Wheels to S3"
|
||||
id: upload-rocm-wheels
|
||||
depends_on:
|
||||
- step: build-rocm-vllm-wheel
|
||||
allow_failure: false
|
||||
agents:
|
||||
queue: cpu_queue_postmerge
|
||||
timeout_in_minutes: 60
|
||||
commands:
|
||||
# Download all wheel artifacts and run upload
|
||||
- |
|
||||
set -euo pipefail
|
||||
|
||||
# Check if upload is enabled (from env var, meta-data, or release branch)
|
||||
ROCM_UPLOAD_WHEELS="$${ROCM_UPLOAD_WHEELS:-}"
|
||||
if [ -z "$${ROCM_UPLOAD_WHEELS}" ]; then
|
||||
# Try to get from meta-data (input form)
|
||||
ROCM_UPLOAD_WHEELS="$$(buildkite-agent meta-data get rocm-upload-wheels 2>/dev/null || echo '')"
|
||||
fi
|
||||
|
||||
echo "========================================"
|
||||
echo "Upload check:"
|
||||
echo " ROCM_UPLOAD_WHEELS: $${ROCM_UPLOAD_WHEELS}"
|
||||
echo " BUILDKITE_BRANCH: $${BUILDKITE_BRANCH}"
|
||||
echo "========================================"
|
||||
|
||||
# Skip upload if not enabled
|
||||
if [ "$${ROCM_UPLOAD_WHEELS}" != "true" ]; then
|
||||
echo "Skipping S3 upload (ROCM_UPLOAD_WHEELS != true, NIGHTLY != 1, not a release branch)"
|
||||
echo "To enable upload, set 'Upload Wheels to S3' to 'Yes' in the build configuration"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
echo "Upload enabled, proceeding..."
|
||||
|
||||
# Download artifacts from current build
|
||||
echo "Downloading artifacts from current build"
|
||||
buildkite-agent artifact download "artifacts/rocm-base-wheels/*.whl" .
|
||||
buildkite-agent artifact download "artifacts/rocm-vllm-wheel/*.whl" .
|
||||
|
||||
# Run upload script
|
||||
bash .buildkite/scripts/upload-rocm-wheels.sh
|
||||
env:
|
||||
DOCKER_BUILDKIT: "1"
|
||||
S3_BUCKET: "vllm-wheels"
|
||||
|
||||
# ROCm Job 4: Annotate ROCm Wheel Release
|
||||
- label: ":memo: Annotate ROCm wheel release"
|
||||
id: annotate-rocm-release
|
||||
depends_on:
|
||||
- step: upload-rocm-wheels
|
||||
allow_failure: true
|
||||
agents:
|
||||
queue: cpu_queue_postmerge
|
||||
commands:
|
||||
- "bash .buildkite/scripts/annotate-rocm-release.sh"
|
||||
env:
|
||||
S3_BUCKET: "vllm-wheels"
|
||||
|
||||
@@ -32,6 +32,7 @@ To download and upload the image:
|
||||
\`\`\`
|
||||
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64
|
||||
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64
|
||||
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm
|
||||
|
||||
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64 vllm/vllm-openai:x86_64
|
||||
docker tag vllm/vllm-openai:x86_64 vllm/vllm-openai:latest-x86_64
|
||||
@@ -45,6 +46,12 @@ docker tag vllm/vllm-openai:aarch64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
|
||||
docker push vllm/vllm-openai:latest-aarch64
|
||||
docker push vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
|
||||
|
||||
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm vllm/vllm-openai:rocm
|
||||
docker tag vllm/vllm-openai:rocm vllm/vllm-openai:latest-rocm
|
||||
docker tag vllm/vllm-openai:rocm vllm/vllm-openai:v${RELEASE_VERSION}-rocm
|
||||
docker push vllm/vllm-openai:latest-rocm
|
||||
docker push vllm/vllm-openai:v${RELEASE_VERSION}-rocm
|
||||
|
||||
docker manifest rm vllm/vllm-openai:latest
|
||||
docker manifest create vllm/vllm-openai:latest vllm/vllm-openai:latest-x86_64 vllm/vllm-openai:latest-aarch64
|
||||
docker manifest create vllm/vllm-openai:v${RELEASE_VERSION} vllm/vllm-openai:v${RELEASE_VERSION}-x86_64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
|
||||
|
||||
74
.buildkite/scripts/annotate-rocm-release.sh
Executable file
74
.buildkite/scripts/annotate-rocm-release.sh
Executable file
@@ -0,0 +1,74 @@
|
||||
#!/bin/bash
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
#
|
||||
# Generate Buildkite annotation for ROCm wheel release
|
||||
|
||||
set -ex
|
||||
|
||||
# Get build configuration from meta-data
|
||||
# Extract ROCm version dynamically from Dockerfile.rocm_base
|
||||
# BASE_IMAGE format: rocm/dev-ubuntu-22.04:7.1-complete -> extracts "7.1"
|
||||
ROCM_VERSION=$(grep -E '^ARG BASE_IMAGE=' docker/Dockerfile.rocm_base | sed -E 's/.*:([0-9]+\.[0-9]+).*/\1/' || echo "unknown")
|
||||
PYTHON_VERSION=$(buildkite-agent meta-data get rocm-python-version 2>/dev/null || echo "3.12")
|
||||
PYTORCH_ROCM_ARCH=$(buildkite-agent meta-data get rocm-pytorch-rocm-arch 2>/dev/null || echo "gfx90a;gfx942;gfx950;gfx1100;gfx1101;gfx1200;gfx1201;gfx1150;gfx1151")
|
||||
|
||||
# S3 URLs
|
||||
S3_BUCKET="${S3_BUCKET:-vllm-wheels}"
|
||||
S3_REGION="${AWS_DEFAULT_REGION:-us-west-2}"
|
||||
S3_URL="https://${S3_BUCKET}.s3.${S3_REGION}.amazonaws.com"
|
||||
ROCM_PATH="rocm/${BUILDKITE_COMMIT}"
|
||||
|
||||
buildkite-agent annotate --style 'success' --context 'rocm-release-workflow' << EOF
|
||||
## :rocm: ROCm Wheel Release
|
||||
|
||||
### Build Configuration
|
||||
| Setting | Value |
|
||||
|---------|-------|
|
||||
| **ROCm Version** | ${ROCM_VERSION} |
|
||||
| **Python Version** | ${PYTHON_VERSION} |
|
||||
| **GPU Architectures** | ${PYTORCH_ROCM_ARCH} |
|
||||
| **Branch** | \`${BUILDKITE_BRANCH}\` |
|
||||
| **Commit** | \`${BUILDKITE_COMMIT}\` |
|
||||
|
||||
### :package: Installation
|
||||
|
||||
**Install from this build (by commit):**
|
||||
\`\`\`bash
|
||||
uv pip install vllm --extra-index-url ${S3_URL}/${ROCM_PATH}/{rocm_variant}/
|
||||
|
||||
# Example:
|
||||
uv pip install vllm --extra-index-url ${S3_URL}/${ROCM_PATH}/rocm700/
|
||||
\`\`\`
|
||||
|
||||
**Install from nightly (if published):**
|
||||
\`\`\`bash
|
||||
uv pip install vllm --extra-index-url ${S3_URL}/rocm/nightly/
|
||||
\`\`\`
|
||||
|
||||
### :floppy_disk: Download Wheels Directly
|
||||
|
||||
\`\`\`bash
|
||||
# List all ROCm wheels
|
||||
aws s3 ls s3://${S3_BUCKET}/${ROCM_PATH}/
|
||||
|
||||
# Download specific wheels
|
||||
aws s3 cp s3://${S3_BUCKET}/${ROCM_PATH}/vllm-*.whl .
|
||||
aws s3 cp s3://${S3_BUCKET}/${ROCM_PATH}/torch-*.whl .
|
||||
aws s3 cp s3://${S3_BUCKET}/${ROCM_PATH}/triton_rocm-*.whl .
|
||||
aws s3 cp s3://${S3_BUCKET}/${ROCM_PATH}/torchvision-*.whl .
|
||||
aws s3 cp s3://${S3_BUCKET}/${ROCM_PATH}/amdsmi-*.whl .
|
||||
\`\`\`
|
||||
|
||||
### :gear: Included Packages
|
||||
- **vllm**: vLLM with ROCm support
|
||||
- **torch**: PyTorch built for ROCm ${ROCM_VERSION}
|
||||
- **triton_rocm**: Triton built for ROCm
|
||||
- **torchvision**: TorchVision for ROCm PyTorch
|
||||
- **amdsmi**: AMD SMI Python bindings
|
||||
|
||||
### :warning: Notes
|
||||
- These wheels are built for **ROCm ${ROCM_VERSION}** and will NOT work with CUDA GPUs
|
||||
- Supported GPU architectures: ${PYTORCH_ROCM_ARCH}
|
||||
- Platform: Linux x86_64 only
|
||||
EOF
|
||||
140
.buildkite/scripts/cache-rocm-base-wheels.sh
Executable file
140
.buildkite/scripts/cache-rocm-base-wheels.sh
Executable file
@@ -0,0 +1,140 @@
|
||||
#!/usr/bin/env bash
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
#
|
||||
# Cache helper for ROCm base wheels
|
||||
#
|
||||
# This script manages caching of pre-built ROCm base wheels (torch, triton, etc.)
|
||||
# to avoid rebuilding them when Dockerfile.rocm_base hasn't changed.
|
||||
#
|
||||
# Usage:
|
||||
# cache-rocm-base-wheels.sh check - Check if cache exists, outputs "hit" or "miss"
|
||||
# cache-rocm-base-wheels.sh upload - Upload wheels to cache
|
||||
# cache-rocm-base-wheels.sh download - Download wheels from cache
|
||||
# cache-rocm-base-wheels.sh key - Output the cache key
|
||||
#
|
||||
# Environment variables:
|
||||
# S3_BUCKET - S3 bucket name (default: vllm-wheels)
|
||||
# PYTHON_VERSION - Python version (affects cache key)
|
||||
# PYTORCH_ROCM_ARCH - GPU architectures (affects cache key)
|
||||
#
|
||||
# Note: ROCm version is determined by BASE_IMAGE in Dockerfile.rocm_base,
|
||||
# so changes to ROCm version are captured by the Dockerfile hash.
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
BUCKET="${S3_BUCKET:-vllm-wheels}"
|
||||
DOCKERFILE="docker/Dockerfile.rocm_base"
|
||||
CACHE_PREFIX="rocm/cache"
|
||||
|
||||
# Generate hash from Dockerfile content + build args
|
||||
generate_cache_key() {
|
||||
# Include Dockerfile content
|
||||
if [[ ! -f "$DOCKERFILE" ]]; then
|
||||
echo "ERROR: Dockerfile not found: $DOCKERFILE" >&2
|
||||
exit 1
|
||||
fi
|
||||
local dockerfile_hash=$(sha256sum "$DOCKERFILE" | cut -c1-16)
|
||||
|
||||
# Include key build args that affect the output
|
||||
# These should match the ARGs in Dockerfile.rocm_base that change the build output
|
||||
# Note: ROCm version is determined by BASE_IMAGE in the Dockerfile, so it's captured by dockerfile_hash
|
||||
local args_string="${PYTHON_VERSION:-}|${PYTORCH_ROCM_ARCH:-}"
|
||||
local args_hash=$(echo "$args_string" | sha256sum | cut -c1-8)
|
||||
|
||||
echo "${dockerfile_hash}-${args_hash}"
|
||||
}
|
||||
|
||||
CACHE_KEY=$(generate_cache_key)
|
||||
CACHE_PATH="s3://${BUCKET}/${CACHE_PREFIX}/${CACHE_KEY}/"
|
||||
|
||||
case "${1:-}" in
|
||||
check)
|
||||
echo "Checking cache for key: ${CACHE_KEY}" >&2
|
||||
echo "Cache path: ${CACHE_PATH}" >&2
|
||||
echo "Variables used in cache key:" >&2
|
||||
echo " PYTHON_VERSION: ${PYTHON_VERSION:-<not set>}" >&2
|
||||
echo " PYTORCH_ROCM_ARCH: ${PYTORCH_ROCM_ARCH:-<not set>}" >&2
|
||||
|
||||
# Check if cache exists by listing objects
|
||||
# We look for at least one .whl file
|
||||
echo "Running: aws s3 ls ${CACHE_PATH}" >&2
|
||||
S3_OUTPUT=$(aws s3 ls "${CACHE_PATH}" 2>&1) || true
|
||||
echo "S3 ls output:" >&2
|
||||
echo "$S3_OUTPUT" | head -5 >&2
|
||||
|
||||
if echo "$S3_OUTPUT" | grep -q "\.whl"; then
|
||||
echo "hit"
|
||||
else
|
||||
echo "miss"
|
||||
fi
|
||||
;;
|
||||
|
||||
upload)
|
||||
echo "========================================"
|
||||
echo "Uploading wheels to cache"
|
||||
echo "========================================"
|
||||
echo "Cache key: ${CACHE_KEY}"
|
||||
echo "Cache path: ${CACHE_PATH}"
|
||||
echo ""
|
||||
|
||||
if [[ ! -d "artifacts/rocm-base-wheels" ]]; then
|
||||
echo "ERROR: artifacts/rocm-base-wheels directory not found" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
WHEEL_COUNT=$(ls artifacts/rocm-base-wheels/*.whl 2>/dev/null | wc -l)
|
||||
if [[ "$WHEEL_COUNT" -eq 0 ]]; then
|
||||
echo "ERROR: No wheels found in artifacts/rocm-base-wheels/" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "Uploading $WHEEL_COUNT wheels..."
|
||||
aws s3 cp --recursive artifacts/rocm-base-wheels/ "${CACHE_PATH}"
|
||||
|
||||
echo ""
|
||||
echo "Cache upload complete!"
|
||||
echo "========================================"
|
||||
;;
|
||||
|
||||
download)
|
||||
echo "========================================"
|
||||
echo "Downloading wheels from cache"
|
||||
echo "========================================"
|
||||
echo "Cache key: ${CACHE_KEY}"
|
||||
echo "Cache path: ${CACHE_PATH}"
|
||||
echo ""
|
||||
|
||||
mkdir -p artifacts/rocm-base-wheels
|
||||
aws s3 cp --recursive "${CACHE_PATH}" artifacts/rocm-base-wheels/
|
||||
|
||||
echo ""
|
||||
echo "Downloaded wheels:"
|
||||
ls -lh artifacts/rocm-base-wheels/
|
||||
|
||||
WHEEL_COUNT=$(ls artifacts/rocm-base-wheels/*.whl 2>/dev/null | wc -l)
|
||||
echo ""
|
||||
echo "Total: $WHEEL_COUNT wheels"
|
||||
echo "========================================"
|
||||
;;
|
||||
|
||||
key)
|
||||
echo "${CACHE_KEY}"
|
||||
;;
|
||||
|
||||
path)
|
||||
echo "${CACHE_PATH}"
|
||||
;;
|
||||
|
||||
*)
|
||||
echo "Usage: $0 {check|upload|download|key|path}" >&2
|
||||
echo "" >&2
|
||||
echo "Commands:" >&2
|
||||
echo " check - Check if cache exists, outputs 'hit' or 'miss'" >&2
|
||||
echo " upload - Upload wheels from artifacts/rocm-base-wheels/ to cache" >&2
|
||||
echo " download - Download wheels from cache to artifacts/rocm-base-wheels/" >&2
|
||||
echo " key - Output the cache key" >&2
|
||||
echo " path - Output the full S3 cache path" >&2
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
@@ -16,6 +16,18 @@ from urllib.parse import quote
|
||||
|
||||
import regex as re
|
||||
|
||||
|
||||
def normalize_package_name(name: str) -> str:
|
||||
"""
|
||||
Normalize package name according to PEP 503.
|
||||
https://peps.python.org/pep-0503/#normalized-names
|
||||
|
||||
Replace runs of underscores, hyphens, and periods with a single hyphen,
|
||||
and lowercase the result.
|
||||
"""
|
||||
return re.sub(r"[-_.]+", "-", name).lower()
|
||||
|
||||
|
||||
if not sys.version_info >= (3, 12):
|
||||
raise RuntimeError("This script requires Python 3.12 or higher.")
|
||||
|
||||
@@ -78,7 +90,13 @@ def parse_from_filename(file: str) -> WheelFileInfo:
|
||||
version = version.removesuffix("." + variant)
|
||||
else:
|
||||
if "+" in version:
|
||||
version, variant = version.split("+")
|
||||
version_part, suffix = version.split("+", 1)
|
||||
# Only treat known patterns as variants (rocmXXX, cuXXX, cpu)
|
||||
# Git hashes and other suffixes are NOT variants
|
||||
if suffix.startswith(("rocm", "cu", "cpu")):
|
||||
variant = suffix
|
||||
version = version_part
|
||||
# Otherwise keep the full version string (variant stays None)
|
||||
|
||||
return WheelFileInfo(
|
||||
package_name=package_name,
|
||||
@@ -206,6 +224,26 @@ def generate_index_and_metadata(
|
||||
print("No wheel files found, skipping index generation.")
|
||||
return
|
||||
|
||||
# For ROCm builds: inherit variant from vllm wheel
|
||||
# All ROCm wheels should share the same variant as vllm
|
||||
rocm_variant = None
|
||||
for file in parsed_files:
|
||||
if (
|
||||
file.package_name == "vllm"
|
||||
and file.variant
|
||||
and file.variant.startswith("rocm")
|
||||
):
|
||||
rocm_variant = file.variant
|
||||
print(f"Detected ROCm variant from vllm: {rocm_variant}")
|
||||
break
|
||||
|
||||
# Apply ROCm variant to all wheels without a variant
|
||||
if rocm_variant:
|
||||
for file in parsed_files:
|
||||
if file.variant is None:
|
||||
file.variant = rocm_variant
|
||||
print(f"Inherited variant '{rocm_variant}' for {file.filename}")
|
||||
|
||||
# Group by variant
|
||||
variant_to_files: dict[str, list[WheelFileInfo]] = {}
|
||||
for file in parsed_files:
|
||||
@@ -256,8 +294,8 @@ def generate_index_and_metadata(
|
||||
|
||||
variant_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# gather all package names in this variant
|
||||
packages = set(f.package_name for f in files)
|
||||
# gather all package names in this variant (normalized per PEP 503)
|
||||
packages = set(normalize_package_name(f.package_name) for f in files)
|
||||
if variant == "default":
|
||||
# these packages should also appear in the "project list"
|
||||
# generate after all variants are processed
|
||||
@@ -269,8 +307,10 @@ def generate_index_and_metadata(
|
||||
f.write(project_list_str)
|
||||
|
||||
for package in packages:
|
||||
# filter files belonging to this package only
|
||||
package_files = [f for f in files if f.package_name == package]
|
||||
# filter files belonging to this package only (compare normalized names)
|
||||
package_files = [
|
||||
f for f in files if normalize_package_name(f.package_name) == package
|
||||
]
|
||||
package_dir = variant_dir / package
|
||||
package_dir.mkdir(parents=True, exist_ok=True)
|
||||
index_str, metadata_str = generate_package_index_and_metadata(
|
||||
@@ -341,8 +381,13 @@ if __name__ == "__main__":
|
||||
args = parser.parse_args()
|
||||
|
||||
version = args.version
|
||||
if "/" in version or "\\" in version:
|
||||
raise ValueError("Version string must not contain slashes.")
|
||||
# Allow rocm/ prefix, reject other slashes and all backslashes
|
||||
if "\\" in version:
|
||||
raise ValueError("Version string must not contain backslashes.")
|
||||
if "/" in version and not version.startswith("rocm/"):
|
||||
raise ValueError(
|
||||
"Version string must not contain slashes (except for 'rocm/' prefix)."
|
||||
)
|
||||
current_objects_path = Path(args.current_objects)
|
||||
output_dir = Path(args.output_dir)
|
||||
if not output_dir.exists():
|
||||
@@ -393,8 +438,23 @@ if __name__ == "__main__":
|
||||
# Generate index and metadata, assuming wheels and indices are stored as:
|
||||
# s3://vllm-wheels/{wheel_dir}/<wheel files>
|
||||
# s3://vllm-wheels/<anything>/<index files>
|
||||
wheel_dir = args.wheel_dir or version
|
||||
wheel_base_dir = Path(output_dir).parent / wheel_dir.strip().rstrip("/")
|
||||
#
|
||||
# For ROCm builds, version is "rocm/{commit}" and indices are uploaded to:
|
||||
# - rocm/{commit}/ (same as wheels)
|
||||
# - rocm/nightly/
|
||||
# - rocm/{version}/
|
||||
# All these are under the "rocm/" prefix, so relative paths should be
|
||||
# relative to "rocm/", not the bucket root.
|
||||
if args.wheel_dir:
|
||||
# Explicit wheel-dir provided (e.g., for version-specific indices pointing to commit dir)
|
||||
wheel_dir = args.wheel_dir.strip().rstrip("/")
|
||||
elif version.startswith("rocm/"):
|
||||
# For rocm/commit, wheel_base_dir should be just the commit part
|
||||
# so relative path from rocm/0.12.0/rocm710/vllm/ -> ../../../{commit}/
|
||||
wheel_dir = version.split("/", 1)[1]
|
||||
else:
|
||||
wheel_dir = version
|
||||
wheel_base_dir = Path(output_dir).parent / wheel_dir
|
||||
index_base_dir = Path(output_dir)
|
||||
|
||||
generate_index_and_metadata(
|
||||
|
||||
151
.buildkite/scripts/upload-rocm-wheels.sh
Executable file
151
.buildkite/scripts/upload-rocm-wheels.sh
Executable file
@@ -0,0 +1,151 @@
|
||||
#!/usr/bin/env bash
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
#
|
||||
# Upload ROCm wheels to S3 with proper index generation
|
||||
#
|
||||
# Required environment variables:
|
||||
# AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY (or IAM role)
|
||||
# S3_BUCKET (default: vllm-wheels)
|
||||
#
|
||||
# S3 path structure:
|
||||
# s3://vllm-wheels/rocm/{commit}/ - All wheels for this commit
|
||||
# s3://vllm-wheels/rocm/nightly/ - Index pointing to latest nightly
|
||||
# s3://vllm-wheels/rocm/{version}/ - Index for release versions
|
||||
|
||||
set -ex
|
||||
|
||||
# ======== Configuration ========
|
||||
BUCKET="${S3_BUCKET:-vllm-wheels}"
|
||||
ROCM_SUBPATH="rocm/${BUILDKITE_COMMIT}"
|
||||
S3_COMMIT_PREFIX="s3://$BUCKET/$ROCM_SUBPATH/"
|
||||
INDICES_OUTPUT_DIR="rocm-indices"
|
||||
PYTHON="${PYTHON_PROG:-python3}"
|
||||
|
||||
# ROCm uses manylinux_2_35 (Ubuntu 22.04 based)
|
||||
MANYLINUX_VERSION="manylinux_2_35"
|
||||
|
||||
echo "========================================"
|
||||
echo "ROCm Wheel Upload Configuration"
|
||||
echo "========================================"
|
||||
echo "S3 Bucket: $BUCKET"
|
||||
echo "S3 Path: $ROCM_SUBPATH"
|
||||
echo "Commit: $BUILDKITE_COMMIT"
|
||||
echo "Branch: $BUILDKITE_BRANCH"
|
||||
echo "========================================"
|
||||
|
||||
# ======== Part 0: Setup Python ========
|
||||
|
||||
# Detect if python3.12+ is available
|
||||
has_new_python=$($PYTHON -c "print(1 if __import__('sys').version_info >= (3,12) else 0)" 2>/dev/null || echo 0)
|
||||
if [[ "$has_new_python" -eq 0 ]]; then
|
||||
# Use new python from docker
|
||||
# Use --user to ensure files are created with correct ownership (not root)
|
||||
docker pull python:3-slim
|
||||
PYTHON="docker run --rm --user $(id -u):$(id -g) -v $(pwd):/app -w /app python:3-slim python3"
|
||||
fi
|
||||
|
||||
echo "Using python interpreter: $PYTHON"
|
||||
echo "Python version: $($PYTHON --version)"
|
||||
|
||||
# ======== Part 1: Collect and prepare wheels ========
|
||||
|
||||
# Collect all wheels
|
||||
mkdir -p all-rocm-wheels
|
||||
cp artifacts/rocm-base-wheels/*.whl all-rocm-wheels/ 2>/dev/null || true
|
||||
cp artifacts/rocm-vllm-wheel/*.whl all-rocm-wheels/ 2>/dev/null || true
|
||||
|
||||
WHEEL_COUNT=$(ls all-rocm-wheels/*.whl 2>/dev/null | wc -l)
|
||||
echo "Total wheels to upload: $WHEEL_COUNT"
|
||||
|
||||
if [ "$WHEEL_COUNT" -eq 0 ]; then
|
||||
echo "ERROR: No wheels found to upload!"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Rename linux to manylinux in wheel filenames
|
||||
for wheel in all-rocm-wheels/*.whl; do
|
||||
if [[ "$wheel" == *"linux"* ]] && [[ "$wheel" != *"manylinux"* ]]; then
|
||||
new_wheel="${wheel/linux/$MANYLINUX_VERSION}"
|
||||
mv -- "$wheel" "$new_wheel"
|
||||
echo "Renamed: $(basename "$wheel") -> $(basename "$new_wheel")"
|
||||
fi
|
||||
done
|
||||
|
||||
echo ""
|
||||
echo "Wheels to upload:"
|
||||
ls -lh all-rocm-wheels/
|
||||
|
||||
# ======== Part 2: Upload wheels to S3 ========
|
||||
|
||||
echo ""
|
||||
echo "Uploading wheels to $S3_COMMIT_PREFIX"
|
||||
for wheel in all-rocm-wheels/*.whl; do
|
||||
aws s3 cp "$wheel" "$S3_COMMIT_PREFIX"
|
||||
done
|
||||
|
||||
# ======== Part 3: Generate and upload indices ========
|
||||
|
||||
# List existing wheels in commit directory
|
||||
echo ""
|
||||
echo "Generating indices..."
|
||||
obj_json="rocm-objects.json"
|
||||
aws s3api list-objects-v2 --bucket "$BUCKET" --prefix "$ROCM_SUBPATH/" --delimiter / --output json > "$obj_json"
|
||||
|
||||
mkdir -p "$INDICES_OUTPUT_DIR"
|
||||
|
||||
# Use the existing generate-nightly-index.py
|
||||
# HACK: Replace regex module with stdlib re (same as CUDA script)
|
||||
sed -i 's/import regex as re/import re/g' .buildkite/scripts/generate-nightly-index.py
|
||||
|
||||
$PYTHON .buildkite/scripts/generate-nightly-index.py \
|
||||
--version "$ROCM_SUBPATH" \
|
||||
--current-objects "$obj_json" \
|
||||
--output-dir "$INDICES_OUTPUT_DIR" \
|
||||
--comment "ROCm commit $BUILDKITE_COMMIT"
|
||||
|
||||
# Upload indices to commit directory
|
||||
echo "Uploading indices to $S3_COMMIT_PREFIX"
|
||||
aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "$S3_COMMIT_PREFIX"
|
||||
|
||||
# Update rocm/nightly/ if on main branch and not a PR
|
||||
if [[ "$BUILDKITE_BRANCH" == "main" && "$BUILDKITE_PULL_REQUEST" == "false" ]] || [[ "$NIGHTLY" == "1" ]]; then
|
||||
echo "Updating rocm/nightly/ index..."
|
||||
aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "s3://$BUCKET/rocm/nightly/"
|
||||
fi
|
||||
|
||||
# Extract version from vLLM wheel and update version-specific index
|
||||
VLLM_WHEEL=$(ls all-rocm-wheels/vllm*.whl 2>/dev/null | head -1)
|
||||
if [ -n "$VLLM_WHEEL" ]; then
|
||||
VERSION=$(unzip -p "$VLLM_WHEEL" '**/METADATA' | grep '^Version: ' | cut -d' ' -f2)
|
||||
echo "Version in wheel: $VERSION"
|
||||
PURE_VERSION="${VERSION%%+*}"
|
||||
PURE_VERSION="${PURE_VERSION%%.rocm}"
|
||||
echo "Pure version: $PURE_VERSION"
|
||||
|
||||
if [[ "$VERSION" != *"dev"* ]]; then
|
||||
echo "Updating rocm/$PURE_VERSION/ index..."
|
||||
aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "s3://$BUCKET/rocm/$PURE_VERSION/"
|
||||
fi
|
||||
fi
|
||||
|
||||
# ======== Part 4: Summary ========
|
||||
|
||||
echo ""
|
||||
echo "========================================"
|
||||
echo "ROCm Wheel Upload Complete!"
|
||||
echo "========================================"
|
||||
echo ""
|
||||
echo "Wheels available at:"
|
||||
echo " s3://$BUCKET/$ROCM_SUBPATH/"
|
||||
echo ""
|
||||
echo "Install command (by commit):"
|
||||
echo " pip install vllm --extra-index-url https://${BUCKET}.s3.amazonaws.com/$ROCM_SUBPATH/"
|
||||
echo ""
|
||||
if [[ "$BUILDKITE_BRANCH" == "main" ]] || [[ "$NIGHTLY" == "1" ]]; then
|
||||
echo "Install command (nightly):"
|
||||
echo " pip install vllm --extra-index-url https://${BUCKET}.s3.amazonaws.com/rocm/nightly/"
|
||||
fi
|
||||
echo ""
|
||||
echo "Wheel count: $WHEEL_COUNT"
|
||||
echo "========================================"
|
||||
@@ -3,6 +3,14 @@ ARG REMOTE_VLLM="0"
|
||||
ARG COMMON_WORKDIR=/app
|
||||
ARG BASE_IMAGE=rocm/vllm-dev:base
|
||||
|
||||
# Sccache configuration (only used in release pipeline)
|
||||
ARG USE_SCCACHE
|
||||
ARG SCCACHE_DOWNLOAD_URL
|
||||
ARG SCCACHE_ENDPOINT
|
||||
ARG SCCACHE_BUCKET_NAME=vllm-build-sccache
|
||||
ARG SCCACHE_REGION_NAME=us-west-2
|
||||
ARG SCCACHE_S3_NO_CREDENTIALS=0
|
||||
|
||||
FROM ${BASE_IMAGE} AS base
|
||||
|
||||
ARG ARG_PYTORCH_ROCM_ARCH
|
||||
@@ -14,9 +22,14 @@ ENV RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES=1
|
||||
RUN apt-get update -q -y && apt-get install -q -y \
|
||||
sqlite3 libsqlite3-dev libfmt-dev libmsgpack-dev libsuitesparse-dev \
|
||||
apt-transport-https ca-certificates wget curl
|
||||
# Remove sccache
|
||||
RUN python3 -m pip install --upgrade pip
|
||||
RUN apt-get purge -y sccache; python3 -m pip uninstall -y sccache; rm -f "$(which sccache)"
|
||||
# Remove sccache only if not using sccache (it exists in base image from Dockerfile.rocm_base)
|
||||
ARG USE_SCCACHE
|
||||
RUN if [ "$USE_SCCACHE" != "1" ]; then \
|
||||
apt-get purge -y sccache || true; \
|
||||
python3 -m pip uninstall -y sccache || true; \
|
||||
rm -f "$(which sccache)" || true; \
|
||||
fi
|
||||
|
||||
# Install UV
|
||||
RUN curl -LsSf https://astral.sh/uv/install.sh | env UV_INSTALL_DIR="/usr/local/bin" sh
|
||||
@@ -28,6 +41,39 @@ ENV UV_INDEX_STRATEGY="unsafe-best-match"
|
||||
# Use copy mode to avoid hardlink failures with Docker cache mounts
|
||||
ENV UV_LINK_MODE=copy
|
||||
|
||||
# Install sccache if USE_SCCACHE is enabled (for release builds)
|
||||
ARG USE_SCCACHE
|
||||
ARG SCCACHE_DOWNLOAD_URL
|
||||
ARG SCCACHE_ENDPOINT
|
||||
ARG SCCACHE_BUCKET_NAME
|
||||
ARG SCCACHE_REGION_NAME
|
||||
ARG SCCACHE_S3_NO_CREDENTIALS
|
||||
RUN if [ "$USE_SCCACHE" = "1" ]; then \
|
||||
if command -v sccache >/dev/null 2>&1; then \
|
||||
echo "sccache already installed, skipping installation"; \
|
||||
sccache --version; \
|
||||
else \
|
||||
echo "Installing sccache..." \
|
||||
&& SCCACHE_ARCH="x86_64" \
|
||||
&& SCCACHE_VERSION="v0.8.1" \
|
||||
&& SCCACHE_DL_URL="${SCCACHE_DOWNLOAD_URL:-https://github.com/mozilla/sccache/releases/download/${SCCACHE_VERSION}/sccache-${SCCACHE_VERSION}-${SCCACHE_ARCH}-unknown-linux-musl.tar.gz}" \
|
||||
&& curl -L -o /tmp/sccache.tar.gz ${SCCACHE_DL_URL} \
|
||||
&& tar -xzf /tmp/sccache.tar.gz -C /tmp \
|
||||
&& mv /tmp/sccache-${SCCACHE_VERSION}-${SCCACHE_ARCH}-unknown-linux-musl/sccache /usr/bin/sccache \
|
||||
&& chmod +x /usr/bin/sccache \
|
||||
&& rm -rf /tmp/sccache.tar.gz /tmp/sccache-${SCCACHE_VERSION}-${SCCACHE_ARCH}-unknown-linux-musl \
|
||||
&& sccache --version; \
|
||||
fi; \
|
||||
fi
|
||||
|
||||
# Set sccache environment variables only when USE_SCCACHE=1
|
||||
# This prevents S3 config from leaking into images when sccache is not used
|
||||
ARG USE_SCCACHE
|
||||
ENV SCCACHE_BUCKET=${USE_SCCACHE:+${SCCACHE_BUCKET_NAME}}
|
||||
ENV SCCACHE_REGION=${USE_SCCACHE:+${SCCACHE_REGION_NAME}}
|
||||
ENV SCCACHE_S3_NO_CREDENTIALS=${USE_SCCACHE:+${SCCACHE_S3_NO_CREDENTIALS}}
|
||||
ENV SCCACHE_IDLE_TIMEOUT=${USE_SCCACHE:+0}
|
||||
|
||||
ARG COMMON_WORKDIR
|
||||
WORKDIR ${COMMON_WORKDIR}
|
||||
|
||||
@@ -51,7 +97,7 @@ FROM fetch_vllm_${REMOTE_VLLM} AS fetch_vllm
|
||||
# -----------------------
|
||||
# vLLM build stages
|
||||
FROM fetch_vllm AS build_vllm
|
||||
# Build vLLM
|
||||
# Build vLLM (setup.py auto-detects sccache in PATH)
|
||||
RUN cd vllm \
|
||||
&& python3 -m pip install -r requirements/rocm.txt \
|
||||
&& python3 setup.py clean --all \
|
||||
@@ -67,6 +113,178 @@ COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/docker/Dockerfile.rocm /docker/
|
||||
COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/.buildkite /.buildkite
|
||||
COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/vllm/v1 /vllm_v1
|
||||
|
||||
# RIXL/UCX build stages
|
||||
FROM base AS build_rixl
|
||||
ARG RIXL_BRANCH="f33a5599"
|
||||
ARG RIXL_REPO="https://github.com/ROCm/RIXL.git"
|
||||
ARG UCX_BRANCH="da3fac2a"
|
||||
ARG UCX_REPO="https://github.com/ROCm/ucx.git"
|
||||
ENV ROCM_PATH=/opt/rocm
|
||||
ENV UCX_HOME=/usr/local/ucx
|
||||
ENV RIXL_HOME=/usr/local/rixl
|
||||
ENV RIXL_BENCH_HOME=/usr/local/rixl_bench
|
||||
|
||||
# RIXL build system dependences and RDMA support
|
||||
RUN apt-get -y update && apt-get -y install autoconf libtool pkg-config \
|
||||
libgrpc-dev \
|
||||
libgrpc++-dev \
|
||||
libprotobuf-dev \
|
||||
protobuf-compiler-grpc \
|
||||
libcpprest-dev \
|
||||
libaio-dev \
|
||||
librdmacm1 \
|
||||
librdmacm-dev \
|
||||
libibverbs1 \
|
||||
libibverbs-dev \
|
||||
ibverbs-utils \
|
||||
rdmacm-utils \
|
||||
ibverbs-providers \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
RUN uv pip install --system meson auditwheel patchelf tomlkit
|
||||
|
||||
RUN cd /usr/local/src && \
|
||||
git clone ${UCX_REPO} && \
|
||||
cd ucx && \
|
||||
git checkout ${UCX_BRANCH} && \
|
||||
./autogen.sh && \
|
||||
mkdir build && cd build && \
|
||||
../configure \
|
||||
--prefix=/usr/local/ucx \
|
||||
--enable-shared \
|
||||
--disable-static \
|
||||
--disable-doxygen-doc \
|
||||
--enable-optimizations \
|
||||
--enable-devel-headers \
|
||||
--with-rocm=/opt/rocm \
|
||||
--with-verbs \
|
||||
--with-dm \
|
||||
--enable-mt && \
|
||||
make -j && \
|
||||
make install
|
||||
|
||||
ENV PATH=/usr/local/ucx/bin:$PATH
|
||||
ENV LD_LIBRARY_PATH=${UCX_HOME}/lib:${LD_LIBRARY_PATH}
|
||||
|
||||
RUN git clone ${RIXL_REPO} /opt/rixl && \
|
||||
cd /opt/rixl && \
|
||||
git checkout ${RIXL_BRANCH} && \
|
||||
meson setup build --prefix=${RIXL_HOME} \
|
||||
-Ducx_path=${UCX_HOME} \
|
||||
-Drocm_path=${ROCM_PATH} && \
|
||||
cd build && \
|
||||
ninja && \
|
||||
ninja install
|
||||
|
||||
# Generate RIXL wheel
|
||||
RUN cd /opt/rixl && mkdir -p /app/install && \
|
||||
./contrib/build-wheel.sh \
|
||||
--output-dir /app/install \
|
||||
--rocm-dir ${ROCM_PATH} \
|
||||
--ucx-plugins-dir ${UCX_HOME}/lib/ucx \
|
||||
--nixl-plugins-dir ${RIXL_HOME}/lib/x86_64-linux-gnu/plugins
|
||||
|
||||
|
||||
# -----------------------
|
||||
# vLLM wheel release build stage (for building distributable wheels)
|
||||
# This stage pins dependencies to custom ROCm wheel versions and handles version detection
|
||||
FROM fetch_vllm AS build_vllm_wheel_release
|
||||
|
||||
ARG COMMON_WORKDIR
|
||||
|
||||
# Create /install directory for custom wheels
|
||||
RUN mkdir -p /install
|
||||
|
||||
# Copy custom ROCm wheels from docker/context if they exist
|
||||
# COPY ensures Docker cache is invalidated when wheels change
|
||||
# .keep file ensures directory always exists for COPY to work
|
||||
COPY docker/context/base-wheels/ /tmp/base-wheels/
|
||||
# This is how we know if we are building for a wheel release or not.
|
||||
# If there are not wheels found there, we are not building for a wheel release.
|
||||
# So we exit with an error. To skip this stage.
|
||||
RUN if [ -n "$(ls /tmp/base-wheels/*.whl 2>/dev/null)" ]; then \
|
||||
echo "Found custom wheels - copying to /install"; \
|
||||
cp /tmp/base-wheels/*.whl /install/ && \
|
||||
echo "Copied custom wheels:"; \
|
||||
ls -lh /install/; \
|
||||
else \
|
||||
echo "ERROR: No custom wheels found in docker/context/base-wheels/"; \
|
||||
echo "Wheel releases require pre-built ROCm wheels."; \
|
||||
exit 1; \
|
||||
fi
|
||||
|
||||
# GIT_REPO_CHECK: Verify repo is clean and tags are available (for release builds)
|
||||
# This matches CUDA's Dockerfile behavior for proper version detection via setuptools_scm
|
||||
ARG GIT_REPO_CHECK=0
|
||||
RUN if [ "$GIT_REPO_CHECK" != "0" ]; then \
|
||||
echo "Running repository checks..."; \
|
||||
cd vllm && bash tools/check_repo.sh; \
|
||||
fi
|
||||
|
||||
# Extract version from git BEFORE any modifications (pin_rocm_dependencies.py modifies requirements/rocm.txt)
|
||||
# This ensures setuptools_scm sees clean repo state for version detection
|
||||
RUN --mount=type=bind,source=.git,target=vllm/.git \
|
||||
cd vllm \
|
||||
&& pip install setuptools_scm \
|
||||
&& VLLM_VERSION=$(python3 -c "import setuptools_scm; print(setuptools_scm.get_version())") \
|
||||
&& echo "Detected vLLM version: ${VLLM_VERSION}" \
|
||||
&& echo "${VLLM_VERSION}" > /tmp/vllm_version.txt
|
||||
|
||||
# Fail if git-based package dependencies are found in requirements files
|
||||
# (uv doesn't handle git+ URLs well, and packages should be distributed on PyPI)
|
||||
# Extra notes: pip install is able to handle git+ URLs, but uv doesn't.
|
||||
RUN echo "Checking for git-based packages in requirements files..." \
|
||||
&& echo "Checking common.txt for git-based packages:" \
|
||||
&& if grep -q 'git+' ${COMMON_WORKDIR}/vllm/requirements/common.txt; then \
|
||||
echo "ERROR: Git-based packages found in common.txt:"; \
|
||||
grep 'git+' ${COMMON_WORKDIR}/vllm/requirements/common.txt; \
|
||||
echo "Please publish these packages to PyPI instead of using git dependencies."; \
|
||||
exit 1; \
|
||||
else \
|
||||
echo " ✓ No git-based packages found in common.txt"; \
|
||||
fi \
|
||||
&& echo "Checking rocm.txt for git-based packages:" \
|
||||
&& if grep -q 'git+' ${COMMON_WORKDIR}/vllm/requirements/rocm.txt; then \
|
||||
echo "ERROR: Git-based packages found in rocm.txt:"; \
|
||||
grep 'git+' ${COMMON_WORKDIR}/vllm/requirements/rocm.txt; \
|
||||
echo "Please publish these packages to PyPI instead of using git dependencies."; \
|
||||
exit 1; \
|
||||
else \
|
||||
echo " ✓ No git-based packages found in rocm.txt"; \
|
||||
fi \
|
||||
&& echo "All requirements files are clean - no git-based packages found"
|
||||
|
||||
# Pin vLLM dependencies to exact versions of custom ROCm wheels
|
||||
# This ensures 'pip install vllm' automatically installs correct torch/triton/torchvision/amdsmi
|
||||
COPY tools/vllm-rocm/pin_rocm_dependencies.py /tmp/pin_rocm_dependencies.py
|
||||
RUN echo "Pinning vLLM dependencies to custom wheel versions..." \
|
||||
&& python3 /tmp/pin_rocm_dependencies.py /install ${COMMON_WORKDIR}/vllm/requirements/rocm.txt
|
||||
|
||||
# Install dependencies using custom wheels from /install
|
||||
RUN cd vllm \
|
||||
&& echo "Building vLLM with custom wheels from /install" \
|
||||
&& python3 -m pip install --find-links /install -r requirements/rocm.txt \
|
||||
&& python3 setup.py clean --all
|
||||
|
||||
# Build wheel using pre-extracted version to avoid dirty state from modified requirements/rocm.txt
|
||||
# (setup.py auto-detects sccache in PATH)
|
||||
RUN --mount=type=bind,source=.git,target=vllm/.git \
|
||||
cd vllm \
|
||||
&& export SETUPTOOLS_SCM_PRETEND_VERSION=$(cat /tmp/vllm_version.txt) \
|
||||
&& echo "Building wheel with version: ${SETUPTOOLS_SCM_PRETEND_VERSION}" \
|
||||
&& python3 setup.py bdist_wheel --dist-dir=dist
|
||||
|
||||
FROM scratch AS export_vllm_wheel_release
|
||||
ARG COMMON_WORKDIR
|
||||
COPY --from=build_vllm_wheel_release ${COMMON_WORKDIR}/vllm/dist/*.whl /
|
||||
COPY --from=build_vllm_wheel_release ${COMMON_WORKDIR}/vllm/requirements /requirements
|
||||
COPY --from=build_vllm_wheel_release ${COMMON_WORKDIR}/vllm/benchmarks /benchmarks
|
||||
COPY --from=build_vllm_wheel_release ${COMMON_WORKDIR}/vllm/tests /tests
|
||||
COPY --from=build_vllm_wheel_release ${COMMON_WORKDIR}/vllm/examples /examples
|
||||
COPY --from=build_vllm_wheel_release ${COMMON_WORKDIR}/vllm/docker/Dockerfile.rocm /docker/
|
||||
COPY --from=build_vllm_wheel_release ${COMMON_WORKDIR}/vllm/.buildkite /.buildkite
|
||||
COPY --from=build_vllm_wheel_release ${COMMON_WORKDIR}/vllm/vllm/v1 /vllm_v1
|
||||
|
||||
# -----------------------
|
||||
# Test vLLM image
|
||||
FROM base AS test
|
||||
@@ -159,3 +377,7 @@ ENV KINETO_CONFIG="${COMMON_WORKDIR}/libkineto.conf"
|
||||
RUN echo "VLLM_BASE_IMAGE=${BASE_IMAGE}" >> ${COMMON_WORKDIR}/versions.txt
|
||||
|
||||
CMD ["/bin/bash"]
|
||||
|
||||
#Set entrypoint for vllm-openai official images
|
||||
FROM final As vllm-openai
|
||||
ENTRYPOINT ["vllm", "serve"]
|
||||
|
||||
@@ -14,16 +14,13 @@ ARG AITER_REPO="https://github.com/ROCm/aiter.git"
|
||||
ARG MORI_BRANCH="2d02c6a9"
|
||||
ARG MORI_REPO="https://github.com/ROCm/mori.git"
|
||||
|
||||
#TODO: When patch has been upstreamed, switch to the main repo/branch
|
||||
# ARG RIXL_BRANCH="<TODO>"
|
||||
# ARG RIXL_REPO="https://github.com/ROCm/RIXL.git"
|
||||
ARG RIXL_BRANCH="50d63d94"
|
||||
ARG RIXL_REPO="https://github.com/vcave/RIXL.git"
|
||||
# Needed by RIXL
|
||||
ARG ETCD_BRANCH="7c6e714f"
|
||||
ARG ETCD_REPO="https://github.com/etcd-cpp-apiv3/etcd-cpp-apiv3.git"
|
||||
ARG UCX_BRANCH="da3fac2a"
|
||||
ARG UCX_REPO="https://github.com/ROCm/ucx.git"
|
||||
# Sccache configuration (only used in release pipeline)
|
||||
ARG USE_SCCACHE
|
||||
ARG SCCACHE_DOWNLOAD_URL
|
||||
ARG SCCACHE_ENDPOINT
|
||||
ARG SCCACHE_BUCKET_NAME=vllm-build-sccache
|
||||
ARG SCCACHE_REGION_NAME=us-west-2
|
||||
ARG SCCACHE_S3_NO_CREDENTIALS=0
|
||||
|
||||
FROM ${BASE_IMAGE} AS base
|
||||
|
||||
@@ -64,6 +61,49 @@ RUN apt-get update -y \
|
||||
RUN pip install -U packaging 'cmake<4' ninja wheel 'setuptools<80' pybind11 Cython
|
||||
RUN apt-get update && apt-get install -y libjpeg-dev libsox-dev libsox-fmt-all sox && rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# Install sccache if USE_SCCACHE is enabled (for release builds)
|
||||
ARG USE_SCCACHE
|
||||
ARG SCCACHE_DOWNLOAD_URL
|
||||
ARG SCCACHE_ENDPOINT
|
||||
ARG SCCACHE_BUCKET_NAME
|
||||
ARG SCCACHE_REGION_NAME
|
||||
ARG SCCACHE_S3_NO_CREDENTIALS
|
||||
RUN if [ "$USE_SCCACHE" = "1" ]; then \
|
||||
echo "Installing sccache..." \
|
||||
&& SCCACHE_ARCH="x86_64" \
|
||||
&& SCCACHE_VERSION="v0.8.1" \
|
||||
&& SCCACHE_DL_URL="${SCCACHE_DOWNLOAD_URL:-https://github.com/mozilla/sccache/releases/download/${SCCACHE_VERSION}/sccache-${SCCACHE_VERSION}-${SCCACHE_ARCH}-unknown-linux-musl.tar.gz}" \
|
||||
&& curl -L -o /tmp/sccache.tar.gz ${SCCACHE_DL_URL} \
|
||||
&& tar -xzf /tmp/sccache.tar.gz -C /tmp \
|
||||
&& mv /tmp/sccache-${SCCACHE_VERSION}-${SCCACHE_ARCH}-unknown-linux-musl/sccache /usr/bin/sccache \
|
||||
&& chmod +x /usr/bin/sccache \
|
||||
&& rm -rf /tmp/sccache.tar.gz /tmp/sccache-${SCCACHE_VERSION}-${SCCACHE_ARCH}-unknown-linux-musl \
|
||||
&& sccache --version; \
|
||||
fi
|
||||
|
||||
# Setup sccache for HIP compilation via HIP_CLANG_PATH
|
||||
# This creates wrapper scripts in a separate directory and points HIP to use them
|
||||
# This avoids modifying the original ROCm binaries which can break detection
|
||||
# NOTE: HIP_CLANG_PATH is NOT set as ENV to avoid affecting downstream images (Dockerfile.rocm)
|
||||
# Instead, each build stage should export HIP_CLANG_PATH=/opt/sccache-wrappers if USE_SCCACHE=1
|
||||
RUN if [ "$USE_SCCACHE" = "1" ]; then \
|
||||
echo "Setting up sccache wrappers for HIP compilation..." \
|
||||
&& mkdir -p /opt/sccache-wrappers \
|
||||
&& printf '#!/bin/bash\nexec sccache /opt/rocm/lib/llvm/bin/clang++ "$@"\n' > /opt/sccache-wrappers/clang++ \
|
||||
&& chmod +x /opt/sccache-wrappers/clang++ \
|
||||
&& printf '#!/bin/bash\nexec sccache /opt/rocm/lib/llvm/bin/clang "$@"\n' > /opt/sccache-wrappers/clang \
|
||||
&& chmod +x /opt/sccache-wrappers/clang \
|
||||
&& echo "sccache wrappers created in /opt/sccache-wrappers"; \
|
||||
fi
|
||||
|
||||
# Set sccache environment variables only when USE_SCCACHE=1
|
||||
# This prevents S3 config from leaking into images when sccache is not used
|
||||
ARG USE_SCCACHE
|
||||
ENV SCCACHE_BUCKET=${USE_SCCACHE:+${SCCACHE_BUCKET_NAME}}
|
||||
ENV SCCACHE_REGION=${USE_SCCACHE:+${SCCACHE_REGION_NAME}}
|
||||
ENV SCCACHE_S3_NO_CREDENTIALS=${USE_SCCACHE:+${SCCACHE_S3_NO_CREDENTIALS}}
|
||||
ENV SCCACHE_IDLE_TIMEOUT=${USE_SCCACHE:+0}
|
||||
|
||||
|
||||
###
|
||||
### Triton Build
|
||||
@@ -100,22 +140,42 @@ ARG PYTORCH_AUDIO_BRANCH
|
||||
ARG PYTORCH_REPO
|
||||
ARG PYTORCH_VISION_REPO
|
||||
ARG PYTORCH_AUDIO_REPO
|
||||
ARG USE_SCCACHE
|
||||
|
||||
RUN git clone ${PYTORCH_REPO} pytorch
|
||||
RUN cd pytorch && git checkout ${PYTORCH_BRANCH} \
|
||||
&& pip install -r requirements.txt && git submodule update --init --recursive \
|
||||
&& python3 tools/amd_build/build_amd.py \
|
||||
&& if [ "$USE_SCCACHE" = "1" ]; then \
|
||||
export HIP_CLANG_PATH=/opt/sccache-wrappers \
|
||||
&& export CMAKE_C_COMPILER_LAUNCHER=sccache \
|
||||
&& export CMAKE_CXX_COMPILER_LAUNCHER=sccache \
|
||||
&& sccache --show-stats; \
|
||||
fi \
|
||||
&& CMAKE_PREFIX_PATH=$(python3 -c 'import sys; print(sys.prefix)') python3 setup.py bdist_wheel --dist-dir=dist \
|
||||
&& if [ "$USE_SCCACHE" = "1" ]; then sccache --show-stats; fi \
|
||||
&& pip install dist/*.whl
|
||||
RUN git clone ${PYTORCH_VISION_REPO} vision
|
||||
RUN cd vision && git checkout ${PYTORCH_VISION_BRANCH} \
|
||||
&& if [ "$USE_SCCACHE" = "1" ]; then \
|
||||
export HIP_CLANG_PATH=/opt/sccache-wrappers \
|
||||
&& export CMAKE_C_COMPILER_LAUNCHER=sccache \
|
||||
&& export CMAKE_CXX_COMPILER_LAUNCHER=sccache; \
|
||||
fi \
|
||||
&& python3 setup.py bdist_wheel --dist-dir=dist \
|
||||
&& if [ "$USE_SCCACHE" = "1" ]; then sccache --show-stats; fi \
|
||||
&& pip install dist/*.whl
|
||||
RUN git clone ${PYTORCH_AUDIO_REPO} audio
|
||||
RUN cd audio && git checkout ${PYTORCH_AUDIO_BRANCH} \
|
||||
&& git submodule update --init --recursive \
|
||||
&& pip install -r requirements.txt \
|
||||
&& if [ "$USE_SCCACHE" = "1" ]; then \
|
||||
export HIP_CLANG_PATH=/opt/sccache-wrappers \
|
||||
&& export CMAKE_C_COMPILER_LAUNCHER=sccache \
|
||||
&& export CMAKE_CXX_COMPILER_LAUNCHER=sccache; \
|
||||
fi \
|
||||
&& python3 setup.py bdist_wheel --dist-dir=dist \
|
||||
&& if [ "$USE_SCCACHE" = "1" ]; then sccache --show-stats; fi \
|
||||
&& pip install dist/*.whl
|
||||
RUN mkdir -p /app/install && cp /app/pytorch/dist/*.whl /app/install \
|
||||
&& cp /app/vision/dist/*.whl /app/install \
|
||||
@@ -230,13 +290,19 @@ RUN cd /opt/rixl && mkdir -p /app/install && \
|
||||
FROM base AS build_fa
|
||||
ARG FA_BRANCH
|
||||
ARG FA_REPO
|
||||
ARG USE_SCCACHE
|
||||
RUN --mount=type=bind,from=build_pytorch,src=/app/install/,target=/install \
|
||||
pip install /install/*.whl
|
||||
RUN git clone ${FA_REPO}
|
||||
RUN cd flash-attention \
|
||||
&& git checkout ${FA_BRANCH} \
|
||||
&& git submodule update --init \
|
||||
&& GPU_ARCHS=$(echo ${PYTORCH_ROCM_ARCH} | sed -e 's/;gfx1[0-9]\{3\}//g') python3 setup.py bdist_wheel --dist-dir=dist
|
||||
&& if [ "$USE_SCCACHE" = "1" ]; then \
|
||||
export HIP_CLANG_PATH=/opt/sccache-wrappers \
|
||||
&& sccache --show-stats; \
|
||||
fi \
|
||||
&& GPU_ARCHS=$(echo ${PYTORCH_ROCM_ARCH} | sed -e 's/;gfx1[0-9]\{3\}//g') python3 setup.py bdist_wheel --dist-dir=dist \
|
||||
&& if [ "$USE_SCCACHE" = "1" ]; then sccache --show-stats; fi
|
||||
RUN mkdir -p /app/install && cp /app/flash-attention/dist/*.whl /app/install
|
||||
|
||||
|
||||
@@ -246,6 +312,7 @@ RUN mkdir -p /app/install && cp /app/flash-attention/dist/*.whl /app/install
|
||||
FROM base AS build_aiter
|
||||
ARG AITER_BRANCH
|
||||
ARG AITER_REPO
|
||||
ARG USE_SCCACHE
|
||||
RUN --mount=type=bind,from=build_pytorch,src=/app/install/,target=/install \
|
||||
pip install /install/*.whl
|
||||
RUN git clone --recursive ${AITER_REPO}
|
||||
@@ -253,13 +320,37 @@ RUN cd aiter \
|
||||
&& git checkout ${AITER_BRANCH} \
|
||||
&& git submodule update --init --recursive \
|
||||
&& pip install -r requirements.txt
|
||||
RUN pip install pyyaml && cd aiter && PREBUILD_KERNELS=1 GPU_ARCHS=${AITER_ROCM_ARCH} python3 setup.py bdist_wheel --dist-dir=dist && ls /app/aiter/dist/*.whl
|
||||
RUN pip install pyyaml && cd aiter \
|
||||
&& if [ "$USE_SCCACHE" = "1" ]; then \
|
||||
export HIP_CLANG_PATH=/opt/sccache-wrappers \
|
||||
&& sccache --show-stats; \
|
||||
fi \
|
||||
&& PREBUILD_KERNELS=1 GPU_ARCHS=${AITER_ROCM_ARCH} python3 setup.py bdist_wheel --dist-dir=dist \
|
||||
&& if [ "$USE_SCCACHE" = "1" ]; then sccache --show-stats; fi \
|
||||
&& ls /app/aiter/dist/*.whl
|
||||
RUN mkdir -p /app/install && cp /app/aiter/dist/*.whl /app/install
|
||||
|
||||
|
||||
###
|
||||
### Final Build
|
||||
###
|
||||
|
||||
# Wheel release stage -
|
||||
# only includes dependencies used by wheel release pipeline
|
||||
FROM base AS debs_wheel_release
|
||||
RUN mkdir /app/debs
|
||||
RUN --mount=type=bind,from=build_triton,src=/app/install/,target=/install \
|
||||
cp /install/*.whl /app/debs
|
||||
RUN --mount=type=bind,from=build_fa,src=/app/install/,target=/install \
|
||||
cp /install/*.whl /app/debs
|
||||
RUN --mount=type=bind,from=build_amdsmi,src=/app/install/,target=/install \
|
||||
cp /install/*.whl /app/debs
|
||||
RUN --mount=type=bind,from=build_pytorch,src=/app/install/,target=/install \
|
||||
cp /install/*.whl /app/debs
|
||||
RUN --mount=type=bind,from=build_aiter,src=/app/install/,target=/install \
|
||||
cp /install/*.whl /app/debs
|
||||
|
||||
# Full debs stage - includes Mori (used by Docker releases)
|
||||
FROM base AS debs
|
||||
RUN mkdir /app/debs
|
||||
RUN --mount=type=bind,from=build_triton,src=/app/install/,target=/install \
|
||||
|
||||
@@ -47,7 +47,7 @@ The key parameters for chunked processing are in the `--pooler-config`:
|
||||
```json
|
||||
{
|
||||
"pooling_type": "auto",
|
||||
"normalize": true,
|
||||
"use_activation": true,
|
||||
"enable_chunked_processing": true,
|
||||
"max_embed_len": 3072000
|
||||
}
|
||||
|
||||
@@ -14,7 +14,7 @@ Prerequisites:
|
||||
# MEAN pooling (processes all chunks, recommended for complete coverage)
|
||||
vllm serve intfloat/multilingual-e5-large \
|
||||
--pooler-config \
|
||||
'{"pooling_type": "MEAN", "normalize": true, ' \
|
||||
'{"pooling_type": "MEAN", "use_activation": true, ' \
|
||||
'"enable_chunked_processing": true, "max_embed_len": 3072000}' \
|
||||
--served-model-name multilingual-e5-large \
|
||||
--trust-remote-code \
|
||||
@@ -24,7 +24,7 @@ Prerequisites:
|
||||
# OR CLS pooling (native CLS within chunks, MEAN aggregation across chunks)
|
||||
vllm serve BAAI/bge-large-en-v1.5 \
|
||||
--pooler-config \
|
||||
'{"pooling_type": "CLS", "normalize": true, ' \
|
||||
'{"pooling_type": "CLS", "use_activation": true, ' \
|
||||
'"enable_chunked_processing": true, "max_embed_len": 1048576}' \
|
||||
--served-model-name bge-large-en-v1.5 \
|
||||
--trust-remote-code \
|
||||
|
||||
@@ -96,7 +96,7 @@ echo ""
|
||||
echo "🔧 Starting server with enhanced chunked processing configuration..."
|
||||
|
||||
# Build pooler config JSON
|
||||
POOLER_CONFIG="{\"pooling_type\": \"$POOLING_TYPE\", \"normalize\": true, \"enable_chunked_processing\": ${VLLM_ENABLE_CHUNKED_PROCESSING}, \"max_embed_len\": ${MAX_EMBED_LEN}}"
|
||||
POOLER_CONFIG="{\"pooling_type\": \"$POOLING_TYPE\", \"use_activation\": true, \"enable_chunked_processing\": ${VLLM_ENABLE_CHUNKED_PROCESSING}, \"max_embed_len\": ${MAX_EMBED_LEN}}"
|
||||
|
||||
# Start vLLM server with enhanced chunked processing
|
||||
vllm serve "$MODEL_NAME" \
|
||||
|
||||
@@ -80,6 +80,8 @@ num2words==0.5.14
|
||||
pqdm==0.2.0
|
||||
# via lm-eval
|
||||
|
||||
# Required for fastsafetensors test
|
||||
fastsafetensors @ git+https://github.com/foundation-model-stack/fastsafetensors.git@d6f998a03432b2452f8de2bb5cefb5af9795d459
|
||||
# Required for suffix decoding test
|
||||
arctic-inference == 0.1.1
|
||||
# Required for Nemotron test
|
||||
|
||||
@@ -15,5 +15,4 @@ setuptools-scm>=8
|
||||
runai-model-streamer[s3,gcs]==0.15.3
|
||||
conch-triton-kernels==1.2.1
|
||||
timm>=1.0.17
|
||||
fastsafetensors @ git+https://github.com/foundation-model-stack/fastsafetensors.git@d6f998a03432b2452f8de2bb5cefb5af9795d459
|
||||
grpcio-tools>=1.76.0
|
||||
@@ -53,7 +53,9 @@ def test_token_embed(llm: LLM):
|
||||
def test_pooling_params(llm: LLM):
|
||||
def get_outputs(normalize):
|
||||
outputs = llm.embed(
|
||||
prompts, pooling_params=PoolingParams(normalize=normalize), use_tqdm=False
|
||||
prompts,
|
||||
pooling_params=PoolingParams(use_activation=normalize),
|
||||
use_tqdm=False,
|
||||
)
|
||||
return torch.tensor([x.outputs.embedding for x in outputs])
|
||||
|
||||
|
||||
@@ -216,7 +216,7 @@ def server_with_chunked_processing():
|
||||
"512", # Set smaller max_model_len to trigger chunking mechanism
|
||||
"--pooler-config",
|
||||
(
|
||||
'{"pooling_type": "MEAN", "normalize": true, '
|
||||
'{"pooling_type": "MEAN", "use_activation": true, '
|
||||
'"enable_chunked_processing": true, "max_embed_len": 10000}'
|
||||
),
|
||||
"--gpu-memory-utilization",
|
||||
|
||||
@@ -236,17 +236,14 @@ class TestModel:
|
||||
"use_activation": use_activation,
|
||||
},
|
||||
)
|
||||
if response.status_code != 200:
|
||||
return response
|
||||
|
||||
outputs = response.json()
|
||||
return torch.tensor([x["score"] for x in outputs["data"]])
|
||||
|
||||
if model["is_cross_encoder"]:
|
||||
default = get_outputs(use_activation=None)
|
||||
w_activation = get_outputs(use_activation=True)
|
||||
wo_activation = get_outputs(use_activation=False)
|
||||
default = get_outputs(use_activation=None)
|
||||
w_activation = get_outputs(use_activation=True)
|
||||
wo_activation = get_outputs(use_activation=False)
|
||||
|
||||
if model["is_cross_encoder"]:
|
||||
assert torch.allclose(default, w_activation, atol=1e-2), (
|
||||
"Default should use activation."
|
||||
)
|
||||
@@ -256,9 +253,3 @@ class TestModel:
|
||||
assert torch.allclose(F.sigmoid(wo_activation), w_activation, atol=1e-2), (
|
||||
"w_activation should be close to activation(wo_activation)."
|
||||
)
|
||||
else:
|
||||
get_outputs(use_activation=None)
|
||||
|
||||
# The activation parameter only works for the is_cross_encoder model
|
||||
response = get_outputs(use_activation=True)
|
||||
assert response.status_code == 400
|
||||
|
||||
@@ -48,7 +48,7 @@ def test_model_loading_with_params(vllm_runner, monkeypatch):
|
||||
# asserts on the pooling config files
|
||||
assert model_config.pooler_config.seq_pooling_type == "CLS"
|
||||
assert model_config.pooler_config.tok_pooling_type == "ALL"
|
||||
assert model_config.pooler_config.normalize
|
||||
assert model_config.pooler_config.use_activation
|
||||
|
||||
# asserts on the tokenizer loaded
|
||||
assert model_config.tokenizer == "BAAI/bge-base-en-v1.5"
|
||||
@@ -93,7 +93,7 @@ def test_roberta_model_loading_with_params(vllm_runner, monkeypatch):
|
||||
# asserts on the pooling config files
|
||||
assert model_config.pooler_config.seq_pooling_type == "MEAN"
|
||||
assert model_config.pooler_config.tok_pooling_type == "ALL"
|
||||
assert model_config.pooler_config.normalize
|
||||
assert model_config.pooler_config.use_activation
|
||||
|
||||
# asserts on the tokenizer loaded
|
||||
assert model_config.tokenizer == "intfloat/multilingual-e5-base"
|
||||
|
||||
@@ -66,7 +66,7 @@ def test_embed_models_using_normalize(
|
||||
model,
|
||||
max_model_len=512,
|
||||
dtype=dtype,
|
||||
pooler_config=PoolerConfig(normalize=False),
|
||||
pooler_config=PoolerConfig(use_activation=False),
|
||||
) as vllm_model:
|
||||
wo_normalize = torch.tensor(vllm_model.embed(example_prompts))
|
||||
|
||||
@@ -74,7 +74,7 @@ def test_embed_models_using_normalize(
|
||||
model,
|
||||
max_model_len=512,
|
||||
dtype=dtype,
|
||||
pooler_config=PoolerConfig(normalize=True),
|
||||
pooler_config=PoolerConfig(use_activation=True),
|
||||
) as vllm_model:
|
||||
w_normalize = torch.tensor(vllm_model.embed(example_prompts))
|
||||
|
||||
@@ -146,7 +146,7 @@ def test_multi_vector_retrieval_models_using_normalize(
|
||||
model,
|
||||
max_model_len=512,
|
||||
dtype=dtype,
|
||||
pooler_config=PoolerConfig(normalize=False),
|
||||
pooler_config=PoolerConfig(use_activation=False),
|
||||
) as vllm_model:
|
||||
wo_normalize = vllm_model.token_embed(example_prompts)
|
||||
|
||||
@@ -154,7 +154,7 @@ def test_multi_vector_retrieval_models_using_normalize(
|
||||
model,
|
||||
max_model_len=512,
|
||||
dtype=dtype,
|
||||
pooler_config=PoolerConfig(normalize=True),
|
||||
pooler_config=PoolerConfig(use_activation=True),
|
||||
) as vllm_model:
|
||||
w_normalize = vllm_model.token_embed(example_prompts)
|
||||
|
||||
|
||||
@@ -160,7 +160,7 @@ def test_get_pooling_config():
|
||||
model_config = ModelConfig(model_id)
|
||||
|
||||
assert model_config.pooler_config is not None
|
||||
assert model_config.pooler_config.normalize
|
||||
assert model_config.pooler_config.use_activation
|
||||
assert model_config.pooler_config.seq_pooling_type == "MEAN"
|
||||
assert model_config.pooler_config.tok_pooling_type == "ALL"
|
||||
|
||||
|
||||
@@ -18,7 +18,7 @@ EMBEDDING_MODELS = [
|
||||
]
|
||||
|
||||
classify_parameters = ["use_activation"]
|
||||
embed_parameters = ["dimensions", "normalize"]
|
||||
embed_parameters = ["dimensions", "use_activation"]
|
||||
step_pooling_parameters = ["step_tag_id", "returned_token_ids"]
|
||||
|
||||
|
||||
@@ -42,17 +42,17 @@ def test_embed():
|
||||
task = "embed"
|
||||
model_config = MockModelConfig(pooler_config=PoolerConfig(seq_pooling_type="CLS"))
|
||||
|
||||
pooling_params = PoolingParams(normalize=None)
|
||||
pooling_params = PoolingParams(use_activation=None)
|
||||
pooling_params.verify(task=task, model_config=model_config)
|
||||
|
||||
pooling_params = PoolingParams(normalize=True)
|
||||
pooling_params = PoolingParams(use_activation=True)
|
||||
pooling_params.verify(task=task, model_config=model_config)
|
||||
|
||||
pooling_params = PoolingParams(normalize=False)
|
||||
pooling_params = PoolingParams(use_activation=False)
|
||||
pooling_params.verify(task=task, model_config=model_config)
|
||||
|
||||
invalid_parameters = classify_parameters + step_pooling_parameters
|
||||
for p in invalid_parameters:
|
||||
for p in set(invalid_parameters) - set(embed_parameters):
|
||||
with pytest.raises(ValueError):
|
||||
pooling_params = PoolingParams(**{p: True})
|
||||
pooling_params.verify(task=task, model_config=model_config)
|
||||
@@ -98,7 +98,7 @@ def test_classify(task):
|
||||
pooling_params.verify(task=task, model_config=model_config)
|
||||
|
||||
invalid_parameters = embed_parameters + step_pooling_parameters
|
||||
for p in invalid_parameters:
|
||||
for p in set(invalid_parameters) - set(classify_parameters):
|
||||
with pytest.raises(ValueError):
|
||||
pooling_params = PoolingParams(**{p: True})
|
||||
pooling_params.verify(task=task, model_config=model_config)
|
||||
@@ -111,20 +111,20 @@ def test_token_embed(pooling_type: str):
|
||||
pooler_config=PoolerConfig(tok_pooling_type=pooling_type)
|
||||
)
|
||||
|
||||
pooling_params = PoolingParams(normalize=None)
|
||||
pooling_params = PoolingParams(use_activation=None)
|
||||
pooling_params.verify(task=task, model_config=model_config)
|
||||
|
||||
pooling_params = PoolingParams(normalize=True)
|
||||
pooling_params = PoolingParams(use_activation=True)
|
||||
pooling_params.verify(task=task, model_config=model_config)
|
||||
|
||||
pooling_params = PoolingParams(normalize=False)
|
||||
pooling_params = PoolingParams(use_activation=False)
|
||||
pooling_params.verify(task=task, model_config=model_config)
|
||||
|
||||
invalid_parameters = classify_parameters
|
||||
if pooling_type != "STEP":
|
||||
invalid_parameters = classify_parameters + step_pooling_parameters
|
||||
|
||||
for p in invalid_parameters:
|
||||
for p in set(invalid_parameters) - set(embed_parameters):
|
||||
with pytest.raises(ValueError):
|
||||
pooling_params = PoolingParams(**{p: True})
|
||||
pooling_params.verify(task=task, model_config=model_config)
|
||||
@@ -150,7 +150,7 @@ def test_token_classify(pooling_type: str):
|
||||
if pooling_type != "STEP":
|
||||
invalid_parameters = embed_parameters + step_pooling_parameters
|
||||
|
||||
for p in invalid_parameters:
|
||||
for p in set(invalid_parameters) - set(classify_parameters):
|
||||
with pytest.raises(ValueError):
|
||||
pooling_params = PoolingParams(**{p: True})
|
||||
pooling_params.verify(task=task, model_config=model_config)
|
||||
|
||||
@@ -19,7 +19,8 @@ pytestmark = pytest.mark.cpu_test
|
||||
("lmcache", 4.0, 1, 1, "LMCacheConnectorV1", 4.0),
|
||||
# size per rank: 8.0 GiB / (2 * 2) = 2.0 GiB
|
||||
("lmcache", 8.0, 2, 2, "LMCacheConnectorV1", 2.0),
|
||||
(None, None, 1, 1, None, None),
|
||||
# When kv_offloading_size is None, offloading is disabled (backend is ignored)
|
||||
("native", None, 1, 1, None, None),
|
||||
],
|
||||
)
|
||||
def test_kv_connector(
|
||||
@@ -62,3 +63,19 @@ def test_kv_connector(
|
||||
assert kv_connector_extra_config["lmcache.max_local_cpu_size"] == expected_bytes
|
||||
# Existing config should be replaced
|
||||
assert "existing_key" not in kv_connector_extra_config
|
||||
|
||||
|
||||
def test_kv_offloading_size_only_uses_native_default():
|
||||
"""Test that setting only kv_offloading_size enables native offloading."""
|
||||
vllm_config = VllmConfig(
|
||||
cache_config=CacheConfig(
|
||||
kv_offloading_size=4.0,
|
||||
# kv_offloading_backend not set, should default to "native"
|
||||
),
|
||||
)
|
||||
|
||||
kv_transfer_config = vllm_config.kv_transfer_config
|
||||
kv_connector_extra_config = kv_transfer_config.kv_connector_extra_config
|
||||
assert kv_transfer_config.kv_connector == "OffloadingConnector"
|
||||
assert kv_transfer_config.kv_role == "kv_both"
|
||||
assert kv_connector_extra_config["cpu_bytes_to_use"] == 4.0 * (1 << 30)
|
||||
|
||||
221
tools/vllm-rocm/pin_rocm_dependencies.py
Normal file
221
tools/vllm-rocm/pin_rocm_dependencies.py
Normal file
@@ -0,0 +1,221 @@
|
||||
#!/usr/bin/env python3
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""
|
||||
Pin vLLM dependencies to exact versions of custom ROCm wheels.
|
||||
|
||||
This script modifies vLLM's requirements files to replace version constraints
|
||||
with exact versions of custom-built ROCm wheels (torch, triton, torchvision, amdsmi).
|
||||
|
||||
This ensures that 'pip install vllm' automatically installs the correct custom wheels
|
||||
instead of allowing pip to download different versions from PyPI.
|
||||
"""
|
||||
|
||||
import re
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def extract_version_from_wheel(wheel_name: str) -> str:
|
||||
"""
|
||||
Extract version from wheel filename.
|
||||
|
||||
Example:
|
||||
torch-2.9.0a0+git1c57644-cp312-cp312-linux_x86_64.whl -> 2.9.0a0+git1c57644
|
||||
triton-3.4.0-cp312-cp312-linux_x86_64.whl -> 3.4.0
|
||||
"""
|
||||
# Wheel format:
|
||||
# {distribution}-{version}(-{build tag})?-{python}-{abi}-{platform}.whl
|
||||
parts = wheel_name.replace(".whl", "").split("-")
|
||||
|
||||
if len(parts) < 5:
|
||||
raise ValueError(f"Invalid wheel filename format: {wheel_name}")
|
||||
|
||||
# Version is the second part
|
||||
version = parts[1]
|
||||
return version
|
||||
|
||||
|
||||
def get_custom_wheel_versions(install_dir: str) -> dict[str, str]:
|
||||
"""
|
||||
Read /install directory and extract versions of custom wheels.
|
||||
|
||||
Returns:
|
||||
Dict mapping package names to exact versions
|
||||
"""
|
||||
install_path = Path(install_dir)
|
||||
if not install_path.exists():
|
||||
print(f"ERROR: Install directory not found: {install_dir}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
versions = {}
|
||||
|
||||
# Map wheel prefixes to package names
|
||||
# IMPORTANT: Use dashes to avoid matching substrings
|
||||
# (e.g., 'torch' would match 'torchvision')
|
||||
# ORDER MATTERS: This order is preserved when pinning dependencies
|
||||
# in requirements files
|
||||
package_mapping = [
|
||||
("torch-", "torch"), # Match torch- (not torchvision)
|
||||
("triton-", "triton"), # Match triton- (not triton_kernels)
|
||||
("triton_kernels-", "triton-kernels"), # Match triton_kernels-
|
||||
("torchvision-", "torchvision"), # Match torchvision-
|
||||
("torchaudio-", "torchaudio"), # Match torchaudio-
|
||||
("amdsmi-", "amdsmi"), # Match amdsmi-
|
||||
("flash_attn-", "flash-attn"), # Match flash_attn-
|
||||
("aiter-", "aiter"), # Match aiter-
|
||||
]
|
||||
|
||||
for wheel_file in install_path.glob("*.whl"):
|
||||
wheel_name = wheel_file.name
|
||||
|
||||
for prefix, package_name in package_mapping:
|
||||
if wheel_name.startswith(prefix):
|
||||
try:
|
||||
version = extract_version_from_wheel(wheel_name)
|
||||
versions[package_name] = version
|
||||
print(f"Found {package_name}=={version}", file=sys.stderr)
|
||||
except Exception as e:
|
||||
print(
|
||||
f"WARNING: Could not extract version from {wheel_name}: {e}",
|
||||
file=sys.stderr,
|
||||
)
|
||||
break
|
||||
|
||||
# Return versions in the order defined by package_mapping
|
||||
ordered_versions = {}
|
||||
for _, package_name in package_mapping:
|
||||
if package_name in versions:
|
||||
ordered_versions[package_name] = versions[package_name]
|
||||
return ordered_versions
|
||||
|
||||
|
||||
def pin_dependencies_in_requirements(requirements_path: str, versions: dict[str, str]):
|
||||
"""
|
||||
Insert custom wheel pins at the TOP of requirements file.
|
||||
|
||||
This ensures that when setup.py processes the file line-by-line,
|
||||
custom wheels (torch, triton, etc.) are encountered FIRST, before
|
||||
any `-r common.txt` includes that might pull in other dependencies.
|
||||
|
||||
Creates:
|
||||
# Custom ROCm wheel pins (auto-generated)
|
||||
torch==2.9.0a0+git1c57644
|
||||
triton==3.4.0
|
||||
torchvision==0.23.0a0+824e8c8
|
||||
amdsmi==26.1.0+5df6c765
|
||||
|
||||
-r common.txt
|
||||
... rest of file ...
|
||||
"""
|
||||
requirements_file = Path(requirements_path)
|
||||
|
||||
if not requirements_file.exists():
|
||||
print(
|
||||
f"ERROR: Requirements file not found: {requirements_path}", file=sys.stderr
|
||||
)
|
||||
sys.exit(1)
|
||||
|
||||
# Backup original file
|
||||
backup_file = requirements_file.with_suffix(requirements_file.suffix + ".bak")
|
||||
with open(requirements_file) as f:
|
||||
original_lines = f.readlines()
|
||||
|
||||
# Write backup
|
||||
with open(backup_file, "w") as f:
|
||||
f.writelines(original_lines)
|
||||
|
||||
# Build header with pinned custom wheels
|
||||
header_lines = [
|
||||
"# Custom ROCm wheel pins (auto-generated by pin_rocm_dependencies.py)\n",
|
||||
"# These must come FIRST to ensure correct dependency resolution\n",
|
||||
]
|
||||
|
||||
for package_name, exact_version in versions.items():
|
||||
header_lines.append(f"{package_name}=={exact_version}\n")
|
||||
|
||||
header_lines.append("\n") # Blank line separator
|
||||
|
||||
# Filter out any existing entries for custom packages from original file
|
||||
filtered_lines = []
|
||||
removed_packages = []
|
||||
|
||||
for line in original_lines:
|
||||
stripped = line.strip()
|
||||
should_keep = True
|
||||
|
||||
# Check if this line is for one of our custom packages
|
||||
if stripped and not stripped.startswith("#") and not stripped.startswith("-"):
|
||||
for package_name in versions:
|
||||
# Handle both hyphen and underscore variations
|
||||
pattern_name = package_name.replace("-", "[-_]")
|
||||
pattern = rf"^{pattern_name}\s*[=<>]=?\s*[\d.a-zA-Z+]+"
|
||||
|
||||
if re.match(pattern, stripped, re.IGNORECASE):
|
||||
removed_packages.append(f"{package_name}: {stripped}")
|
||||
should_keep = False
|
||||
break
|
||||
|
||||
if should_keep:
|
||||
filtered_lines.append(line)
|
||||
|
||||
# Combine: header + filtered original content
|
||||
final_lines = header_lines + filtered_lines
|
||||
|
||||
# Write modified content
|
||||
with open(requirements_file, "w") as f:
|
||||
f.writelines(final_lines)
|
||||
|
||||
# Print summary
|
||||
print("\n✓ Inserted custom wheel pins at TOP of requirements:", file=sys.stderr)
|
||||
for package_name, exact_version in versions.items():
|
||||
print(f" - {package_name}=={exact_version}", file=sys.stderr)
|
||||
|
||||
if removed_packages:
|
||||
print("\n✓ Removed old package entries:", file=sys.stderr)
|
||||
for pkg in removed_packages:
|
||||
print(f" - {pkg}", file=sys.stderr)
|
||||
|
||||
print(f"\n✓ Patched requirements file: {requirements_path}", file=sys.stderr)
|
||||
print(f" Backup saved: {backup_file}", file=sys.stderr)
|
||||
|
||||
|
||||
def main():
|
||||
if len(sys.argv) != 3:
|
||||
print(
|
||||
f"Usage: {sys.argv[0]} <install_dir> <requirements_file>", file=sys.stderr
|
||||
)
|
||||
print(
|
||||
f"Example: {sys.argv[0]} /install /app/vllm/requirements/rocm.txt",
|
||||
file=sys.stderr,
|
||||
)
|
||||
sys.exit(1)
|
||||
|
||||
install_dir = sys.argv[1]
|
||||
requirements_path = sys.argv[2]
|
||||
|
||||
print("=" * 70, file=sys.stderr)
|
||||
print("Pinning vLLM dependencies to custom ROCm wheel versions", file=sys.stderr)
|
||||
print("=" * 70, file=sys.stderr)
|
||||
|
||||
# Get versions from custom wheels
|
||||
print(f"\nScanning {install_dir} for custom wheels...", file=sys.stderr)
|
||||
versions = get_custom_wheel_versions(install_dir)
|
||||
|
||||
if not versions:
|
||||
print("\nERROR: No custom wheels found in /install!", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
# Pin dependencies in requirements file
|
||||
print(f"\nPatching {requirements_path}...", file=sys.stderr)
|
||||
pin_dependencies_in_requirements(requirements_path, versions)
|
||||
|
||||
print("\n" + "=" * 70, file=sys.stderr)
|
||||
print("✓ Dependency pinning complete!", file=sys.stderr)
|
||||
print("=" * 70, file=sys.stderr)
|
||||
|
||||
sys.exit(0)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -152,13 +152,13 @@ class CacheConfig:
|
||||
kv_offloading_size: float | None = None
|
||||
"""Size of the KV cache offloading buffer in GiB. When TP > 1, this is
|
||||
the total buffer size summed across all TP ranks. By default, this is set
|
||||
to None, which means no KV offloading is enabled. When set with
|
||||
kv_offloading_backend, vLLM will enable KV cache offloading to CPU"""
|
||||
to None, which means no KV offloading is enabled. When set, vLLM will
|
||||
enable KV cache offloading to CPU using the kv_offloading_backend."""
|
||||
|
||||
kv_offloading_backend: KVOffloadingBackend | None = None
|
||||
kv_offloading_backend: KVOffloadingBackend = "native"
|
||||
"""The backend to use for KV cache offloading. Supported backends include
|
||||
'native' (vLLM native CPU offloading), 'lmcache' This option must be used
|
||||
together with kv_offloading_size."""
|
||||
'native' (vLLM native CPU offloading), 'lmcache'.
|
||||
KV offloading is only activated when kv_offloading_size is set."""
|
||||
|
||||
def compute_hash(self) -> str:
|
||||
"""
|
||||
|
||||
@@ -48,7 +48,7 @@ class PoolerConfig:
|
||||
## for embeddings models
|
||||
normalize: bool | None = None
|
||||
"""
|
||||
Whether to normalize the embeddings outputs. Defaults to True.
|
||||
DEPRECATED: please use `use_activation` instead.
|
||||
"""
|
||||
dimensions: int | None = None
|
||||
"""
|
||||
@@ -75,11 +75,11 @@ class PoolerConfig:
|
||||
## for classification models
|
||||
softmax: float | None = None
|
||||
"""
|
||||
softmax will be deprecated, please use use_activation instead.
|
||||
DEPRECATED: please use `use_activation` instead.
|
||||
"""
|
||||
activation: float | None = None
|
||||
"""
|
||||
activation will be deprecated, please use use_activation instead.
|
||||
DEPRECATED: please use `use_activation` instead.
|
||||
"""
|
||||
use_activation: bool | None = None
|
||||
"""
|
||||
@@ -164,17 +164,24 @@ class PoolerConfig:
|
||||
|
||||
|
||||
def get_use_activation(o: object):
|
||||
if softmax := getattr(o, "softmax", None) is not None:
|
||||
if (normalize := getattr(o, "normalize", None)) is not None:
|
||||
logger.warning_once(
|
||||
"softmax will be deprecated and will be removed in v0.15. "
|
||||
"Please use use_activation instead."
|
||||
"`normalize` is deprecated and will be removed in v0.15. "
|
||||
"Please use `use_activation` instead."
|
||||
)
|
||||
return normalize
|
||||
|
||||
if (softmax := getattr(o, "softmax", None)) is not None:
|
||||
logger.warning_once(
|
||||
"`softmax` is deprecated and will be removed in v0.15. "
|
||||
"Please use `use_activation` instead."
|
||||
)
|
||||
return softmax
|
||||
|
||||
if activation := getattr(o, "activation", None) is not None:
|
||||
if (activation := getattr(o, "activation", None)) is not None:
|
||||
logger.warning_once(
|
||||
"activation will be deprecated and will be removed in v0.15. "
|
||||
"Please use use_activation instead."
|
||||
"`activation` is deprecated and will be removed in v0.15. "
|
||||
"Please use `use_activation` instead."
|
||||
)
|
||||
return activation
|
||||
|
||||
|
||||
@@ -498,17 +498,15 @@ class VllmConfig:
|
||||
Right now, this function reads the offloading settings from
|
||||
CacheConfig and configures the KVTransferConfig accordingly.
|
||||
"""
|
||||
if (kv_offloading_backend := self.cache_config.kv_offloading_backend) is None:
|
||||
# KV offloading is only activated when kv_offloading_size is set.
|
||||
if (kv_offloading_size := self.cache_config.kv_offloading_size) is None:
|
||||
return
|
||||
|
||||
kv_offloading_backend = self.cache_config.kv_offloading_backend
|
||||
|
||||
# If no KVTransferConfig is provided, create a default one.
|
||||
if self.kv_transfer_config is None:
|
||||
self.kv_transfer_config = KVTransferConfig()
|
||||
|
||||
if (kv_offloading_size := self.cache_config.kv_offloading_size) is None:
|
||||
raise ValueError(
|
||||
"You must set kv_offloading_size when kv_offloading_backend is set."
|
||||
)
|
||||
num_kv_ranks = (
|
||||
self.parallel_config.tensor_parallel_size
|
||||
* self.parallel_config.pipeline_parallel_size
|
||||
|
||||
@@ -234,7 +234,7 @@ class LMCacheConnectorV1(KVConnectorBase_V1):
|
||||
lora_id=e.lora_id,
|
||||
block_size=e.block_size,
|
||||
medium=e.medium,
|
||||
lora_name=e.lora_name,
|
||||
lora_name=getattr(e, "lora_name", None),
|
||||
)
|
||||
for e in events
|
||||
]
|
||||
|
||||
@@ -578,9 +578,7 @@ class EngineArgs:
|
||||
optimization_level: OptimizationLevel = VllmConfig.optimization_level
|
||||
|
||||
kv_offloading_size: float | None = CacheConfig.kv_offloading_size
|
||||
kv_offloading_backend: KVOffloadingBackend | None = (
|
||||
CacheConfig.kv_offloading_backend
|
||||
)
|
||||
kv_offloading_backend: KVOffloadingBackend = CacheConfig.kv_offloading_backend
|
||||
tokens_only: bool = False
|
||||
|
||||
def __post_init__(self):
|
||||
|
||||
@@ -75,7 +75,7 @@ class EmbeddingCompletionRequest(OpenAIBaseModel):
|
||||
return PoolingParams(
|
||||
truncate_prompt_tokens=self.truncate_prompt_tokens,
|
||||
dimensions=self.dimensions,
|
||||
normalize=self.normalize,
|
||||
use_activation=self.normalize,
|
||||
)
|
||||
|
||||
|
||||
@@ -189,7 +189,7 @@ class EmbeddingChatRequest(OpenAIBaseModel):
|
||||
return PoolingParams(
|
||||
truncate_prompt_tokens=self.truncate_prompt_tokens,
|
||||
dimensions=self.dimensions,
|
||||
normalize=self.normalize,
|
||||
use_activation=self.normalize,
|
||||
)
|
||||
|
||||
|
||||
|
||||
@@ -40,7 +40,6 @@ class PoolingCompletionRequest(EmbeddingCompletionRequest):
|
||||
return PoolingParams(
|
||||
truncate_prompt_tokens=self.truncate_prompt_tokens,
|
||||
dimensions=self.dimensions,
|
||||
normalize=self.normalize,
|
||||
use_activation=get_use_activation(self),
|
||||
)
|
||||
|
||||
@@ -66,7 +65,6 @@ class PoolingChatRequest(EmbeddingChatRequest):
|
||||
return PoolingParams(
|
||||
truncate_prompt_tokens=self.truncate_prompt_tokens,
|
||||
dimensions=self.dimensions,
|
||||
normalize=self.normalize,
|
||||
use_activation=get_use_activation(self),
|
||||
)
|
||||
|
||||
|
||||
@@ -83,7 +83,7 @@ class EmbeddingPoolerHead(SequencePoolerHead):
|
||||
|
||||
# for normalize
|
||||
if self.activation is not None:
|
||||
flags = [p.normalize for p in pooling_params]
|
||||
flags = [p.use_activation for p in pooling_params]
|
||||
if len(set(flags)) == 1:
|
||||
if flags[0]:
|
||||
pooled_data = self.activation(pooled_data)
|
||||
|
||||
@@ -95,8 +95,8 @@ def pooler_for_embed(pooler_config: PoolerConfig):
|
||||
vllm_config = get_current_vllm_config()
|
||||
model_config = vllm_config.model_config
|
||||
head = EmbeddingPoolerHead(
|
||||
projector=_load_st_projector(model_config),
|
||||
head_dtype=model_config.head_dtype,
|
||||
projector=_load_st_projector(model_config),
|
||||
activation=PoolerNormalize(),
|
||||
)
|
||||
|
||||
@@ -116,9 +116,9 @@ def pooler_for_classify(
|
||||
vllm_config = get_current_vllm_config()
|
||||
model_config = vllm_config.model_config
|
||||
head = ClassifierPoolerHead(
|
||||
head_dtype=model_config.head_dtype,
|
||||
classifier=classifier,
|
||||
logit_bias=model_config.pooler_config.logit_bias,
|
||||
head_dtype=model_config.head_dtype,
|
||||
activation=resolve_classifier_act_fn(
|
||||
model_config, static_num_labels=True, act_fn=act_fn
|
||||
),
|
||||
|
||||
@@ -44,14 +44,14 @@ class TokenPoolerHead(nn.Module, ABC):
|
||||
class TokenEmbeddingPoolerHead(TokenPoolerHead):
|
||||
def __init__(
|
||||
self,
|
||||
projector: ProjectorFn | None = None,
|
||||
head_dtype: torch.dtype | str | None = None,
|
||||
projector: ProjectorFn | None = None,
|
||||
activation: ActivationFn | None = None,
|
||||
) -> None:
|
||||
super().__init__()
|
||||
|
||||
self.projector = projector
|
||||
self.head_dtype = head_dtype
|
||||
self.projector = projector
|
||||
self.activation = activation
|
||||
|
||||
def get_supported_tasks(self) -> Set[PoolingTask]:
|
||||
@@ -79,7 +79,7 @@ class TokenEmbeddingPoolerHead(TokenPoolerHead):
|
||||
pooled_data = pooled_data[..., : pooling_param.dimensions]
|
||||
|
||||
# for normalize
|
||||
if self.activation is not None and pooling_param.normalize:
|
||||
if self.activation is not None and pooling_param.use_activation:
|
||||
pooled_data = self.activation(pooled_data)
|
||||
|
||||
# pooled_data shape: [n_tokens, embedding_dimension]
|
||||
|
||||
@@ -95,8 +95,8 @@ def pooler_for_token_embed(pooler_config: PoolerConfig):
|
||||
vllm_config = get_current_vllm_config()
|
||||
model_config = vllm_config.model_config
|
||||
head = TokenEmbeddingPoolerHead(
|
||||
projector=_load_st_projector(model_config),
|
||||
head_dtype=model_config.head_dtype,
|
||||
projector=_load_st_projector(model_config),
|
||||
activation=PoolerNormalize(),
|
||||
)
|
||||
|
||||
@@ -116,9 +116,9 @@ def pooler_for_token_classify(
|
||||
vllm_config = get_current_vllm_config()
|
||||
model_config = vllm_config.model_config
|
||||
head = TokenClassifierPoolerHead(
|
||||
head_dtype=model_config.head_dtype,
|
||||
classifier=classifier,
|
||||
logit_bias=model_config.pooler_config.logit_bias,
|
||||
head_dtype=model_config.head_dtype,
|
||||
activation=resolve_classifier_act_fn(
|
||||
model_config, static_num_labels=False, act_fn=act_fn
|
||||
),
|
||||
|
||||
@@ -98,7 +98,9 @@ class QuantFP8(CustomOp):
|
||||
num_token_padding=self.num_token_padding,
|
||||
scale_ub=scale_ub,
|
||||
use_per_token_if_dynamic=self.use_per_token_if_dynamic,
|
||||
group_shape=self.group_shape if self.static else None,
|
||||
group_shape=(self.group_shape.row, self.group_shape.col)
|
||||
if self.static
|
||||
else None,
|
||||
)
|
||||
|
||||
def forward_hip(
|
||||
|
||||
@@ -116,8 +116,8 @@ class BertPooler(SequencePooler):
|
||||
|
||||
# Use lambdas so that weights are not registered under `self.head`
|
||||
self.head = EmbeddingPoolerHead(
|
||||
projector=lambda x: self.dense(x),
|
||||
head_dtype=head_dtype,
|
||||
projector=lambda x: self.dense(x),
|
||||
activation=LambdaPoolerActivation(self.act_fn),
|
||||
)
|
||||
|
||||
|
||||
@@ -309,12 +309,13 @@ class ModernBertPooler(SequencePooler):
|
||||
config.hidden_size,
|
||||
eps=config.norm_eps,
|
||||
bias=config.norm_bias,
|
||||
dtype=head_dtype,
|
||||
)
|
||||
|
||||
# Use lambdas so that weights are not registered under `self.head`
|
||||
self.head = EmbeddingPoolerHead(
|
||||
projector=lambda x: self.dense(x),
|
||||
head_dtype=head_dtype,
|
||||
projector=lambda x: self.dense(x),
|
||||
activation=LambdaPoolerActivation(lambda x: self.norm(self.act(x))),
|
||||
)
|
||||
|
||||
|
||||
@@ -26,9 +26,9 @@ class PoolingParams(
|
||||
Set to None to disable truncation.
|
||||
dimensions: Reduce the dimensions of embeddings
|
||||
if model support matryoshka representation.
|
||||
normalize: Whether to normalize the embeddings outputs.
|
||||
softmax: softmax will be deprecated, please use use_activation instead.
|
||||
activation: activation will be deprecated, please use use_activation instead.
|
||||
normalize: Deprecated, please use use_activation instead.
|
||||
softmax: Deprecated, please use use_activation instead.
|
||||
activation: Deprecated, please use use_activation instead.
|
||||
use_activation: Whether to apply activation function to
|
||||
the classification outputs.
|
||||
"""
|
||||
@@ -63,15 +63,15 @@ class PoolingParams(
|
||||
|
||||
@property
|
||||
def all_parameters(self) -> list[str]:
|
||||
return ["dimensions", "normalize", "use_activation"]
|
||||
return ["dimensions", "use_activation"]
|
||||
|
||||
@property
|
||||
def valid_parameters(self):
|
||||
return {
|
||||
"embed": ["dimensions", "normalize"],
|
||||
"embed": ["dimensions", "use_activation"],
|
||||
"classify": ["use_activation"],
|
||||
"score": ["use_activation"],
|
||||
"token_embed": ["dimensions", "normalize"],
|
||||
"token_embed": ["dimensions", "use_activation"],
|
||||
"token_classify": ["use_activation"],
|
||||
}
|
||||
|
||||
@@ -162,8 +162,8 @@ class PoolingParams(
|
||||
|
||||
def _set_default_parameters(self, model_config: Optional["ModelConfig"]):
|
||||
if self.task in ["embed", "token_embed"]:
|
||||
if self.normalize is None:
|
||||
self.normalize = True
|
||||
if self.use_activation is None:
|
||||
self.use_activation = True
|
||||
|
||||
if self.dimensions is not None and model_config is not None:
|
||||
if not model_config.is_matryoshka:
|
||||
@@ -213,7 +213,6 @@ class PoolingParams(
|
||||
return (
|
||||
f"PoolingParams("
|
||||
f"task={self.task}, "
|
||||
f"normalize={self.normalize}, "
|
||||
f"dimensions={self.dimensions}, "
|
||||
f"use_activation={self.use_activation}, "
|
||||
f"step_tag_id={self.step_tag_id}, "
|
||||
|
||||
@@ -801,7 +801,7 @@ def get_pooling_config(
|
||||
|
||||
logger.info("Found pooling configuration.")
|
||||
|
||||
config: dict[str, Any] = {"normalize": normalize}
|
||||
config: dict[str, Any] = {"use_activation": normalize}
|
||||
for key, val in pooling_dict.items():
|
||||
if val is True:
|
||||
pooling_type = parse_pooling_type(key)
|
||||
|
||||
@@ -167,7 +167,16 @@ class RocmAttentionBackend(AttentionBackend):
|
||||
# ROCM paged attention kernel only supports block sizes 16 and 32
|
||||
# due to shared memory (LDS) constraints on AMD GPUs.
|
||||
# See csrc/rocm/attention.cu CALL_CUSTOM_LAUNCHER_BLK macro.
|
||||
return [16, 32]
|
||||
|
||||
# However, The limitations in [16, 32] are reasonable for a native C++ kernel,
|
||||
# but vLLM should allow support for non-standard sizes via the Triton path,
|
||||
# as addressed in this PR: https://github.com/vllm-project/vllm/pull/31380,
|
||||
# where the Triton kernel under rocm_atten does not support inference
|
||||
# for a non-standard qwen3-next model with a block_size of 544.
|
||||
# We have fixed the Triton kernel so that the standard model uses the original
|
||||
# bit-addressing logic, while the non-standard model
|
||||
# uses our optimized kernel logic.
|
||||
return [16, 32, 544]
|
||||
|
||||
@classmethod
|
||||
def get_supported_head_sizes(cls) -> list[int]:
|
||||
|
||||
@@ -174,6 +174,8 @@ class TopKTopPSampler(nn.Module):
|
||||
k: torch.Tensor | None,
|
||||
p: torch.Tensor | None,
|
||||
) -> tuple[torch.Tensor, torch.Tensor | None]:
|
||||
# FIXME: Fix aiter_sampler's accuracy issue and remove this flag
|
||||
DISABLE_AITER_SAMPLER = True
|
||||
"""Optimized ROCm/aiter path (same structure as forward_cuda)."""
|
||||
if (k is None and p is None) or generators:
|
||||
if generators:
|
||||
@@ -186,6 +188,8 @@ class TopKTopPSampler(nn.Module):
|
||||
"processed_logits",
|
||||
"processed_logprobs",
|
||||
), "aiter sampler does not support returning logits/logprobs."
|
||||
if DISABLE_AITER_SAMPLER:
|
||||
return self.forward_native(logits, generators, k, p)
|
||||
return self.aiter_sample(logits, k, p, generators), None
|
||||
|
||||
def aiter_sample(
|
||||
|
||||
Reference in New Issue
Block a user