Compare commits
8 Commits
v0.17.2rc0
...
v0.14.0rc2
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
7f42dc20bb | ||
|
|
c2a37a3cf8 | ||
|
|
0e31fc7996 | ||
|
|
6ac0fcf416 | ||
|
|
b62249725c | ||
|
|
1b57275207 | ||
|
|
2c24bc6996 | ||
|
|
0aa8c40552 |
@@ -169,6 +169,24 @@ steps:
|
|||||||
env:
|
env:
|
||||||
DOCKER_BUILDKIT: "1"
|
DOCKER_BUILDKIT: "1"
|
||||||
|
|
||||||
|
- block: "Build ROCm release image"
|
||||||
|
key: block-rocm-release-image-build
|
||||||
|
depends_on: ~
|
||||||
|
|
||||||
|
- label: "Build release image (ROCm)"
|
||||||
|
depends_on: block-rocm-release-image-build
|
||||||
|
id: build-release-image-rocm
|
||||||
|
agents:
|
||||||
|
queue: cpu_queue_postmerge
|
||||||
|
commands:
|
||||||
|
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
||||||
|
# Build base image first
|
||||||
|
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --tag rocm/vllm-dev:base-$BUILDKITE_COMMIT --target final --progress plain -f docker/Dockerfile.rocm_base ."
|
||||||
|
# Build vLLM ROCm image using the base
|
||||||
|
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg BASE_IMAGE=rocm/vllm-dev:base-$BUILDKITE_COMMIT --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-rocm --target vllm-openai --progress plain -f docker/Dockerfile.rocm ."
|
||||||
|
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-rocm"
|
||||||
|
|
||||||
|
|
||||||
- label: "Build and publish nightly multi-arch image to DockerHub"
|
- label: "Build and publish nightly multi-arch image to DockerHub"
|
||||||
depends_on:
|
depends_on:
|
||||||
- create-multi-arch-manifest
|
- create-multi-arch-manifest
|
||||||
@@ -196,3 +214,365 @@ steps:
|
|||||||
env:
|
env:
|
||||||
DOCKER_BUILDKIT: "1"
|
DOCKER_BUILDKIT: "1"
|
||||||
DOCKERHUB_USERNAME: "vllmbot"
|
DOCKERHUB_USERNAME: "vllmbot"
|
||||||
|
|
||||||
|
# =============================================================================
|
||||||
|
# ROCm Release Pipeline (x86_64 only)
|
||||||
|
# =============================================================================
|
||||||
|
#
|
||||||
|
# vLLM version is determined by the Buildkite checkout (like CUDA pipeline).
|
||||||
|
# To build a specific version, trigger the build from that branch/tag.
|
||||||
|
#
|
||||||
|
# Environment variables for ROCm builds (set via Buildkite UI or schedule):
|
||||||
|
# ROCM_PYTHON_VERSION: Python version (default: 3.12)
|
||||||
|
# PYTORCH_ROCM_ARCH: GPU architectures (default: gfx90a;gfx942;gfx950;gfx1100;gfx1101;gfx1200;gfx1201;gfx1150;gfx1151)
|
||||||
|
# ROCM_UPLOAD_WHEELS: Upload to S3 (default: false for nightly, true for releases)
|
||||||
|
# ROCM_FORCE_REBUILD: Force rebuild base wheels, ignore S3 cache (default: false)
|
||||||
|
#
|
||||||
|
# Note: ROCm version is determined by BASE_IMAGE in docker/Dockerfile.rocm_base
|
||||||
|
# (currently rocm/dev-ubuntu-22.04:7.1-complete)
|
||||||
|
#
|
||||||
|
# =============================================================================
|
||||||
|
|
||||||
|
# ROCm Input Step - Collect build configuration (manual trigger only)
|
||||||
|
- input: "ROCm Wheel Release Build Configuration"
|
||||||
|
key: input-rocm-config
|
||||||
|
depends_on: ~
|
||||||
|
if: build.source == "ui"
|
||||||
|
fields:
|
||||||
|
- text: "Python Version"
|
||||||
|
key: "rocm-python-version"
|
||||||
|
default: "3.12"
|
||||||
|
hint: "Python version (e.g., 3.12)"
|
||||||
|
- text: "GPU Architectures"
|
||||||
|
key: "rocm-pytorch-rocm-arch"
|
||||||
|
default: "gfx90a;gfx942;gfx950;gfx1100;gfx1101;gfx1200;gfx1201;gfx1150;gfx1151"
|
||||||
|
hint: "Semicolon-separated GPU architectures"
|
||||||
|
- select: "Upload Wheels to S3"
|
||||||
|
key: "rocm-upload-wheels"
|
||||||
|
default: "true"
|
||||||
|
options:
|
||||||
|
- label: "No - Build only (nightly/dev)"
|
||||||
|
value: "false"
|
||||||
|
- label: "Yes - Upload to S3 (release)"
|
||||||
|
value: "true"
|
||||||
|
- select: "Force Rebuild Base Wheels"
|
||||||
|
key: "rocm-force-rebuild"
|
||||||
|
default: "false"
|
||||||
|
hint: "Ignore S3 cache and rebuild base wheels from scratch"
|
||||||
|
options:
|
||||||
|
- label: "No - Use cached wheels if available"
|
||||||
|
value: "false"
|
||||||
|
- label: "Yes - Rebuild even if cache exists"
|
||||||
|
value: "true"
|
||||||
|
|
||||||
|
# ROCm Job 1: Build ROCm Base Wheels (with S3 caching)
|
||||||
|
- label: ":rocm: Build ROCm Base Wheels"
|
||||||
|
id: build-rocm-base-wheels
|
||||||
|
depends_on:
|
||||||
|
- step: input-rocm-config
|
||||||
|
allow_failure: true # Allow failure so non-UI builds can proceed (input step is skipped)
|
||||||
|
agents:
|
||||||
|
queue: cpu_queue_postmerge
|
||||||
|
commands:
|
||||||
|
# Set configuration and check cache
|
||||||
|
- |
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
# Get values from meta-data (set by input step) or use defaults
|
||||||
|
PYTHON_VERSION="$$(buildkite-agent meta-data get rocm-python-version 2>/dev/null || echo '')"
|
||||||
|
export PYTHON_VERSION="$${PYTHON_VERSION:-3.12}"
|
||||||
|
|
||||||
|
PYTORCH_ROCM_ARCH="$$(buildkite-agent meta-data get rocm-pytorch-rocm-arch 2>/dev/null || echo '')"
|
||||||
|
export PYTORCH_ROCM_ARCH="$${PYTORCH_ROCM_ARCH:-gfx90a;gfx942;gfx950;gfx1100;gfx1101;gfx1200;gfx1201;gfx1150;gfx1151}"
|
||||||
|
|
||||||
|
# Check for force rebuild flag
|
||||||
|
ROCM_FORCE_REBUILD="$${ROCM_FORCE_REBUILD:-}"
|
||||||
|
if [ -z "$${ROCM_FORCE_REBUILD}" ]; then
|
||||||
|
ROCM_FORCE_REBUILD="$$(buildkite-agent meta-data get rocm-force-rebuild 2>/dev/null || echo '')"
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "========================================"
|
||||||
|
echo "ROCm Base Wheels Build Configuration"
|
||||||
|
echo "========================================"
|
||||||
|
echo " PYTHON_VERSION: $${PYTHON_VERSION}"
|
||||||
|
echo " PYTORCH_ROCM_ARCH: $${PYTORCH_ROCM_ARCH}"
|
||||||
|
echo " ROCM_FORCE_REBUILD: $${ROCM_FORCE_REBUILD:-false}"
|
||||||
|
echo "========================================"
|
||||||
|
|
||||||
|
# Save resolved config for later jobs
|
||||||
|
buildkite-agent meta-data set "rocm-python-version" "$${PYTHON_VERSION}"
|
||||||
|
buildkite-agent meta-data set "rocm-pytorch-rocm-arch" "$${PYTORCH_ROCM_ARCH}"
|
||||||
|
|
||||||
|
# Check S3 cache for pre-built wheels
|
||||||
|
CACHE_KEY=$$(.buildkite/scripts/cache-rocm-base-wheels.sh key)
|
||||||
|
CACHE_PATH=$$(.buildkite/scripts/cache-rocm-base-wheels.sh path)
|
||||||
|
echo ""
|
||||||
|
echo "Cache key: $${CACHE_KEY}"
|
||||||
|
echo "Cache path: $${CACHE_PATH}"
|
||||||
|
|
||||||
|
# Save cache key for downstream jobs
|
||||||
|
buildkite-agent meta-data set "rocm-cache-key" "$${CACHE_KEY}"
|
||||||
|
|
||||||
|
CACHE_STATUS="miss"
|
||||||
|
if [ "$${ROCM_FORCE_REBUILD}" != "true" ]; then
|
||||||
|
CACHE_STATUS=$$(.buildkite/scripts/cache-rocm-base-wheels.sh check)
|
||||||
|
else
|
||||||
|
echo "Force rebuild requested, skipping cache check"
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ "$${CACHE_STATUS}" = "hit" ]; then
|
||||||
|
echo ""
|
||||||
|
echo "CACHE HIT! Downloading pre-built wheels..."
|
||||||
|
echo ""
|
||||||
|
.buildkite/scripts/cache-rocm-base-wheels.sh download
|
||||||
|
|
||||||
|
# Set the S3 path for the cached Docker image (for Job 2 to download)
|
||||||
|
S3_ARTIFACT_PATH="s3://$${S3_BUCKET}/rocm/cache/$${CACHE_KEY}"
|
||||||
|
buildkite-agent meta-data set "rocm-docker-image-s3-path" "$${S3_ARTIFACT_PATH}/rocm-base-image.tar.gz"
|
||||||
|
|
||||||
|
# Mark that we used cache (for Docker image handling)
|
||||||
|
buildkite-agent meta-data set "rocm-used-cache" "true"
|
||||||
|
|
||||||
|
echo ""
|
||||||
|
echo "Cache download complete. Skipping Docker build."
|
||||||
|
echo "Docker image will be downloaded from: $${S3_ARTIFACT_PATH}/rocm-base-image.tar.gz"
|
||||||
|
else
|
||||||
|
echo ""
|
||||||
|
echo "CACHE MISS. Building from scratch..."
|
||||||
|
echo ""
|
||||||
|
|
||||||
|
# Build full base image (for later vLLM build)
|
||||||
|
DOCKER_BUILDKIT=1 docker buildx build \
|
||||||
|
--file docker/Dockerfile.rocm_base \
|
||||||
|
--tag rocm/vllm-dev:base-$${BUILDKITE_BUILD_NUMBER} \
|
||||||
|
--build-arg PYTORCH_ROCM_ARCH="$${PYTORCH_ROCM_ARCH}" \
|
||||||
|
--build-arg PYTHON_VERSION="$${PYTHON_VERSION}" \
|
||||||
|
--build-arg USE_SCCACHE=1 \
|
||||||
|
--build-arg SCCACHE_BUCKET_NAME=vllm-build-sccache \
|
||||||
|
--build-arg SCCACHE_REGION_NAME=us-west-2 \
|
||||||
|
--build-arg SCCACHE_S3_NO_CREDENTIALS=0 \
|
||||||
|
--load \
|
||||||
|
.
|
||||||
|
|
||||||
|
# Build debs_wheel_release stage for wheel extraction
|
||||||
|
DOCKER_BUILDKIT=1 docker buildx build \
|
||||||
|
--file docker/Dockerfile.rocm_base \
|
||||||
|
--tag rocm-base-debs:$${BUILDKITE_BUILD_NUMBER} \
|
||||||
|
--target debs_wheel_release \
|
||||||
|
--build-arg PYTORCH_ROCM_ARCH="$${PYTORCH_ROCM_ARCH}" \
|
||||||
|
--build-arg PYTHON_VERSION="$${PYTHON_VERSION}" \
|
||||||
|
--build-arg USE_SCCACHE=1 \
|
||||||
|
--build-arg SCCACHE_BUCKET_NAME=vllm-build-sccache \
|
||||||
|
--build-arg SCCACHE_REGION_NAME=us-west-2 \
|
||||||
|
--build-arg SCCACHE_S3_NO_CREDENTIALS=0 \
|
||||||
|
--load \
|
||||||
|
.
|
||||||
|
|
||||||
|
# Extract wheels from Docker image
|
||||||
|
mkdir -p artifacts/rocm-base-wheels
|
||||||
|
container_id=$$(docker create rocm-base-debs:$${BUILDKITE_BUILD_NUMBER})
|
||||||
|
docker cp $${container_id}:/app/debs/. artifacts/rocm-base-wheels/
|
||||||
|
docker rm $${container_id}
|
||||||
|
echo "Extracted base wheels:"
|
||||||
|
ls -lh artifacts/rocm-base-wheels/
|
||||||
|
|
||||||
|
# Upload wheels to S3 cache for future builds
|
||||||
|
echo ""
|
||||||
|
echo "Uploading wheels to S3 cache..."
|
||||||
|
.buildkite/scripts/cache-rocm-base-wheels.sh upload
|
||||||
|
|
||||||
|
# Export base Docker image for reuse in vLLM build
|
||||||
|
mkdir -p artifacts/rocm-docker-image
|
||||||
|
docker save rocm/vllm-dev:base-$${BUILDKITE_BUILD_NUMBER} | gzip > artifacts/rocm-docker-image/rocm-base-image.tar.gz
|
||||||
|
echo "Docker image size:"
|
||||||
|
ls -lh artifacts/rocm-docker-image/
|
||||||
|
|
||||||
|
# Upload large Docker image to S3 (also cached by cache key)
|
||||||
|
S3_ARTIFACT_PATH="s3://$${S3_BUCKET}/rocm/cache/$${CACHE_KEY}"
|
||||||
|
echo "Uploading Docker image to $${S3_ARTIFACT_PATH}/"
|
||||||
|
aws s3 cp artifacts/rocm-docker-image/rocm-base-image.tar.gz "$${S3_ARTIFACT_PATH}/rocm-base-image.tar.gz"
|
||||||
|
|
||||||
|
# Save the S3 path for downstream jobs
|
||||||
|
buildkite-agent meta-data set "rocm-docker-image-s3-path" "$${S3_ARTIFACT_PATH}/rocm-base-image.tar.gz"
|
||||||
|
|
||||||
|
# Mark that we did NOT use cache
|
||||||
|
buildkite-agent meta-data set "rocm-used-cache" "false"
|
||||||
|
|
||||||
|
echo ""
|
||||||
|
echo "Build complete. Wheels cached for future builds."
|
||||||
|
fi
|
||||||
|
artifact_paths:
|
||||||
|
- "artifacts/rocm-base-wheels/*.whl"
|
||||||
|
env:
|
||||||
|
DOCKER_BUILDKIT: "1"
|
||||||
|
S3_BUCKET: "vllm-wheels"
|
||||||
|
|
||||||
|
# ROCm Job 2: Build vLLM ROCm Wheel
|
||||||
|
- label: ":python: Build vLLM ROCm Wheel"
|
||||||
|
id: build-rocm-vllm-wheel
|
||||||
|
depends_on:
|
||||||
|
- step: build-rocm-base-wheels
|
||||||
|
allow_failure: false
|
||||||
|
agents:
|
||||||
|
queue: cpu_queue_postmerge
|
||||||
|
timeout_in_minutes: 180
|
||||||
|
commands:
|
||||||
|
# Download artifacts and prepare Docker image
|
||||||
|
- |
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
# Ensure git tags are up-to-date (Buildkite's default fetch doesn't update tags)
|
||||||
|
# This fixes version detection when tags are moved/force-pushed
|
||||||
|
echo "Fetching latest tags from origin..."
|
||||||
|
git fetch --tags --force origin
|
||||||
|
|
||||||
|
# Log tag information for debugging version detection
|
||||||
|
echo "========================================"
|
||||||
|
echo "Git Tag Verification"
|
||||||
|
echo "========================================"
|
||||||
|
echo "Current HEAD: $(git rev-parse HEAD)"
|
||||||
|
echo "git describe --tags: $(git describe --tags 2>/dev/null || echo 'No tags found')"
|
||||||
|
echo ""
|
||||||
|
echo "Recent tags (pointing to commits near HEAD):"
|
||||||
|
git tag -l --sort=-creatordate | head -5
|
||||||
|
echo "setuptools_scm version detection:"
|
||||||
|
pip install -q setuptools_scm 2>/dev/null || true
|
||||||
|
python3 -c "import setuptools_scm; print(' Detected version:', setuptools_scm.get_version())" 2>/dev/null || echo " (setuptools_scm not available in this environment)"
|
||||||
|
echo "========================================"
|
||||||
|
|
||||||
|
# Download wheel artifacts from current build
|
||||||
|
echo "Downloading wheel artifacts from current build"
|
||||||
|
buildkite-agent artifact download "artifacts/rocm-base-wheels/*.whl" .
|
||||||
|
|
||||||
|
# Download Docker image from S3 (too large for Buildkite artifacts)
|
||||||
|
DOCKER_IMAGE_S3_PATH="$$(buildkite-agent meta-data get rocm-docker-image-s3-path 2>/dev/null || echo '')"
|
||||||
|
if [ -z "$${DOCKER_IMAGE_S3_PATH}" ]; then
|
||||||
|
echo "ERROR: rocm-docker-image-s3-path metadata not found"
|
||||||
|
echo "This should have been set by the build-rocm-base-wheels job"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
echo "Downloading Docker image from $${DOCKER_IMAGE_S3_PATH}"
|
||||||
|
mkdir -p artifacts/rocm-docker-image
|
||||||
|
aws s3 cp "$${DOCKER_IMAGE_S3_PATH}" artifacts/rocm-docker-image/rocm-base-image.tar.gz
|
||||||
|
|
||||||
|
# Load base Docker image and capture the tag
|
||||||
|
echo "Loading base Docker image..."
|
||||||
|
LOAD_OUTPUT=$$(gunzip -c artifacts/rocm-docker-image/rocm-base-image.tar.gz | docker load)
|
||||||
|
echo "$${LOAD_OUTPUT}"
|
||||||
|
# Extract the actual loaded image tag from "Loaded image: <tag>" output
|
||||||
|
# This avoids picking up stale images (like rocm/vllm-dev:nightly) already on the agent
|
||||||
|
BASE_IMAGE_TAG=$$(echo "$${LOAD_OUTPUT}" | grep "Loaded image:" | sed 's/Loaded image: //')
|
||||||
|
if [ -z "$${BASE_IMAGE_TAG}" ]; then
|
||||||
|
echo "ERROR: Failed to extract image tag from docker load output"
|
||||||
|
echo "Load output was: $${LOAD_OUTPUT}"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
echo "Loaded base image: $${BASE_IMAGE_TAG}"
|
||||||
|
|
||||||
|
# Prepare base wheels for Docker build context
|
||||||
|
mkdir -p docker/context/base-wheels
|
||||||
|
touch docker/context/base-wheels/.keep
|
||||||
|
cp artifacts/rocm-base-wheels/*.whl docker/context/base-wheels/
|
||||||
|
echo "Base wheels for vLLM build:"
|
||||||
|
ls -lh docker/context/base-wheels/
|
||||||
|
|
||||||
|
# Get GPU architectures from meta-data
|
||||||
|
PYTORCH_ROCM_ARCH="$$(buildkite-agent meta-data get rocm-pytorch-rocm-arch 2>/dev/null || echo '')"
|
||||||
|
PYTORCH_ROCM_ARCH="$${PYTORCH_ROCM_ARCH:-gfx90a;gfx942;gfx950;gfx1100;gfx1101;gfx1200;gfx1201;gfx1150;gfx1151}"
|
||||||
|
|
||||||
|
echo "========================================"
|
||||||
|
echo "Building vLLM wheel with:"
|
||||||
|
echo " BUILDKITE_COMMIT: $${BUILDKITE_COMMIT}"
|
||||||
|
echo " BUILDKITE_BRANCH: $${BUILDKITE_BRANCH}"
|
||||||
|
echo " PYTORCH_ROCM_ARCH: $${PYTORCH_ROCM_ARCH}"
|
||||||
|
echo " BASE_IMAGE: $${BASE_IMAGE_TAG}"
|
||||||
|
echo "========================================"
|
||||||
|
|
||||||
|
# Build vLLM wheel using local checkout (REMOTE_VLLM=0)
|
||||||
|
DOCKER_BUILDKIT=1 docker build \
|
||||||
|
--file docker/Dockerfile.rocm \
|
||||||
|
--target export_vllm_wheel_release \
|
||||||
|
--output type=local,dest=rocm-dist \
|
||||||
|
--build-arg BASE_IMAGE="$${BASE_IMAGE_TAG}" \
|
||||||
|
--build-arg ARG_PYTORCH_ROCM_ARCH="$${PYTORCH_ROCM_ARCH}" \
|
||||||
|
--build-arg REMOTE_VLLM=0 \
|
||||||
|
--build-arg GIT_REPO_CHECK=1 \
|
||||||
|
--build-arg USE_SCCACHE=1 \
|
||||||
|
--build-arg SCCACHE_BUCKET_NAME=vllm-build-sccache \
|
||||||
|
--build-arg SCCACHE_REGION_NAME=us-west-2 \
|
||||||
|
--build-arg SCCACHE_S3_NO_CREDENTIALS=0 \
|
||||||
|
.
|
||||||
|
|
||||||
|
echo "Built vLLM wheel:"
|
||||||
|
ls -lh rocm-dist/*.whl
|
||||||
|
|
||||||
|
# Copy wheel to artifacts directory
|
||||||
|
mkdir -p artifacts/rocm-vllm-wheel
|
||||||
|
cp rocm-dist/*.whl artifacts/rocm-vllm-wheel/
|
||||||
|
echo "Final vLLM wheel:"
|
||||||
|
ls -lh artifacts/rocm-vllm-wheel/
|
||||||
|
artifact_paths:
|
||||||
|
- "artifacts/rocm-vllm-wheel/*.whl"
|
||||||
|
env:
|
||||||
|
DOCKER_BUILDKIT: "1"
|
||||||
|
S3_BUCKET: "vllm-wheels"
|
||||||
|
|
||||||
|
# ROCm Job 3: Upload Wheels to S3
|
||||||
|
- label: ":s3: Upload ROCm Wheels to S3"
|
||||||
|
id: upload-rocm-wheels
|
||||||
|
depends_on:
|
||||||
|
- step: build-rocm-vllm-wheel
|
||||||
|
allow_failure: false
|
||||||
|
agents:
|
||||||
|
queue: cpu_queue_postmerge
|
||||||
|
timeout_in_minutes: 60
|
||||||
|
commands:
|
||||||
|
# Download all wheel artifacts and run upload
|
||||||
|
- |
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
# Check if upload is enabled (from env var, meta-data, or release branch)
|
||||||
|
ROCM_UPLOAD_WHEELS="$${ROCM_UPLOAD_WHEELS:-}"
|
||||||
|
if [ -z "$${ROCM_UPLOAD_WHEELS}" ]; then
|
||||||
|
# Try to get from meta-data (input form)
|
||||||
|
ROCM_UPLOAD_WHEELS="$$(buildkite-agent meta-data get rocm-upload-wheels 2>/dev/null || echo '')"
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "========================================"
|
||||||
|
echo "Upload check:"
|
||||||
|
echo " ROCM_UPLOAD_WHEELS: $${ROCM_UPLOAD_WHEELS}"
|
||||||
|
echo " BUILDKITE_BRANCH: $${BUILDKITE_BRANCH}"
|
||||||
|
echo "========================================"
|
||||||
|
|
||||||
|
# Skip upload if not enabled
|
||||||
|
if [ "$${ROCM_UPLOAD_WHEELS}" != "true" ]; then
|
||||||
|
echo "Skipping S3 upload (ROCM_UPLOAD_WHEELS != true, NIGHTLY != 1, not a release branch)"
|
||||||
|
echo "To enable upload, set 'Upload Wheels to S3' to 'Yes' in the build configuration"
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "Upload enabled, proceeding..."
|
||||||
|
|
||||||
|
# Download artifacts from current build
|
||||||
|
echo "Downloading artifacts from current build"
|
||||||
|
buildkite-agent artifact download "artifacts/rocm-base-wheels/*.whl" .
|
||||||
|
buildkite-agent artifact download "artifacts/rocm-vllm-wheel/*.whl" .
|
||||||
|
|
||||||
|
# Run upload script
|
||||||
|
bash .buildkite/scripts/upload-rocm-wheels.sh
|
||||||
|
env:
|
||||||
|
DOCKER_BUILDKIT: "1"
|
||||||
|
S3_BUCKET: "vllm-wheels"
|
||||||
|
|
||||||
|
# ROCm Job 4: Annotate ROCm Wheel Release
|
||||||
|
- label: ":memo: Annotate ROCm wheel release"
|
||||||
|
id: annotate-rocm-release
|
||||||
|
depends_on:
|
||||||
|
- step: upload-rocm-wheels
|
||||||
|
allow_failure: true
|
||||||
|
agents:
|
||||||
|
queue: cpu_queue_postmerge
|
||||||
|
commands:
|
||||||
|
- "bash .buildkite/scripts/annotate-rocm-release.sh"
|
||||||
|
env:
|
||||||
|
S3_BUCKET: "vllm-wheels"
|
||||||
|
|||||||
@@ -32,6 +32,7 @@ To download and upload the image:
|
|||||||
\`\`\`
|
\`\`\`
|
||||||
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64
|
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64
|
||||||
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64
|
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64
|
||||||
|
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm
|
||||||
|
|
||||||
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64 vllm/vllm-openai:x86_64
|
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64 vllm/vllm-openai:x86_64
|
||||||
docker tag vllm/vllm-openai:x86_64 vllm/vllm-openai:latest-x86_64
|
docker tag vllm/vllm-openai:x86_64 vllm/vllm-openai:latest-x86_64
|
||||||
@@ -45,6 +46,12 @@ docker tag vllm/vllm-openai:aarch64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
|
|||||||
docker push vllm/vllm-openai:latest-aarch64
|
docker push vllm/vllm-openai:latest-aarch64
|
||||||
docker push vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
|
docker push vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
|
||||||
|
|
||||||
|
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm vllm/vllm-openai:rocm
|
||||||
|
docker tag vllm/vllm-openai:rocm vllm/vllm-openai:latest-rocm
|
||||||
|
docker tag vllm/vllm-openai:rocm vllm/vllm-openai:v${RELEASE_VERSION}-rocm
|
||||||
|
docker push vllm/vllm-openai:latest-rocm
|
||||||
|
docker push vllm/vllm-openai:v${RELEASE_VERSION}-rocm
|
||||||
|
|
||||||
docker manifest rm vllm/vllm-openai:latest
|
docker manifest rm vllm/vllm-openai:latest
|
||||||
docker manifest create vllm/vllm-openai:latest vllm/vllm-openai:latest-x86_64 vllm/vllm-openai:latest-aarch64
|
docker manifest create vllm/vllm-openai:latest vllm/vllm-openai:latest-x86_64 vllm/vllm-openai:latest-aarch64
|
||||||
docker manifest create vllm/vllm-openai:v${RELEASE_VERSION} vllm/vllm-openai:v${RELEASE_VERSION}-x86_64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
|
docker manifest create vllm/vllm-openai:v${RELEASE_VERSION} vllm/vllm-openai:v${RELEASE_VERSION}-x86_64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
|
||||||
|
|||||||
74
.buildkite/scripts/annotate-rocm-release.sh
Executable file
74
.buildkite/scripts/annotate-rocm-release.sh
Executable file
@@ -0,0 +1,74 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
#
|
||||||
|
# Generate Buildkite annotation for ROCm wheel release
|
||||||
|
|
||||||
|
set -ex
|
||||||
|
|
||||||
|
# Get build configuration from meta-data
|
||||||
|
# Extract ROCm version dynamically from Dockerfile.rocm_base
|
||||||
|
# BASE_IMAGE format: rocm/dev-ubuntu-22.04:7.1-complete -> extracts "7.1"
|
||||||
|
ROCM_VERSION=$(grep -E '^ARG BASE_IMAGE=' docker/Dockerfile.rocm_base | sed -E 's/.*:([0-9]+\.[0-9]+).*/\1/' || echo "unknown")
|
||||||
|
PYTHON_VERSION=$(buildkite-agent meta-data get rocm-python-version 2>/dev/null || echo "3.12")
|
||||||
|
PYTORCH_ROCM_ARCH=$(buildkite-agent meta-data get rocm-pytorch-rocm-arch 2>/dev/null || echo "gfx90a;gfx942;gfx950;gfx1100;gfx1101;gfx1200;gfx1201;gfx1150;gfx1151")
|
||||||
|
|
||||||
|
# S3 URLs
|
||||||
|
S3_BUCKET="${S3_BUCKET:-vllm-wheels}"
|
||||||
|
S3_REGION="${AWS_DEFAULT_REGION:-us-west-2}"
|
||||||
|
S3_URL="https://${S3_BUCKET}.s3.${S3_REGION}.amazonaws.com"
|
||||||
|
ROCM_PATH="rocm/${BUILDKITE_COMMIT}"
|
||||||
|
|
||||||
|
buildkite-agent annotate --style 'success' --context 'rocm-release-workflow' << EOF
|
||||||
|
## :rocm: ROCm Wheel Release
|
||||||
|
|
||||||
|
### Build Configuration
|
||||||
|
| Setting | Value |
|
||||||
|
|---------|-------|
|
||||||
|
| **ROCm Version** | ${ROCM_VERSION} |
|
||||||
|
| **Python Version** | ${PYTHON_VERSION} |
|
||||||
|
| **GPU Architectures** | ${PYTORCH_ROCM_ARCH} |
|
||||||
|
| **Branch** | \`${BUILDKITE_BRANCH}\` |
|
||||||
|
| **Commit** | \`${BUILDKITE_COMMIT}\` |
|
||||||
|
|
||||||
|
### :package: Installation
|
||||||
|
|
||||||
|
**Install from this build (by commit):**
|
||||||
|
\`\`\`bash
|
||||||
|
uv pip install vllm --extra-index-url ${S3_URL}/${ROCM_PATH}/{rocm_variant}/
|
||||||
|
|
||||||
|
# Example:
|
||||||
|
uv pip install vllm --extra-index-url ${S3_URL}/${ROCM_PATH}/rocm700/
|
||||||
|
\`\`\`
|
||||||
|
|
||||||
|
**Install from nightly (if published):**
|
||||||
|
\`\`\`bash
|
||||||
|
uv pip install vllm --extra-index-url ${S3_URL}/rocm/nightly/
|
||||||
|
\`\`\`
|
||||||
|
|
||||||
|
### :floppy_disk: Download Wheels Directly
|
||||||
|
|
||||||
|
\`\`\`bash
|
||||||
|
# List all ROCm wheels
|
||||||
|
aws s3 ls s3://${S3_BUCKET}/${ROCM_PATH}/
|
||||||
|
|
||||||
|
# Download specific wheels
|
||||||
|
aws s3 cp s3://${S3_BUCKET}/${ROCM_PATH}/vllm-*.whl .
|
||||||
|
aws s3 cp s3://${S3_BUCKET}/${ROCM_PATH}/torch-*.whl .
|
||||||
|
aws s3 cp s3://${S3_BUCKET}/${ROCM_PATH}/triton_rocm-*.whl .
|
||||||
|
aws s3 cp s3://${S3_BUCKET}/${ROCM_PATH}/torchvision-*.whl .
|
||||||
|
aws s3 cp s3://${S3_BUCKET}/${ROCM_PATH}/amdsmi-*.whl .
|
||||||
|
\`\`\`
|
||||||
|
|
||||||
|
### :gear: Included Packages
|
||||||
|
- **vllm**: vLLM with ROCm support
|
||||||
|
- **torch**: PyTorch built for ROCm ${ROCM_VERSION}
|
||||||
|
- **triton_rocm**: Triton built for ROCm
|
||||||
|
- **torchvision**: TorchVision for ROCm PyTorch
|
||||||
|
- **amdsmi**: AMD SMI Python bindings
|
||||||
|
|
||||||
|
### :warning: Notes
|
||||||
|
- These wheels are built for **ROCm ${ROCM_VERSION}** and will NOT work with CUDA GPUs
|
||||||
|
- Supported GPU architectures: ${PYTORCH_ROCM_ARCH}
|
||||||
|
- Platform: Linux x86_64 only
|
||||||
|
EOF
|
||||||
140
.buildkite/scripts/cache-rocm-base-wheels.sh
Executable file
140
.buildkite/scripts/cache-rocm-base-wheels.sh
Executable file
@@ -0,0 +1,140 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
#
|
||||||
|
# Cache helper for ROCm base wheels
|
||||||
|
#
|
||||||
|
# This script manages caching of pre-built ROCm base wheels (torch, triton, etc.)
|
||||||
|
# to avoid rebuilding them when Dockerfile.rocm_base hasn't changed.
|
||||||
|
#
|
||||||
|
# Usage:
|
||||||
|
# cache-rocm-base-wheels.sh check - Check if cache exists, outputs "hit" or "miss"
|
||||||
|
# cache-rocm-base-wheels.sh upload - Upload wheels to cache
|
||||||
|
# cache-rocm-base-wheels.sh download - Download wheels from cache
|
||||||
|
# cache-rocm-base-wheels.sh key - Output the cache key
|
||||||
|
#
|
||||||
|
# Environment variables:
|
||||||
|
# S3_BUCKET - S3 bucket name (default: vllm-wheels)
|
||||||
|
# PYTHON_VERSION - Python version (affects cache key)
|
||||||
|
# PYTORCH_ROCM_ARCH - GPU architectures (affects cache key)
|
||||||
|
#
|
||||||
|
# Note: ROCm version is determined by BASE_IMAGE in Dockerfile.rocm_base,
|
||||||
|
# so changes to ROCm version are captured by the Dockerfile hash.
|
||||||
|
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
BUCKET="${S3_BUCKET:-vllm-wheels}"
|
||||||
|
DOCKERFILE="docker/Dockerfile.rocm_base"
|
||||||
|
CACHE_PREFIX="rocm/cache"
|
||||||
|
|
||||||
|
# Generate hash from Dockerfile content + build args
|
||||||
|
generate_cache_key() {
|
||||||
|
# Include Dockerfile content
|
||||||
|
if [[ ! -f "$DOCKERFILE" ]]; then
|
||||||
|
echo "ERROR: Dockerfile not found: $DOCKERFILE" >&2
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
local dockerfile_hash=$(sha256sum "$DOCKERFILE" | cut -c1-16)
|
||||||
|
|
||||||
|
# Include key build args that affect the output
|
||||||
|
# These should match the ARGs in Dockerfile.rocm_base that change the build output
|
||||||
|
# Note: ROCm version is determined by BASE_IMAGE in the Dockerfile, so it's captured by dockerfile_hash
|
||||||
|
local args_string="${PYTHON_VERSION:-}|${PYTORCH_ROCM_ARCH:-}"
|
||||||
|
local args_hash=$(echo "$args_string" | sha256sum | cut -c1-8)
|
||||||
|
|
||||||
|
echo "${dockerfile_hash}-${args_hash}"
|
||||||
|
}
|
||||||
|
|
||||||
|
CACHE_KEY=$(generate_cache_key)
|
||||||
|
CACHE_PATH="s3://${BUCKET}/${CACHE_PREFIX}/${CACHE_KEY}/"
|
||||||
|
|
||||||
|
case "${1:-}" in
|
||||||
|
check)
|
||||||
|
echo "Checking cache for key: ${CACHE_KEY}" >&2
|
||||||
|
echo "Cache path: ${CACHE_PATH}" >&2
|
||||||
|
echo "Variables used in cache key:" >&2
|
||||||
|
echo " PYTHON_VERSION: ${PYTHON_VERSION:-<not set>}" >&2
|
||||||
|
echo " PYTORCH_ROCM_ARCH: ${PYTORCH_ROCM_ARCH:-<not set>}" >&2
|
||||||
|
|
||||||
|
# Check if cache exists by listing objects
|
||||||
|
# We look for at least one .whl file
|
||||||
|
echo "Running: aws s3 ls ${CACHE_PATH}" >&2
|
||||||
|
S3_OUTPUT=$(aws s3 ls "${CACHE_PATH}" 2>&1) || true
|
||||||
|
echo "S3 ls output:" >&2
|
||||||
|
echo "$S3_OUTPUT" | head -5 >&2
|
||||||
|
|
||||||
|
if echo "$S3_OUTPUT" | grep -q "\.whl"; then
|
||||||
|
echo "hit"
|
||||||
|
else
|
||||||
|
echo "miss"
|
||||||
|
fi
|
||||||
|
;;
|
||||||
|
|
||||||
|
upload)
|
||||||
|
echo "========================================"
|
||||||
|
echo "Uploading wheels to cache"
|
||||||
|
echo "========================================"
|
||||||
|
echo "Cache key: ${CACHE_KEY}"
|
||||||
|
echo "Cache path: ${CACHE_PATH}"
|
||||||
|
echo ""
|
||||||
|
|
||||||
|
if [[ ! -d "artifacts/rocm-base-wheels" ]]; then
|
||||||
|
echo "ERROR: artifacts/rocm-base-wheels directory not found" >&2
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
WHEEL_COUNT=$(ls artifacts/rocm-base-wheels/*.whl 2>/dev/null | wc -l)
|
||||||
|
if [[ "$WHEEL_COUNT" -eq 0 ]]; then
|
||||||
|
echo "ERROR: No wheels found in artifacts/rocm-base-wheels/" >&2
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "Uploading $WHEEL_COUNT wheels..."
|
||||||
|
aws s3 cp --recursive artifacts/rocm-base-wheels/ "${CACHE_PATH}"
|
||||||
|
|
||||||
|
echo ""
|
||||||
|
echo "Cache upload complete!"
|
||||||
|
echo "========================================"
|
||||||
|
;;
|
||||||
|
|
||||||
|
download)
|
||||||
|
echo "========================================"
|
||||||
|
echo "Downloading wheels from cache"
|
||||||
|
echo "========================================"
|
||||||
|
echo "Cache key: ${CACHE_KEY}"
|
||||||
|
echo "Cache path: ${CACHE_PATH}"
|
||||||
|
echo ""
|
||||||
|
|
||||||
|
mkdir -p artifacts/rocm-base-wheels
|
||||||
|
aws s3 cp --recursive "${CACHE_PATH}" artifacts/rocm-base-wheels/
|
||||||
|
|
||||||
|
echo ""
|
||||||
|
echo "Downloaded wheels:"
|
||||||
|
ls -lh artifacts/rocm-base-wheels/
|
||||||
|
|
||||||
|
WHEEL_COUNT=$(ls artifacts/rocm-base-wheels/*.whl 2>/dev/null | wc -l)
|
||||||
|
echo ""
|
||||||
|
echo "Total: $WHEEL_COUNT wheels"
|
||||||
|
echo "========================================"
|
||||||
|
;;
|
||||||
|
|
||||||
|
key)
|
||||||
|
echo "${CACHE_KEY}"
|
||||||
|
;;
|
||||||
|
|
||||||
|
path)
|
||||||
|
echo "${CACHE_PATH}"
|
||||||
|
;;
|
||||||
|
|
||||||
|
*)
|
||||||
|
echo "Usage: $0 {check|upload|download|key|path}" >&2
|
||||||
|
echo "" >&2
|
||||||
|
echo "Commands:" >&2
|
||||||
|
echo " check - Check if cache exists, outputs 'hit' or 'miss'" >&2
|
||||||
|
echo " upload - Upload wheels from artifacts/rocm-base-wheels/ to cache" >&2
|
||||||
|
echo " download - Download wheels from cache to artifacts/rocm-base-wheels/" >&2
|
||||||
|
echo " key - Output the cache key" >&2
|
||||||
|
echo " path - Output the full S3 cache path" >&2
|
||||||
|
exit 1
|
||||||
|
;;
|
||||||
|
esac
|
||||||
@@ -16,6 +16,18 @@ from urllib.parse import quote
|
|||||||
|
|
||||||
import regex as re
|
import regex as re
|
||||||
|
|
||||||
|
|
||||||
|
def normalize_package_name(name: str) -> str:
|
||||||
|
"""
|
||||||
|
Normalize package name according to PEP 503.
|
||||||
|
https://peps.python.org/pep-0503/#normalized-names
|
||||||
|
|
||||||
|
Replace runs of underscores, hyphens, and periods with a single hyphen,
|
||||||
|
and lowercase the result.
|
||||||
|
"""
|
||||||
|
return re.sub(r"[-_.]+", "-", name).lower()
|
||||||
|
|
||||||
|
|
||||||
if not sys.version_info >= (3, 12):
|
if not sys.version_info >= (3, 12):
|
||||||
raise RuntimeError("This script requires Python 3.12 or higher.")
|
raise RuntimeError("This script requires Python 3.12 or higher.")
|
||||||
|
|
||||||
@@ -78,7 +90,13 @@ def parse_from_filename(file: str) -> WheelFileInfo:
|
|||||||
version = version.removesuffix("." + variant)
|
version = version.removesuffix("." + variant)
|
||||||
else:
|
else:
|
||||||
if "+" in version:
|
if "+" in version:
|
||||||
version, variant = version.split("+")
|
version_part, suffix = version.split("+", 1)
|
||||||
|
# Only treat known patterns as variants (rocmXXX, cuXXX, cpu)
|
||||||
|
# Git hashes and other suffixes are NOT variants
|
||||||
|
if suffix.startswith(("rocm", "cu", "cpu")):
|
||||||
|
variant = suffix
|
||||||
|
version = version_part
|
||||||
|
# Otherwise keep the full version string (variant stays None)
|
||||||
|
|
||||||
return WheelFileInfo(
|
return WheelFileInfo(
|
||||||
package_name=package_name,
|
package_name=package_name,
|
||||||
@@ -206,6 +224,26 @@ def generate_index_and_metadata(
|
|||||||
print("No wheel files found, skipping index generation.")
|
print("No wheel files found, skipping index generation.")
|
||||||
return
|
return
|
||||||
|
|
||||||
|
# For ROCm builds: inherit variant from vllm wheel
|
||||||
|
# All ROCm wheels should share the same variant as vllm
|
||||||
|
rocm_variant = None
|
||||||
|
for file in parsed_files:
|
||||||
|
if (
|
||||||
|
file.package_name == "vllm"
|
||||||
|
and file.variant
|
||||||
|
and file.variant.startswith("rocm")
|
||||||
|
):
|
||||||
|
rocm_variant = file.variant
|
||||||
|
print(f"Detected ROCm variant from vllm: {rocm_variant}")
|
||||||
|
break
|
||||||
|
|
||||||
|
# Apply ROCm variant to all wheels without a variant
|
||||||
|
if rocm_variant:
|
||||||
|
for file in parsed_files:
|
||||||
|
if file.variant is None:
|
||||||
|
file.variant = rocm_variant
|
||||||
|
print(f"Inherited variant '{rocm_variant}' for {file.filename}")
|
||||||
|
|
||||||
# Group by variant
|
# Group by variant
|
||||||
variant_to_files: dict[str, list[WheelFileInfo]] = {}
|
variant_to_files: dict[str, list[WheelFileInfo]] = {}
|
||||||
for file in parsed_files:
|
for file in parsed_files:
|
||||||
@@ -256,8 +294,8 @@ def generate_index_and_metadata(
|
|||||||
|
|
||||||
variant_dir.mkdir(parents=True, exist_ok=True)
|
variant_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
# gather all package names in this variant
|
# gather all package names in this variant (normalized per PEP 503)
|
||||||
packages = set(f.package_name for f in files)
|
packages = set(normalize_package_name(f.package_name) for f in files)
|
||||||
if variant == "default":
|
if variant == "default":
|
||||||
# these packages should also appear in the "project list"
|
# these packages should also appear in the "project list"
|
||||||
# generate after all variants are processed
|
# generate after all variants are processed
|
||||||
@@ -269,8 +307,10 @@ def generate_index_and_metadata(
|
|||||||
f.write(project_list_str)
|
f.write(project_list_str)
|
||||||
|
|
||||||
for package in packages:
|
for package in packages:
|
||||||
# filter files belonging to this package only
|
# filter files belonging to this package only (compare normalized names)
|
||||||
package_files = [f for f in files if f.package_name == package]
|
package_files = [
|
||||||
|
f for f in files if normalize_package_name(f.package_name) == package
|
||||||
|
]
|
||||||
package_dir = variant_dir / package
|
package_dir = variant_dir / package
|
||||||
package_dir.mkdir(parents=True, exist_ok=True)
|
package_dir.mkdir(parents=True, exist_ok=True)
|
||||||
index_str, metadata_str = generate_package_index_and_metadata(
|
index_str, metadata_str = generate_package_index_and_metadata(
|
||||||
@@ -341,8 +381,13 @@ if __name__ == "__main__":
|
|||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
version = args.version
|
version = args.version
|
||||||
if "/" in version or "\\" in version:
|
# Allow rocm/ prefix, reject other slashes and all backslashes
|
||||||
raise ValueError("Version string must not contain slashes.")
|
if "\\" in version:
|
||||||
|
raise ValueError("Version string must not contain backslashes.")
|
||||||
|
if "/" in version and not version.startswith("rocm/"):
|
||||||
|
raise ValueError(
|
||||||
|
"Version string must not contain slashes (except for 'rocm/' prefix)."
|
||||||
|
)
|
||||||
current_objects_path = Path(args.current_objects)
|
current_objects_path = Path(args.current_objects)
|
||||||
output_dir = Path(args.output_dir)
|
output_dir = Path(args.output_dir)
|
||||||
if not output_dir.exists():
|
if not output_dir.exists():
|
||||||
@@ -393,8 +438,23 @@ if __name__ == "__main__":
|
|||||||
# Generate index and metadata, assuming wheels and indices are stored as:
|
# Generate index and metadata, assuming wheels and indices are stored as:
|
||||||
# s3://vllm-wheels/{wheel_dir}/<wheel files>
|
# s3://vllm-wheels/{wheel_dir}/<wheel files>
|
||||||
# s3://vllm-wheels/<anything>/<index files>
|
# s3://vllm-wheels/<anything>/<index files>
|
||||||
wheel_dir = args.wheel_dir or version
|
#
|
||||||
wheel_base_dir = Path(output_dir).parent / wheel_dir.strip().rstrip("/")
|
# For ROCm builds, version is "rocm/{commit}" and indices are uploaded to:
|
||||||
|
# - rocm/{commit}/ (same as wheels)
|
||||||
|
# - rocm/nightly/
|
||||||
|
# - rocm/{version}/
|
||||||
|
# All these are under the "rocm/" prefix, so relative paths should be
|
||||||
|
# relative to "rocm/", not the bucket root.
|
||||||
|
if args.wheel_dir:
|
||||||
|
# Explicit wheel-dir provided (e.g., for version-specific indices pointing to commit dir)
|
||||||
|
wheel_dir = args.wheel_dir.strip().rstrip("/")
|
||||||
|
elif version.startswith("rocm/"):
|
||||||
|
# For rocm/commit, wheel_base_dir should be just the commit part
|
||||||
|
# so relative path from rocm/0.12.0/rocm710/vllm/ -> ../../../{commit}/
|
||||||
|
wheel_dir = version.split("/", 1)[1]
|
||||||
|
else:
|
||||||
|
wheel_dir = version
|
||||||
|
wheel_base_dir = Path(output_dir).parent / wheel_dir
|
||||||
index_base_dir = Path(output_dir)
|
index_base_dir = Path(output_dir)
|
||||||
|
|
||||||
generate_index_and_metadata(
|
generate_index_and_metadata(
|
||||||
|
|||||||
151
.buildkite/scripts/upload-rocm-wheels.sh
Executable file
151
.buildkite/scripts/upload-rocm-wheels.sh
Executable file
@@ -0,0 +1,151 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
#
|
||||||
|
# Upload ROCm wheels to S3 with proper index generation
|
||||||
|
#
|
||||||
|
# Required environment variables:
|
||||||
|
# AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY (or IAM role)
|
||||||
|
# S3_BUCKET (default: vllm-wheels)
|
||||||
|
#
|
||||||
|
# S3 path structure:
|
||||||
|
# s3://vllm-wheels/rocm/{commit}/ - All wheels for this commit
|
||||||
|
# s3://vllm-wheels/rocm/nightly/ - Index pointing to latest nightly
|
||||||
|
# s3://vllm-wheels/rocm/{version}/ - Index for release versions
|
||||||
|
|
||||||
|
set -ex
|
||||||
|
|
||||||
|
# ======== Configuration ========
|
||||||
|
BUCKET="${S3_BUCKET:-vllm-wheels}"
|
||||||
|
ROCM_SUBPATH="rocm/${BUILDKITE_COMMIT}"
|
||||||
|
S3_COMMIT_PREFIX="s3://$BUCKET/$ROCM_SUBPATH/"
|
||||||
|
INDICES_OUTPUT_DIR="rocm-indices"
|
||||||
|
PYTHON="${PYTHON_PROG:-python3}"
|
||||||
|
|
||||||
|
# ROCm uses manylinux_2_35 (Ubuntu 22.04 based)
|
||||||
|
MANYLINUX_VERSION="manylinux_2_35"
|
||||||
|
|
||||||
|
echo "========================================"
|
||||||
|
echo "ROCm Wheel Upload Configuration"
|
||||||
|
echo "========================================"
|
||||||
|
echo "S3 Bucket: $BUCKET"
|
||||||
|
echo "S3 Path: $ROCM_SUBPATH"
|
||||||
|
echo "Commit: $BUILDKITE_COMMIT"
|
||||||
|
echo "Branch: $BUILDKITE_BRANCH"
|
||||||
|
echo "========================================"
|
||||||
|
|
||||||
|
# ======== Part 0: Setup Python ========
|
||||||
|
|
||||||
|
# Detect if python3.12+ is available
|
||||||
|
has_new_python=$($PYTHON -c "print(1 if __import__('sys').version_info >= (3,12) else 0)" 2>/dev/null || echo 0)
|
||||||
|
if [[ "$has_new_python" -eq 0 ]]; then
|
||||||
|
# Use new python from docker
|
||||||
|
# Use --user to ensure files are created with correct ownership (not root)
|
||||||
|
docker pull python:3-slim
|
||||||
|
PYTHON="docker run --rm --user $(id -u):$(id -g) -v $(pwd):/app -w /app python:3-slim python3"
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "Using python interpreter: $PYTHON"
|
||||||
|
echo "Python version: $($PYTHON --version)"
|
||||||
|
|
||||||
|
# ======== Part 1: Collect and prepare wheels ========
|
||||||
|
|
||||||
|
# Collect all wheels
|
||||||
|
mkdir -p all-rocm-wheels
|
||||||
|
cp artifacts/rocm-base-wheels/*.whl all-rocm-wheels/ 2>/dev/null || true
|
||||||
|
cp artifacts/rocm-vllm-wheel/*.whl all-rocm-wheels/ 2>/dev/null || true
|
||||||
|
|
||||||
|
WHEEL_COUNT=$(ls all-rocm-wheels/*.whl 2>/dev/null | wc -l)
|
||||||
|
echo "Total wheels to upload: $WHEEL_COUNT"
|
||||||
|
|
||||||
|
if [ "$WHEEL_COUNT" -eq 0 ]; then
|
||||||
|
echo "ERROR: No wheels found to upload!"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Rename linux to manylinux in wheel filenames
|
||||||
|
for wheel in all-rocm-wheels/*.whl; do
|
||||||
|
if [[ "$wheel" == *"linux"* ]] && [[ "$wheel" != *"manylinux"* ]]; then
|
||||||
|
new_wheel="${wheel/linux/$MANYLINUX_VERSION}"
|
||||||
|
mv -- "$wheel" "$new_wheel"
|
||||||
|
echo "Renamed: $(basename "$wheel") -> $(basename "$new_wheel")"
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
||||||
|
echo ""
|
||||||
|
echo "Wheels to upload:"
|
||||||
|
ls -lh all-rocm-wheels/
|
||||||
|
|
||||||
|
# ======== Part 2: Upload wheels to S3 ========
|
||||||
|
|
||||||
|
echo ""
|
||||||
|
echo "Uploading wheels to $S3_COMMIT_PREFIX"
|
||||||
|
for wheel in all-rocm-wheels/*.whl; do
|
||||||
|
aws s3 cp "$wheel" "$S3_COMMIT_PREFIX"
|
||||||
|
done
|
||||||
|
|
||||||
|
# ======== Part 3: Generate and upload indices ========
|
||||||
|
|
||||||
|
# List existing wheels in commit directory
|
||||||
|
echo ""
|
||||||
|
echo "Generating indices..."
|
||||||
|
obj_json="rocm-objects.json"
|
||||||
|
aws s3api list-objects-v2 --bucket "$BUCKET" --prefix "$ROCM_SUBPATH/" --delimiter / --output json > "$obj_json"
|
||||||
|
|
||||||
|
mkdir -p "$INDICES_OUTPUT_DIR"
|
||||||
|
|
||||||
|
# Use the existing generate-nightly-index.py
|
||||||
|
# HACK: Replace regex module with stdlib re (same as CUDA script)
|
||||||
|
sed -i 's/import regex as re/import re/g' .buildkite/scripts/generate-nightly-index.py
|
||||||
|
|
||||||
|
$PYTHON .buildkite/scripts/generate-nightly-index.py \
|
||||||
|
--version "$ROCM_SUBPATH" \
|
||||||
|
--current-objects "$obj_json" \
|
||||||
|
--output-dir "$INDICES_OUTPUT_DIR" \
|
||||||
|
--comment "ROCm commit $BUILDKITE_COMMIT"
|
||||||
|
|
||||||
|
# Upload indices to commit directory
|
||||||
|
echo "Uploading indices to $S3_COMMIT_PREFIX"
|
||||||
|
aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "$S3_COMMIT_PREFIX"
|
||||||
|
|
||||||
|
# Update rocm/nightly/ if on main branch and not a PR
|
||||||
|
if [[ "$BUILDKITE_BRANCH" == "main" && "$BUILDKITE_PULL_REQUEST" == "false" ]] || [[ "$NIGHTLY" == "1" ]]; then
|
||||||
|
echo "Updating rocm/nightly/ index..."
|
||||||
|
aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "s3://$BUCKET/rocm/nightly/"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Extract version from vLLM wheel and update version-specific index
|
||||||
|
VLLM_WHEEL=$(ls all-rocm-wheels/vllm*.whl 2>/dev/null | head -1)
|
||||||
|
if [ -n "$VLLM_WHEEL" ]; then
|
||||||
|
VERSION=$(unzip -p "$VLLM_WHEEL" '**/METADATA' | grep '^Version: ' | cut -d' ' -f2)
|
||||||
|
echo "Version in wheel: $VERSION"
|
||||||
|
PURE_VERSION="${VERSION%%+*}"
|
||||||
|
PURE_VERSION="${PURE_VERSION%%.rocm}"
|
||||||
|
echo "Pure version: $PURE_VERSION"
|
||||||
|
|
||||||
|
if [[ "$VERSION" != *"dev"* ]]; then
|
||||||
|
echo "Updating rocm/$PURE_VERSION/ index..."
|
||||||
|
aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "s3://$BUCKET/rocm/$PURE_VERSION/"
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
# ======== Part 4: Summary ========
|
||||||
|
|
||||||
|
echo ""
|
||||||
|
echo "========================================"
|
||||||
|
echo "ROCm Wheel Upload Complete!"
|
||||||
|
echo "========================================"
|
||||||
|
echo ""
|
||||||
|
echo "Wheels available at:"
|
||||||
|
echo " s3://$BUCKET/$ROCM_SUBPATH/"
|
||||||
|
echo ""
|
||||||
|
echo "Install command (by commit):"
|
||||||
|
echo " pip install vllm --extra-index-url https://${BUCKET}.s3.amazonaws.com/$ROCM_SUBPATH/"
|
||||||
|
echo ""
|
||||||
|
if [[ "$BUILDKITE_BRANCH" == "main" ]] || [[ "$NIGHTLY" == "1" ]]; then
|
||||||
|
echo "Install command (nightly):"
|
||||||
|
echo " pip install vllm --extra-index-url https://${BUCKET}.s3.amazonaws.com/rocm/nightly/"
|
||||||
|
fi
|
||||||
|
echo ""
|
||||||
|
echo "Wheel count: $WHEEL_COUNT"
|
||||||
|
echo "========================================"
|
||||||
@@ -3,6 +3,14 @@ ARG REMOTE_VLLM="0"
|
|||||||
ARG COMMON_WORKDIR=/app
|
ARG COMMON_WORKDIR=/app
|
||||||
ARG BASE_IMAGE=rocm/vllm-dev:base
|
ARG BASE_IMAGE=rocm/vllm-dev:base
|
||||||
|
|
||||||
|
# Sccache configuration (only used in release pipeline)
|
||||||
|
ARG USE_SCCACHE
|
||||||
|
ARG SCCACHE_DOWNLOAD_URL
|
||||||
|
ARG SCCACHE_ENDPOINT
|
||||||
|
ARG SCCACHE_BUCKET_NAME=vllm-build-sccache
|
||||||
|
ARG SCCACHE_REGION_NAME=us-west-2
|
||||||
|
ARG SCCACHE_S3_NO_CREDENTIALS=0
|
||||||
|
|
||||||
FROM ${BASE_IMAGE} AS base
|
FROM ${BASE_IMAGE} AS base
|
||||||
|
|
||||||
ARG ARG_PYTORCH_ROCM_ARCH
|
ARG ARG_PYTORCH_ROCM_ARCH
|
||||||
@@ -14,9 +22,14 @@ ENV RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES=1
|
|||||||
RUN apt-get update -q -y && apt-get install -q -y \
|
RUN apt-get update -q -y && apt-get install -q -y \
|
||||||
sqlite3 libsqlite3-dev libfmt-dev libmsgpack-dev libsuitesparse-dev \
|
sqlite3 libsqlite3-dev libfmt-dev libmsgpack-dev libsuitesparse-dev \
|
||||||
apt-transport-https ca-certificates wget curl
|
apt-transport-https ca-certificates wget curl
|
||||||
# Remove sccache
|
|
||||||
RUN python3 -m pip install --upgrade pip
|
RUN python3 -m pip install --upgrade pip
|
||||||
RUN apt-get purge -y sccache; python3 -m pip uninstall -y sccache; rm -f "$(which sccache)"
|
# Remove sccache only if not using sccache (it exists in base image from Dockerfile.rocm_base)
|
||||||
|
ARG USE_SCCACHE
|
||||||
|
RUN if [ "$USE_SCCACHE" != "1" ]; then \
|
||||||
|
apt-get purge -y sccache || true; \
|
||||||
|
python3 -m pip uninstall -y sccache || true; \
|
||||||
|
rm -f "$(which sccache)" || true; \
|
||||||
|
fi
|
||||||
|
|
||||||
# Install UV
|
# Install UV
|
||||||
RUN curl -LsSf https://astral.sh/uv/install.sh | env UV_INSTALL_DIR="/usr/local/bin" sh
|
RUN curl -LsSf https://astral.sh/uv/install.sh | env UV_INSTALL_DIR="/usr/local/bin" sh
|
||||||
@@ -28,6 +41,39 @@ ENV UV_INDEX_STRATEGY="unsafe-best-match"
|
|||||||
# Use copy mode to avoid hardlink failures with Docker cache mounts
|
# Use copy mode to avoid hardlink failures with Docker cache mounts
|
||||||
ENV UV_LINK_MODE=copy
|
ENV UV_LINK_MODE=copy
|
||||||
|
|
||||||
|
# Install sccache if USE_SCCACHE is enabled (for release builds)
|
||||||
|
ARG USE_SCCACHE
|
||||||
|
ARG SCCACHE_DOWNLOAD_URL
|
||||||
|
ARG SCCACHE_ENDPOINT
|
||||||
|
ARG SCCACHE_BUCKET_NAME
|
||||||
|
ARG SCCACHE_REGION_NAME
|
||||||
|
ARG SCCACHE_S3_NO_CREDENTIALS
|
||||||
|
RUN if [ "$USE_SCCACHE" = "1" ]; then \
|
||||||
|
if command -v sccache >/dev/null 2>&1; then \
|
||||||
|
echo "sccache already installed, skipping installation"; \
|
||||||
|
sccache --version; \
|
||||||
|
else \
|
||||||
|
echo "Installing sccache..." \
|
||||||
|
&& SCCACHE_ARCH="x86_64" \
|
||||||
|
&& SCCACHE_VERSION="v0.8.1" \
|
||||||
|
&& SCCACHE_DL_URL="${SCCACHE_DOWNLOAD_URL:-https://github.com/mozilla/sccache/releases/download/${SCCACHE_VERSION}/sccache-${SCCACHE_VERSION}-${SCCACHE_ARCH}-unknown-linux-musl.tar.gz}" \
|
||||||
|
&& curl -L -o /tmp/sccache.tar.gz ${SCCACHE_DL_URL} \
|
||||||
|
&& tar -xzf /tmp/sccache.tar.gz -C /tmp \
|
||||||
|
&& mv /tmp/sccache-${SCCACHE_VERSION}-${SCCACHE_ARCH}-unknown-linux-musl/sccache /usr/bin/sccache \
|
||||||
|
&& chmod +x /usr/bin/sccache \
|
||||||
|
&& rm -rf /tmp/sccache.tar.gz /tmp/sccache-${SCCACHE_VERSION}-${SCCACHE_ARCH}-unknown-linux-musl \
|
||||||
|
&& sccache --version; \
|
||||||
|
fi; \
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Set sccache environment variables only when USE_SCCACHE=1
|
||||||
|
# This prevents S3 config from leaking into images when sccache is not used
|
||||||
|
ARG USE_SCCACHE
|
||||||
|
ENV SCCACHE_BUCKET=${USE_SCCACHE:+${SCCACHE_BUCKET_NAME}}
|
||||||
|
ENV SCCACHE_REGION=${USE_SCCACHE:+${SCCACHE_REGION_NAME}}
|
||||||
|
ENV SCCACHE_S3_NO_CREDENTIALS=${USE_SCCACHE:+${SCCACHE_S3_NO_CREDENTIALS}}
|
||||||
|
ENV SCCACHE_IDLE_TIMEOUT=${USE_SCCACHE:+0}
|
||||||
|
|
||||||
ARG COMMON_WORKDIR
|
ARG COMMON_WORKDIR
|
||||||
WORKDIR ${COMMON_WORKDIR}
|
WORKDIR ${COMMON_WORKDIR}
|
||||||
|
|
||||||
@@ -51,7 +97,7 @@ FROM fetch_vllm_${REMOTE_VLLM} AS fetch_vllm
|
|||||||
# -----------------------
|
# -----------------------
|
||||||
# vLLM build stages
|
# vLLM build stages
|
||||||
FROM fetch_vllm AS build_vllm
|
FROM fetch_vllm AS build_vllm
|
||||||
# Build vLLM
|
# Build vLLM (setup.py auto-detects sccache in PATH)
|
||||||
RUN cd vllm \
|
RUN cd vllm \
|
||||||
&& python3 -m pip install -r requirements/rocm.txt \
|
&& python3 -m pip install -r requirements/rocm.txt \
|
||||||
&& python3 setup.py clean --all \
|
&& python3 setup.py clean --all \
|
||||||
@@ -67,6 +113,178 @@ COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/docker/Dockerfile.rocm /docker/
|
|||||||
COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/.buildkite /.buildkite
|
COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/.buildkite /.buildkite
|
||||||
COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/vllm/v1 /vllm_v1
|
COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/vllm/v1 /vllm_v1
|
||||||
|
|
||||||
|
# RIXL/UCX build stages
|
||||||
|
FROM base AS build_rixl
|
||||||
|
ARG RIXL_BRANCH="f33a5599"
|
||||||
|
ARG RIXL_REPO="https://github.com/ROCm/RIXL.git"
|
||||||
|
ARG UCX_BRANCH="da3fac2a"
|
||||||
|
ARG UCX_REPO="https://github.com/ROCm/ucx.git"
|
||||||
|
ENV ROCM_PATH=/opt/rocm
|
||||||
|
ENV UCX_HOME=/usr/local/ucx
|
||||||
|
ENV RIXL_HOME=/usr/local/rixl
|
||||||
|
ENV RIXL_BENCH_HOME=/usr/local/rixl_bench
|
||||||
|
|
||||||
|
# RIXL build system dependences and RDMA support
|
||||||
|
RUN apt-get -y update && apt-get -y install autoconf libtool pkg-config \
|
||||||
|
libgrpc-dev \
|
||||||
|
libgrpc++-dev \
|
||||||
|
libprotobuf-dev \
|
||||||
|
protobuf-compiler-grpc \
|
||||||
|
libcpprest-dev \
|
||||||
|
libaio-dev \
|
||||||
|
librdmacm1 \
|
||||||
|
librdmacm-dev \
|
||||||
|
libibverbs1 \
|
||||||
|
libibverbs-dev \
|
||||||
|
ibverbs-utils \
|
||||||
|
rdmacm-utils \
|
||||||
|
ibverbs-providers \
|
||||||
|
&& rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
|
RUN uv pip install --system meson auditwheel patchelf tomlkit
|
||||||
|
|
||||||
|
RUN cd /usr/local/src && \
|
||||||
|
git clone ${UCX_REPO} && \
|
||||||
|
cd ucx && \
|
||||||
|
git checkout ${UCX_BRANCH} && \
|
||||||
|
./autogen.sh && \
|
||||||
|
mkdir build && cd build && \
|
||||||
|
../configure \
|
||||||
|
--prefix=/usr/local/ucx \
|
||||||
|
--enable-shared \
|
||||||
|
--disable-static \
|
||||||
|
--disable-doxygen-doc \
|
||||||
|
--enable-optimizations \
|
||||||
|
--enable-devel-headers \
|
||||||
|
--with-rocm=/opt/rocm \
|
||||||
|
--with-verbs \
|
||||||
|
--with-dm \
|
||||||
|
--enable-mt && \
|
||||||
|
make -j && \
|
||||||
|
make install
|
||||||
|
|
||||||
|
ENV PATH=/usr/local/ucx/bin:$PATH
|
||||||
|
ENV LD_LIBRARY_PATH=${UCX_HOME}/lib:${LD_LIBRARY_PATH}
|
||||||
|
|
||||||
|
RUN git clone ${RIXL_REPO} /opt/rixl && \
|
||||||
|
cd /opt/rixl && \
|
||||||
|
git checkout ${RIXL_BRANCH} && \
|
||||||
|
meson setup build --prefix=${RIXL_HOME} \
|
||||||
|
-Ducx_path=${UCX_HOME} \
|
||||||
|
-Drocm_path=${ROCM_PATH} && \
|
||||||
|
cd build && \
|
||||||
|
ninja && \
|
||||||
|
ninja install
|
||||||
|
|
||||||
|
# Generate RIXL wheel
|
||||||
|
RUN cd /opt/rixl && mkdir -p /app/install && \
|
||||||
|
./contrib/build-wheel.sh \
|
||||||
|
--output-dir /app/install \
|
||||||
|
--rocm-dir ${ROCM_PATH} \
|
||||||
|
--ucx-plugins-dir ${UCX_HOME}/lib/ucx \
|
||||||
|
--nixl-plugins-dir ${RIXL_HOME}/lib/x86_64-linux-gnu/plugins
|
||||||
|
|
||||||
|
|
||||||
|
# -----------------------
|
||||||
|
# vLLM wheel release build stage (for building distributable wheels)
|
||||||
|
# This stage pins dependencies to custom ROCm wheel versions and handles version detection
|
||||||
|
FROM fetch_vllm AS build_vllm_wheel_release
|
||||||
|
|
||||||
|
ARG COMMON_WORKDIR
|
||||||
|
|
||||||
|
# Create /install directory for custom wheels
|
||||||
|
RUN mkdir -p /install
|
||||||
|
|
||||||
|
# Copy custom ROCm wheels from docker/context if they exist
|
||||||
|
# COPY ensures Docker cache is invalidated when wheels change
|
||||||
|
# .keep file ensures directory always exists for COPY to work
|
||||||
|
COPY docker/context/base-wheels/ /tmp/base-wheels/
|
||||||
|
# This is how we know if we are building for a wheel release or not.
|
||||||
|
# If there are not wheels found there, we are not building for a wheel release.
|
||||||
|
# So we exit with an error. To skip this stage.
|
||||||
|
RUN if [ -n "$(ls /tmp/base-wheels/*.whl 2>/dev/null)" ]; then \
|
||||||
|
echo "Found custom wheels - copying to /install"; \
|
||||||
|
cp /tmp/base-wheels/*.whl /install/ && \
|
||||||
|
echo "Copied custom wheels:"; \
|
||||||
|
ls -lh /install/; \
|
||||||
|
else \
|
||||||
|
echo "ERROR: No custom wheels found in docker/context/base-wheels/"; \
|
||||||
|
echo "Wheel releases require pre-built ROCm wheels."; \
|
||||||
|
exit 1; \
|
||||||
|
fi
|
||||||
|
|
||||||
|
# GIT_REPO_CHECK: Verify repo is clean and tags are available (for release builds)
|
||||||
|
# This matches CUDA's Dockerfile behavior for proper version detection via setuptools_scm
|
||||||
|
ARG GIT_REPO_CHECK=0
|
||||||
|
RUN if [ "$GIT_REPO_CHECK" != "0" ]; then \
|
||||||
|
echo "Running repository checks..."; \
|
||||||
|
cd vllm && bash tools/check_repo.sh; \
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Extract version from git BEFORE any modifications (pin_rocm_dependencies.py modifies requirements/rocm.txt)
|
||||||
|
# This ensures setuptools_scm sees clean repo state for version detection
|
||||||
|
RUN --mount=type=bind,source=.git,target=vllm/.git \
|
||||||
|
cd vllm \
|
||||||
|
&& pip install setuptools_scm \
|
||||||
|
&& VLLM_VERSION=$(python3 -c "import setuptools_scm; print(setuptools_scm.get_version())") \
|
||||||
|
&& echo "Detected vLLM version: ${VLLM_VERSION}" \
|
||||||
|
&& echo "${VLLM_VERSION}" > /tmp/vllm_version.txt
|
||||||
|
|
||||||
|
# Fail if git-based package dependencies are found in requirements files
|
||||||
|
# (uv doesn't handle git+ URLs well, and packages should be distributed on PyPI)
|
||||||
|
# Extra notes: pip install is able to handle git+ URLs, but uv doesn't.
|
||||||
|
RUN echo "Checking for git-based packages in requirements files..." \
|
||||||
|
&& echo "Checking common.txt for git-based packages:" \
|
||||||
|
&& if grep -q 'git+' ${COMMON_WORKDIR}/vllm/requirements/common.txt; then \
|
||||||
|
echo "ERROR: Git-based packages found in common.txt:"; \
|
||||||
|
grep 'git+' ${COMMON_WORKDIR}/vllm/requirements/common.txt; \
|
||||||
|
echo "Please publish these packages to PyPI instead of using git dependencies."; \
|
||||||
|
exit 1; \
|
||||||
|
else \
|
||||||
|
echo " ✓ No git-based packages found in common.txt"; \
|
||||||
|
fi \
|
||||||
|
&& echo "Checking rocm.txt for git-based packages:" \
|
||||||
|
&& if grep -q 'git+' ${COMMON_WORKDIR}/vllm/requirements/rocm.txt; then \
|
||||||
|
echo "ERROR: Git-based packages found in rocm.txt:"; \
|
||||||
|
grep 'git+' ${COMMON_WORKDIR}/vllm/requirements/rocm.txt; \
|
||||||
|
echo "Please publish these packages to PyPI instead of using git dependencies."; \
|
||||||
|
exit 1; \
|
||||||
|
else \
|
||||||
|
echo " ✓ No git-based packages found in rocm.txt"; \
|
||||||
|
fi \
|
||||||
|
&& echo "All requirements files are clean - no git-based packages found"
|
||||||
|
|
||||||
|
# Pin vLLM dependencies to exact versions of custom ROCm wheels
|
||||||
|
# This ensures 'pip install vllm' automatically installs correct torch/triton/torchvision/amdsmi
|
||||||
|
COPY tools/vllm-rocm/pin_rocm_dependencies.py /tmp/pin_rocm_dependencies.py
|
||||||
|
RUN echo "Pinning vLLM dependencies to custom wheel versions..." \
|
||||||
|
&& python3 /tmp/pin_rocm_dependencies.py /install ${COMMON_WORKDIR}/vllm/requirements/rocm.txt
|
||||||
|
|
||||||
|
# Install dependencies using custom wheels from /install
|
||||||
|
RUN cd vllm \
|
||||||
|
&& echo "Building vLLM with custom wheels from /install" \
|
||||||
|
&& python3 -m pip install --find-links /install -r requirements/rocm.txt \
|
||||||
|
&& python3 setup.py clean --all
|
||||||
|
|
||||||
|
# Build wheel using pre-extracted version to avoid dirty state from modified requirements/rocm.txt
|
||||||
|
# (setup.py auto-detects sccache in PATH)
|
||||||
|
RUN --mount=type=bind,source=.git,target=vllm/.git \
|
||||||
|
cd vllm \
|
||||||
|
&& export SETUPTOOLS_SCM_PRETEND_VERSION=$(cat /tmp/vllm_version.txt) \
|
||||||
|
&& echo "Building wheel with version: ${SETUPTOOLS_SCM_PRETEND_VERSION}" \
|
||||||
|
&& python3 setup.py bdist_wheel --dist-dir=dist
|
||||||
|
|
||||||
|
FROM scratch AS export_vllm_wheel_release
|
||||||
|
ARG COMMON_WORKDIR
|
||||||
|
COPY --from=build_vllm_wheel_release ${COMMON_WORKDIR}/vllm/dist/*.whl /
|
||||||
|
COPY --from=build_vllm_wheel_release ${COMMON_WORKDIR}/vllm/requirements /requirements
|
||||||
|
COPY --from=build_vllm_wheel_release ${COMMON_WORKDIR}/vllm/benchmarks /benchmarks
|
||||||
|
COPY --from=build_vllm_wheel_release ${COMMON_WORKDIR}/vllm/tests /tests
|
||||||
|
COPY --from=build_vllm_wheel_release ${COMMON_WORKDIR}/vllm/examples /examples
|
||||||
|
COPY --from=build_vllm_wheel_release ${COMMON_WORKDIR}/vllm/docker/Dockerfile.rocm /docker/
|
||||||
|
COPY --from=build_vllm_wheel_release ${COMMON_WORKDIR}/vllm/.buildkite /.buildkite
|
||||||
|
COPY --from=build_vllm_wheel_release ${COMMON_WORKDIR}/vllm/vllm/v1 /vllm_v1
|
||||||
|
|
||||||
# -----------------------
|
# -----------------------
|
||||||
# Test vLLM image
|
# Test vLLM image
|
||||||
FROM base AS test
|
FROM base AS test
|
||||||
@@ -159,3 +377,7 @@ ENV KINETO_CONFIG="${COMMON_WORKDIR}/libkineto.conf"
|
|||||||
RUN echo "VLLM_BASE_IMAGE=${BASE_IMAGE}" >> ${COMMON_WORKDIR}/versions.txt
|
RUN echo "VLLM_BASE_IMAGE=${BASE_IMAGE}" >> ${COMMON_WORKDIR}/versions.txt
|
||||||
|
|
||||||
CMD ["/bin/bash"]
|
CMD ["/bin/bash"]
|
||||||
|
|
||||||
|
#Set entrypoint for vllm-openai official images
|
||||||
|
FROM final As vllm-openai
|
||||||
|
ENTRYPOINT ["vllm", "serve"]
|
||||||
|
|||||||
@@ -14,16 +14,13 @@ ARG AITER_REPO="https://github.com/ROCm/aiter.git"
|
|||||||
ARG MORI_BRANCH="2d02c6a9"
|
ARG MORI_BRANCH="2d02c6a9"
|
||||||
ARG MORI_REPO="https://github.com/ROCm/mori.git"
|
ARG MORI_REPO="https://github.com/ROCm/mori.git"
|
||||||
|
|
||||||
#TODO: When patch has been upstreamed, switch to the main repo/branch
|
# Sccache configuration (only used in release pipeline)
|
||||||
# ARG RIXL_BRANCH="<TODO>"
|
ARG USE_SCCACHE
|
||||||
# ARG RIXL_REPO="https://github.com/ROCm/RIXL.git"
|
ARG SCCACHE_DOWNLOAD_URL
|
||||||
ARG RIXL_BRANCH="50d63d94"
|
ARG SCCACHE_ENDPOINT
|
||||||
ARG RIXL_REPO="https://github.com/vcave/RIXL.git"
|
ARG SCCACHE_BUCKET_NAME=vllm-build-sccache
|
||||||
# Needed by RIXL
|
ARG SCCACHE_REGION_NAME=us-west-2
|
||||||
ARG ETCD_BRANCH="7c6e714f"
|
ARG SCCACHE_S3_NO_CREDENTIALS=0
|
||||||
ARG ETCD_REPO="https://github.com/etcd-cpp-apiv3/etcd-cpp-apiv3.git"
|
|
||||||
ARG UCX_BRANCH="da3fac2a"
|
|
||||||
ARG UCX_REPO="https://github.com/ROCm/ucx.git"
|
|
||||||
|
|
||||||
FROM ${BASE_IMAGE} AS base
|
FROM ${BASE_IMAGE} AS base
|
||||||
|
|
||||||
@@ -64,6 +61,49 @@ RUN apt-get update -y \
|
|||||||
RUN pip install -U packaging 'cmake<4' ninja wheel 'setuptools<80' pybind11 Cython
|
RUN pip install -U packaging 'cmake<4' ninja wheel 'setuptools<80' pybind11 Cython
|
||||||
RUN apt-get update && apt-get install -y libjpeg-dev libsox-dev libsox-fmt-all sox && rm -rf /var/lib/apt/lists/*
|
RUN apt-get update && apt-get install -y libjpeg-dev libsox-dev libsox-fmt-all sox && rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
|
# Install sccache if USE_SCCACHE is enabled (for release builds)
|
||||||
|
ARG USE_SCCACHE
|
||||||
|
ARG SCCACHE_DOWNLOAD_URL
|
||||||
|
ARG SCCACHE_ENDPOINT
|
||||||
|
ARG SCCACHE_BUCKET_NAME
|
||||||
|
ARG SCCACHE_REGION_NAME
|
||||||
|
ARG SCCACHE_S3_NO_CREDENTIALS
|
||||||
|
RUN if [ "$USE_SCCACHE" = "1" ]; then \
|
||||||
|
echo "Installing sccache..." \
|
||||||
|
&& SCCACHE_ARCH="x86_64" \
|
||||||
|
&& SCCACHE_VERSION="v0.8.1" \
|
||||||
|
&& SCCACHE_DL_URL="${SCCACHE_DOWNLOAD_URL:-https://github.com/mozilla/sccache/releases/download/${SCCACHE_VERSION}/sccache-${SCCACHE_VERSION}-${SCCACHE_ARCH}-unknown-linux-musl.tar.gz}" \
|
||||||
|
&& curl -L -o /tmp/sccache.tar.gz ${SCCACHE_DL_URL} \
|
||||||
|
&& tar -xzf /tmp/sccache.tar.gz -C /tmp \
|
||||||
|
&& mv /tmp/sccache-${SCCACHE_VERSION}-${SCCACHE_ARCH}-unknown-linux-musl/sccache /usr/bin/sccache \
|
||||||
|
&& chmod +x /usr/bin/sccache \
|
||||||
|
&& rm -rf /tmp/sccache.tar.gz /tmp/sccache-${SCCACHE_VERSION}-${SCCACHE_ARCH}-unknown-linux-musl \
|
||||||
|
&& sccache --version; \
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Setup sccache for HIP compilation via HIP_CLANG_PATH
|
||||||
|
# This creates wrapper scripts in a separate directory and points HIP to use them
|
||||||
|
# This avoids modifying the original ROCm binaries which can break detection
|
||||||
|
# NOTE: HIP_CLANG_PATH is NOT set as ENV to avoid affecting downstream images (Dockerfile.rocm)
|
||||||
|
# Instead, each build stage should export HIP_CLANG_PATH=/opt/sccache-wrappers if USE_SCCACHE=1
|
||||||
|
RUN if [ "$USE_SCCACHE" = "1" ]; then \
|
||||||
|
echo "Setting up sccache wrappers for HIP compilation..." \
|
||||||
|
&& mkdir -p /opt/sccache-wrappers \
|
||||||
|
&& printf '#!/bin/bash\nexec sccache /opt/rocm/lib/llvm/bin/clang++ "$@"\n' > /opt/sccache-wrappers/clang++ \
|
||||||
|
&& chmod +x /opt/sccache-wrappers/clang++ \
|
||||||
|
&& printf '#!/bin/bash\nexec sccache /opt/rocm/lib/llvm/bin/clang "$@"\n' > /opt/sccache-wrappers/clang \
|
||||||
|
&& chmod +x /opt/sccache-wrappers/clang \
|
||||||
|
&& echo "sccache wrappers created in /opt/sccache-wrappers"; \
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Set sccache environment variables only when USE_SCCACHE=1
|
||||||
|
# This prevents S3 config from leaking into images when sccache is not used
|
||||||
|
ARG USE_SCCACHE
|
||||||
|
ENV SCCACHE_BUCKET=${USE_SCCACHE:+${SCCACHE_BUCKET_NAME}}
|
||||||
|
ENV SCCACHE_REGION=${USE_SCCACHE:+${SCCACHE_REGION_NAME}}
|
||||||
|
ENV SCCACHE_S3_NO_CREDENTIALS=${USE_SCCACHE:+${SCCACHE_S3_NO_CREDENTIALS}}
|
||||||
|
ENV SCCACHE_IDLE_TIMEOUT=${USE_SCCACHE:+0}
|
||||||
|
|
||||||
|
|
||||||
###
|
###
|
||||||
### Triton Build
|
### Triton Build
|
||||||
@@ -100,22 +140,42 @@ ARG PYTORCH_AUDIO_BRANCH
|
|||||||
ARG PYTORCH_REPO
|
ARG PYTORCH_REPO
|
||||||
ARG PYTORCH_VISION_REPO
|
ARG PYTORCH_VISION_REPO
|
||||||
ARG PYTORCH_AUDIO_REPO
|
ARG PYTORCH_AUDIO_REPO
|
||||||
|
ARG USE_SCCACHE
|
||||||
|
|
||||||
RUN git clone ${PYTORCH_REPO} pytorch
|
RUN git clone ${PYTORCH_REPO} pytorch
|
||||||
RUN cd pytorch && git checkout ${PYTORCH_BRANCH} \
|
RUN cd pytorch && git checkout ${PYTORCH_BRANCH} \
|
||||||
&& pip install -r requirements.txt && git submodule update --init --recursive \
|
&& pip install -r requirements.txt && git submodule update --init --recursive \
|
||||||
&& python3 tools/amd_build/build_amd.py \
|
&& python3 tools/amd_build/build_amd.py \
|
||||||
|
&& if [ "$USE_SCCACHE" = "1" ]; then \
|
||||||
|
export HIP_CLANG_PATH=/opt/sccache-wrappers \
|
||||||
|
&& export CMAKE_C_COMPILER_LAUNCHER=sccache \
|
||||||
|
&& export CMAKE_CXX_COMPILER_LAUNCHER=sccache \
|
||||||
|
&& sccache --show-stats; \
|
||||||
|
fi \
|
||||||
&& CMAKE_PREFIX_PATH=$(python3 -c 'import sys; print(sys.prefix)') python3 setup.py bdist_wheel --dist-dir=dist \
|
&& CMAKE_PREFIX_PATH=$(python3 -c 'import sys; print(sys.prefix)') python3 setup.py bdist_wheel --dist-dir=dist \
|
||||||
|
&& if [ "$USE_SCCACHE" = "1" ]; then sccache --show-stats; fi \
|
||||||
&& pip install dist/*.whl
|
&& pip install dist/*.whl
|
||||||
RUN git clone ${PYTORCH_VISION_REPO} vision
|
RUN git clone ${PYTORCH_VISION_REPO} vision
|
||||||
RUN cd vision && git checkout ${PYTORCH_VISION_BRANCH} \
|
RUN cd vision && git checkout ${PYTORCH_VISION_BRANCH} \
|
||||||
|
&& if [ "$USE_SCCACHE" = "1" ]; then \
|
||||||
|
export HIP_CLANG_PATH=/opt/sccache-wrappers \
|
||||||
|
&& export CMAKE_C_COMPILER_LAUNCHER=sccache \
|
||||||
|
&& export CMAKE_CXX_COMPILER_LAUNCHER=sccache; \
|
||||||
|
fi \
|
||||||
&& python3 setup.py bdist_wheel --dist-dir=dist \
|
&& python3 setup.py bdist_wheel --dist-dir=dist \
|
||||||
|
&& if [ "$USE_SCCACHE" = "1" ]; then sccache --show-stats; fi \
|
||||||
&& pip install dist/*.whl
|
&& pip install dist/*.whl
|
||||||
RUN git clone ${PYTORCH_AUDIO_REPO} audio
|
RUN git clone ${PYTORCH_AUDIO_REPO} audio
|
||||||
RUN cd audio && git checkout ${PYTORCH_AUDIO_BRANCH} \
|
RUN cd audio && git checkout ${PYTORCH_AUDIO_BRANCH} \
|
||||||
&& git submodule update --init --recursive \
|
&& git submodule update --init --recursive \
|
||||||
&& pip install -r requirements.txt \
|
&& pip install -r requirements.txt \
|
||||||
|
&& if [ "$USE_SCCACHE" = "1" ]; then \
|
||||||
|
export HIP_CLANG_PATH=/opt/sccache-wrappers \
|
||||||
|
&& export CMAKE_C_COMPILER_LAUNCHER=sccache \
|
||||||
|
&& export CMAKE_CXX_COMPILER_LAUNCHER=sccache; \
|
||||||
|
fi \
|
||||||
&& python3 setup.py bdist_wheel --dist-dir=dist \
|
&& python3 setup.py bdist_wheel --dist-dir=dist \
|
||||||
|
&& if [ "$USE_SCCACHE" = "1" ]; then sccache --show-stats; fi \
|
||||||
&& pip install dist/*.whl
|
&& pip install dist/*.whl
|
||||||
RUN mkdir -p /app/install && cp /app/pytorch/dist/*.whl /app/install \
|
RUN mkdir -p /app/install && cp /app/pytorch/dist/*.whl /app/install \
|
||||||
&& cp /app/vision/dist/*.whl /app/install \
|
&& cp /app/vision/dist/*.whl /app/install \
|
||||||
@@ -230,13 +290,19 @@ RUN cd /opt/rixl && mkdir -p /app/install && \
|
|||||||
FROM base AS build_fa
|
FROM base AS build_fa
|
||||||
ARG FA_BRANCH
|
ARG FA_BRANCH
|
||||||
ARG FA_REPO
|
ARG FA_REPO
|
||||||
|
ARG USE_SCCACHE
|
||||||
RUN --mount=type=bind,from=build_pytorch,src=/app/install/,target=/install \
|
RUN --mount=type=bind,from=build_pytorch,src=/app/install/,target=/install \
|
||||||
pip install /install/*.whl
|
pip install /install/*.whl
|
||||||
RUN git clone ${FA_REPO}
|
RUN git clone ${FA_REPO}
|
||||||
RUN cd flash-attention \
|
RUN cd flash-attention \
|
||||||
&& git checkout ${FA_BRANCH} \
|
&& git checkout ${FA_BRANCH} \
|
||||||
&& git submodule update --init \
|
&& git submodule update --init \
|
||||||
&& GPU_ARCHS=$(echo ${PYTORCH_ROCM_ARCH} | sed -e 's/;gfx1[0-9]\{3\}//g') python3 setup.py bdist_wheel --dist-dir=dist
|
&& if [ "$USE_SCCACHE" = "1" ]; then \
|
||||||
|
export HIP_CLANG_PATH=/opt/sccache-wrappers \
|
||||||
|
&& sccache --show-stats; \
|
||||||
|
fi \
|
||||||
|
&& GPU_ARCHS=$(echo ${PYTORCH_ROCM_ARCH} | sed -e 's/;gfx1[0-9]\{3\}//g') python3 setup.py bdist_wheel --dist-dir=dist \
|
||||||
|
&& if [ "$USE_SCCACHE" = "1" ]; then sccache --show-stats; fi
|
||||||
RUN mkdir -p /app/install && cp /app/flash-attention/dist/*.whl /app/install
|
RUN mkdir -p /app/install && cp /app/flash-attention/dist/*.whl /app/install
|
||||||
|
|
||||||
|
|
||||||
@@ -246,6 +312,7 @@ RUN mkdir -p /app/install && cp /app/flash-attention/dist/*.whl /app/install
|
|||||||
FROM base AS build_aiter
|
FROM base AS build_aiter
|
||||||
ARG AITER_BRANCH
|
ARG AITER_BRANCH
|
||||||
ARG AITER_REPO
|
ARG AITER_REPO
|
||||||
|
ARG USE_SCCACHE
|
||||||
RUN --mount=type=bind,from=build_pytorch,src=/app/install/,target=/install \
|
RUN --mount=type=bind,from=build_pytorch,src=/app/install/,target=/install \
|
||||||
pip install /install/*.whl
|
pip install /install/*.whl
|
||||||
RUN git clone --recursive ${AITER_REPO}
|
RUN git clone --recursive ${AITER_REPO}
|
||||||
@@ -253,13 +320,37 @@ RUN cd aiter \
|
|||||||
&& git checkout ${AITER_BRANCH} \
|
&& git checkout ${AITER_BRANCH} \
|
||||||
&& git submodule update --init --recursive \
|
&& git submodule update --init --recursive \
|
||||||
&& pip install -r requirements.txt
|
&& pip install -r requirements.txt
|
||||||
RUN pip install pyyaml && cd aiter && PREBUILD_KERNELS=1 GPU_ARCHS=${AITER_ROCM_ARCH} python3 setup.py bdist_wheel --dist-dir=dist && ls /app/aiter/dist/*.whl
|
RUN pip install pyyaml && cd aiter \
|
||||||
|
&& if [ "$USE_SCCACHE" = "1" ]; then \
|
||||||
|
export HIP_CLANG_PATH=/opt/sccache-wrappers \
|
||||||
|
&& sccache --show-stats; \
|
||||||
|
fi \
|
||||||
|
&& PREBUILD_KERNELS=1 GPU_ARCHS=${AITER_ROCM_ARCH} python3 setup.py bdist_wheel --dist-dir=dist \
|
||||||
|
&& if [ "$USE_SCCACHE" = "1" ]; then sccache --show-stats; fi \
|
||||||
|
&& ls /app/aiter/dist/*.whl
|
||||||
RUN mkdir -p /app/install && cp /app/aiter/dist/*.whl /app/install
|
RUN mkdir -p /app/install && cp /app/aiter/dist/*.whl /app/install
|
||||||
|
|
||||||
|
|
||||||
###
|
###
|
||||||
### Final Build
|
### Final Build
|
||||||
###
|
###
|
||||||
|
|
||||||
|
# Wheel release stage -
|
||||||
|
# only includes dependencies used by wheel release pipeline
|
||||||
|
FROM base AS debs_wheel_release
|
||||||
|
RUN mkdir /app/debs
|
||||||
|
RUN --mount=type=bind,from=build_triton,src=/app/install/,target=/install \
|
||||||
|
cp /install/*.whl /app/debs
|
||||||
|
RUN --mount=type=bind,from=build_fa,src=/app/install/,target=/install \
|
||||||
|
cp /install/*.whl /app/debs
|
||||||
|
RUN --mount=type=bind,from=build_amdsmi,src=/app/install/,target=/install \
|
||||||
|
cp /install/*.whl /app/debs
|
||||||
|
RUN --mount=type=bind,from=build_pytorch,src=/app/install/,target=/install \
|
||||||
|
cp /install/*.whl /app/debs
|
||||||
|
RUN --mount=type=bind,from=build_aiter,src=/app/install/,target=/install \
|
||||||
|
cp /install/*.whl /app/debs
|
||||||
|
|
||||||
|
# Full debs stage - includes Mori (used by Docker releases)
|
||||||
FROM base AS debs
|
FROM base AS debs
|
||||||
RUN mkdir /app/debs
|
RUN mkdir /app/debs
|
||||||
RUN --mount=type=bind,from=build_triton,src=/app/install/,target=/install \
|
RUN --mount=type=bind,from=build_triton,src=/app/install/,target=/install \
|
||||||
|
|||||||
@@ -47,7 +47,7 @@ The key parameters for chunked processing are in the `--pooler-config`:
|
|||||||
```json
|
```json
|
||||||
{
|
{
|
||||||
"pooling_type": "auto",
|
"pooling_type": "auto",
|
||||||
"normalize": true,
|
"use_activation": true,
|
||||||
"enable_chunked_processing": true,
|
"enable_chunked_processing": true,
|
||||||
"max_embed_len": 3072000
|
"max_embed_len": 3072000
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -14,7 +14,7 @@ Prerequisites:
|
|||||||
# MEAN pooling (processes all chunks, recommended for complete coverage)
|
# MEAN pooling (processes all chunks, recommended for complete coverage)
|
||||||
vllm serve intfloat/multilingual-e5-large \
|
vllm serve intfloat/multilingual-e5-large \
|
||||||
--pooler-config \
|
--pooler-config \
|
||||||
'{"pooling_type": "MEAN", "normalize": true, ' \
|
'{"pooling_type": "MEAN", "use_activation": true, ' \
|
||||||
'"enable_chunked_processing": true, "max_embed_len": 3072000}' \
|
'"enable_chunked_processing": true, "max_embed_len": 3072000}' \
|
||||||
--served-model-name multilingual-e5-large \
|
--served-model-name multilingual-e5-large \
|
||||||
--trust-remote-code \
|
--trust-remote-code \
|
||||||
@@ -24,7 +24,7 @@ Prerequisites:
|
|||||||
# OR CLS pooling (native CLS within chunks, MEAN aggregation across chunks)
|
# OR CLS pooling (native CLS within chunks, MEAN aggregation across chunks)
|
||||||
vllm serve BAAI/bge-large-en-v1.5 \
|
vllm serve BAAI/bge-large-en-v1.5 \
|
||||||
--pooler-config \
|
--pooler-config \
|
||||||
'{"pooling_type": "CLS", "normalize": true, ' \
|
'{"pooling_type": "CLS", "use_activation": true, ' \
|
||||||
'"enable_chunked_processing": true, "max_embed_len": 1048576}' \
|
'"enable_chunked_processing": true, "max_embed_len": 1048576}' \
|
||||||
--served-model-name bge-large-en-v1.5 \
|
--served-model-name bge-large-en-v1.5 \
|
||||||
--trust-remote-code \
|
--trust-remote-code \
|
||||||
|
|||||||
@@ -96,7 +96,7 @@ echo ""
|
|||||||
echo "🔧 Starting server with enhanced chunked processing configuration..."
|
echo "🔧 Starting server with enhanced chunked processing configuration..."
|
||||||
|
|
||||||
# Build pooler config JSON
|
# Build pooler config JSON
|
||||||
POOLER_CONFIG="{\"pooling_type\": \"$POOLING_TYPE\", \"normalize\": true, \"enable_chunked_processing\": ${VLLM_ENABLE_CHUNKED_PROCESSING}, \"max_embed_len\": ${MAX_EMBED_LEN}}"
|
POOLER_CONFIG="{\"pooling_type\": \"$POOLING_TYPE\", \"use_activation\": true, \"enable_chunked_processing\": ${VLLM_ENABLE_CHUNKED_PROCESSING}, \"max_embed_len\": ${MAX_EMBED_LEN}}"
|
||||||
|
|
||||||
# Start vLLM server with enhanced chunked processing
|
# Start vLLM server with enhanced chunked processing
|
||||||
vllm serve "$MODEL_NAME" \
|
vllm serve "$MODEL_NAME" \
|
||||||
|
|||||||
@@ -80,6 +80,8 @@ num2words==0.5.14
|
|||||||
pqdm==0.2.0
|
pqdm==0.2.0
|
||||||
# via lm-eval
|
# via lm-eval
|
||||||
|
|
||||||
|
# Required for fastsafetensors test
|
||||||
|
fastsafetensors @ git+https://github.com/foundation-model-stack/fastsafetensors.git@d6f998a03432b2452f8de2bb5cefb5af9795d459
|
||||||
# Required for suffix decoding test
|
# Required for suffix decoding test
|
||||||
arctic-inference == 0.1.1
|
arctic-inference == 0.1.1
|
||||||
# Required for Nemotron test
|
# Required for Nemotron test
|
||||||
|
|||||||
@@ -15,5 +15,4 @@ setuptools-scm>=8
|
|||||||
runai-model-streamer[s3,gcs]==0.15.3
|
runai-model-streamer[s3,gcs]==0.15.3
|
||||||
conch-triton-kernels==1.2.1
|
conch-triton-kernels==1.2.1
|
||||||
timm>=1.0.17
|
timm>=1.0.17
|
||||||
fastsafetensors @ git+https://github.com/foundation-model-stack/fastsafetensors.git@d6f998a03432b2452f8de2bb5cefb5af9795d459
|
|
||||||
grpcio-tools>=1.76.0
|
grpcio-tools>=1.76.0
|
||||||
@@ -53,7 +53,9 @@ def test_token_embed(llm: LLM):
|
|||||||
def test_pooling_params(llm: LLM):
|
def test_pooling_params(llm: LLM):
|
||||||
def get_outputs(normalize):
|
def get_outputs(normalize):
|
||||||
outputs = llm.embed(
|
outputs = llm.embed(
|
||||||
prompts, pooling_params=PoolingParams(normalize=normalize), use_tqdm=False
|
prompts,
|
||||||
|
pooling_params=PoolingParams(use_activation=normalize),
|
||||||
|
use_tqdm=False,
|
||||||
)
|
)
|
||||||
return torch.tensor([x.outputs.embedding for x in outputs])
|
return torch.tensor([x.outputs.embedding for x in outputs])
|
||||||
|
|
||||||
|
|||||||
@@ -216,7 +216,7 @@ def server_with_chunked_processing():
|
|||||||
"512", # Set smaller max_model_len to trigger chunking mechanism
|
"512", # Set smaller max_model_len to trigger chunking mechanism
|
||||||
"--pooler-config",
|
"--pooler-config",
|
||||||
(
|
(
|
||||||
'{"pooling_type": "MEAN", "normalize": true, '
|
'{"pooling_type": "MEAN", "use_activation": true, '
|
||||||
'"enable_chunked_processing": true, "max_embed_len": 10000}'
|
'"enable_chunked_processing": true, "max_embed_len": 10000}'
|
||||||
),
|
),
|
||||||
"--gpu-memory-utilization",
|
"--gpu-memory-utilization",
|
||||||
|
|||||||
@@ -236,17 +236,14 @@ class TestModel:
|
|||||||
"use_activation": use_activation,
|
"use_activation": use_activation,
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
if response.status_code != 200:
|
|
||||||
return response
|
|
||||||
|
|
||||||
outputs = response.json()
|
outputs = response.json()
|
||||||
return torch.tensor([x["score"] for x in outputs["data"]])
|
return torch.tensor([x["score"] for x in outputs["data"]])
|
||||||
|
|
||||||
if model["is_cross_encoder"]:
|
|
||||||
default = get_outputs(use_activation=None)
|
default = get_outputs(use_activation=None)
|
||||||
w_activation = get_outputs(use_activation=True)
|
w_activation = get_outputs(use_activation=True)
|
||||||
wo_activation = get_outputs(use_activation=False)
|
wo_activation = get_outputs(use_activation=False)
|
||||||
|
|
||||||
|
if model["is_cross_encoder"]:
|
||||||
assert torch.allclose(default, w_activation, atol=1e-2), (
|
assert torch.allclose(default, w_activation, atol=1e-2), (
|
||||||
"Default should use activation."
|
"Default should use activation."
|
||||||
)
|
)
|
||||||
@@ -256,9 +253,3 @@ class TestModel:
|
|||||||
assert torch.allclose(F.sigmoid(wo_activation), w_activation, atol=1e-2), (
|
assert torch.allclose(F.sigmoid(wo_activation), w_activation, atol=1e-2), (
|
||||||
"w_activation should be close to activation(wo_activation)."
|
"w_activation should be close to activation(wo_activation)."
|
||||||
)
|
)
|
||||||
else:
|
|
||||||
get_outputs(use_activation=None)
|
|
||||||
|
|
||||||
# The activation parameter only works for the is_cross_encoder model
|
|
||||||
response = get_outputs(use_activation=True)
|
|
||||||
assert response.status_code == 400
|
|
||||||
|
|||||||
@@ -48,7 +48,7 @@ def test_model_loading_with_params(vllm_runner, monkeypatch):
|
|||||||
# asserts on the pooling config files
|
# asserts on the pooling config files
|
||||||
assert model_config.pooler_config.seq_pooling_type == "CLS"
|
assert model_config.pooler_config.seq_pooling_type == "CLS"
|
||||||
assert model_config.pooler_config.tok_pooling_type == "ALL"
|
assert model_config.pooler_config.tok_pooling_type == "ALL"
|
||||||
assert model_config.pooler_config.normalize
|
assert model_config.pooler_config.use_activation
|
||||||
|
|
||||||
# asserts on the tokenizer loaded
|
# asserts on the tokenizer loaded
|
||||||
assert model_config.tokenizer == "BAAI/bge-base-en-v1.5"
|
assert model_config.tokenizer == "BAAI/bge-base-en-v1.5"
|
||||||
@@ -93,7 +93,7 @@ def test_roberta_model_loading_with_params(vllm_runner, monkeypatch):
|
|||||||
# asserts on the pooling config files
|
# asserts on the pooling config files
|
||||||
assert model_config.pooler_config.seq_pooling_type == "MEAN"
|
assert model_config.pooler_config.seq_pooling_type == "MEAN"
|
||||||
assert model_config.pooler_config.tok_pooling_type == "ALL"
|
assert model_config.pooler_config.tok_pooling_type == "ALL"
|
||||||
assert model_config.pooler_config.normalize
|
assert model_config.pooler_config.use_activation
|
||||||
|
|
||||||
# asserts on the tokenizer loaded
|
# asserts on the tokenizer loaded
|
||||||
assert model_config.tokenizer == "intfloat/multilingual-e5-base"
|
assert model_config.tokenizer == "intfloat/multilingual-e5-base"
|
||||||
|
|||||||
@@ -66,7 +66,7 @@ def test_embed_models_using_normalize(
|
|||||||
model,
|
model,
|
||||||
max_model_len=512,
|
max_model_len=512,
|
||||||
dtype=dtype,
|
dtype=dtype,
|
||||||
pooler_config=PoolerConfig(normalize=False),
|
pooler_config=PoolerConfig(use_activation=False),
|
||||||
) as vllm_model:
|
) as vllm_model:
|
||||||
wo_normalize = torch.tensor(vllm_model.embed(example_prompts))
|
wo_normalize = torch.tensor(vllm_model.embed(example_prompts))
|
||||||
|
|
||||||
@@ -74,7 +74,7 @@ def test_embed_models_using_normalize(
|
|||||||
model,
|
model,
|
||||||
max_model_len=512,
|
max_model_len=512,
|
||||||
dtype=dtype,
|
dtype=dtype,
|
||||||
pooler_config=PoolerConfig(normalize=True),
|
pooler_config=PoolerConfig(use_activation=True),
|
||||||
) as vllm_model:
|
) as vllm_model:
|
||||||
w_normalize = torch.tensor(vllm_model.embed(example_prompts))
|
w_normalize = torch.tensor(vllm_model.embed(example_prompts))
|
||||||
|
|
||||||
@@ -146,7 +146,7 @@ def test_multi_vector_retrieval_models_using_normalize(
|
|||||||
model,
|
model,
|
||||||
max_model_len=512,
|
max_model_len=512,
|
||||||
dtype=dtype,
|
dtype=dtype,
|
||||||
pooler_config=PoolerConfig(normalize=False),
|
pooler_config=PoolerConfig(use_activation=False),
|
||||||
) as vllm_model:
|
) as vllm_model:
|
||||||
wo_normalize = vllm_model.token_embed(example_prompts)
|
wo_normalize = vllm_model.token_embed(example_prompts)
|
||||||
|
|
||||||
@@ -154,7 +154,7 @@ def test_multi_vector_retrieval_models_using_normalize(
|
|||||||
model,
|
model,
|
||||||
max_model_len=512,
|
max_model_len=512,
|
||||||
dtype=dtype,
|
dtype=dtype,
|
||||||
pooler_config=PoolerConfig(normalize=True),
|
pooler_config=PoolerConfig(use_activation=True),
|
||||||
) as vllm_model:
|
) as vllm_model:
|
||||||
w_normalize = vllm_model.token_embed(example_prompts)
|
w_normalize = vllm_model.token_embed(example_prompts)
|
||||||
|
|
||||||
|
|||||||
@@ -160,7 +160,7 @@ def test_get_pooling_config():
|
|||||||
model_config = ModelConfig(model_id)
|
model_config = ModelConfig(model_id)
|
||||||
|
|
||||||
assert model_config.pooler_config is not None
|
assert model_config.pooler_config is not None
|
||||||
assert model_config.pooler_config.normalize
|
assert model_config.pooler_config.use_activation
|
||||||
assert model_config.pooler_config.seq_pooling_type == "MEAN"
|
assert model_config.pooler_config.seq_pooling_type == "MEAN"
|
||||||
assert model_config.pooler_config.tok_pooling_type == "ALL"
|
assert model_config.pooler_config.tok_pooling_type == "ALL"
|
||||||
|
|
||||||
|
|||||||
@@ -18,7 +18,7 @@ EMBEDDING_MODELS = [
|
|||||||
]
|
]
|
||||||
|
|
||||||
classify_parameters = ["use_activation"]
|
classify_parameters = ["use_activation"]
|
||||||
embed_parameters = ["dimensions", "normalize"]
|
embed_parameters = ["dimensions", "use_activation"]
|
||||||
step_pooling_parameters = ["step_tag_id", "returned_token_ids"]
|
step_pooling_parameters = ["step_tag_id", "returned_token_ids"]
|
||||||
|
|
||||||
|
|
||||||
@@ -42,17 +42,17 @@ def test_embed():
|
|||||||
task = "embed"
|
task = "embed"
|
||||||
model_config = MockModelConfig(pooler_config=PoolerConfig(seq_pooling_type="CLS"))
|
model_config = MockModelConfig(pooler_config=PoolerConfig(seq_pooling_type="CLS"))
|
||||||
|
|
||||||
pooling_params = PoolingParams(normalize=None)
|
pooling_params = PoolingParams(use_activation=None)
|
||||||
pooling_params.verify(task=task, model_config=model_config)
|
pooling_params.verify(task=task, model_config=model_config)
|
||||||
|
|
||||||
pooling_params = PoolingParams(normalize=True)
|
pooling_params = PoolingParams(use_activation=True)
|
||||||
pooling_params.verify(task=task, model_config=model_config)
|
pooling_params.verify(task=task, model_config=model_config)
|
||||||
|
|
||||||
pooling_params = PoolingParams(normalize=False)
|
pooling_params = PoolingParams(use_activation=False)
|
||||||
pooling_params.verify(task=task, model_config=model_config)
|
pooling_params.verify(task=task, model_config=model_config)
|
||||||
|
|
||||||
invalid_parameters = classify_parameters + step_pooling_parameters
|
invalid_parameters = classify_parameters + step_pooling_parameters
|
||||||
for p in invalid_parameters:
|
for p in set(invalid_parameters) - set(embed_parameters):
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
pooling_params = PoolingParams(**{p: True})
|
pooling_params = PoolingParams(**{p: True})
|
||||||
pooling_params.verify(task=task, model_config=model_config)
|
pooling_params.verify(task=task, model_config=model_config)
|
||||||
@@ -98,7 +98,7 @@ def test_classify(task):
|
|||||||
pooling_params.verify(task=task, model_config=model_config)
|
pooling_params.verify(task=task, model_config=model_config)
|
||||||
|
|
||||||
invalid_parameters = embed_parameters + step_pooling_parameters
|
invalid_parameters = embed_parameters + step_pooling_parameters
|
||||||
for p in invalid_parameters:
|
for p in set(invalid_parameters) - set(classify_parameters):
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
pooling_params = PoolingParams(**{p: True})
|
pooling_params = PoolingParams(**{p: True})
|
||||||
pooling_params.verify(task=task, model_config=model_config)
|
pooling_params.verify(task=task, model_config=model_config)
|
||||||
@@ -111,20 +111,20 @@ def test_token_embed(pooling_type: str):
|
|||||||
pooler_config=PoolerConfig(tok_pooling_type=pooling_type)
|
pooler_config=PoolerConfig(tok_pooling_type=pooling_type)
|
||||||
)
|
)
|
||||||
|
|
||||||
pooling_params = PoolingParams(normalize=None)
|
pooling_params = PoolingParams(use_activation=None)
|
||||||
pooling_params.verify(task=task, model_config=model_config)
|
pooling_params.verify(task=task, model_config=model_config)
|
||||||
|
|
||||||
pooling_params = PoolingParams(normalize=True)
|
pooling_params = PoolingParams(use_activation=True)
|
||||||
pooling_params.verify(task=task, model_config=model_config)
|
pooling_params.verify(task=task, model_config=model_config)
|
||||||
|
|
||||||
pooling_params = PoolingParams(normalize=False)
|
pooling_params = PoolingParams(use_activation=False)
|
||||||
pooling_params.verify(task=task, model_config=model_config)
|
pooling_params.verify(task=task, model_config=model_config)
|
||||||
|
|
||||||
invalid_parameters = classify_parameters
|
invalid_parameters = classify_parameters
|
||||||
if pooling_type != "STEP":
|
if pooling_type != "STEP":
|
||||||
invalid_parameters = classify_parameters + step_pooling_parameters
|
invalid_parameters = classify_parameters + step_pooling_parameters
|
||||||
|
|
||||||
for p in invalid_parameters:
|
for p in set(invalid_parameters) - set(embed_parameters):
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
pooling_params = PoolingParams(**{p: True})
|
pooling_params = PoolingParams(**{p: True})
|
||||||
pooling_params.verify(task=task, model_config=model_config)
|
pooling_params.verify(task=task, model_config=model_config)
|
||||||
@@ -150,7 +150,7 @@ def test_token_classify(pooling_type: str):
|
|||||||
if pooling_type != "STEP":
|
if pooling_type != "STEP":
|
||||||
invalid_parameters = embed_parameters + step_pooling_parameters
|
invalid_parameters = embed_parameters + step_pooling_parameters
|
||||||
|
|
||||||
for p in invalid_parameters:
|
for p in set(invalid_parameters) - set(classify_parameters):
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
pooling_params = PoolingParams(**{p: True})
|
pooling_params = PoolingParams(**{p: True})
|
||||||
pooling_params.verify(task=task, model_config=model_config)
|
pooling_params.verify(task=task, model_config=model_config)
|
||||||
|
|||||||
@@ -19,7 +19,8 @@ pytestmark = pytest.mark.cpu_test
|
|||||||
("lmcache", 4.0, 1, 1, "LMCacheConnectorV1", 4.0),
|
("lmcache", 4.0, 1, 1, "LMCacheConnectorV1", 4.0),
|
||||||
# size per rank: 8.0 GiB / (2 * 2) = 2.0 GiB
|
# size per rank: 8.0 GiB / (2 * 2) = 2.0 GiB
|
||||||
("lmcache", 8.0, 2, 2, "LMCacheConnectorV1", 2.0),
|
("lmcache", 8.0, 2, 2, "LMCacheConnectorV1", 2.0),
|
||||||
(None, None, 1, 1, None, None),
|
# When kv_offloading_size is None, offloading is disabled (backend is ignored)
|
||||||
|
("native", None, 1, 1, None, None),
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
def test_kv_connector(
|
def test_kv_connector(
|
||||||
@@ -62,3 +63,19 @@ def test_kv_connector(
|
|||||||
assert kv_connector_extra_config["lmcache.max_local_cpu_size"] == expected_bytes
|
assert kv_connector_extra_config["lmcache.max_local_cpu_size"] == expected_bytes
|
||||||
# Existing config should be replaced
|
# Existing config should be replaced
|
||||||
assert "existing_key" not in kv_connector_extra_config
|
assert "existing_key" not in kv_connector_extra_config
|
||||||
|
|
||||||
|
|
||||||
|
def test_kv_offloading_size_only_uses_native_default():
|
||||||
|
"""Test that setting only kv_offloading_size enables native offloading."""
|
||||||
|
vllm_config = VllmConfig(
|
||||||
|
cache_config=CacheConfig(
|
||||||
|
kv_offloading_size=4.0,
|
||||||
|
# kv_offloading_backend not set, should default to "native"
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
kv_transfer_config = vllm_config.kv_transfer_config
|
||||||
|
kv_connector_extra_config = kv_transfer_config.kv_connector_extra_config
|
||||||
|
assert kv_transfer_config.kv_connector == "OffloadingConnector"
|
||||||
|
assert kv_transfer_config.kv_role == "kv_both"
|
||||||
|
assert kv_connector_extra_config["cpu_bytes_to_use"] == 4.0 * (1 << 30)
|
||||||
|
|||||||
221
tools/vllm-rocm/pin_rocm_dependencies.py
Normal file
221
tools/vllm-rocm/pin_rocm_dependencies.py
Normal file
@@ -0,0 +1,221 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
"""
|
||||||
|
Pin vLLM dependencies to exact versions of custom ROCm wheels.
|
||||||
|
|
||||||
|
This script modifies vLLM's requirements files to replace version constraints
|
||||||
|
with exact versions of custom-built ROCm wheels (torch, triton, torchvision, amdsmi).
|
||||||
|
|
||||||
|
This ensures that 'pip install vllm' automatically installs the correct custom wheels
|
||||||
|
instead of allowing pip to download different versions from PyPI.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import re
|
||||||
|
import sys
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
|
||||||
|
def extract_version_from_wheel(wheel_name: str) -> str:
|
||||||
|
"""
|
||||||
|
Extract version from wheel filename.
|
||||||
|
|
||||||
|
Example:
|
||||||
|
torch-2.9.0a0+git1c57644-cp312-cp312-linux_x86_64.whl -> 2.9.0a0+git1c57644
|
||||||
|
triton-3.4.0-cp312-cp312-linux_x86_64.whl -> 3.4.0
|
||||||
|
"""
|
||||||
|
# Wheel format:
|
||||||
|
# {distribution}-{version}(-{build tag})?-{python}-{abi}-{platform}.whl
|
||||||
|
parts = wheel_name.replace(".whl", "").split("-")
|
||||||
|
|
||||||
|
if len(parts) < 5:
|
||||||
|
raise ValueError(f"Invalid wheel filename format: {wheel_name}")
|
||||||
|
|
||||||
|
# Version is the second part
|
||||||
|
version = parts[1]
|
||||||
|
return version
|
||||||
|
|
||||||
|
|
||||||
|
def get_custom_wheel_versions(install_dir: str) -> dict[str, str]:
|
||||||
|
"""
|
||||||
|
Read /install directory and extract versions of custom wheels.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dict mapping package names to exact versions
|
||||||
|
"""
|
||||||
|
install_path = Path(install_dir)
|
||||||
|
if not install_path.exists():
|
||||||
|
print(f"ERROR: Install directory not found: {install_dir}", file=sys.stderr)
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
versions = {}
|
||||||
|
|
||||||
|
# Map wheel prefixes to package names
|
||||||
|
# IMPORTANT: Use dashes to avoid matching substrings
|
||||||
|
# (e.g., 'torch' would match 'torchvision')
|
||||||
|
# ORDER MATTERS: This order is preserved when pinning dependencies
|
||||||
|
# in requirements files
|
||||||
|
package_mapping = [
|
||||||
|
("torch-", "torch"), # Match torch- (not torchvision)
|
||||||
|
("triton-", "triton"), # Match triton- (not triton_kernels)
|
||||||
|
("triton_kernels-", "triton-kernels"), # Match triton_kernels-
|
||||||
|
("torchvision-", "torchvision"), # Match torchvision-
|
||||||
|
("torchaudio-", "torchaudio"), # Match torchaudio-
|
||||||
|
("amdsmi-", "amdsmi"), # Match amdsmi-
|
||||||
|
("flash_attn-", "flash-attn"), # Match flash_attn-
|
||||||
|
("aiter-", "aiter"), # Match aiter-
|
||||||
|
]
|
||||||
|
|
||||||
|
for wheel_file in install_path.glob("*.whl"):
|
||||||
|
wheel_name = wheel_file.name
|
||||||
|
|
||||||
|
for prefix, package_name in package_mapping:
|
||||||
|
if wheel_name.startswith(prefix):
|
||||||
|
try:
|
||||||
|
version = extract_version_from_wheel(wheel_name)
|
||||||
|
versions[package_name] = version
|
||||||
|
print(f"Found {package_name}=={version}", file=sys.stderr)
|
||||||
|
except Exception as e:
|
||||||
|
print(
|
||||||
|
f"WARNING: Could not extract version from {wheel_name}: {e}",
|
||||||
|
file=sys.stderr,
|
||||||
|
)
|
||||||
|
break
|
||||||
|
|
||||||
|
# Return versions in the order defined by package_mapping
|
||||||
|
ordered_versions = {}
|
||||||
|
for _, package_name in package_mapping:
|
||||||
|
if package_name in versions:
|
||||||
|
ordered_versions[package_name] = versions[package_name]
|
||||||
|
return ordered_versions
|
||||||
|
|
||||||
|
|
||||||
|
def pin_dependencies_in_requirements(requirements_path: str, versions: dict[str, str]):
|
||||||
|
"""
|
||||||
|
Insert custom wheel pins at the TOP of requirements file.
|
||||||
|
|
||||||
|
This ensures that when setup.py processes the file line-by-line,
|
||||||
|
custom wheels (torch, triton, etc.) are encountered FIRST, before
|
||||||
|
any `-r common.txt` includes that might pull in other dependencies.
|
||||||
|
|
||||||
|
Creates:
|
||||||
|
# Custom ROCm wheel pins (auto-generated)
|
||||||
|
torch==2.9.0a0+git1c57644
|
||||||
|
triton==3.4.0
|
||||||
|
torchvision==0.23.0a0+824e8c8
|
||||||
|
amdsmi==26.1.0+5df6c765
|
||||||
|
|
||||||
|
-r common.txt
|
||||||
|
... rest of file ...
|
||||||
|
"""
|
||||||
|
requirements_file = Path(requirements_path)
|
||||||
|
|
||||||
|
if not requirements_file.exists():
|
||||||
|
print(
|
||||||
|
f"ERROR: Requirements file not found: {requirements_path}", file=sys.stderr
|
||||||
|
)
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
# Backup original file
|
||||||
|
backup_file = requirements_file.with_suffix(requirements_file.suffix + ".bak")
|
||||||
|
with open(requirements_file) as f:
|
||||||
|
original_lines = f.readlines()
|
||||||
|
|
||||||
|
# Write backup
|
||||||
|
with open(backup_file, "w") as f:
|
||||||
|
f.writelines(original_lines)
|
||||||
|
|
||||||
|
# Build header with pinned custom wheels
|
||||||
|
header_lines = [
|
||||||
|
"# Custom ROCm wheel pins (auto-generated by pin_rocm_dependencies.py)\n",
|
||||||
|
"# These must come FIRST to ensure correct dependency resolution\n",
|
||||||
|
]
|
||||||
|
|
||||||
|
for package_name, exact_version in versions.items():
|
||||||
|
header_lines.append(f"{package_name}=={exact_version}\n")
|
||||||
|
|
||||||
|
header_lines.append("\n") # Blank line separator
|
||||||
|
|
||||||
|
# Filter out any existing entries for custom packages from original file
|
||||||
|
filtered_lines = []
|
||||||
|
removed_packages = []
|
||||||
|
|
||||||
|
for line in original_lines:
|
||||||
|
stripped = line.strip()
|
||||||
|
should_keep = True
|
||||||
|
|
||||||
|
# Check if this line is for one of our custom packages
|
||||||
|
if stripped and not stripped.startswith("#") and not stripped.startswith("-"):
|
||||||
|
for package_name in versions:
|
||||||
|
# Handle both hyphen and underscore variations
|
||||||
|
pattern_name = package_name.replace("-", "[-_]")
|
||||||
|
pattern = rf"^{pattern_name}\s*[=<>]=?\s*[\d.a-zA-Z+]+"
|
||||||
|
|
||||||
|
if re.match(pattern, stripped, re.IGNORECASE):
|
||||||
|
removed_packages.append(f"{package_name}: {stripped}")
|
||||||
|
should_keep = False
|
||||||
|
break
|
||||||
|
|
||||||
|
if should_keep:
|
||||||
|
filtered_lines.append(line)
|
||||||
|
|
||||||
|
# Combine: header + filtered original content
|
||||||
|
final_lines = header_lines + filtered_lines
|
||||||
|
|
||||||
|
# Write modified content
|
||||||
|
with open(requirements_file, "w") as f:
|
||||||
|
f.writelines(final_lines)
|
||||||
|
|
||||||
|
# Print summary
|
||||||
|
print("\n✓ Inserted custom wheel pins at TOP of requirements:", file=sys.stderr)
|
||||||
|
for package_name, exact_version in versions.items():
|
||||||
|
print(f" - {package_name}=={exact_version}", file=sys.stderr)
|
||||||
|
|
||||||
|
if removed_packages:
|
||||||
|
print("\n✓ Removed old package entries:", file=sys.stderr)
|
||||||
|
for pkg in removed_packages:
|
||||||
|
print(f" - {pkg}", file=sys.stderr)
|
||||||
|
|
||||||
|
print(f"\n✓ Patched requirements file: {requirements_path}", file=sys.stderr)
|
||||||
|
print(f" Backup saved: {backup_file}", file=sys.stderr)
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
if len(sys.argv) != 3:
|
||||||
|
print(
|
||||||
|
f"Usage: {sys.argv[0]} <install_dir> <requirements_file>", file=sys.stderr
|
||||||
|
)
|
||||||
|
print(
|
||||||
|
f"Example: {sys.argv[0]} /install /app/vllm/requirements/rocm.txt",
|
||||||
|
file=sys.stderr,
|
||||||
|
)
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
install_dir = sys.argv[1]
|
||||||
|
requirements_path = sys.argv[2]
|
||||||
|
|
||||||
|
print("=" * 70, file=sys.stderr)
|
||||||
|
print("Pinning vLLM dependencies to custom ROCm wheel versions", file=sys.stderr)
|
||||||
|
print("=" * 70, file=sys.stderr)
|
||||||
|
|
||||||
|
# Get versions from custom wheels
|
||||||
|
print(f"\nScanning {install_dir} for custom wheels...", file=sys.stderr)
|
||||||
|
versions = get_custom_wheel_versions(install_dir)
|
||||||
|
|
||||||
|
if not versions:
|
||||||
|
print("\nERROR: No custom wheels found in /install!", file=sys.stderr)
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
# Pin dependencies in requirements file
|
||||||
|
print(f"\nPatching {requirements_path}...", file=sys.stderr)
|
||||||
|
pin_dependencies_in_requirements(requirements_path, versions)
|
||||||
|
|
||||||
|
print("\n" + "=" * 70, file=sys.stderr)
|
||||||
|
print("✓ Dependency pinning complete!", file=sys.stderr)
|
||||||
|
print("=" * 70, file=sys.stderr)
|
||||||
|
|
||||||
|
sys.exit(0)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
@@ -152,13 +152,13 @@ class CacheConfig:
|
|||||||
kv_offloading_size: float | None = None
|
kv_offloading_size: float | None = None
|
||||||
"""Size of the KV cache offloading buffer in GiB. When TP > 1, this is
|
"""Size of the KV cache offloading buffer in GiB. When TP > 1, this is
|
||||||
the total buffer size summed across all TP ranks. By default, this is set
|
the total buffer size summed across all TP ranks. By default, this is set
|
||||||
to None, which means no KV offloading is enabled. When set with
|
to None, which means no KV offloading is enabled. When set, vLLM will
|
||||||
kv_offloading_backend, vLLM will enable KV cache offloading to CPU"""
|
enable KV cache offloading to CPU using the kv_offloading_backend."""
|
||||||
|
|
||||||
kv_offloading_backend: KVOffloadingBackend | None = None
|
kv_offloading_backend: KVOffloadingBackend = "native"
|
||||||
"""The backend to use for KV cache offloading. Supported backends include
|
"""The backend to use for KV cache offloading. Supported backends include
|
||||||
'native' (vLLM native CPU offloading), 'lmcache' This option must be used
|
'native' (vLLM native CPU offloading), 'lmcache'.
|
||||||
together with kv_offloading_size."""
|
KV offloading is only activated when kv_offloading_size is set."""
|
||||||
|
|
||||||
def compute_hash(self) -> str:
|
def compute_hash(self) -> str:
|
||||||
"""
|
"""
|
||||||
|
|||||||
@@ -48,7 +48,7 @@ class PoolerConfig:
|
|||||||
## for embeddings models
|
## for embeddings models
|
||||||
normalize: bool | None = None
|
normalize: bool | None = None
|
||||||
"""
|
"""
|
||||||
Whether to normalize the embeddings outputs. Defaults to True.
|
DEPRECATED: please use `use_activation` instead.
|
||||||
"""
|
"""
|
||||||
dimensions: int | None = None
|
dimensions: int | None = None
|
||||||
"""
|
"""
|
||||||
@@ -75,11 +75,11 @@ class PoolerConfig:
|
|||||||
## for classification models
|
## for classification models
|
||||||
softmax: float | None = None
|
softmax: float | None = None
|
||||||
"""
|
"""
|
||||||
softmax will be deprecated, please use use_activation instead.
|
DEPRECATED: please use `use_activation` instead.
|
||||||
"""
|
"""
|
||||||
activation: float | None = None
|
activation: float | None = None
|
||||||
"""
|
"""
|
||||||
activation will be deprecated, please use use_activation instead.
|
DEPRECATED: please use `use_activation` instead.
|
||||||
"""
|
"""
|
||||||
use_activation: bool | None = None
|
use_activation: bool | None = None
|
||||||
"""
|
"""
|
||||||
@@ -164,17 +164,24 @@ class PoolerConfig:
|
|||||||
|
|
||||||
|
|
||||||
def get_use_activation(o: object):
|
def get_use_activation(o: object):
|
||||||
if softmax := getattr(o, "softmax", None) is not None:
|
if (normalize := getattr(o, "normalize", None)) is not None:
|
||||||
logger.warning_once(
|
logger.warning_once(
|
||||||
"softmax will be deprecated and will be removed in v0.15. "
|
"`normalize` is deprecated and will be removed in v0.15. "
|
||||||
"Please use use_activation instead."
|
"Please use `use_activation` instead."
|
||||||
|
)
|
||||||
|
return normalize
|
||||||
|
|
||||||
|
if (softmax := getattr(o, "softmax", None)) is not None:
|
||||||
|
logger.warning_once(
|
||||||
|
"`softmax` is deprecated and will be removed in v0.15. "
|
||||||
|
"Please use `use_activation` instead."
|
||||||
)
|
)
|
||||||
return softmax
|
return softmax
|
||||||
|
|
||||||
if activation := getattr(o, "activation", None) is not None:
|
if (activation := getattr(o, "activation", None)) is not None:
|
||||||
logger.warning_once(
|
logger.warning_once(
|
||||||
"activation will be deprecated and will be removed in v0.15. "
|
"`activation` is deprecated and will be removed in v0.15. "
|
||||||
"Please use use_activation instead."
|
"Please use `use_activation` instead."
|
||||||
)
|
)
|
||||||
return activation
|
return activation
|
||||||
|
|
||||||
|
|||||||
@@ -498,17 +498,15 @@ class VllmConfig:
|
|||||||
Right now, this function reads the offloading settings from
|
Right now, this function reads the offloading settings from
|
||||||
CacheConfig and configures the KVTransferConfig accordingly.
|
CacheConfig and configures the KVTransferConfig accordingly.
|
||||||
"""
|
"""
|
||||||
if (kv_offloading_backend := self.cache_config.kv_offloading_backend) is None:
|
# KV offloading is only activated when kv_offloading_size is set.
|
||||||
|
if (kv_offloading_size := self.cache_config.kv_offloading_size) is None:
|
||||||
return
|
return
|
||||||
|
|
||||||
|
kv_offloading_backend = self.cache_config.kv_offloading_backend
|
||||||
|
|
||||||
# If no KVTransferConfig is provided, create a default one.
|
# If no KVTransferConfig is provided, create a default one.
|
||||||
if self.kv_transfer_config is None:
|
if self.kv_transfer_config is None:
|
||||||
self.kv_transfer_config = KVTransferConfig()
|
self.kv_transfer_config = KVTransferConfig()
|
||||||
|
|
||||||
if (kv_offloading_size := self.cache_config.kv_offloading_size) is None:
|
|
||||||
raise ValueError(
|
|
||||||
"You must set kv_offloading_size when kv_offloading_backend is set."
|
|
||||||
)
|
|
||||||
num_kv_ranks = (
|
num_kv_ranks = (
|
||||||
self.parallel_config.tensor_parallel_size
|
self.parallel_config.tensor_parallel_size
|
||||||
* self.parallel_config.pipeline_parallel_size
|
* self.parallel_config.pipeline_parallel_size
|
||||||
|
|||||||
@@ -234,7 +234,7 @@ class LMCacheConnectorV1(KVConnectorBase_V1):
|
|||||||
lora_id=e.lora_id,
|
lora_id=e.lora_id,
|
||||||
block_size=e.block_size,
|
block_size=e.block_size,
|
||||||
medium=e.medium,
|
medium=e.medium,
|
||||||
lora_name=e.lora_name,
|
lora_name=getattr(e, "lora_name", None),
|
||||||
)
|
)
|
||||||
for e in events
|
for e in events
|
||||||
]
|
]
|
||||||
|
|||||||
@@ -578,9 +578,7 @@ class EngineArgs:
|
|||||||
optimization_level: OptimizationLevel = VllmConfig.optimization_level
|
optimization_level: OptimizationLevel = VllmConfig.optimization_level
|
||||||
|
|
||||||
kv_offloading_size: float | None = CacheConfig.kv_offloading_size
|
kv_offloading_size: float | None = CacheConfig.kv_offloading_size
|
||||||
kv_offloading_backend: KVOffloadingBackend | None = (
|
kv_offloading_backend: KVOffloadingBackend = CacheConfig.kv_offloading_backend
|
||||||
CacheConfig.kv_offloading_backend
|
|
||||||
)
|
|
||||||
tokens_only: bool = False
|
tokens_only: bool = False
|
||||||
|
|
||||||
def __post_init__(self):
|
def __post_init__(self):
|
||||||
|
|||||||
@@ -75,7 +75,7 @@ class EmbeddingCompletionRequest(OpenAIBaseModel):
|
|||||||
return PoolingParams(
|
return PoolingParams(
|
||||||
truncate_prompt_tokens=self.truncate_prompt_tokens,
|
truncate_prompt_tokens=self.truncate_prompt_tokens,
|
||||||
dimensions=self.dimensions,
|
dimensions=self.dimensions,
|
||||||
normalize=self.normalize,
|
use_activation=self.normalize,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@@ -189,7 +189,7 @@ class EmbeddingChatRequest(OpenAIBaseModel):
|
|||||||
return PoolingParams(
|
return PoolingParams(
|
||||||
truncate_prompt_tokens=self.truncate_prompt_tokens,
|
truncate_prompt_tokens=self.truncate_prompt_tokens,
|
||||||
dimensions=self.dimensions,
|
dimensions=self.dimensions,
|
||||||
normalize=self.normalize,
|
use_activation=self.normalize,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -40,7 +40,6 @@ class PoolingCompletionRequest(EmbeddingCompletionRequest):
|
|||||||
return PoolingParams(
|
return PoolingParams(
|
||||||
truncate_prompt_tokens=self.truncate_prompt_tokens,
|
truncate_prompt_tokens=self.truncate_prompt_tokens,
|
||||||
dimensions=self.dimensions,
|
dimensions=self.dimensions,
|
||||||
normalize=self.normalize,
|
|
||||||
use_activation=get_use_activation(self),
|
use_activation=get_use_activation(self),
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -66,7 +65,6 @@ class PoolingChatRequest(EmbeddingChatRequest):
|
|||||||
return PoolingParams(
|
return PoolingParams(
|
||||||
truncate_prompt_tokens=self.truncate_prompt_tokens,
|
truncate_prompt_tokens=self.truncate_prompt_tokens,
|
||||||
dimensions=self.dimensions,
|
dimensions=self.dimensions,
|
||||||
normalize=self.normalize,
|
|
||||||
use_activation=get_use_activation(self),
|
use_activation=get_use_activation(self),
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
@@ -83,7 +83,7 @@ class EmbeddingPoolerHead(SequencePoolerHead):
|
|||||||
|
|
||||||
# for normalize
|
# for normalize
|
||||||
if self.activation is not None:
|
if self.activation is not None:
|
||||||
flags = [p.normalize for p in pooling_params]
|
flags = [p.use_activation for p in pooling_params]
|
||||||
if len(set(flags)) == 1:
|
if len(set(flags)) == 1:
|
||||||
if flags[0]:
|
if flags[0]:
|
||||||
pooled_data = self.activation(pooled_data)
|
pooled_data = self.activation(pooled_data)
|
||||||
|
|||||||
@@ -95,8 +95,8 @@ def pooler_for_embed(pooler_config: PoolerConfig):
|
|||||||
vllm_config = get_current_vllm_config()
|
vllm_config = get_current_vllm_config()
|
||||||
model_config = vllm_config.model_config
|
model_config = vllm_config.model_config
|
||||||
head = EmbeddingPoolerHead(
|
head = EmbeddingPoolerHead(
|
||||||
projector=_load_st_projector(model_config),
|
|
||||||
head_dtype=model_config.head_dtype,
|
head_dtype=model_config.head_dtype,
|
||||||
|
projector=_load_st_projector(model_config),
|
||||||
activation=PoolerNormalize(),
|
activation=PoolerNormalize(),
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -116,9 +116,9 @@ def pooler_for_classify(
|
|||||||
vllm_config = get_current_vllm_config()
|
vllm_config = get_current_vllm_config()
|
||||||
model_config = vllm_config.model_config
|
model_config = vllm_config.model_config
|
||||||
head = ClassifierPoolerHead(
|
head = ClassifierPoolerHead(
|
||||||
|
head_dtype=model_config.head_dtype,
|
||||||
classifier=classifier,
|
classifier=classifier,
|
||||||
logit_bias=model_config.pooler_config.logit_bias,
|
logit_bias=model_config.pooler_config.logit_bias,
|
||||||
head_dtype=model_config.head_dtype,
|
|
||||||
activation=resolve_classifier_act_fn(
|
activation=resolve_classifier_act_fn(
|
||||||
model_config, static_num_labels=True, act_fn=act_fn
|
model_config, static_num_labels=True, act_fn=act_fn
|
||||||
),
|
),
|
||||||
|
|||||||
@@ -44,14 +44,14 @@ class TokenPoolerHead(nn.Module, ABC):
|
|||||||
class TokenEmbeddingPoolerHead(TokenPoolerHead):
|
class TokenEmbeddingPoolerHead(TokenPoolerHead):
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
projector: ProjectorFn | None = None,
|
|
||||||
head_dtype: torch.dtype | str | None = None,
|
head_dtype: torch.dtype | str | None = None,
|
||||||
|
projector: ProjectorFn | None = None,
|
||||||
activation: ActivationFn | None = None,
|
activation: ActivationFn | None = None,
|
||||||
) -> None:
|
) -> None:
|
||||||
super().__init__()
|
super().__init__()
|
||||||
|
|
||||||
self.projector = projector
|
|
||||||
self.head_dtype = head_dtype
|
self.head_dtype = head_dtype
|
||||||
|
self.projector = projector
|
||||||
self.activation = activation
|
self.activation = activation
|
||||||
|
|
||||||
def get_supported_tasks(self) -> Set[PoolingTask]:
|
def get_supported_tasks(self) -> Set[PoolingTask]:
|
||||||
@@ -79,7 +79,7 @@ class TokenEmbeddingPoolerHead(TokenPoolerHead):
|
|||||||
pooled_data = pooled_data[..., : pooling_param.dimensions]
|
pooled_data = pooled_data[..., : pooling_param.dimensions]
|
||||||
|
|
||||||
# for normalize
|
# for normalize
|
||||||
if self.activation is not None and pooling_param.normalize:
|
if self.activation is not None and pooling_param.use_activation:
|
||||||
pooled_data = self.activation(pooled_data)
|
pooled_data = self.activation(pooled_data)
|
||||||
|
|
||||||
# pooled_data shape: [n_tokens, embedding_dimension]
|
# pooled_data shape: [n_tokens, embedding_dimension]
|
||||||
|
|||||||
@@ -95,8 +95,8 @@ def pooler_for_token_embed(pooler_config: PoolerConfig):
|
|||||||
vllm_config = get_current_vllm_config()
|
vllm_config = get_current_vllm_config()
|
||||||
model_config = vllm_config.model_config
|
model_config = vllm_config.model_config
|
||||||
head = TokenEmbeddingPoolerHead(
|
head = TokenEmbeddingPoolerHead(
|
||||||
projector=_load_st_projector(model_config),
|
|
||||||
head_dtype=model_config.head_dtype,
|
head_dtype=model_config.head_dtype,
|
||||||
|
projector=_load_st_projector(model_config),
|
||||||
activation=PoolerNormalize(),
|
activation=PoolerNormalize(),
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -116,9 +116,9 @@ def pooler_for_token_classify(
|
|||||||
vllm_config = get_current_vllm_config()
|
vllm_config = get_current_vllm_config()
|
||||||
model_config = vllm_config.model_config
|
model_config = vllm_config.model_config
|
||||||
head = TokenClassifierPoolerHead(
|
head = TokenClassifierPoolerHead(
|
||||||
|
head_dtype=model_config.head_dtype,
|
||||||
classifier=classifier,
|
classifier=classifier,
|
||||||
logit_bias=model_config.pooler_config.logit_bias,
|
logit_bias=model_config.pooler_config.logit_bias,
|
||||||
head_dtype=model_config.head_dtype,
|
|
||||||
activation=resolve_classifier_act_fn(
|
activation=resolve_classifier_act_fn(
|
||||||
model_config, static_num_labels=False, act_fn=act_fn
|
model_config, static_num_labels=False, act_fn=act_fn
|
||||||
),
|
),
|
||||||
|
|||||||
@@ -98,7 +98,9 @@ class QuantFP8(CustomOp):
|
|||||||
num_token_padding=self.num_token_padding,
|
num_token_padding=self.num_token_padding,
|
||||||
scale_ub=scale_ub,
|
scale_ub=scale_ub,
|
||||||
use_per_token_if_dynamic=self.use_per_token_if_dynamic,
|
use_per_token_if_dynamic=self.use_per_token_if_dynamic,
|
||||||
group_shape=self.group_shape if self.static else None,
|
group_shape=(self.group_shape.row, self.group_shape.col)
|
||||||
|
if self.static
|
||||||
|
else None,
|
||||||
)
|
)
|
||||||
|
|
||||||
def forward_hip(
|
def forward_hip(
|
||||||
|
|||||||
@@ -116,8 +116,8 @@ class BertPooler(SequencePooler):
|
|||||||
|
|
||||||
# Use lambdas so that weights are not registered under `self.head`
|
# Use lambdas so that weights are not registered under `self.head`
|
||||||
self.head = EmbeddingPoolerHead(
|
self.head = EmbeddingPoolerHead(
|
||||||
projector=lambda x: self.dense(x),
|
|
||||||
head_dtype=head_dtype,
|
head_dtype=head_dtype,
|
||||||
|
projector=lambda x: self.dense(x),
|
||||||
activation=LambdaPoolerActivation(self.act_fn),
|
activation=LambdaPoolerActivation(self.act_fn),
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
@@ -309,12 +309,13 @@ class ModernBertPooler(SequencePooler):
|
|||||||
config.hidden_size,
|
config.hidden_size,
|
||||||
eps=config.norm_eps,
|
eps=config.norm_eps,
|
||||||
bias=config.norm_bias,
|
bias=config.norm_bias,
|
||||||
|
dtype=head_dtype,
|
||||||
)
|
)
|
||||||
|
|
||||||
# Use lambdas so that weights are not registered under `self.head`
|
# Use lambdas so that weights are not registered under `self.head`
|
||||||
self.head = EmbeddingPoolerHead(
|
self.head = EmbeddingPoolerHead(
|
||||||
projector=lambda x: self.dense(x),
|
|
||||||
head_dtype=head_dtype,
|
head_dtype=head_dtype,
|
||||||
|
projector=lambda x: self.dense(x),
|
||||||
activation=LambdaPoolerActivation(lambda x: self.norm(self.act(x))),
|
activation=LambdaPoolerActivation(lambda x: self.norm(self.act(x))),
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
@@ -26,9 +26,9 @@ class PoolingParams(
|
|||||||
Set to None to disable truncation.
|
Set to None to disable truncation.
|
||||||
dimensions: Reduce the dimensions of embeddings
|
dimensions: Reduce the dimensions of embeddings
|
||||||
if model support matryoshka representation.
|
if model support matryoshka representation.
|
||||||
normalize: Whether to normalize the embeddings outputs.
|
normalize: Deprecated, please use use_activation instead.
|
||||||
softmax: softmax will be deprecated, please use use_activation instead.
|
softmax: Deprecated, please use use_activation instead.
|
||||||
activation: activation will be deprecated, please use use_activation instead.
|
activation: Deprecated, please use use_activation instead.
|
||||||
use_activation: Whether to apply activation function to
|
use_activation: Whether to apply activation function to
|
||||||
the classification outputs.
|
the classification outputs.
|
||||||
"""
|
"""
|
||||||
@@ -63,15 +63,15 @@ class PoolingParams(
|
|||||||
|
|
||||||
@property
|
@property
|
||||||
def all_parameters(self) -> list[str]:
|
def all_parameters(self) -> list[str]:
|
||||||
return ["dimensions", "normalize", "use_activation"]
|
return ["dimensions", "use_activation"]
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def valid_parameters(self):
|
def valid_parameters(self):
|
||||||
return {
|
return {
|
||||||
"embed": ["dimensions", "normalize"],
|
"embed": ["dimensions", "use_activation"],
|
||||||
"classify": ["use_activation"],
|
"classify": ["use_activation"],
|
||||||
"score": ["use_activation"],
|
"score": ["use_activation"],
|
||||||
"token_embed": ["dimensions", "normalize"],
|
"token_embed": ["dimensions", "use_activation"],
|
||||||
"token_classify": ["use_activation"],
|
"token_classify": ["use_activation"],
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -162,8 +162,8 @@ class PoolingParams(
|
|||||||
|
|
||||||
def _set_default_parameters(self, model_config: Optional["ModelConfig"]):
|
def _set_default_parameters(self, model_config: Optional["ModelConfig"]):
|
||||||
if self.task in ["embed", "token_embed"]:
|
if self.task in ["embed", "token_embed"]:
|
||||||
if self.normalize is None:
|
if self.use_activation is None:
|
||||||
self.normalize = True
|
self.use_activation = True
|
||||||
|
|
||||||
if self.dimensions is not None and model_config is not None:
|
if self.dimensions is not None and model_config is not None:
|
||||||
if not model_config.is_matryoshka:
|
if not model_config.is_matryoshka:
|
||||||
@@ -213,7 +213,6 @@ class PoolingParams(
|
|||||||
return (
|
return (
|
||||||
f"PoolingParams("
|
f"PoolingParams("
|
||||||
f"task={self.task}, "
|
f"task={self.task}, "
|
||||||
f"normalize={self.normalize}, "
|
|
||||||
f"dimensions={self.dimensions}, "
|
f"dimensions={self.dimensions}, "
|
||||||
f"use_activation={self.use_activation}, "
|
f"use_activation={self.use_activation}, "
|
||||||
f"step_tag_id={self.step_tag_id}, "
|
f"step_tag_id={self.step_tag_id}, "
|
||||||
|
|||||||
@@ -801,7 +801,7 @@ def get_pooling_config(
|
|||||||
|
|
||||||
logger.info("Found pooling configuration.")
|
logger.info("Found pooling configuration.")
|
||||||
|
|
||||||
config: dict[str, Any] = {"normalize": normalize}
|
config: dict[str, Any] = {"use_activation": normalize}
|
||||||
for key, val in pooling_dict.items():
|
for key, val in pooling_dict.items():
|
||||||
if val is True:
|
if val is True:
|
||||||
pooling_type = parse_pooling_type(key)
|
pooling_type = parse_pooling_type(key)
|
||||||
|
|||||||
@@ -167,7 +167,16 @@ class RocmAttentionBackend(AttentionBackend):
|
|||||||
# ROCM paged attention kernel only supports block sizes 16 and 32
|
# ROCM paged attention kernel only supports block sizes 16 and 32
|
||||||
# due to shared memory (LDS) constraints on AMD GPUs.
|
# due to shared memory (LDS) constraints on AMD GPUs.
|
||||||
# See csrc/rocm/attention.cu CALL_CUSTOM_LAUNCHER_BLK macro.
|
# See csrc/rocm/attention.cu CALL_CUSTOM_LAUNCHER_BLK macro.
|
||||||
return [16, 32]
|
|
||||||
|
# However, The limitations in [16, 32] are reasonable for a native C++ kernel,
|
||||||
|
# but vLLM should allow support for non-standard sizes via the Triton path,
|
||||||
|
# as addressed in this PR: https://github.com/vllm-project/vllm/pull/31380,
|
||||||
|
# where the Triton kernel under rocm_atten does not support inference
|
||||||
|
# for a non-standard qwen3-next model with a block_size of 544.
|
||||||
|
# We have fixed the Triton kernel so that the standard model uses the original
|
||||||
|
# bit-addressing logic, while the non-standard model
|
||||||
|
# uses our optimized kernel logic.
|
||||||
|
return [16, 32, 544]
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def get_supported_head_sizes(cls) -> list[int]:
|
def get_supported_head_sizes(cls) -> list[int]:
|
||||||
|
|||||||
@@ -174,6 +174,8 @@ class TopKTopPSampler(nn.Module):
|
|||||||
k: torch.Tensor | None,
|
k: torch.Tensor | None,
|
||||||
p: torch.Tensor | None,
|
p: torch.Tensor | None,
|
||||||
) -> tuple[torch.Tensor, torch.Tensor | None]:
|
) -> tuple[torch.Tensor, torch.Tensor | None]:
|
||||||
|
# FIXME: Fix aiter_sampler's accuracy issue and remove this flag
|
||||||
|
DISABLE_AITER_SAMPLER = True
|
||||||
"""Optimized ROCm/aiter path (same structure as forward_cuda)."""
|
"""Optimized ROCm/aiter path (same structure as forward_cuda)."""
|
||||||
if (k is None and p is None) or generators:
|
if (k is None and p is None) or generators:
|
||||||
if generators:
|
if generators:
|
||||||
@@ -186,6 +188,8 @@ class TopKTopPSampler(nn.Module):
|
|||||||
"processed_logits",
|
"processed_logits",
|
||||||
"processed_logprobs",
|
"processed_logprobs",
|
||||||
), "aiter sampler does not support returning logits/logprobs."
|
), "aiter sampler does not support returning logits/logprobs."
|
||||||
|
if DISABLE_AITER_SAMPLER:
|
||||||
|
return self.forward_native(logits, generators, k, p)
|
||||||
return self.aiter_sample(logits, k, p, generators), None
|
return self.aiter_sample(logits, k, p, generators), None
|
||||||
|
|
||||||
def aiter_sample(
|
def aiter_sample(
|
||||||
|
|||||||
Reference in New Issue
Block a user