Compare commits

...

16 Commits

Author SHA1 Message Date
Shengqi Chen
d7de043d55 [CI] fix version comparsion and exclusion patterns in upload-release-wheels.sh (#32971)
Signed-off-by: Shengqi Chen <harry-chen@outlook.com>
(cherry picked from commit 136c499f6e)
2026-01-23 14:22:49 -08:00
Nicolò Lucchesi
4dc11b06d3 [Bugfix] Fix Whisper/encoder-decoder GPU memory leak (#32789)
Signed-off-by: NickLucche <nlucches@redhat.com>
(cherry picked from commit ea6102b85d)
2026-01-23 02:53:12 -08:00
Isotr0py
2bd95d803a [Misc] Bump opencv-python dependecy version to 4.13 (#32668)
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
(cherry picked from commit 444e2e7e1f)
2026-01-23 02:52:47 -08:00
Isotr0py
f46d576c54 [Misc] Replace urllib's urlparse with urllib3's parse_url (#32746)
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
(cherry picked from commit 8ebf271bb6)
2026-01-23 02:51:53 -08:00
Shengqi Chen
d68209402d [build] fix cu130 related release pipeline steps and publish as nightly image (#32522)
Signed-off-by: Shengqi Chen <harry-chen@outlook.com>
(cherry picked from commit 965765aef9)
2026-01-17 18:38:46 -08:00
Shengqi Chen
b17039bccc [CI] Implement uploading to PyPI and GitHub in the release pipeline, enable release image building for CUDA 13.0 (#31032)
(cherry picked from commit 8e61425ee6)
2026-01-16 21:04:48 -08:00
Cyrus Leung
48b67ba75f [Frontend] Standardize use of create_error_response (#32319)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2026-01-16 11:35:10 +00:00
TJian
09f4264a55 [Bugfix] Fix ROCm dockerfiles (#32447)
Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com>
2026-01-16 10:50:00 +08:00
Matthew Bonanni
7f42dc20bb [CI] Fix LM Eval Large Models (H100) (#32423)
Signed-off-by: Matthew Bonanni <mbonanni@redhat.com>
(cherry picked from commit bcf2333cd6)
2026-01-15 18:00:21 -08:00
TJian
c2a37a3cf8 Cherry pick [ROCm] [CI] [Release] Rocm wheel pipeline with sccache #32264
Signed-off-by: Kevin H. Luu <khluu000@gmail.com>
2026-01-15 17:59:58 -08:00
Michael Goin
0e31fc7996 [UX] Use kv_offloading_backend=native by default (#32421)
Signed-off-by: mgoin <mgoin64@gmail.com>
(cherry picked from commit 1be5a73571)
2026-01-15 17:55:20 -08:00
Pleaplusone
6ac0fcf416 [ROCm][Bugfix] Disable hip sampler to fix deepseek's accuracy issue on ROCm (#32413)
Signed-off-by: ganyi <ygan@amd.com>
(cherry picked from commit 77c16df31d)
2026-01-15 17:55:06 -08:00
Douglas Lehr
b62249725c [ROCM] Add ROCm image build to release pipeline (#31995)
Signed-off-by: Doug Lehr <douglehr@amd.com>
Co-authored-by: Doug Lehr <douglehr@amd.com>
(cherry picked from commit c5891b5430)
2026-01-15 17:54:47 -08:00
vllmellm
1b57275207 [Bugfix][ROCm][performance] Resolve the performance regression issue of the Qwen3-Next-80B-A3B-Thinking under rocm_atten (#32336)
Signed-off-by: vllmellm <vllm.ellm@embeddedllm.com>
(cherry picked from commit e27078ea80)
2026-01-15 17:54:01 -08:00
Martin Hickey
2c24bc6996 [BugFix] [KVConnector] Fix KV events for LMCache connector (#32169)
Signed-off-by: Martin Hickey <martin.hickey@ie.ibm.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
2026-01-13 10:56:23 -08:00
Cyrus Leung
0aa8c40552 [Bugfix] Replace PoolingParams.normalize with use_activation (#32243)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2026-01-13 10:56:23 -08:00
65 changed files with 1908 additions and 337 deletions

View File

@@ -1,6 +1,6 @@
steps:
# aarch64 + CUDA builds
- label: "Build arm64 wheel - CUDA 12.9"
- label: "Build wheel - aarch64 - CUDA 12.9"
depends_on: ~
id: build-wheel-arm64-cuda-12-9
agents:
@@ -11,11 +11,11 @@ steps:
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
- "mkdir artifacts"
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
- "bash .buildkite/scripts/upload-wheels.sh"
- "bash .buildkite/scripts/upload-nightly-wheels.sh"
env:
DOCKER_BUILDKIT: "1"
- label: "Build arm64 wheel - CUDA 13.0"
- label: "Build wheel - aarch64 - CUDA 13.0"
depends_on: ~
id: build-wheel-arm64-cuda-13-0
agents:
@@ -26,12 +26,12 @@ steps:
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
- "mkdir artifacts"
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
- "bash .buildkite/scripts/upload-wheels.sh manylinux_2_35"
- "bash .buildkite/scripts/upload-nightly-wheels.sh manylinux_2_35"
env:
DOCKER_BUILDKIT: "1"
# aarch64 build
- label: "Build arm64 CPU wheel"
- label: "Build wheel - aarch64 - CPU"
depends_on: ~
id: build-wheel-arm64-cpu
agents:
@@ -40,39 +40,39 @@ steps:
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_BUILD_ACL=ON --tag vllm-ci:build-image --target vllm-build --progress plain -f docker/Dockerfile.cpu ."
- "mkdir artifacts"
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
- "bash .buildkite/scripts/upload-wheels.sh manylinux_2_35"
- "bash .buildkite/scripts/upload-nightly-wheels.sh manylinux_2_35"
env:
DOCKER_BUILDKIT: "1"
# x86 + CUDA builds
- label: "Build wheel - CUDA 12.9"
- label: "Build wheel - x86_64 - CUDA 12.9"
depends_on: ~
id: build-wheel-cuda-12-9
id: build-wheel-x86-cuda-12-9
agents:
queue: cpu_queue_postmerge
commands:
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
- "mkdir artifacts"
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
- "bash .buildkite/scripts/upload-wheels.sh manylinux_2_31"
- "bash .buildkite/scripts/upload-nightly-wheels.sh manylinux_2_31"
env:
DOCKER_BUILDKIT: "1"
- label: "Build wheel - CUDA 13.0"
- label: "Build wheel - x86_64 - CUDA 13.0"
depends_on: ~
id: build-wheel-cuda-13-0
id: build-wheel-x86-cuda-13-0
agents:
queue: cpu_queue_postmerge
commands:
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
- "mkdir artifacts"
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
- "bash .buildkite/scripts/upload-wheels.sh manylinux_2_35"
- "bash .buildkite/scripts/upload-nightly-wheels.sh manylinux_2_35"
env:
DOCKER_BUILDKIT: "1"
# x86 CPU wheel build
- label: "Build x86 CPU wheel"
- label: "Build wheel - x86_64 - CPU"
depends_on: ~
id: build-wheel-x86-cpu
agents:
@@ -81,12 +81,12 @@ steps:
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_CPU_AVX512BF16=true --build-arg VLLM_CPU_AVX512VNNI=true --build-arg VLLM_CPU_AMXBF16=true --tag vllm-ci:build-image --target vllm-build --progress plain -f docker/Dockerfile.cpu ."
- "mkdir artifacts"
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
- "bash .buildkite/scripts/upload-wheels.sh manylinux_2_35"
- "bash .buildkite/scripts/upload-nightly-wheels.sh manylinux_2_35"
env:
DOCKER_BUILDKIT: "1"
# Build release images (12.9)
- label: "Build release image (x86)"
# Build release images (CUDA 12.9)
- label: "Build release image - x86_64 - CUDA 12.9"
depends_on: ~
id: build-release-image-x86
agents:
@@ -99,7 +99,7 @@ steps:
- "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
- label: "Build release image (arm64)"
- label: "Build release image - aarch64 - CUDA 12.9"
depends_on: ~
id: build-release-image-arm64
agents:
@@ -109,34 +109,93 @@ steps:
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)"
# Add job to create multi-arch manifest
- label: "Create multi-arch manifest"
- label: "Create multi-arch manifest - CUDA 12.9"
depends_on:
- build-release-image-x86
- build-release-image-arm64
id: create-multi-arch-manifest
agents:
queue: cpu_queue_postmerge
queue: small_cpu_queue_postmerge
commands:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
- "docker manifest create public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64 --amend"
- "docker manifest push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
- label: "Annotate release workflow"
- label: "Annotate release workflow - CUDA 12.9"
depends_on:
- create-multi-arch-manifest
id: annotate-release-workflow
agents:
queue: cpu_queue_postmerge
queue: small_cpu_queue_postmerge
commands:
- "bash .buildkite/scripts/annotate-release.sh"
- block: "Build CUDA 13.0 release images"
key: block-release-image-build-cuda-13-0
depends_on: ~
- label: "Build release image - x86_64 - CUDA 13.0"
depends_on: block-release-image-build-cuda-13-0
id: build-release-image-x86-cuda-13-0
agents:
queue: cpu_queue_postmerge
commands:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg INSTALL_KV_CONNECTORS=true --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130 --target vllm-openai --progress plain -f docker/Dockerfile ."
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130"
# re-tag to default image tag and push, just in case arm64 build fails
- "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu130"
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu130"
- label: "Build release image - aarch64 - CUDA 13.0"
depends_on: block-release-image-build-cuda-13-0
id: build-release-image-arm64-cuda-13-0
agents:
queue: arm64_cpu_queue_postmerge
commands:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
# compute capability 12.0 for RTX-50 series / RTX PRO 6000 Blackwell, 12.1 for DGX Spark
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0 12.1' --build-arg INSTALL_KV_CONNECTORS=true --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130 --target vllm-openai --progress plain -f docker/Dockerfile ."
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130"
- label: "Create multi-arch manifest - CUDA 13.0"
depends_on:
- build-release-image-x86-cuda-13-0
- build-release-image-arm64-cuda-13-0
id: create-multi-arch-manifest-cuda-13-0
agents:
queue: small_cpu_queue_postmerge
commands:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
- "docker manifest create public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu130 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64-cu130 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64-cu130 --amend"
- "docker manifest push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu130"
- input: "Provide Release version here"
id: input-release-version
fields:
- text: "What is the release version?"
key: release-version
- block: "Confirm update release wheels to PyPI (experimental, use with caution)?"
key: block-upload-release-wheels
depends_on:
- input-release-version
- build-wheel-x86-cuda-12-9
- build-wheel-x86-cuda-13-0
- build-wheel-x86-cpu
- build-wheel-arm64-cuda-12-9
- build-wheel-arm64-cuda-13-0
- build-wheel-arm64-cpu
- label: "Upload release wheels to PyPI and GitHub"
depends_on:
- block-upload-release-wheels
id: upload-release-wheels
agents:
queue: small_cpu_queue_postmerge
commands:
- "bash .buildkite/scripts/upload-release-wheels.sh"
- block: "Build CPU release image"
key: block-cpu-release-image-build
depends_on: ~
@@ -169,24 +228,31 @@ steps:
env:
DOCKER_BUILDKIT: "1"
- block: "Build ROCm release image"
key: block-rocm-release-image-build
depends_on: ~
- label: "Build release image (ROCm)"
depends_on: block-rocm-release-image-build
id: build-release-image-rocm
agents:
queue: cpu_queue_postmerge
commands:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
# Build base image first
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --tag rocm/vllm-dev:base-$BUILDKITE_COMMIT --target final --progress plain -f docker/Dockerfile.rocm_base ."
# Build vLLM ROCm image using the base
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg BASE_IMAGE=rocm/vllm-dev:base-$BUILDKITE_COMMIT --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-rocm --target vllm-openai --progress plain -f docker/Dockerfile.rocm ."
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-rocm"
- label: "Build and publish nightly multi-arch image to DockerHub"
depends_on:
- create-multi-arch-manifest
if: build.env("NIGHTLY") == "1"
agents:
queue: cpu_queue_postmerge
queue: small_cpu_queue_postmerge
commands:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
- "docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64"
- "docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64"
- "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64 vllm/vllm-openai:nightly-x86_64"
- "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64 vllm/vllm-openai:nightly-aarch64"
- "docker push vllm/vllm-openai:nightly-x86_64"
- "docker push vllm/vllm-openai:nightly-aarch64"
- "docker manifest create vllm/vllm-openai:nightly vllm/vllm-openai:nightly-x86_64 vllm/vllm-openai:nightly-aarch64 --amend"
- "docker manifest create vllm/vllm-openai:nightly-$BUILDKITE_COMMIT vllm/vllm-openai:nightly-x86_64 vllm/vllm-openai:nightly-aarch64 --amend"
- "docker manifest push vllm/vllm-openai:nightly"
- "docker manifest push vllm/vllm-openai:nightly-$BUILDKITE_COMMIT"
- "bash .buildkite/scripts/push-nightly-builds.sh"
# Clean up old nightly builds (keep only last 14)
- "bash .buildkite/scripts/cleanup-nightly-builds.sh"
plugins:
@@ -196,3 +262,384 @@ steps:
env:
DOCKER_BUILDKIT: "1"
DOCKERHUB_USERNAME: "vllmbot"
- label: "Build and publish nightly multi-arch image to DockerHub - CUDA 13.0"
depends_on:
- create-multi-arch-manifest-cuda-13-0
if: build.env("NIGHTLY") == "1"
agents:
queue: small_cpu_queue_postmerge
commands:
- "bash .buildkite/scripts/push-nightly-builds.sh cu130"
# Clean up old nightly builds (keep only last 14)
- "bash .buildkite/scripts/cleanup-nightly-builds.sh cu130-nightly-"
plugins:
- docker-login#v3.0.0:
username: vllmbot
password-env: DOCKERHUB_TOKEN
env:
DOCKER_BUILDKIT: "1"
DOCKERHUB_USERNAME: "vllmbot"
# =============================================================================
# ROCm Release Pipeline (x86_64 only)
# =============================================================================
#
# vLLM version is determined by the Buildkite checkout (like CUDA pipeline).
# To build a specific version, trigger the build from that branch/tag.
#
# Environment variables for ROCm builds (set via Buildkite UI or schedule):
# ROCM_PYTHON_VERSION: Python version (default: 3.12)
# PYTORCH_ROCM_ARCH: GPU architectures (default: gfx90a;gfx942;gfx950;gfx1100;gfx1101;gfx1200;gfx1201;gfx1150;gfx1151)
# ROCM_UPLOAD_WHEELS: Upload to S3 (default: false for nightly, true for releases)
# ROCM_FORCE_REBUILD: Force rebuild base wheels, ignore S3 cache (default: false)
#
# Note: ROCm version is determined by BASE_IMAGE in docker/Dockerfile.rocm_base
# (currently rocm/dev-ubuntu-22.04:7.1-complete)
#
# =============================================================================
# ROCm Input Step - Collect build configuration (manual trigger only)
- input: "ROCm Wheel Release Build Configuration"
key: input-rocm-config
depends_on: ~
if: build.source == "ui"
fields:
- text: "Python Version"
key: "rocm-python-version"
default: "3.12"
hint: "Python version (e.g., 3.12)"
- text: "GPU Architectures"
key: "rocm-pytorch-rocm-arch"
default: "gfx90a;gfx942;gfx950;gfx1100;gfx1101;gfx1200;gfx1201;gfx1150;gfx1151"
hint: "Semicolon-separated GPU architectures"
- select: "Upload Wheels to S3"
key: "rocm-upload-wheels"
default: "true"
options:
- label: "No - Build only (nightly/dev)"
value: "false"
- label: "Yes - Upload to S3 (release)"
value: "true"
- select: "Force Rebuild Base Wheels"
key: "rocm-force-rebuild"
default: "false"
hint: "Ignore S3 cache and rebuild base wheels from scratch"
options:
- label: "No - Use cached wheels if available"
value: "false"
- label: "Yes - Rebuild even if cache exists"
value: "true"
# ROCm Job 1: Build ROCm Base Wheels (with S3 caching)
- label: ":rocm: Build ROCm Base Wheels"
id: build-rocm-base-wheels
depends_on:
- step: input-rocm-config
allow_failure: true # Allow failure so non-UI builds can proceed (input step is skipped)
agents:
queue: cpu_queue_postmerge
commands:
# Set configuration and check cache
- |
set -euo pipefail
# Get values from meta-data (set by input step) or use defaults
PYTHON_VERSION="$$(buildkite-agent meta-data get rocm-python-version 2>/dev/null || echo '')"
export PYTHON_VERSION="$${PYTHON_VERSION:-3.12}"
PYTORCH_ROCM_ARCH="$$(buildkite-agent meta-data get rocm-pytorch-rocm-arch 2>/dev/null || echo '')"
export PYTORCH_ROCM_ARCH="$${PYTORCH_ROCM_ARCH:-gfx90a;gfx942;gfx950;gfx1100;gfx1101;gfx1200;gfx1201;gfx1150;gfx1151}"
# Check for force rebuild flag
ROCM_FORCE_REBUILD="$${ROCM_FORCE_REBUILD:-}"
if [ -z "$${ROCM_FORCE_REBUILD}" ]; then
ROCM_FORCE_REBUILD="$$(buildkite-agent meta-data get rocm-force-rebuild 2>/dev/null || echo '')"
fi
echo "========================================"
echo "ROCm Base Wheels Build Configuration"
echo "========================================"
echo " PYTHON_VERSION: $${PYTHON_VERSION}"
echo " PYTORCH_ROCM_ARCH: $${PYTORCH_ROCM_ARCH}"
echo " ROCM_FORCE_REBUILD: $${ROCM_FORCE_REBUILD:-false}"
echo "========================================"
# Save resolved config for later jobs
buildkite-agent meta-data set "rocm-python-version" "$${PYTHON_VERSION}"
buildkite-agent meta-data set "rocm-pytorch-rocm-arch" "$${PYTORCH_ROCM_ARCH}"
# Check S3 cache for pre-built wheels
CACHE_KEY=$$(.buildkite/scripts/cache-rocm-base-wheels.sh key)
CACHE_PATH=$$(.buildkite/scripts/cache-rocm-base-wheels.sh path)
echo ""
echo "Cache key: $${CACHE_KEY}"
echo "Cache path: $${CACHE_PATH}"
# Save cache key for downstream jobs
buildkite-agent meta-data set "rocm-cache-key" "$${CACHE_KEY}"
CACHE_STATUS="miss"
if [ "$${ROCM_FORCE_REBUILD}" != "true" ]; then
CACHE_STATUS=$$(.buildkite/scripts/cache-rocm-base-wheels.sh check)
else
echo "Force rebuild requested, skipping cache check"
fi
if [ "$${CACHE_STATUS}" = "hit" ]; then
echo ""
echo "CACHE HIT! Downloading pre-built wheels..."
echo ""
.buildkite/scripts/cache-rocm-base-wheels.sh download
# Set the S3 path for the cached Docker image (for Job 2 to download)
S3_ARTIFACT_PATH="s3://$${S3_BUCKET}/rocm/cache/$${CACHE_KEY}"
buildkite-agent meta-data set "rocm-docker-image-s3-path" "$${S3_ARTIFACT_PATH}/rocm-base-image.tar.gz"
# Mark that we used cache (for Docker image handling)
buildkite-agent meta-data set "rocm-used-cache" "true"
echo ""
echo "Cache download complete. Skipping Docker build."
echo "Docker image will be downloaded from: $${S3_ARTIFACT_PATH}/rocm-base-image.tar.gz"
else
echo ""
echo "CACHE MISS. Building from scratch..."
echo ""
# Build full base image (for later vLLM build)
DOCKER_BUILDKIT=1 docker buildx build \
--file docker/Dockerfile.rocm_base \
--tag rocm/vllm-dev:base-$${BUILDKITE_BUILD_NUMBER} \
--build-arg PYTORCH_ROCM_ARCH="$${PYTORCH_ROCM_ARCH}" \
--build-arg PYTHON_VERSION="$${PYTHON_VERSION}" \
--build-arg USE_SCCACHE=1 \
--build-arg SCCACHE_BUCKET_NAME=vllm-build-sccache \
--build-arg SCCACHE_REGION_NAME=us-west-2 \
--build-arg SCCACHE_S3_NO_CREDENTIALS=0 \
--load \
.
# Build debs_wheel_release stage for wheel extraction
DOCKER_BUILDKIT=1 docker buildx build \
--file docker/Dockerfile.rocm_base \
--tag rocm-base-debs:$${BUILDKITE_BUILD_NUMBER} \
--target debs_wheel_release \
--build-arg PYTORCH_ROCM_ARCH="$${PYTORCH_ROCM_ARCH}" \
--build-arg PYTHON_VERSION="$${PYTHON_VERSION}" \
--build-arg USE_SCCACHE=1 \
--build-arg SCCACHE_BUCKET_NAME=vllm-build-sccache \
--build-arg SCCACHE_REGION_NAME=us-west-2 \
--build-arg SCCACHE_S3_NO_CREDENTIALS=0 \
--load \
.
# Extract wheels from Docker image
mkdir -p artifacts/rocm-base-wheels
container_id=$$(docker create rocm-base-debs:$${BUILDKITE_BUILD_NUMBER})
docker cp $${container_id}:/app/debs/. artifacts/rocm-base-wheels/
docker rm $${container_id}
echo "Extracted base wheels:"
ls -lh artifacts/rocm-base-wheels/
# Upload wheels to S3 cache for future builds
echo ""
echo "Uploading wheels to S3 cache..."
.buildkite/scripts/cache-rocm-base-wheels.sh upload
# Export base Docker image for reuse in vLLM build
mkdir -p artifacts/rocm-docker-image
docker save rocm/vllm-dev:base-$${BUILDKITE_BUILD_NUMBER} | gzip > artifacts/rocm-docker-image/rocm-base-image.tar.gz
echo "Docker image size:"
ls -lh artifacts/rocm-docker-image/
# Upload large Docker image to S3 (also cached by cache key)
S3_ARTIFACT_PATH="s3://$${S3_BUCKET}/rocm/cache/$${CACHE_KEY}"
echo "Uploading Docker image to $${S3_ARTIFACT_PATH}/"
aws s3 cp artifacts/rocm-docker-image/rocm-base-image.tar.gz "$${S3_ARTIFACT_PATH}/rocm-base-image.tar.gz"
# Save the S3 path for downstream jobs
buildkite-agent meta-data set "rocm-docker-image-s3-path" "$${S3_ARTIFACT_PATH}/rocm-base-image.tar.gz"
# Mark that we did NOT use cache
buildkite-agent meta-data set "rocm-used-cache" "false"
echo ""
echo "Build complete. Wheels cached for future builds."
fi
artifact_paths:
- "artifacts/rocm-base-wheels/*.whl"
env:
DOCKER_BUILDKIT: "1"
S3_BUCKET: "vllm-wheels"
# ROCm Job 2: Build vLLM ROCm Wheel
- label: ":python: Build vLLM ROCm Wheel"
id: build-rocm-vllm-wheel
depends_on:
- step: build-rocm-base-wheels
allow_failure: false
agents:
queue: cpu_queue_postmerge
timeout_in_minutes: 180
commands:
# Download artifacts and prepare Docker image
- |
set -euo pipefail
# Ensure git tags are up-to-date (Buildkite's default fetch doesn't update tags)
# This fixes version detection when tags are moved/force-pushed
echo "Fetching latest tags from origin..."
git fetch --tags --force origin
# Log tag information for debugging version detection
echo "========================================"
echo "Git Tag Verification"
echo "========================================"
echo "Current HEAD: $(git rev-parse HEAD)"
echo "git describe --tags: $(git describe --tags 2>/dev/null || echo 'No tags found')"
echo ""
echo "Recent tags (pointing to commits near HEAD):"
git tag -l --sort=-creatordate | head -5
echo "setuptools_scm version detection:"
pip install -q setuptools_scm 2>/dev/null || true
python3 -c "import setuptools_scm; print(' Detected version:', setuptools_scm.get_version())" 2>/dev/null || echo " (setuptools_scm not available in this environment)"
echo "========================================"
# Download wheel artifacts from current build
echo "Downloading wheel artifacts from current build"
buildkite-agent artifact download "artifacts/rocm-base-wheels/*.whl" .
# Download Docker image from S3 (too large for Buildkite artifacts)
DOCKER_IMAGE_S3_PATH="$$(buildkite-agent meta-data get rocm-docker-image-s3-path 2>/dev/null || echo '')"
if [ -z "$${DOCKER_IMAGE_S3_PATH}" ]; then
echo "ERROR: rocm-docker-image-s3-path metadata not found"
echo "This should have been set by the build-rocm-base-wheels job"
exit 1
fi
echo "Downloading Docker image from $${DOCKER_IMAGE_S3_PATH}"
mkdir -p artifacts/rocm-docker-image
aws s3 cp "$${DOCKER_IMAGE_S3_PATH}" artifacts/rocm-docker-image/rocm-base-image.tar.gz
# Load base Docker image and capture the tag
echo "Loading base Docker image..."
LOAD_OUTPUT=$$(gunzip -c artifacts/rocm-docker-image/rocm-base-image.tar.gz | docker load)
echo "$${LOAD_OUTPUT}"
# Extract the actual loaded image tag from "Loaded image: <tag>" output
# This avoids picking up stale images (like rocm/vllm-dev:nightly) already on the agent
BASE_IMAGE_TAG=$$(echo "$${LOAD_OUTPUT}" | grep "Loaded image:" | sed 's/Loaded image: //')
if [ -z "$${BASE_IMAGE_TAG}" ]; then
echo "ERROR: Failed to extract image tag from docker load output"
echo "Load output was: $${LOAD_OUTPUT}"
exit 1
fi
echo "Loaded base image: $${BASE_IMAGE_TAG}"
# Prepare base wheels for Docker build context
mkdir -p docker/context/base-wheels
touch docker/context/base-wheels/.keep
cp artifacts/rocm-base-wheels/*.whl docker/context/base-wheels/
echo "Base wheels for vLLM build:"
ls -lh docker/context/base-wheels/
# Get GPU architectures from meta-data
PYTORCH_ROCM_ARCH="$$(buildkite-agent meta-data get rocm-pytorch-rocm-arch 2>/dev/null || echo '')"
PYTORCH_ROCM_ARCH="$${PYTORCH_ROCM_ARCH:-gfx90a;gfx942;gfx950;gfx1100;gfx1101;gfx1200;gfx1201;gfx1150;gfx1151}"
echo "========================================"
echo "Building vLLM wheel with:"
echo " BUILDKITE_COMMIT: $${BUILDKITE_COMMIT}"
echo " BUILDKITE_BRANCH: $${BUILDKITE_BRANCH}"
echo " PYTORCH_ROCM_ARCH: $${PYTORCH_ROCM_ARCH}"
echo " BASE_IMAGE: $${BASE_IMAGE_TAG}"
echo "========================================"
# Build vLLM wheel using local checkout (REMOTE_VLLM=0)
DOCKER_BUILDKIT=1 docker build \
--file docker/Dockerfile.rocm \
--target export_vllm_wheel_release \
--output type=local,dest=rocm-dist \
--build-arg BASE_IMAGE="$${BASE_IMAGE_TAG}" \
--build-arg ARG_PYTORCH_ROCM_ARCH="$${PYTORCH_ROCM_ARCH}" \
--build-arg REMOTE_VLLM=0 \
--build-arg GIT_REPO_CHECK=1 \
--build-arg USE_SCCACHE=1 \
--build-arg SCCACHE_BUCKET_NAME=vllm-build-sccache \
--build-arg SCCACHE_REGION_NAME=us-west-2 \
--build-arg SCCACHE_S3_NO_CREDENTIALS=0 \
.
echo "Built vLLM wheel:"
ls -lh rocm-dist/*.whl
# Copy wheel to artifacts directory
mkdir -p artifacts/rocm-vllm-wheel
cp rocm-dist/*.whl artifacts/rocm-vllm-wheel/
echo "Final vLLM wheel:"
ls -lh artifacts/rocm-vllm-wheel/
artifact_paths:
- "artifacts/rocm-vllm-wheel/*.whl"
env:
DOCKER_BUILDKIT: "1"
S3_BUCKET: "vllm-wheels"
# ROCm Job 3: Upload Wheels to S3
- label: ":s3: Upload ROCm Wheels to S3"
id: upload-rocm-wheels
depends_on:
- step: build-rocm-vllm-wheel
allow_failure: false
agents:
queue: cpu_queue_postmerge
timeout_in_minutes: 60
commands:
# Download all wheel artifacts and run upload
- |
set -euo pipefail
# Check if upload is enabled (from env var, meta-data, or release branch)
ROCM_UPLOAD_WHEELS="$${ROCM_UPLOAD_WHEELS:-}"
if [ -z "$${ROCM_UPLOAD_WHEELS}" ]; then
# Try to get from meta-data (input form)
ROCM_UPLOAD_WHEELS="$$(buildkite-agent meta-data get rocm-upload-wheels 2>/dev/null || echo '')"
fi
echo "========================================"
echo "Upload check:"
echo " ROCM_UPLOAD_WHEELS: $${ROCM_UPLOAD_WHEELS}"
echo " BUILDKITE_BRANCH: $${BUILDKITE_BRANCH}"
echo "========================================"
# Skip upload if not enabled
if [ "$${ROCM_UPLOAD_WHEELS}" != "true" ]; then
echo "Skipping S3 upload (ROCM_UPLOAD_WHEELS != true, NIGHTLY != 1, not a release branch)"
echo "To enable upload, set 'Upload Wheels to S3' to 'Yes' in the build configuration"
exit 0
fi
echo "Upload enabled, proceeding..."
# Download artifacts from current build
echo "Downloading artifacts from current build"
buildkite-agent artifact download "artifacts/rocm-base-wheels/*.whl" .
buildkite-agent artifact download "artifacts/rocm-vllm-wheel/*.whl" .
# Run upload script
bash .buildkite/scripts/upload-rocm-wheels.sh
env:
DOCKER_BUILDKIT: "1"
S3_BUCKET: "vllm-wheels"
# ROCm Job 4: Annotate ROCm Wheel Release
- label: ":memo: Annotate ROCm wheel release"
id: annotate-rocm-release
depends_on:
- step: upload-rocm-wheels
allow_failure: true
agents:
queue: cpu_queue_postmerge
commands:
- "bash .buildkite/scripts/annotate-rocm-release.sh"
env:
S3_BUCKET: "vllm-wheels"

View File

@@ -32,6 +32,7 @@ To download and upload the image:
\`\`\`
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64 vllm/vllm-openai:x86_64
docker tag vllm/vllm-openai:x86_64 vllm/vllm-openai:latest-x86_64
@@ -45,6 +46,12 @@ docker tag vllm/vllm-openai:aarch64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
docker push vllm/vllm-openai:latest-aarch64
docker push vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm vllm/vllm-openai:rocm
docker tag vllm/vllm-openai:rocm vllm/vllm-openai:latest-rocm
docker tag vllm/vllm-openai:rocm vllm/vllm-openai:v${RELEASE_VERSION}-rocm
docker push vllm/vllm-openai:latest-rocm
docker push vllm/vllm-openai:v${RELEASE_VERSION}-rocm
docker manifest rm vllm/vllm-openai:latest
docker manifest create vllm/vllm-openai:latest vllm/vllm-openai:latest-x86_64 vllm/vllm-openai:latest-aarch64
docker manifest create vllm/vllm-openai:v${RELEASE_VERSION} vllm/vllm-openai:v${RELEASE_VERSION}-x86_64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64

View File

@@ -0,0 +1,74 @@
#!/bin/bash
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
#
# Generate Buildkite annotation for ROCm wheel release
set -ex
# Get build configuration from meta-data
# Extract ROCm version dynamically from Dockerfile.rocm_base
# BASE_IMAGE format: rocm/dev-ubuntu-22.04:7.1-complete -> extracts "7.1"
ROCM_VERSION=$(grep -E '^ARG BASE_IMAGE=' docker/Dockerfile.rocm_base | sed -E 's/.*:([0-9]+\.[0-9]+).*/\1/' || echo "unknown")
PYTHON_VERSION=$(buildkite-agent meta-data get rocm-python-version 2>/dev/null || echo "3.12")
PYTORCH_ROCM_ARCH=$(buildkite-agent meta-data get rocm-pytorch-rocm-arch 2>/dev/null || echo "gfx90a;gfx942;gfx950;gfx1100;gfx1101;gfx1200;gfx1201;gfx1150;gfx1151")
# S3 URLs
S3_BUCKET="${S3_BUCKET:-vllm-wheels}"
S3_REGION="${AWS_DEFAULT_REGION:-us-west-2}"
S3_URL="https://${S3_BUCKET}.s3.${S3_REGION}.amazonaws.com"
ROCM_PATH="rocm/${BUILDKITE_COMMIT}"
buildkite-agent annotate --style 'success' --context 'rocm-release-workflow' << EOF
## :rocm: ROCm Wheel Release
### Build Configuration
| Setting | Value |
|---------|-------|
| **ROCm Version** | ${ROCM_VERSION} |
| **Python Version** | ${PYTHON_VERSION} |
| **GPU Architectures** | ${PYTORCH_ROCM_ARCH} |
| **Branch** | \`${BUILDKITE_BRANCH}\` |
| **Commit** | \`${BUILDKITE_COMMIT}\` |
### :package: Installation
**Install from this build (by commit):**
\`\`\`bash
uv pip install vllm --extra-index-url ${S3_URL}/${ROCM_PATH}/{rocm_variant}/
# Example:
uv pip install vllm --extra-index-url ${S3_URL}/${ROCM_PATH}/rocm700/
\`\`\`
**Install from nightly (if published):**
\`\`\`bash
uv pip install vllm --extra-index-url ${S3_URL}/rocm/nightly/
\`\`\`
### :floppy_disk: Download Wheels Directly
\`\`\`bash
# List all ROCm wheels
aws s3 ls s3://${S3_BUCKET}/${ROCM_PATH}/
# Download specific wheels
aws s3 cp s3://${S3_BUCKET}/${ROCM_PATH}/vllm-*.whl .
aws s3 cp s3://${S3_BUCKET}/${ROCM_PATH}/torch-*.whl .
aws s3 cp s3://${S3_BUCKET}/${ROCM_PATH}/triton_rocm-*.whl .
aws s3 cp s3://${S3_BUCKET}/${ROCM_PATH}/torchvision-*.whl .
aws s3 cp s3://${S3_BUCKET}/${ROCM_PATH}/amdsmi-*.whl .
\`\`\`
### :gear: Included Packages
- **vllm**: vLLM with ROCm support
- **torch**: PyTorch built for ROCm ${ROCM_VERSION}
- **triton_rocm**: Triton built for ROCm
- **torchvision**: TorchVision for ROCm PyTorch
- **amdsmi**: AMD SMI Python bindings
### :warning: Notes
- These wheels are built for **ROCm ${ROCM_VERSION}** and will NOT work with CUDA GPUs
- Supported GPU architectures: ${PYTORCH_ROCM_ARCH}
- Platform: Linux x86_64 only
EOF

View File

@@ -0,0 +1,140 @@
#!/usr/bin/env bash
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
#
# Cache helper for ROCm base wheels
#
# This script manages caching of pre-built ROCm base wheels (torch, triton, etc.)
# to avoid rebuilding them when Dockerfile.rocm_base hasn't changed.
#
# Usage:
# cache-rocm-base-wheels.sh check - Check if cache exists, outputs "hit" or "miss"
# cache-rocm-base-wheels.sh upload - Upload wheels to cache
# cache-rocm-base-wheels.sh download - Download wheels from cache
# cache-rocm-base-wheels.sh key - Output the cache key
#
# Environment variables:
# S3_BUCKET - S3 bucket name (default: vllm-wheels)
# PYTHON_VERSION - Python version (affects cache key)
# PYTORCH_ROCM_ARCH - GPU architectures (affects cache key)
#
# Note: ROCm version is determined by BASE_IMAGE in Dockerfile.rocm_base,
# so changes to ROCm version are captured by the Dockerfile hash.
set -euo pipefail
BUCKET="${S3_BUCKET:-vllm-wheels}"
DOCKERFILE="docker/Dockerfile.rocm_base"
CACHE_PREFIX="rocm/cache"
# Generate hash from Dockerfile content + build args
generate_cache_key() {
# Include Dockerfile content
if [[ ! -f "$DOCKERFILE" ]]; then
echo "ERROR: Dockerfile not found: $DOCKERFILE" >&2
exit 1
fi
local dockerfile_hash=$(sha256sum "$DOCKERFILE" | cut -c1-16)
# Include key build args that affect the output
# These should match the ARGs in Dockerfile.rocm_base that change the build output
# Note: ROCm version is determined by BASE_IMAGE in the Dockerfile, so it's captured by dockerfile_hash
local args_string="${PYTHON_VERSION:-}|${PYTORCH_ROCM_ARCH:-}"
local args_hash=$(echo "$args_string" | sha256sum | cut -c1-8)
echo "${dockerfile_hash}-${args_hash}"
}
CACHE_KEY=$(generate_cache_key)
CACHE_PATH="s3://${BUCKET}/${CACHE_PREFIX}/${CACHE_KEY}/"
case "${1:-}" in
check)
echo "Checking cache for key: ${CACHE_KEY}" >&2
echo "Cache path: ${CACHE_PATH}" >&2
echo "Variables used in cache key:" >&2
echo " PYTHON_VERSION: ${PYTHON_VERSION:-<not set>}" >&2
echo " PYTORCH_ROCM_ARCH: ${PYTORCH_ROCM_ARCH:-<not set>}" >&2
# Check if cache exists by listing objects
# We look for at least one .whl file
echo "Running: aws s3 ls ${CACHE_PATH}" >&2
S3_OUTPUT=$(aws s3 ls "${CACHE_PATH}" 2>&1) || true
echo "S3 ls output:" >&2
echo "$S3_OUTPUT" | head -5 >&2
if echo "$S3_OUTPUT" | grep -q "\.whl"; then
echo "hit"
else
echo "miss"
fi
;;
upload)
echo "========================================"
echo "Uploading wheels to cache"
echo "========================================"
echo "Cache key: ${CACHE_KEY}"
echo "Cache path: ${CACHE_PATH}"
echo ""
if [[ ! -d "artifacts/rocm-base-wheels" ]]; then
echo "ERROR: artifacts/rocm-base-wheels directory not found" >&2
exit 1
fi
WHEEL_COUNT=$(ls artifacts/rocm-base-wheels/*.whl 2>/dev/null | wc -l)
if [[ "$WHEEL_COUNT" -eq 0 ]]; then
echo "ERROR: No wheels found in artifacts/rocm-base-wheels/" >&2
exit 1
fi
echo "Uploading $WHEEL_COUNT wheels..."
aws s3 cp --recursive artifacts/rocm-base-wheels/ "${CACHE_PATH}"
echo ""
echo "Cache upload complete!"
echo "========================================"
;;
download)
echo "========================================"
echo "Downloading wheels from cache"
echo "========================================"
echo "Cache key: ${CACHE_KEY}"
echo "Cache path: ${CACHE_PATH}"
echo ""
mkdir -p artifacts/rocm-base-wheels
aws s3 cp --recursive "${CACHE_PATH}" artifacts/rocm-base-wheels/
echo ""
echo "Downloaded wheels:"
ls -lh artifacts/rocm-base-wheels/
WHEEL_COUNT=$(ls artifacts/rocm-base-wheels/*.whl 2>/dev/null | wc -l)
echo ""
echo "Total: $WHEEL_COUNT wheels"
echo "========================================"
;;
key)
echo "${CACHE_KEY}"
;;
path)
echo "${CACHE_PATH}"
;;
*)
echo "Usage: $0 {check|upload|download|key|path}" >&2
echo "" >&2
echo "Commands:" >&2
echo " check - Check if cache exists, outputs 'hit' or 'miss'" >&2
echo " upload - Upload wheels from artifacts/rocm-base-wheels/ to cache" >&2
echo " download - Download wheels from cache to artifacts/rocm-base-wheels/" >&2
echo " key - Output the cache key" >&2
echo " path - Output the full S3 cache path" >&2
exit 1
;;
esac

View File

@@ -3,7 +3,14 @@
set -ex
# Clean up old nightly builds from DockerHub, keeping only the last 14 builds
# This script uses DockerHub API to list and delete old tags with "nightly-" prefix
# This script uses DockerHub API to list and delete old tags with specified prefix
# Usage: cleanup-nightly-builds.sh [TAG_PREFIX]
# Example: cleanup-nightly-builds.sh "nightly-" or cleanup-nightly-builds.sh "cu130-nightly-"
# Get tag prefix from argument, default to "nightly-" if not provided
TAG_PREFIX="${1:-nightly-}"
echo "Cleaning up tags with prefix: $TAG_PREFIX"
# DockerHub API endpoint for vllm/vllm-openai repository
REPO_API_URL="https://hub.docker.com/v2/repositories/vllm/vllm-openai/tags"
@@ -45,7 +52,7 @@ get_all_tags() {
set -x
# Get both last_updated timestamp and tag name, separated by |
local tags=$(echo "$response" | jq -r '.results[] | select(.name | startswith("nightly-")) | "\(.last_updated)|\(.name)"')
local tags=$(echo "$response" | jq -r --arg prefix "$TAG_PREFIX" '.results[] | select(.name | startswith($prefix)) | "\(.last_updated)|\(.name)"')
if [ -z "$tags" ]; then
break

View File

@@ -16,6 +16,18 @@ from urllib.parse import quote
import regex as re
def normalize_package_name(name: str) -> str:
"""
Normalize package name according to PEP 503.
https://peps.python.org/pep-0503/#normalized-names
Replace runs of underscores, hyphens, and periods with a single hyphen,
and lowercase the result.
"""
return re.sub(r"[-_.]+", "-", name).lower()
if not sys.version_info >= (3, 12):
raise RuntimeError("This script requires Python 3.12 or higher.")
@@ -78,7 +90,13 @@ def parse_from_filename(file: str) -> WheelFileInfo:
version = version.removesuffix("." + variant)
else:
if "+" in version:
version, variant = version.split("+")
version_part, suffix = version.split("+", 1)
# Only treat known patterns as variants (rocmXXX, cuXXX, cpu)
# Git hashes and other suffixes are NOT variants
if suffix.startswith(("rocm", "cu", "cpu")):
variant = suffix
version = version_part
# Otherwise keep the full version string (variant stays None)
return WheelFileInfo(
package_name=package_name,
@@ -206,6 +224,26 @@ def generate_index_and_metadata(
print("No wheel files found, skipping index generation.")
return
# For ROCm builds: inherit variant from vllm wheel
# All ROCm wheels should share the same variant as vllm
rocm_variant = None
for file in parsed_files:
if (
file.package_name == "vllm"
and file.variant
and file.variant.startswith("rocm")
):
rocm_variant = file.variant
print(f"Detected ROCm variant from vllm: {rocm_variant}")
break
# Apply ROCm variant to all wheels without a variant
if rocm_variant:
for file in parsed_files:
if file.variant is None:
file.variant = rocm_variant
print(f"Inherited variant '{rocm_variant}' for {file.filename}")
# Group by variant
variant_to_files: dict[str, list[WheelFileInfo]] = {}
for file in parsed_files:
@@ -256,8 +294,8 @@ def generate_index_and_metadata(
variant_dir.mkdir(parents=True, exist_ok=True)
# gather all package names in this variant
packages = set(f.package_name for f in files)
# gather all package names in this variant (normalized per PEP 503)
packages = set(normalize_package_name(f.package_name) for f in files)
if variant == "default":
# these packages should also appear in the "project list"
# generate after all variants are processed
@@ -269,8 +307,10 @@ def generate_index_and_metadata(
f.write(project_list_str)
for package in packages:
# filter files belonging to this package only
package_files = [f for f in files if f.package_name == package]
# filter files belonging to this package only (compare normalized names)
package_files = [
f for f in files if normalize_package_name(f.package_name) == package
]
package_dir = variant_dir / package
package_dir.mkdir(parents=True, exist_ok=True)
index_str, metadata_str = generate_package_index_and_metadata(
@@ -341,8 +381,13 @@ if __name__ == "__main__":
args = parser.parse_args()
version = args.version
if "/" in version or "\\" in version:
raise ValueError("Version string must not contain slashes.")
# Allow rocm/ prefix, reject other slashes and all backslashes
if "\\" in version:
raise ValueError("Version string must not contain backslashes.")
if "/" in version and not version.startswith("rocm/"):
raise ValueError(
"Version string must not contain slashes (except for 'rocm/' prefix)."
)
current_objects_path = Path(args.current_objects)
output_dir = Path(args.output_dir)
if not output_dir.exists():
@@ -393,8 +438,23 @@ if __name__ == "__main__":
# Generate index and metadata, assuming wheels and indices are stored as:
# s3://vllm-wheels/{wheel_dir}/<wheel files>
# s3://vllm-wheels/<anything>/<index files>
wheel_dir = args.wheel_dir or version
wheel_base_dir = Path(output_dir).parent / wheel_dir.strip().rstrip("/")
#
# For ROCm builds, version is "rocm/{commit}" and indices are uploaded to:
# - rocm/{commit}/ (same as wheels)
# - rocm/nightly/
# - rocm/{version}/
# All these are under the "rocm/" prefix, so relative paths should be
# relative to "rocm/", not the bucket root.
if args.wheel_dir:
# Explicit wheel-dir provided (e.g., for version-specific indices pointing to commit dir)
wheel_dir = args.wheel_dir.strip().rstrip("/")
elif version.startswith("rocm/"):
# For rocm/commit, wheel_base_dir should be just the commit part
# so relative path from rocm/0.12.0/rocm710/vllm/ -> ../../../{commit}/
wheel_dir = version.split("/", 1)[1]
else:
wheel_dir = version
wheel_base_dir = Path(output_dir).parent / wheel_dir
index_base_dir = Path(output_dir)
generate_index_and_metadata(

View File

@@ -0,0 +1,36 @@
#!/bin/bash
set -ex
# Get tag variant from argument, default to empty if not provided, should be something like "cu130".
# Due to limits in cleanup script, we must move variants to use separate tags like "cu130-nightly",
# otherwise they will be cleaned up together with the main "nightly" tags.
TAG_VARIANT="$1"
if [ -n "$TAG_VARIANT" ]; then
ORIG_TAG_SUFFIX="-$TAG_VARIANT"
TAG_NAME="$TAG_VARIANT-nightly"
else
ORIG_TAG_SUFFIX=""
TAG_NAME="nightly"
fi
ORIG_TAG_NAME="$BUILDKITE_COMMIT"
echo "Pushing original tag $ORIG_TAG_NAME$ORIG_TAG_SUFFIX to new nightly tag name: $TAG_NAME"
# pull original arch-dependent images from AWS ECR Public
aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:$ORIG_TAG_NAME-x86_64$ORIG_TAG_SUFFIX
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:$ORIG_TAG_NAME-aarch64$ORIG_TAG_SUFFIX
# tag arch-dependent images
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$ORIG_TAG_NAME-x86_64$ORIG_TAG_SUFFIX vllm/vllm-openai:$TAG_NAME-x86_64
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$ORIG_TAG_NAME-aarch64$ORIG_TAG_SUFFIX vllm/vllm-openai:$TAG_NAME-aarch64
# push arch-dependent images to DockerHub
docker push vllm/vllm-openai:$TAG_NAME-x86_64
docker push vllm/vllm-openai:$TAG_NAME-aarch64
# push arch-independent manifest to DockerHub
docker manifest create vllm/vllm-openai:$TAG_NAME vllm/vllm-openai:$TAG_NAME-x86_64 vllm/vllm-openai:$TAG_NAME-aarch64 --amend
docker manifest create vllm/vllm-openai:$TAG_NAME-$BUILDKITE_COMMIT vllm/vllm-openai:$TAG_NAME-x86_64 vllm/vllm-openai:$TAG_NAME-aarch64 --amend
docker manifest push vllm/vllm-openai:$TAG_NAME
docker manifest push vllm/vllm-openai:$TAG_NAME-$BUILDKITE_COMMIT

View File

@@ -0,0 +1,104 @@
#!/usr/bin/env bash
set -e
BUCKET="vllm-wheels"
SUBPATH=$BUILDKITE_COMMIT
S3_COMMIT_PREFIX="s3://$BUCKET/$SUBPATH/"
RELEASE_VERSION=$(buildkite-agent meta-data get release-version)
echo "Release version from Buildkite: $RELEASE_VERSION"
GIT_VERSION=$(git describe --exact-match --tags $BUILDKITE_COMMIT 2>/dev/null)
if [ -z "$GIT_VERSION" ]; then
echo "[FATAL] Not on a git tag, cannot create release."
exit 1
else
echo "Git version for commit $BUILDKITE_COMMIT: $GIT_VERSION"
fi
# sanity check for version mismatch
if [ "$RELEASE_VERSION" != "$GIT_VERSION" ]; then
if [ "$FORCE_RELEASE_IGNORE_VERSION_MISMATCH" == "true" ]; then
echo "[WARNING] Force release and ignore version mismatch"
else
echo "[FATAL] Release version from Buildkite does not match Git version."
exit 1
fi
fi
PURE_VERSION=${RELEASE_VERSION#v} # remove leading 'v'
# check pypi token
if [ -z "$PYPI_TOKEN" ]; then
echo "[FATAL] PYPI_TOKEN is not set."
exit 1
else
export TWINE_USERNAME="__token__"
export TWINE_PASSWORD="$PYPI_TOKEN"
fi
# check github token
if [ -z "$GITHUB_TOKEN" ]; then
echo "[FATAL] GITHUB_TOKEN is not set."
exit 1
else
export GH_TOKEN="$GITHUB_TOKEN"
fi
set -x # avoid printing secrets above
# download gh CLI from github
# Get latest gh CLI version from GitHub API
GH_VERSION=$(curl -s https://api.github.com/repos/cli/cli/releases/latest | grep '"tag_name":' | sed -E 's/.*"([^"]+)".*/\1/' | sed 's/^v//')
if [ -z "$GH_VERSION" ]; then
echo "[FATAL] Failed to get latest gh CLI version from GitHub"
exit 1
fi
echo "Downloading gh CLI version: $GH_VERSION"
GH_TARBALL="gh_${GH_VERSION}_linux_amd64.tar.gz"
GH_URL="https://github.com/cli/cli/releases/download/v${GH_VERSION}/${GH_TARBALL}"
GH_INSTALL_DIR="/tmp/gh-install"
mkdir -p "$GH_INSTALL_DIR"
pushd "$GH_INSTALL_DIR"
curl -L -o "$GH_TARBALL" "$GH_URL"
tar -xzf "$GH_TARBALL"
GH_BIN=$(realpath $(find . -name "gh" -type f -executable | head -n 1))
if [ -z "$GH_BIN" ]; then
echo "[FATAL] Failed to find gh CLI executable"
exit 1
fi
echo "gh CLI downloaded successfully, version: $($GH_BIN --version)"
echo "Last 5 releases on GitHub:" # as a sanity check of gh and GH_TOKEN
command "$GH_BIN" release list --limit 5
popd
# install twine from pypi
python3 -m venv /tmp/vllm-release-env
source /tmp/vllm-release-env/bin/activate
pip install twine
python3 -m twine --version
# copy release wheels to local directory
DIST_DIR=/tmp/vllm-release-dist
echo "Existing wheels on S3:"
aws s3 ls "$S3_COMMIT_PREFIX"
echo "Copying wheels to local directory"
mkdir -p $DIST_DIR
# include only wheels for the release version, ignore all files with "dev" or "rc" in the name (without excluding 'aarch64')
aws s3 cp --recursive --exclude "*" --include "vllm-${PURE_VERSION}*.whl" --exclude "*dev*" --exclude "*rc[0-9]*" "$S3_COMMIT_PREFIX" $DIST_DIR
echo "Wheels copied to local directory"
# generate source tarball
git archive --format=tar.gz --output="$DIST_DIR/vllm-${PURE_VERSION}.tar.gz" $BUILDKITE_COMMIT
ls -la $DIST_DIR
# upload wheels to PyPI (only default variant, i.e. files without '+' in the name)
PYPI_WHEEL_FILES=$(find $DIST_DIR -name "vllm-${PURE_VERSION}*.whl" -not -name "*+*")
if [ -z "$PYPI_WHEEL_FILES" ]; then
echo "No default variant wheels found, quitting..."
exit 1
fi
python3 -m twine check $PYPI_WHEEL_FILES
python3 -m twine --non-interactive --verbose upload $PYPI_WHEEL_FILES
echo "Wheels uploaded to PyPI"
# create release on GitHub with the release version and all wheels
command "$GH_BIN" release create $GIT_VERSION -d --latest --notes-from-tag --verify-tag $DIST_DIR/*.whl

View File

@@ -0,0 +1,151 @@
#!/usr/bin/env bash
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
#
# Upload ROCm wheels to S3 with proper index generation
#
# Required environment variables:
# AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY (or IAM role)
# S3_BUCKET (default: vllm-wheels)
#
# S3 path structure:
# s3://vllm-wheels/rocm/{commit}/ - All wheels for this commit
# s3://vllm-wheels/rocm/nightly/ - Index pointing to latest nightly
# s3://vllm-wheels/rocm/{version}/ - Index for release versions
set -ex
# ======== Configuration ========
BUCKET="${S3_BUCKET:-vllm-wheels}"
ROCM_SUBPATH="rocm/${BUILDKITE_COMMIT}"
S3_COMMIT_PREFIX="s3://$BUCKET/$ROCM_SUBPATH/"
INDICES_OUTPUT_DIR="rocm-indices"
PYTHON="${PYTHON_PROG:-python3}"
# ROCm uses manylinux_2_35 (Ubuntu 22.04 based)
MANYLINUX_VERSION="manylinux_2_35"
echo "========================================"
echo "ROCm Wheel Upload Configuration"
echo "========================================"
echo "S3 Bucket: $BUCKET"
echo "S3 Path: $ROCM_SUBPATH"
echo "Commit: $BUILDKITE_COMMIT"
echo "Branch: $BUILDKITE_BRANCH"
echo "========================================"
# ======== Part 0: Setup Python ========
# Detect if python3.12+ is available
has_new_python=$($PYTHON -c "print(1 if __import__('sys').version_info >= (3,12) else 0)" 2>/dev/null || echo 0)
if [[ "$has_new_python" -eq 0 ]]; then
# Use new python from docker
# Use --user to ensure files are created with correct ownership (not root)
docker pull python:3-slim
PYTHON="docker run --rm --user $(id -u):$(id -g) -v $(pwd):/app -w /app python:3-slim python3"
fi
echo "Using python interpreter: $PYTHON"
echo "Python version: $($PYTHON --version)"
# ======== Part 1: Collect and prepare wheels ========
# Collect all wheels
mkdir -p all-rocm-wheels
cp artifacts/rocm-base-wheels/*.whl all-rocm-wheels/ 2>/dev/null || true
cp artifacts/rocm-vllm-wheel/*.whl all-rocm-wheels/ 2>/dev/null || true
WHEEL_COUNT=$(ls all-rocm-wheels/*.whl 2>/dev/null | wc -l)
echo "Total wheels to upload: $WHEEL_COUNT"
if [ "$WHEEL_COUNT" -eq 0 ]; then
echo "ERROR: No wheels found to upload!"
exit 1
fi
# Rename linux to manylinux in wheel filenames
for wheel in all-rocm-wheels/*.whl; do
if [[ "$wheel" == *"linux"* ]] && [[ "$wheel" != *"manylinux"* ]]; then
new_wheel="${wheel/linux/$MANYLINUX_VERSION}"
mv -- "$wheel" "$new_wheel"
echo "Renamed: $(basename "$wheel") -> $(basename "$new_wheel")"
fi
done
echo ""
echo "Wheels to upload:"
ls -lh all-rocm-wheels/
# ======== Part 2: Upload wheels to S3 ========
echo ""
echo "Uploading wheels to $S3_COMMIT_PREFIX"
for wheel in all-rocm-wheels/*.whl; do
aws s3 cp "$wheel" "$S3_COMMIT_PREFIX"
done
# ======== Part 3: Generate and upload indices ========
# List existing wheels in commit directory
echo ""
echo "Generating indices..."
obj_json="rocm-objects.json"
aws s3api list-objects-v2 --bucket "$BUCKET" --prefix "$ROCM_SUBPATH/" --delimiter / --output json > "$obj_json"
mkdir -p "$INDICES_OUTPUT_DIR"
# Use the existing generate-nightly-index.py
# HACK: Replace regex module with stdlib re (same as CUDA script)
sed -i 's/import regex as re/import re/g' .buildkite/scripts/generate-nightly-index.py
$PYTHON .buildkite/scripts/generate-nightly-index.py \
--version "$ROCM_SUBPATH" \
--current-objects "$obj_json" \
--output-dir "$INDICES_OUTPUT_DIR" \
--comment "ROCm commit $BUILDKITE_COMMIT"
# Upload indices to commit directory
echo "Uploading indices to $S3_COMMIT_PREFIX"
aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "$S3_COMMIT_PREFIX"
# Update rocm/nightly/ if on main branch and not a PR
if [[ "$BUILDKITE_BRANCH" == "main" && "$BUILDKITE_PULL_REQUEST" == "false" ]] || [[ "$NIGHTLY" == "1" ]]; then
echo "Updating rocm/nightly/ index..."
aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "s3://$BUCKET/rocm/nightly/"
fi
# Extract version from vLLM wheel and update version-specific index
VLLM_WHEEL=$(ls all-rocm-wheels/vllm*.whl 2>/dev/null | head -1)
if [ -n "$VLLM_WHEEL" ]; then
VERSION=$(unzip -p "$VLLM_WHEEL" '**/METADATA' | grep '^Version: ' | cut -d' ' -f2)
echo "Version in wheel: $VERSION"
PURE_VERSION="${VERSION%%+*}"
PURE_VERSION="${PURE_VERSION%%.rocm}"
echo "Pure version: $PURE_VERSION"
if [[ "$VERSION" != *"dev"* ]]; then
echo "Updating rocm/$PURE_VERSION/ index..."
aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "s3://$BUCKET/rocm/$PURE_VERSION/"
fi
fi
# ======== Part 4: Summary ========
echo ""
echo "========================================"
echo "ROCm Wheel Upload Complete!"
echo "========================================"
echo ""
echo "Wheels available at:"
echo " s3://$BUCKET/$ROCM_SUBPATH/"
echo ""
echo "Install command (by commit):"
echo " pip install vllm --extra-index-url https://${BUCKET}.s3.amazonaws.com/$ROCM_SUBPATH/"
echo ""
if [[ "$BUILDKITE_BRANCH" == "main" ]] || [[ "$NIGHTLY" == "1" ]]; then
echo "Install command (nightly):"
echo " pip install vllm --extra-index-url https://${BUCKET}.s3.amazonaws.com/rocm/nightly/"
fi
echo ""
echo "Wheel count: $WHEEL_COUNT"
echo "========================================"

View File

@@ -3,6 +3,14 @@ ARG REMOTE_VLLM="0"
ARG COMMON_WORKDIR=/app
ARG BASE_IMAGE=rocm/vllm-dev:base
# Sccache configuration (only used in release pipeline)
ARG USE_SCCACHE
ARG SCCACHE_DOWNLOAD_URL
ARG SCCACHE_ENDPOINT
ARG SCCACHE_BUCKET_NAME=vllm-build-sccache
ARG SCCACHE_REGION_NAME=us-west-2
ARG SCCACHE_S3_NO_CREDENTIALS=0
FROM ${BASE_IMAGE} AS base
ARG ARG_PYTORCH_ROCM_ARCH
@@ -14,9 +22,14 @@ ENV RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES=1
RUN apt-get update -q -y && apt-get install -q -y \
sqlite3 libsqlite3-dev libfmt-dev libmsgpack-dev libsuitesparse-dev \
apt-transport-https ca-certificates wget curl
# Remove sccache
RUN python3 -m pip install --upgrade pip
RUN apt-get purge -y sccache; python3 -m pip uninstall -y sccache; rm -f "$(which sccache)"
# Remove sccache only if not using sccache (it exists in base image from Dockerfile.rocm_base)
ARG USE_SCCACHE
RUN if [ "$USE_SCCACHE" != "1" ]; then \
apt-get purge -y sccache || true; \
python3 -m pip uninstall -y sccache || true; \
rm -f "$(which sccache)" || true; \
fi
# Install UV
RUN curl -LsSf https://astral.sh/uv/install.sh | env UV_INSTALL_DIR="/usr/local/bin" sh
@@ -28,6 +41,39 @@ ENV UV_INDEX_STRATEGY="unsafe-best-match"
# Use copy mode to avoid hardlink failures with Docker cache mounts
ENV UV_LINK_MODE=copy
# Install sccache if USE_SCCACHE is enabled (for release builds)
ARG USE_SCCACHE
ARG SCCACHE_DOWNLOAD_URL
ARG SCCACHE_ENDPOINT
ARG SCCACHE_BUCKET_NAME
ARG SCCACHE_REGION_NAME
ARG SCCACHE_S3_NO_CREDENTIALS
RUN if [ "$USE_SCCACHE" = "1" ]; then \
if command -v sccache >/dev/null 2>&1; then \
echo "sccache already installed, skipping installation"; \
sccache --version; \
else \
echo "Installing sccache..." \
&& SCCACHE_ARCH="x86_64" \
&& SCCACHE_VERSION="v0.8.1" \
&& SCCACHE_DL_URL="${SCCACHE_DOWNLOAD_URL:-https://github.com/mozilla/sccache/releases/download/${SCCACHE_VERSION}/sccache-${SCCACHE_VERSION}-${SCCACHE_ARCH}-unknown-linux-musl.tar.gz}" \
&& curl -L -o /tmp/sccache.tar.gz ${SCCACHE_DL_URL} \
&& tar -xzf /tmp/sccache.tar.gz -C /tmp \
&& mv /tmp/sccache-${SCCACHE_VERSION}-${SCCACHE_ARCH}-unknown-linux-musl/sccache /usr/bin/sccache \
&& chmod +x /usr/bin/sccache \
&& rm -rf /tmp/sccache.tar.gz /tmp/sccache-${SCCACHE_VERSION}-${SCCACHE_ARCH}-unknown-linux-musl \
&& sccache --version; \
fi; \
fi
# Set sccache environment variables only when USE_SCCACHE=1
# This prevents S3 config from leaking into images when sccache is not used
ARG USE_SCCACHE
ENV SCCACHE_BUCKET=${USE_SCCACHE:+${SCCACHE_BUCKET_NAME}}
ENV SCCACHE_REGION=${USE_SCCACHE:+${SCCACHE_REGION_NAME}}
ENV SCCACHE_S3_NO_CREDENTIALS=${USE_SCCACHE:+${SCCACHE_S3_NO_CREDENTIALS}}
ENV SCCACHE_IDLE_TIMEOUT=${USE_SCCACHE:+0}
ARG COMMON_WORKDIR
WORKDIR ${COMMON_WORKDIR}
@@ -39,6 +85,8 @@ ONBUILD COPY ./ vllm/
FROM base AS fetch_vllm_1
ARG VLLM_REPO="https://github.com/vllm-project/vllm.git"
ARG VLLM_BRANCH="main"
ENV VLLM_REPO=${VLLM_REPO}
ENV VLLM_BRANCH=${VLLM_BRANCH}
ONBUILD RUN git clone ${VLLM_REPO} \
&& cd vllm \
&& git fetch -v --prune -- origin ${VLLM_BRANCH} \
@@ -51,7 +99,7 @@ FROM fetch_vllm_${REMOTE_VLLM} AS fetch_vllm
# -----------------------
# vLLM build stages
FROM fetch_vllm AS build_vllm
# Build vLLM
# Build vLLM (setup.py auto-detects sccache in PATH)
RUN cd vllm \
&& python3 -m pip install -r requirements/rocm.txt \
&& python3 setup.py clean --all \
@@ -67,6 +115,178 @@ COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/docker/Dockerfile.rocm /docker/
COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/.buildkite /.buildkite
COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/vllm/v1 /vllm_v1
# RIXL/UCX build stages
FROM base AS build_rixl
ARG RIXL_BRANCH="f33a5599"
ARG RIXL_REPO="https://github.com/ROCm/RIXL.git"
ARG UCX_BRANCH="da3fac2a"
ARG UCX_REPO="https://github.com/ROCm/ucx.git"
ENV ROCM_PATH=/opt/rocm
ENV UCX_HOME=/usr/local/ucx
ENV RIXL_HOME=/usr/local/rixl
ENV RIXL_BENCH_HOME=/usr/local/rixl_bench
# RIXL build system dependences and RDMA support
RUN apt-get -y update && apt-get -y install autoconf libtool pkg-config \
libgrpc-dev \
libgrpc++-dev \
libprotobuf-dev \
protobuf-compiler-grpc \
libcpprest-dev \
libaio-dev \
librdmacm1 \
librdmacm-dev \
libibverbs1 \
libibverbs-dev \
ibverbs-utils \
rdmacm-utils \
ibverbs-providers \
&& rm -rf /var/lib/apt/lists/*
RUN uv pip install --system meson auditwheel patchelf tomlkit
RUN cd /usr/local/src && \
git clone ${UCX_REPO} && \
cd ucx && \
git checkout ${UCX_BRANCH} && \
./autogen.sh && \
mkdir build && cd build && \
../configure \
--prefix=/usr/local/ucx \
--enable-shared \
--disable-static \
--disable-doxygen-doc \
--enable-optimizations \
--enable-devel-headers \
--with-rocm=/opt/rocm \
--with-verbs \
--with-dm \
--enable-mt && \
make -j && \
make install
ENV PATH=/usr/local/ucx/bin:$PATH
ENV LD_LIBRARY_PATH=${UCX_HOME}/lib:${LD_LIBRARY_PATH}
RUN git clone ${RIXL_REPO} /opt/rixl && \
cd /opt/rixl && \
git checkout ${RIXL_BRANCH} && \
meson setup build --prefix=${RIXL_HOME} \
-Ducx_path=${UCX_HOME} \
-Drocm_path=${ROCM_PATH} && \
cd build && \
ninja && \
ninja install
# Generate RIXL wheel
RUN cd /opt/rixl && mkdir -p /app/install && \
./contrib/build-wheel.sh \
--output-dir /app/install \
--rocm-dir ${ROCM_PATH} \
--ucx-plugins-dir ${UCX_HOME}/lib/ucx \
--nixl-plugins-dir ${RIXL_HOME}/lib/x86_64-linux-gnu/plugins
# -----------------------
# vLLM wheel release build stage (for building distributable wheels)
# This stage pins dependencies to custom ROCm wheel versions and handles version detection
FROM fetch_vllm AS build_vllm_wheel_release
ARG COMMON_WORKDIR
# Create /install directory for custom wheels
RUN mkdir -p /install
# Copy custom ROCm wheels from docker/context if they exist
# COPY ensures Docker cache is invalidated when wheels change
# .keep file ensures directory always exists for COPY to work
COPY docker/context/base-wheels/ /tmp/base-wheels/
# This is how we know if we are building for a wheel release or not.
# If there are not wheels found there, we are not building for a wheel release.
# So we exit with an error. To skip this stage.
RUN if [ -n "$(ls /tmp/base-wheels/*.whl 2>/dev/null)" ]; then \
echo "Found custom wheels - copying to /install"; \
cp /tmp/base-wheels/*.whl /install/ && \
echo "Copied custom wheels:"; \
ls -lh /install/; \
else \
echo "ERROR: No custom wheels found in docker/context/base-wheels/"; \
echo "Wheel releases require pre-built ROCm wheels."; \
exit 1; \
fi
# GIT_REPO_CHECK: Verify repo is clean and tags are available (for release builds)
# This matches CUDA's Dockerfile behavior for proper version detection via setuptools_scm
ARG GIT_REPO_CHECK=0
RUN if [ "$GIT_REPO_CHECK" != "0" ]; then \
echo "Running repository checks..."; \
cd vllm && bash tools/check_repo.sh; \
fi
# Extract version from git BEFORE any modifications (pin_rocm_dependencies.py modifies requirements/rocm.txt)
# This ensures setuptools_scm sees clean repo state for version detection
RUN --mount=type=bind,source=.git,target=vllm/.git \
cd vllm \
&& pip install setuptools_scm \
&& VLLM_VERSION=$(python3 -c "import setuptools_scm; print(setuptools_scm.get_version())") \
&& echo "Detected vLLM version: ${VLLM_VERSION}" \
&& echo "${VLLM_VERSION}" > /tmp/vllm_version.txt
# Fail if git-based package dependencies are found in requirements files
# (uv doesn't handle git+ URLs well, and packages should be distributed on PyPI)
# Extra notes: pip install is able to handle git+ URLs, but uv doesn't.
RUN echo "Checking for git-based packages in requirements files..." \
&& echo "Checking common.txt for git-based packages:" \
&& if grep -q 'git+' ${COMMON_WORKDIR}/vllm/requirements/common.txt; then \
echo "ERROR: Git-based packages found in common.txt:"; \
grep 'git+' ${COMMON_WORKDIR}/vllm/requirements/common.txt; \
echo "Please publish these packages to PyPI instead of using git dependencies."; \
exit 1; \
else \
echo " ✓ No git-based packages found in common.txt"; \
fi \
&& echo "Checking rocm.txt for git-based packages:" \
&& if grep -q 'git+' ${COMMON_WORKDIR}/vllm/requirements/rocm.txt; then \
echo "ERROR: Git-based packages found in rocm.txt:"; \
grep 'git+' ${COMMON_WORKDIR}/vllm/requirements/rocm.txt; \
echo "Please publish these packages to PyPI instead of using git dependencies."; \
exit 1; \
else \
echo " ✓ No git-based packages found in rocm.txt"; \
fi \
&& echo "All requirements files are clean - no git-based packages found"
# Pin vLLM dependencies to exact versions of custom ROCm wheels
# This ensures 'pip install vllm' automatically installs correct torch/triton/torchvision/amdsmi
COPY tools/vllm-rocm/pin_rocm_dependencies.py /tmp/pin_rocm_dependencies.py
RUN echo "Pinning vLLM dependencies to custom wheel versions..." \
&& python3 /tmp/pin_rocm_dependencies.py /install ${COMMON_WORKDIR}/vllm/requirements/rocm.txt
# Install dependencies using custom wheels from /install
RUN cd vllm \
&& echo "Building vLLM with custom wheels from /install" \
&& python3 -m pip install --find-links /install -r requirements/rocm.txt \
&& python3 setup.py clean --all
# Build wheel using pre-extracted version to avoid dirty state from modified requirements/rocm.txt
# (setup.py auto-detects sccache in PATH)
RUN --mount=type=bind,source=.git,target=vllm/.git \
cd vllm \
&& export SETUPTOOLS_SCM_PRETEND_VERSION=$(cat /tmp/vllm_version.txt) \
&& echo "Building wheel with version: ${SETUPTOOLS_SCM_PRETEND_VERSION}" \
&& python3 setup.py bdist_wheel --dist-dir=dist
FROM scratch AS export_vllm_wheel_release
ARG COMMON_WORKDIR
COPY --from=build_vllm_wheel_release ${COMMON_WORKDIR}/vllm/dist/*.whl /
COPY --from=build_vllm_wheel_release ${COMMON_WORKDIR}/vllm/requirements /requirements
COPY --from=build_vllm_wheel_release ${COMMON_WORKDIR}/vllm/benchmarks /benchmarks
COPY --from=build_vllm_wheel_release ${COMMON_WORKDIR}/vllm/tests /tests
COPY --from=build_vllm_wheel_release ${COMMON_WORKDIR}/vllm/examples /examples
COPY --from=build_vllm_wheel_release ${COMMON_WORKDIR}/vllm/docker/Dockerfile.rocm /docker/
COPY --from=build_vllm_wheel_release ${COMMON_WORKDIR}/vllm/.buildkite /.buildkite
COPY --from=build_vllm_wheel_release ${COMMON_WORKDIR}/vllm/vllm/v1 /vllm_v1
# -----------------------
# Test vLLM image
FROM base AS test
@@ -83,6 +303,10 @@ RUN --mount=type=bind,from=export_vllm,src=/,target=/install \
&& pip uninstall -y vllm \
&& uv pip install --system *.whl
# Install RIXL wheel
RUN --mount=type=bind,from=build_rixl,src=/app/install,target=/rixl_install \
uv pip install --system /rixl_install/*.whl
WORKDIR /vllm-workspace
ARG COMMON_WORKDIR
COPY --from=build_vllm ${COMMON_WORKDIR}/vllm /vllm-workspace
@@ -159,3 +383,7 @@ ENV KINETO_CONFIG="${COMMON_WORKDIR}/libkineto.conf"
RUN echo "VLLM_BASE_IMAGE=${BASE_IMAGE}" >> ${COMMON_WORKDIR}/versions.txt
CMD ["/bin/bash"]
#Set entrypoint for vllm-openai official images
FROM final As vllm-openai
ENTRYPOINT ["vllm", "serve"]

View File

@@ -14,16 +14,13 @@ ARG AITER_REPO="https://github.com/ROCm/aiter.git"
ARG MORI_BRANCH="2d02c6a9"
ARG MORI_REPO="https://github.com/ROCm/mori.git"
#TODO: When patch has been upstreamed, switch to the main repo/branch
# ARG RIXL_BRANCH="<TODO>"
# ARG RIXL_REPO="https://github.com/ROCm/RIXL.git"
ARG RIXL_BRANCH="50d63d94"
ARG RIXL_REPO="https://github.com/vcave/RIXL.git"
# Needed by RIXL
ARG ETCD_BRANCH="7c6e714f"
ARG ETCD_REPO="https://github.com/etcd-cpp-apiv3/etcd-cpp-apiv3.git"
ARG UCX_BRANCH="da3fac2a"
ARG UCX_REPO="https://github.com/ROCm/ucx.git"
# Sccache configuration (only used in release pipeline)
ARG USE_SCCACHE
ARG SCCACHE_DOWNLOAD_URL
ARG SCCACHE_ENDPOINT
ARG SCCACHE_BUCKET_NAME=vllm-build-sccache
ARG SCCACHE_REGION_NAME=us-west-2
ARG SCCACHE_S3_NO_CREDENTIALS=0
FROM ${BASE_IMAGE} AS base
@@ -64,6 +61,49 @@ RUN apt-get update -y \
RUN pip install -U packaging 'cmake<4' ninja wheel 'setuptools<80' pybind11 Cython
RUN apt-get update && apt-get install -y libjpeg-dev libsox-dev libsox-fmt-all sox && rm -rf /var/lib/apt/lists/*
# Install sccache if USE_SCCACHE is enabled (for release builds)
ARG USE_SCCACHE
ARG SCCACHE_DOWNLOAD_URL
ARG SCCACHE_ENDPOINT
ARG SCCACHE_BUCKET_NAME
ARG SCCACHE_REGION_NAME
ARG SCCACHE_S3_NO_CREDENTIALS
RUN if [ "$USE_SCCACHE" = "1" ]; then \
echo "Installing sccache..." \
&& SCCACHE_ARCH="x86_64" \
&& SCCACHE_VERSION="v0.8.1" \
&& SCCACHE_DL_URL="${SCCACHE_DOWNLOAD_URL:-https://github.com/mozilla/sccache/releases/download/${SCCACHE_VERSION}/sccache-${SCCACHE_VERSION}-${SCCACHE_ARCH}-unknown-linux-musl.tar.gz}" \
&& curl -L -o /tmp/sccache.tar.gz ${SCCACHE_DL_URL} \
&& tar -xzf /tmp/sccache.tar.gz -C /tmp \
&& mv /tmp/sccache-${SCCACHE_VERSION}-${SCCACHE_ARCH}-unknown-linux-musl/sccache /usr/bin/sccache \
&& chmod +x /usr/bin/sccache \
&& rm -rf /tmp/sccache.tar.gz /tmp/sccache-${SCCACHE_VERSION}-${SCCACHE_ARCH}-unknown-linux-musl \
&& sccache --version; \
fi
# Setup sccache for HIP compilation via HIP_CLANG_PATH
# This creates wrapper scripts in a separate directory and points HIP to use them
# This avoids modifying the original ROCm binaries which can break detection
# NOTE: HIP_CLANG_PATH is NOT set as ENV to avoid affecting downstream images (Dockerfile.rocm)
# Instead, each build stage should export HIP_CLANG_PATH=/opt/sccache-wrappers if USE_SCCACHE=1
RUN if [ "$USE_SCCACHE" = "1" ]; then \
echo "Setting up sccache wrappers for HIP compilation..." \
&& mkdir -p /opt/sccache-wrappers \
&& printf '#!/bin/bash\nexec sccache /opt/rocm/lib/llvm/bin/clang++ "$@"\n' > /opt/sccache-wrappers/clang++ \
&& chmod +x /opt/sccache-wrappers/clang++ \
&& printf '#!/bin/bash\nexec sccache /opt/rocm/lib/llvm/bin/clang "$@"\n' > /opt/sccache-wrappers/clang \
&& chmod +x /opt/sccache-wrappers/clang \
&& echo "sccache wrappers created in /opt/sccache-wrappers"; \
fi
# Set sccache environment variables only when USE_SCCACHE=1
# This prevents S3 config from leaking into images when sccache is not used
ARG USE_SCCACHE
ENV SCCACHE_BUCKET=${USE_SCCACHE:+${SCCACHE_BUCKET_NAME}}
ENV SCCACHE_REGION=${USE_SCCACHE:+${SCCACHE_REGION_NAME}}
ENV SCCACHE_S3_NO_CREDENTIALS=${USE_SCCACHE:+${SCCACHE_S3_NO_CREDENTIALS}}
ENV SCCACHE_IDLE_TIMEOUT=${USE_SCCACHE:+0}
###
### Triton Build
@@ -100,22 +140,42 @@ ARG PYTORCH_AUDIO_BRANCH
ARG PYTORCH_REPO
ARG PYTORCH_VISION_REPO
ARG PYTORCH_AUDIO_REPO
ARG USE_SCCACHE
RUN git clone ${PYTORCH_REPO} pytorch
RUN cd pytorch && git checkout ${PYTORCH_BRANCH} \
&& pip install -r requirements.txt && git submodule update --init --recursive \
&& python3 tools/amd_build/build_amd.py \
&& if [ "$USE_SCCACHE" = "1" ]; then \
export HIP_CLANG_PATH=/opt/sccache-wrappers \
&& export CMAKE_C_COMPILER_LAUNCHER=sccache \
&& export CMAKE_CXX_COMPILER_LAUNCHER=sccache \
&& sccache --show-stats; \
fi \
&& CMAKE_PREFIX_PATH=$(python3 -c 'import sys; print(sys.prefix)') python3 setup.py bdist_wheel --dist-dir=dist \
&& if [ "$USE_SCCACHE" = "1" ]; then sccache --show-stats; fi \
&& pip install dist/*.whl
RUN git clone ${PYTORCH_VISION_REPO} vision
RUN cd vision && git checkout ${PYTORCH_VISION_BRANCH} \
&& if [ "$USE_SCCACHE" = "1" ]; then \
export HIP_CLANG_PATH=/opt/sccache-wrappers \
&& export CMAKE_C_COMPILER_LAUNCHER=sccache \
&& export CMAKE_CXX_COMPILER_LAUNCHER=sccache; \
fi \
&& python3 setup.py bdist_wheel --dist-dir=dist \
&& if [ "$USE_SCCACHE" = "1" ]; then sccache --show-stats; fi \
&& pip install dist/*.whl
RUN git clone ${PYTORCH_AUDIO_REPO} audio
RUN cd audio && git checkout ${PYTORCH_AUDIO_BRANCH} \
&& git submodule update --init --recursive \
&& pip install -r requirements.txt \
&& if [ "$USE_SCCACHE" = "1" ]; then \
export HIP_CLANG_PATH=/opt/sccache-wrappers \
&& export CMAKE_C_COMPILER_LAUNCHER=sccache \
&& export CMAKE_CXX_COMPILER_LAUNCHER=sccache; \
fi \
&& python3 setup.py bdist_wheel --dist-dir=dist \
&& if [ "$USE_SCCACHE" = "1" ]; then sccache --show-stats; fi \
&& pip install dist/*.whl
RUN mkdir -p /app/install && cp /app/pytorch/dist/*.whl /app/install \
&& cp /app/vision/dist/*.whl /app/install \
@@ -138,105 +198,25 @@ RUN cd mori \
RUN mkdir -p /app/install && cp /app/mori/dist/*.whl /app/install
###
### RIXL Build
###
FROM build_pytorch AS build_rixl
ARG RIXL_BRANCH
ARG RIXL_REPO
ARG ETCD_BRANCH
ARG ETCD_REPO
ARG UCX_BRANCH
ARG UCX_REPO
ENV ROCM_PATH=/opt/rocm
ENV UCX_HOME=/usr/local/ucx
ENV RIXL_HOME=/usr/local/rixl
ENV RIXL_BENCH_HOME=/usr/local/rixl_bench
# RIXL build system dependences and RDMA support
RUN apt-get -y update && apt-get -y install autoconf libtool pkg-config \
libgrpc-dev \
libgrpc++-dev \
libprotobuf-dev \
protobuf-compiler-grpc \
libcpprest-dev \
libaio-dev \
librdmacm1 \
librdmacm-dev \
libibverbs1 \
libibverbs-dev \
ibverbs-utils \
rdmacm-utils \
ibverbs-providers
RUN pip install meson auditwheel patchelf tomlkit
WORKDIR /workspace
RUN git clone ${ETCD_REPO} && \
cd etcd-cpp-apiv3 && \
git checkout ${ETCD_BRANCH} && \
mkdir build && cd build && \
cmake .. -DCMAKE_POLICY_VERSION_MINIMUM=3.5 && \
make -j$(nproc) && \
make install
RUN cd /usr/local/src && \
git clone ${UCX_REPO} && \
cd ucx && \
git checkout ${UCX_BRANCH} && \
./autogen.sh && \
mkdir build && cd build && \
../configure \
--prefix=/usr/local/ucx \
--enable-shared \
--disable-static \
--disable-doxygen-doc \
--enable-optimizations \
--enable-devel-headers \
--with-rocm=/opt/rocm \
--with-verbs \
--with-dm \
--enable-mt && \
make -j && \
make -j install
ENV PATH=/usr/local/ucx/bin:$PATH
ENV LD_LIBRARY_PATH=${UCX_HOME}/lib:${LD_LIBRARY_PATH}
RUN git clone ${RIXL_REPO} /opt/rixl && \
cd /opt/rixl && \
git checkout ${RIXL_BRANCH} && \
meson setup build --prefix=${RIXL_HOME} \
-Ducx_path=${UCX_HOME} \
-Drocm_path=${ROCM_PATH} && \
cd build && \
ninja && \
ninja install
# Generate RIXL wheel
RUN cd /opt/rixl && mkdir -p /app/install && \
./contrib/build-wheel.sh \
--output-dir /app/install \
--rocm-dir ${ROCM_PATH} \
--ucx-plugins-dir ${UCX_HOME}/lib/ucx \
--nixl-plugins-dir ${RIXL_HOME}/lib/x86_64-linux-gnu/plugins
###
### FlashAttention Build
###
FROM base AS build_fa
ARG FA_BRANCH
ARG FA_REPO
ARG USE_SCCACHE
RUN --mount=type=bind,from=build_pytorch,src=/app/install/,target=/install \
pip install /install/*.whl
RUN git clone ${FA_REPO}
RUN cd flash-attention \
&& git checkout ${FA_BRANCH} \
&& git submodule update --init \
&& GPU_ARCHS=$(echo ${PYTORCH_ROCM_ARCH} | sed -e 's/;gfx1[0-9]\{3\}//g') python3 setup.py bdist_wheel --dist-dir=dist
&& if [ "$USE_SCCACHE" = "1" ]; then \
export HIP_CLANG_PATH=/opt/sccache-wrappers \
&& sccache --show-stats; \
fi \
&& GPU_ARCHS=$(echo ${PYTORCH_ROCM_ARCH} | sed -e 's/;gfx1[0-9]\{3\}//g') python3 setup.py bdist_wheel --dist-dir=dist \
&& if [ "$USE_SCCACHE" = "1" ]; then sccache --show-stats; fi
RUN mkdir -p /app/install && cp /app/flash-attention/dist/*.whl /app/install
@@ -246,6 +226,7 @@ RUN mkdir -p /app/install && cp /app/flash-attention/dist/*.whl /app/install
FROM base AS build_aiter
ARG AITER_BRANCH
ARG AITER_REPO
ARG USE_SCCACHE
RUN --mount=type=bind,from=build_pytorch,src=/app/install/,target=/install \
pip install /install/*.whl
RUN git clone --recursive ${AITER_REPO}
@@ -253,13 +234,37 @@ RUN cd aiter \
&& git checkout ${AITER_BRANCH} \
&& git submodule update --init --recursive \
&& pip install -r requirements.txt
RUN pip install pyyaml && cd aiter && PREBUILD_KERNELS=1 GPU_ARCHS=${AITER_ROCM_ARCH} python3 setup.py bdist_wheel --dist-dir=dist && ls /app/aiter/dist/*.whl
RUN pip install pyyaml && cd aiter \
&& if [ "$USE_SCCACHE" = "1" ]; then \
export HIP_CLANG_PATH=/opt/sccache-wrappers \
&& sccache --show-stats; \
fi \
&& PREBUILD_KERNELS=1 GPU_ARCHS=${AITER_ROCM_ARCH} python3 setup.py bdist_wheel --dist-dir=dist \
&& if [ "$USE_SCCACHE" = "1" ]; then sccache --show-stats; fi \
&& ls /app/aiter/dist/*.whl
RUN mkdir -p /app/install && cp /app/aiter/dist/*.whl /app/install
###
### Final Build
###
# Wheel release stage -
# only includes dependencies used by wheel release pipeline
FROM base AS debs_wheel_release
RUN mkdir /app/debs
RUN --mount=type=bind,from=build_triton,src=/app/install/,target=/install \
cp /install/*.whl /app/debs
RUN --mount=type=bind,from=build_fa,src=/app/install/,target=/install \
cp /install/*.whl /app/debs
RUN --mount=type=bind,from=build_amdsmi,src=/app/install/,target=/install \
cp /install/*.whl /app/debs
RUN --mount=type=bind,from=build_pytorch,src=/app/install/,target=/install \
cp /install/*.whl /app/debs
RUN --mount=type=bind,from=build_aiter,src=/app/install/,target=/install \
cp /install/*.whl /app/debs
# Full debs stage - includes Mori (used by Docker releases)
FROM base AS debs
RUN mkdir /app/debs
RUN --mount=type=bind,from=build_triton,src=/app/install/,target=/install \
@@ -274,8 +279,6 @@ RUN --mount=type=bind,from=build_aiter,src=/app/install/,target=/install \
cp /install/*.whl /app/debs
RUN --mount=type=bind,from=build_mori,src=/app/install/,target=/install \
cp /install/*.whl /app/debs
RUN --mount=type=bind,from=build_rixl,src=/app/install/,target=/install \
cp /install/*.whl /app/debs
FROM base AS final
RUN --mount=type=bind,from=debs,src=/app/debs,target=/install \
@@ -294,12 +297,6 @@ ARG FA_BRANCH
ARG FA_REPO
ARG AITER_BRANCH
ARG AITER_REPO
ARG RIXL_BRANCH
ARG RIXL_REPO
ARG ETCD_BRANCH
ARG ETCD_REPO
ARG UCX_BRANCH
ARG UCX_REPO
ARG MORI_BRANCH
ARG MORI_REPO
RUN echo "BASE_IMAGE: ${BASE_IMAGE}" > /app/versions.txt \
@@ -315,11 +312,5 @@ RUN echo "BASE_IMAGE: ${BASE_IMAGE}" > /app/versions.txt \
&& echo "FA_REPO: ${FA_REPO}" >> /app/versions.txt \
&& echo "AITER_BRANCH: ${AITER_BRANCH}" >> /app/versions.txt \
&& echo "AITER_REPO: ${AITER_REPO}" >> /app/versions.txt \
&& echo "RIXL_BRANCH: ${RIXL_BRANCH}" >> /app/versions.txt \
&& echo "RIXL_REPO: ${RIXL_REPO}" >> /app/versions.txt \
&& echo "ETCD_BRANCH: ${ETCD_BRANCH}" >> /app/versions.txt \
&& echo "ETCD_REPO: ${ETCD_REPO}" >> /app/versions.txt \
&& echo "UCX_BRANCH: ${UCX_BRANCH}" >> /app/versions.txt \
&& echo "UCX_REPO: ${UCX_REPO}" >> /app/versions.txt \
&& echo "MORI_BRANCH: ${MORI_BRANCH}" >> /app/versions.txt \
&& echo "MORI_REPO: ${MORI_REPO}" >> /app/versions.txt

View File

@@ -47,7 +47,7 @@ The key parameters for chunked processing are in the `--pooler-config`:
```json
{
"pooling_type": "auto",
"normalize": true,
"use_activation": true,
"enable_chunked_processing": true,
"max_embed_len": 3072000
}

View File

@@ -14,7 +14,7 @@ Prerequisites:
# MEAN pooling (processes all chunks, recommended for complete coverage)
vllm serve intfloat/multilingual-e5-large \
--pooler-config \
'{"pooling_type": "MEAN", "normalize": true, ' \
'{"pooling_type": "MEAN", "use_activation": true, ' \
'"enable_chunked_processing": true, "max_embed_len": 3072000}' \
--served-model-name multilingual-e5-large \
--trust-remote-code \
@@ -24,7 +24,7 @@ Prerequisites:
# OR CLS pooling (native CLS within chunks, MEAN aggregation across chunks)
vllm serve BAAI/bge-large-en-v1.5 \
--pooler-config \
'{"pooling_type": "CLS", "normalize": true, ' \
'{"pooling_type": "CLS", "use_activation": true, ' \
'"enable_chunked_processing": true, "max_embed_len": 1048576}' \
--served-model-name bge-large-en-v1.5 \
--trust-remote-code \

View File

@@ -96,7 +96,7 @@ echo ""
echo "🔧 Starting server with enhanced chunked processing configuration..."
# Build pooler config JSON
POOLER_CONFIG="{\"pooling_type\": \"$POOLING_TYPE\", \"normalize\": true, \"enable_chunked_processing\": ${VLLM_ENABLE_CHUNKED_PROCESSING}, \"max_embed_len\": ${MAX_EMBED_LEN}}"
POOLER_CONFIG="{\"pooling_type\": \"$POOLING_TYPE\", \"use_activation\": true, \"enable_chunked_processing\": ${VLLM_ENABLE_CHUNKED_PROCESSING}, \"max_embed_len\": ${MAX_EMBED_LEN}}"
# Start vLLM server with enhanced chunked processing
vllm serve "$MODEL_NAME" \

View File

@@ -32,7 +32,7 @@ pyzmq >= 25.0.0
msgspec
gguf >= 0.17.0
mistral_common[image] >= 1.8.8
opencv-python-headless >= 4.11.0 # required for video IO
opencv-python-headless >= 4.13.0 # required for video IO
pyyaml
six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that needs to be the latest version for python 3.12
setuptools>=77.0.3,<81.0.0; python_version > '3.11' # Setuptools is used by triton, we need to ensure a modern version is installed for 3.12+ so that it does not try to import distutils, which was removed in 3.12

View File

@@ -25,7 +25,7 @@ transformers_stream_generator # required for qwen-vl test
matplotlib # required for qwen-vl test
mistral_common[image,audio] >= 1.8.8 # required for voxtral test
num2words # required for smolvlm test
opencv-python-headless >= 4.11.0 # required for video test
opencv-python-headless >= 4.13.0 # required for video test
datamodel_code_generator # required for minicpm3 test
lm-eval[api]>=0.4.9.2 # required for model evaluation test
mteb>=1.38.11, <2 # required for mteb test
@@ -37,8 +37,8 @@ bitsandbytes>=0.46.1
buildkite-test-collector==0.1.9
genai_perf==0.0.8
tritonclient==2.51.0
genai_perf>=0.0.8
tritonclient>=2.51.0
numba == 0.61.2 # Required for N-gram speculative decoding
numpy

View File

@@ -80,6 +80,8 @@ num2words==0.5.14
pqdm==0.2.0
# via lm-eval
# Required for fastsafetensors test
fastsafetensors @ git+https://github.com/foundation-model-stack/fastsafetensors.git@d6f998a03432b2452f8de2bb5cefb5af9795d459
# Required for suffix decoding test
arctic-inference == 0.1.1
# Required for Nemotron test

View File

@@ -15,5 +15,4 @@ setuptools-scm>=8
runai-model-streamer[s3,gcs]==0.15.3
conch-triton-kernels==1.2.1
timm>=1.0.17
fastsafetensors @ git+https://github.com/foundation-model-stack/fastsafetensors.git@d6f998a03432b2452f8de2bb5cefb5af9795d459
grpcio-tools>=1.76.0

View File

@@ -33,7 +33,7 @@ matplotlib # required for qwen-vl test
mistral_common[image,audio] >= 1.8.8 # required for voxtral test
num2words # required for smolvlm test
open_clip_torch==2.32.0 # Required for nemotron_vl test, Nemotron Parse in test_common.py
opencv-python-headless >= 4.11.0 # required for video test
opencv-python-headless >= 4.13.0 # required for video test
datamodel_code_generator # required for minicpm3 test
lm-eval[api]>=0.4.9.2 # required for model evaluation test
mteb[bm25s]>=2, <3 # required for mteb test
@@ -45,8 +45,8 @@ bitsandbytes==0.46.1
buildkite-test-collector==0.1.9
genai_perf==0.0.8
tritonclient==2.51.0
genai_perf>=0.0.8
tritonclient>=2.51.0
arctic-inference == 0.1.1 # Required for suffix decoding test
numba == 0.61.2 # Required for N-gram speculative decoding

View File

@@ -31,7 +31,11 @@ albumentations==1.4.6
# -r requirements/test.in
# terratorch
alembic==1.16.4
# via mlflow
# via
# mlflow
# optuna
annotated-doc==0.0.4
# via fastapi
annotated-types==0.7.0
# via pydantic
antlr4-python3-runtime==4.9.3
@@ -143,6 +147,8 @@ colorama==0.4.6
# tqdm-multiprocess
colorful==0.5.6
# via ray
colorlog==6.10.1
# via optuna
contourpy==1.3.0
# via matplotlib
coverage==7.10.6
@@ -250,7 +256,7 @@ fsspec==2024.9.0
# torch
ftfy==6.3.1
# via open-clip-torch
genai-perf==0.0.8
genai-perf==0.0.16
# via -r requirements/test.in
genson==1.3.0
# via datamodel-code-generator
@@ -387,6 +393,7 @@ jinja2==3.1.6
# via
# datamodel-code-generator
# flask
# genai-perf
# mlflow
# torch
jiwer==3.0.5
@@ -526,7 +533,7 @@ numba==0.61.2
# librosa
numexpr==2.10.1
# via lm-eval
numpy==1.26.4
numpy==2.2.6
# via
# -r requirements/test.in
# accelerate
@@ -556,6 +563,7 @@ numpy==1.26.4
# numba
# numexpr
# opencv-python-headless
# optuna
# pandas
# patsy
# peft
@@ -635,7 +643,7 @@ opencensus==0.11.4
# via ray
opencensus-context==0.1.3
# via opencensus
opencv-python-headless==4.11.0.86
opencv-python-headless==4.13.0.90
# via
# -r requirements/test.in
# albucore
@@ -658,6 +666,10 @@ opentelemetry-sdk==1.35.0
# ray
opentelemetry-semantic-conventions==0.56b0
# via opentelemetry-sdk
optuna==3.6.1
# via genai-perf
orjson==3.11.5
# via genai-perf
packaging==24.2
# via
# accelerate
@@ -676,6 +688,7 @@ packaging==24.2
# lightning-utilities
# matplotlib
# mlflow-skinny
# optuna
# peft
# plotly
# pooch
@@ -715,6 +728,8 @@ peft==0.16.0
# lm-eval
perceptron==0.1.4
# via -r requirements/test.in
perf-analyzer==0.1.0
# via genai-perf
pillow==10.4.0
# via
# genai-perf
@@ -901,6 +916,7 @@ pyyaml==6.0.2
# lightning
# mlflow-skinny
# omegaconf
# optuna
# peft
# pytorch-lightning
# ray
@@ -1063,6 +1079,7 @@ sortedcontainers==2.4.0
soundfile==0.12.1
# via
# -r requirements/test.in
# genai-perf
# librosa
# mistral-common
soxr==0.5.0.post1
@@ -1073,6 +1090,7 @@ sqlalchemy==2.0.41
# via
# alembic
# mlflow
# optuna
sqlitedict==2.1.0
# via lm-eval
sqlparse==0.5.3
@@ -1202,6 +1220,7 @@ tqdm==4.66.6
# mteb
# nltk
# open-clip-torch
# optuna
# peft
# pqdm
# pretrainedmodels
@@ -1224,10 +1243,8 @@ transformers-stream-generator==0.0.5
# via -r requirements/test.in
triton==3.5.1
# via torch
tritonclient==2.51.0
# via
# -r requirements/test.in
# genai-perf
tritonclient==2.64.0
# via -r requirements/test.in
typepy==1.3.2
# via
# dataproperty

View File

@@ -267,12 +267,16 @@ async def test_audio_with_max_tokens(mary_had_lamb, client_and_model):
out_tokens = tok(out_text, add_special_tokens=False)["input_ids"]
assert len(out_tokens) == 1
# max_completion_tokens > max_model_len
# max_model_len=32768 for Gemma-3n-E2B-it
transcription = await client.audio.transcriptions.create(
model=model_name,
file=mary_had_lamb,
response_format="text",
temperature=0.0,
extra_body={"max_completion_tokens": int(1e6)},
extra_body={
"max_completion_tokens": int(1e6),
"repetition_penalty": 1.3,
},
)
out = json.loads(transcription)
out_text = out["text"]

View File

@@ -53,7 +53,9 @@ def test_token_embed(llm: LLM):
def test_pooling_params(llm: LLM):
def get_outputs(normalize):
outputs = llm.embed(
prompts, pooling_params=PoolingParams(normalize=normalize), use_tqdm=False
prompts,
pooling_params=PoolingParams(use_activation=normalize),
use_tqdm=False,
)
return torch.tensor([x.outputs.embedding for x in outputs])

View File

@@ -216,7 +216,7 @@ def server_with_chunked_processing():
"512", # Set smaller max_model_len to trigger chunking mechanism
"--pooler-config",
(
'{"pooling_type": "MEAN", "normalize": true, '
'{"pooling_type": "MEAN", "use_activation": true, '
'"enable_chunked_processing": true, "max_embed_len": 10000}'
),
"--gpu-memory-utilization",

View File

@@ -236,17 +236,14 @@ class TestModel:
"use_activation": use_activation,
},
)
if response.status_code != 200:
return response
outputs = response.json()
return torch.tensor([x["score"] for x in outputs["data"]])
if model["is_cross_encoder"]:
default = get_outputs(use_activation=None)
w_activation = get_outputs(use_activation=True)
wo_activation = get_outputs(use_activation=False)
default = get_outputs(use_activation=None)
w_activation = get_outputs(use_activation=True)
wo_activation = get_outputs(use_activation=False)
if model["is_cross_encoder"]:
assert torch.allclose(default, w_activation, atol=1e-2), (
"Default should use activation."
)
@@ -256,9 +253,3 @@ class TestModel:
assert torch.allclose(F.sigmoid(wo_activation), w_activation, atol=1e-2), (
"w_activation should be close to activation(wo_activation)."
)
else:
get_outputs(use_activation=None)
# The activation parameter only works for the is_cross_encoder model
response = get_outputs(use_activation=True)
assert response.status_code == 400

View File

@@ -48,7 +48,7 @@ def test_model_loading_with_params(vllm_runner, monkeypatch):
# asserts on the pooling config files
assert model_config.pooler_config.seq_pooling_type == "CLS"
assert model_config.pooler_config.tok_pooling_type == "ALL"
assert model_config.pooler_config.normalize
assert model_config.pooler_config.use_activation
# asserts on the tokenizer loaded
assert model_config.tokenizer == "BAAI/bge-base-en-v1.5"
@@ -93,7 +93,7 @@ def test_roberta_model_loading_with_params(vllm_runner, monkeypatch):
# asserts on the pooling config files
assert model_config.pooler_config.seq_pooling_type == "MEAN"
assert model_config.pooler_config.tok_pooling_type == "ALL"
assert model_config.pooler_config.normalize
assert model_config.pooler_config.use_activation
# asserts on the tokenizer loaded
assert model_config.tokenizer == "intfloat/multilingual-e5-base"

View File

@@ -66,7 +66,7 @@ def test_embed_models_using_normalize(
model,
max_model_len=512,
dtype=dtype,
pooler_config=PoolerConfig(normalize=False),
pooler_config=PoolerConfig(use_activation=False),
) as vllm_model:
wo_normalize = torch.tensor(vllm_model.embed(example_prompts))
@@ -74,7 +74,7 @@ def test_embed_models_using_normalize(
model,
max_model_len=512,
dtype=dtype,
pooler_config=PoolerConfig(normalize=True),
pooler_config=PoolerConfig(use_activation=True),
) as vllm_model:
w_normalize = torch.tensor(vllm_model.embed(example_prompts))
@@ -146,7 +146,7 @@ def test_multi_vector_retrieval_models_using_normalize(
model,
max_model_len=512,
dtype=dtype,
pooler_config=PoolerConfig(normalize=False),
pooler_config=PoolerConfig(use_activation=False),
) as vllm_model:
wo_normalize = vllm_model.token_embed(example_prompts)
@@ -154,7 +154,7 @@ def test_multi_vector_retrieval_models_using_normalize(
model,
max_model_len=512,
dtype=dtype,
pooler_config=PoolerConfig(normalize=True),
pooler_config=PoolerConfig(use_activation=True),
) as vllm_model:
w_normalize = vllm_model.token_embed(example_prompts)

View File

@@ -176,3 +176,46 @@ def test_models_distributed(
distributed_executor_backend=distributed_executor_backend,
enforce_eager=False,
)
@pytest.mark.core_model
@pytest.mark.parametrize("model", ["openai/whisper-large-v3-turbo"])
def test_encoder_cache_cleanup(
vllm_runner,
model: str,
input_audios,
monkeypatch,
) -> None:
"""Test that encoder cache is properly cleaned up after requests complete.
This is a regression test for a bug where encoder cache entries were freed
in the same scheduling step they were allocated, before the model could use
them.
"""
# Set single-process mode to access the model runner's encoder cache directly
monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
check_model_available(model)
with vllm_runner(
model,
dtype="half",
max_model_len=448,
tensor_parallel_size=1,
limit_mm_per_prompt={"audio": 2},
enforce_eager=True,
) as vllm_model:
engine_core = vllm_model.llm.llm_engine.engine_core.engine_core
model_runner = engine_core.model_executor.driver_worker.worker.model_runner
encoder_cache = model_runner.encoder_cache
# Run multiple sequential requests to ensure cache is properly managed
for vllm_prompts, _, audios in input_audios:
vllm_model.generate_greedy(vllm_prompts, max_tokens=50, audios=audios)
# After all requests complete, encoder cache should be empty
cache_size = len(encoder_cache)
assert cache_size == 0, (
f"Encoder cache should be empty after all requests complete, "
f"but has {cache_size} entries. This indicates encoder cache "
f"entries are not being properly freed."
)

View File

@@ -160,7 +160,7 @@ def test_get_pooling_config():
model_config = ModelConfig(model_id)
assert model_config.pooler_config is not None
assert model_config.pooler_config.normalize
assert model_config.pooler_config.use_activation
assert model_config.pooler_config.seq_pooling_type == "MEAN"
assert model_config.pooler_config.tok_pooling_type == "ALL"

View File

@@ -18,7 +18,7 @@ EMBEDDING_MODELS = [
]
classify_parameters = ["use_activation"]
embed_parameters = ["dimensions", "normalize"]
embed_parameters = ["dimensions", "use_activation"]
step_pooling_parameters = ["step_tag_id", "returned_token_ids"]
@@ -42,17 +42,17 @@ def test_embed():
task = "embed"
model_config = MockModelConfig(pooler_config=PoolerConfig(seq_pooling_type="CLS"))
pooling_params = PoolingParams(normalize=None)
pooling_params = PoolingParams(use_activation=None)
pooling_params.verify(task=task, model_config=model_config)
pooling_params = PoolingParams(normalize=True)
pooling_params = PoolingParams(use_activation=True)
pooling_params.verify(task=task, model_config=model_config)
pooling_params = PoolingParams(normalize=False)
pooling_params = PoolingParams(use_activation=False)
pooling_params.verify(task=task, model_config=model_config)
invalid_parameters = classify_parameters + step_pooling_parameters
for p in invalid_parameters:
for p in set(invalid_parameters) - set(embed_parameters):
with pytest.raises(ValueError):
pooling_params = PoolingParams(**{p: True})
pooling_params.verify(task=task, model_config=model_config)
@@ -98,7 +98,7 @@ def test_classify(task):
pooling_params.verify(task=task, model_config=model_config)
invalid_parameters = embed_parameters + step_pooling_parameters
for p in invalid_parameters:
for p in set(invalid_parameters) - set(classify_parameters):
with pytest.raises(ValueError):
pooling_params = PoolingParams(**{p: True})
pooling_params.verify(task=task, model_config=model_config)
@@ -111,20 +111,20 @@ def test_token_embed(pooling_type: str):
pooler_config=PoolerConfig(tok_pooling_type=pooling_type)
)
pooling_params = PoolingParams(normalize=None)
pooling_params = PoolingParams(use_activation=None)
pooling_params.verify(task=task, model_config=model_config)
pooling_params = PoolingParams(normalize=True)
pooling_params = PoolingParams(use_activation=True)
pooling_params.verify(task=task, model_config=model_config)
pooling_params = PoolingParams(normalize=False)
pooling_params = PoolingParams(use_activation=False)
pooling_params.verify(task=task, model_config=model_config)
invalid_parameters = classify_parameters
if pooling_type != "STEP":
invalid_parameters = classify_parameters + step_pooling_parameters
for p in invalid_parameters:
for p in set(invalid_parameters) - set(embed_parameters):
with pytest.raises(ValueError):
pooling_params = PoolingParams(**{p: True})
pooling_params.verify(task=task, model_config=model_config)
@@ -150,7 +150,7 @@ def test_token_classify(pooling_type: str):
if pooling_type != "STEP":
invalid_parameters = embed_parameters + step_pooling_parameters
for p in invalid_parameters:
for p in set(invalid_parameters) - set(classify_parameters):
with pytest.raises(ValueError):
pooling_params = PoolingParams(**{p: True})
pooling_params.verify(task=task, model_config=model_config)

View File

@@ -19,7 +19,8 @@ pytestmark = pytest.mark.cpu_test
("lmcache", 4.0, 1, 1, "LMCacheConnectorV1", 4.0),
# size per rank: 8.0 GiB / (2 * 2) = 2.0 GiB
("lmcache", 8.0, 2, 2, "LMCacheConnectorV1", 2.0),
(None, None, 1, 1, None, None),
# When kv_offloading_size is None, offloading is disabled (backend is ignored)
("native", None, 1, 1, None, None),
],
)
def test_kv_connector(
@@ -62,3 +63,19 @@ def test_kv_connector(
assert kv_connector_extra_config["lmcache.max_local_cpu_size"] == expected_bytes
# Existing config should be replaced
assert "existing_key" not in kv_connector_extra_config
def test_kv_offloading_size_only_uses_native_default():
"""Test that setting only kv_offloading_size enables native offloading."""
vllm_config = VllmConfig(
cache_config=CacheConfig(
kv_offloading_size=4.0,
# kv_offloading_backend not set, should default to "native"
),
)
kv_transfer_config = vllm_config.kv_transfer_config
kv_connector_extra_config = kv_transfer_config.kv_connector_extra_config
assert kv_transfer_config.kv_connector == "OffloadingConnector"
assert kv_transfer_config.kv_role == "kv_both"
assert kv_connector_extra_config["cpu_bytes_to_use"] == 4.0 * (1 << 30)

View File

@@ -0,0 +1,221 @@
#!/usr/bin/env python3
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
Pin vLLM dependencies to exact versions of custom ROCm wheels.
This script modifies vLLM's requirements files to replace version constraints
with exact versions of custom-built ROCm wheels (torch, triton, torchvision, amdsmi).
This ensures that 'pip install vllm' automatically installs the correct custom wheels
instead of allowing pip to download different versions from PyPI.
"""
import re
import sys
from pathlib import Path
def extract_version_from_wheel(wheel_name: str) -> str:
"""
Extract version from wheel filename.
Example:
torch-2.9.0a0+git1c57644-cp312-cp312-linux_x86_64.whl -> 2.9.0a0+git1c57644
triton-3.4.0-cp312-cp312-linux_x86_64.whl -> 3.4.0
"""
# Wheel format:
# {distribution}-{version}(-{build tag})?-{python}-{abi}-{platform}.whl
parts = wheel_name.replace(".whl", "").split("-")
if len(parts) < 5:
raise ValueError(f"Invalid wheel filename format: {wheel_name}")
# Version is the second part
version = parts[1]
return version
def get_custom_wheel_versions(install_dir: str) -> dict[str, str]:
"""
Read /install directory and extract versions of custom wheels.
Returns:
Dict mapping package names to exact versions
"""
install_path = Path(install_dir)
if not install_path.exists():
print(f"ERROR: Install directory not found: {install_dir}", file=sys.stderr)
sys.exit(1)
versions = {}
# Map wheel prefixes to package names
# IMPORTANT: Use dashes to avoid matching substrings
# (e.g., 'torch' would match 'torchvision')
# ORDER MATTERS: This order is preserved when pinning dependencies
# in requirements files
package_mapping = [
("torch-", "torch"), # Match torch- (not torchvision)
("triton-", "triton"), # Match triton- (not triton_kernels)
("triton_kernels-", "triton-kernels"), # Match triton_kernels-
("torchvision-", "torchvision"), # Match torchvision-
("torchaudio-", "torchaudio"), # Match torchaudio-
("amdsmi-", "amdsmi"), # Match amdsmi-
("flash_attn-", "flash-attn"), # Match flash_attn-
("aiter-", "aiter"), # Match aiter-
]
for wheel_file in install_path.glob("*.whl"):
wheel_name = wheel_file.name
for prefix, package_name in package_mapping:
if wheel_name.startswith(prefix):
try:
version = extract_version_from_wheel(wheel_name)
versions[package_name] = version
print(f"Found {package_name}=={version}", file=sys.stderr)
except Exception as e:
print(
f"WARNING: Could not extract version from {wheel_name}: {e}",
file=sys.stderr,
)
break
# Return versions in the order defined by package_mapping
ordered_versions = {}
for _, package_name in package_mapping:
if package_name in versions:
ordered_versions[package_name] = versions[package_name]
return ordered_versions
def pin_dependencies_in_requirements(requirements_path: str, versions: dict[str, str]):
"""
Insert custom wheel pins at the TOP of requirements file.
This ensures that when setup.py processes the file line-by-line,
custom wheels (torch, triton, etc.) are encountered FIRST, before
any `-r common.txt` includes that might pull in other dependencies.
Creates:
# Custom ROCm wheel pins (auto-generated)
torch==2.9.0a0+git1c57644
triton==3.4.0
torchvision==0.23.0a0+824e8c8
amdsmi==26.1.0+5df6c765
-r common.txt
... rest of file ...
"""
requirements_file = Path(requirements_path)
if not requirements_file.exists():
print(
f"ERROR: Requirements file not found: {requirements_path}", file=sys.stderr
)
sys.exit(1)
# Backup original file
backup_file = requirements_file.with_suffix(requirements_file.suffix + ".bak")
with open(requirements_file) as f:
original_lines = f.readlines()
# Write backup
with open(backup_file, "w") as f:
f.writelines(original_lines)
# Build header with pinned custom wheels
header_lines = [
"# Custom ROCm wheel pins (auto-generated by pin_rocm_dependencies.py)\n",
"# These must come FIRST to ensure correct dependency resolution\n",
]
for package_name, exact_version in versions.items():
header_lines.append(f"{package_name}=={exact_version}\n")
header_lines.append("\n") # Blank line separator
# Filter out any existing entries for custom packages from original file
filtered_lines = []
removed_packages = []
for line in original_lines:
stripped = line.strip()
should_keep = True
# Check if this line is for one of our custom packages
if stripped and not stripped.startswith("#") and not stripped.startswith("-"):
for package_name in versions:
# Handle both hyphen and underscore variations
pattern_name = package_name.replace("-", "[-_]")
pattern = rf"^{pattern_name}\s*[=<>]=?\s*[\d.a-zA-Z+]+"
if re.match(pattern, stripped, re.IGNORECASE):
removed_packages.append(f"{package_name}: {stripped}")
should_keep = False
break
if should_keep:
filtered_lines.append(line)
# Combine: header + filtered original content
final_lines = header_lines + filtered_lines
# Write modified content
with open(requirements_file, "w") as f:
f.writelines(final_lines)
# Print summary
print("\n✓ Inserted custom wheel pins at TOP of requirements:", file=sys.stderr)
for package_name, exact_version in versions.items():
print(f" - {package_name}=={exact_version}", file=sys.stderr)
if removed_packages:
print("\n✓ Removed old package entries:", file=sys.stderr)
for pkg in removed_packages:
print(f" - {pkg}", file=sys.stderr)
print(f"\n✓ Patched requirements file: {requirements_path}", file=sys.stderr)
print(f" Backup saved: {backup_file}", file=sys.stderr)
def main():
if len(sys.argv) != 3:
print(
f"Usage: {sys.argv[0]} <install_dir> <requirements_file>", file=sys.stderr
)
print(
f"Example: {sys.argv[0]} /install /app/vllm/requirements/rocm.txt",
file=sys.stderr,
)
sys.exit(1)
install_dir = sys.argv[1]
requirements_path = sys.argv[2]
print("=" * 70, file=sys.stderr)
print("Pinning vLLM dependencies to custom ROCm wheel versions", file=sys.stderr)
print("=" * 70, file=sys.stderr)
# Get versions from custom wheels
print(f"\nScanning {install_dir} for custom wheels...", file=sys.stderr)
versions = get_custom_wheel_versions(install_dir)
if not versions:
print("\nERROR: No custom wheels found in /install!", file=sys.stderr)
sys.exit(1)
# Pin dependencies in requirements file
print(f"\nPatching {requirements_path}...", file=sys.stderr)
pin_dependencies_in_requirements(requirements_path, versions)
print("\n" + "=" * 70, file=sys.stderr)
print("✓ Dependency pinning complete!", file=sys.stderr)
print("=" * 70, file=sys.stderr)
sys.exit(0)
if __name__ == "__main__":
main()

View File

@@ -152,13 +152,13 @@ class CacheConfig:
kv_offloading_size: float | None = None
"""Size of the KV cache offloading buffer in GiB. When TP > 1, this is
the total buffer size summed across all TP ranks. By default, this is set
to None, which means no KV offloading is enabled. When set with
kv_offloading_backend, vLLM will enable KV cache offloading to CPU"""
to None, which means no KV offloading is enabled. When set, vLLM will
enable KV cache offloading to CPU using the kv_offloading_backend."""
kv_offloading_backend: KVOffloadingBackend | None = None
kv_offloading_backend: KVOffloadingBackend = "native"
"""The backend to use for KV cache offloading. Supported backends include
'native' (vLLM native CPU offloading), 'lmcache' This option must be used
together with kv_offloading_size."""
'native' (vLLM native CPU offloading), 'lmcache'.
KV offloading is only activated when kv_offloading_size is set."""
def compute_hash(self) -> str:
"""

View File

@@ -48,7 +48,7 @@ class PoolerConfig:
## for embeddings models
normalize: bool | None = None
"""
Whether to normalize the embeddings outputs. Defaults to True.
DEPRECATED: please use `use_activation` instead.
"""
dimensions: int | None = None
"""
@@ -75,11 +75,11 @@ class PoolerConfig:
## for classification models
softmax: float | None = None
"""
softmax will be deprecated, please use use_activation instead.
DEPRECATED: please use `use_activation` instead.
"""
activation: float | None = None
"""
activation will be deprecated, please use use_activation instead.
DEPRECATED: please use `use_activation` instead.
"""
use_activation: bool | None = None
"""
@@ -164,17 +164,24 @@ class PoolerConfig:
def get_use_activation(o: object):
if softmax := getattr(o, "softmax", None) is not None:
if (normalize := getattr(o, "normalize", None)) is not None:
logger.warning_once(
"softmax will be deprecated and will be removed in v0.15. "
"Please use use_activation instead."
"`normalize` is deprecated and will be removed in v0.15. "
"Please use `use_activation` instead."
)
return normalize
if (softmax := getattr(o, "softmax", None)) is not None:
logger.warning_once(
"`softmax` is deprecated and will be removed in v0.15. "
"Please use `use_activation` instead."
)
return softmax
if activation := getattr(o, "activation", None) is not None:
if (activation := getattr(o, "activation", None)) is not None:
logger.warning_once(
"activation will be deprecated and will be removed in v0.15. "
"Please use use_activation instead."
"`activation` is deprecated and will be removed in v0.15. "
"Please use `use_activation` instead."
)
return activation

View File

@@ -498,17 +498,15 @@ class VllmConfig:
Right now, this function reads the offloading settings from
CacheConfig and configures the KVTransferConfig accordingly.
"""
if (kv_offloading_backend := self.cache_config.kv_offloading_backend) is None:
# KV offloading is only activated when kv_offloading_size is set.
if (kv_offloading_size := self.cache_config.kv_offloading_size) is None:
return
kv_offloading_backend = self.cache_config.kv_offloading_backend
# If no KVTransferConfig is provided, create a default one.
if self.kv_transfer_config is None:
self.kv_transfer_config = KVTransferConfig()
if (kv_offloading_size := self.cache_config.kv_offloading_size) is None:
raise ValueError(
"You must set kv_offloading_size when kv_offloading_backend is set."
)
num_kv_ranks = (
self.parallel_config.tensor_parallel_size
* self.parallel_config.pipeline_parallel_size

View File

@@ -3,10 +3,10 @@
from collections.abc import Mapping, MutableMapping
from pathlib import Path
from urllib.parse import urlparse
import aiohttp
import requests
from urllib3.util import parse_url
from vllm.version import __version__ as VLLM_VERSION
@@ -37,7 +37,7 @@ class HTTPConnection:
return self._async_client
def _validate_http_url(self, url: str):
parsed_url = urlparse(url)
parsed_url = parse_url(url)
if parsed_url.scheme not in ("http", "https"):
raise ValueError(

View File

@@ -234,7 +234,7 @@ class LMCacheConnectorV1(KVConnectorBase_V1):
lora_id=e.lora_id,
block_size=e.block_size,
medium=e.medium,
lora_name=e.lora_name,
lora_name=getattr(e, "lora_name", None),
)
for e in events
]

View File

@@ -578,9 +578,7 @@ class EngineArgs:
optimization_level: OptimizationLevel = VllmConfig.optimization_level
kv_offloading_size: float | None = CacheConfig.kv_offloading_size
kv_offloading_backend: KVOffloadingBackend | None = (
CacheConfig.kv_offloading_backend
)
kv_offloading_backend: KVOffloadingBackend = CacheConfig.kv_offloading_backend
tokens_only: bool = False
def __post_init__(self):

View File

@@ -540,14 +540,8 @@ async def create_completion(request: CompletionRequest, raw_request: Request):
try:
generator = await handler.create_completion(request, raw_request)
except OverflowError as e:
raise HTTPException(
status_code=HTTPStatus.BAD_REQUEST.value, detail=str(e)
) from e
except Exception as e:
raise HTTPException(
status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, detail=str(e)
) from e
return handler.create_error_response(e)
if isinstance(generator, ErrorResponse):
return JSONResponse(

View File

@@ -86,7 +86,7 @@ from vllm.entrypoints.responses_utils import (
construct_input_messages,
)
from vllm.entrypoints.serve.disagg.protocol import GenerateRequest, GenerateResponse
from vllm.entrypoints.utils import _validate_truncation_size
from vllm.entrypoints.utils import _validate_truncation_size, sanitize_message
from vllm.inputs.data import PromptType, TokensPrompt
from vllm.inputs.parse import (
PromptComponents,
@@ -760,11 +760,15 @@ class OpenAIServing:
err_type = "BadRequestError"
status_code = HTTPStatus.BAD_REQUEST
param = exc.parameter
elif isinstance(exc, (ValueError, TypeError, RuntimeError)):
elif isinstance(exc, (ValueError, TypeError, RuntimeError, OverflowError)):
# Common validation errors from user input
err_type = "BadRequestError"
status_code = HTTPStatus.BAD_REQUEST
param = None
elif isinstance(exc, NotImplementedError):
err_type = "NotImplementedError"
status_code = HTTPStatus.NOT_IMPLEMENTED
param = None
elif exc.__class__.__name__ == "TemplateError":
# jinja2.TemplateError (avoid importing jinja2)
err_type = "BadRequestError"
@@ -783,9 +787,10 @@ class OpenAIServing:
traceback.print_exc()
else:
traceback.print_stack()
return ErrorResponse(
error=ErrorInfo(
message=message,
message=sanitize_message(message),
type=err_type,
code=status_code.value,
param=param,

View File

@@ -16,6 +16,7 @@ from vllm.entrypoints.openai.protocol import (
ModelPermission,
UnloadLoRAAdapterRequest,
)
from vllm.entrypoints.utils import sanitize_message
from vllm.logger import init_logger
from vllm.lora.request import LoRARequest
from vllm.lora.resolver import LoRAResolver, LoRAResolverRegistry
@@ -300,5 +301,9 @@ def create_error_response(
status_code: HTTPStatus = HTTPStatus.BAD_REQUEST,
) -> ErrorResponse:
return ErrorResponse(
error=ErrorInfo(message=message, type=err_type, code=status_code.value)
error=ErrorInfo(
message=sanitize_message(message),
type=err_type,
code=status_code.value,
)
)

View File

@@ -1,8 +1,7 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from http import HTTPStatus
from fastapi import APIRouter, Depends, HTTPException, Request
from fastapi import APIRouter, Depends, Request
from starlette.responses import JSONResponse
from typing_extensions import assert_never
@@ -36,9 +35,8 @@ async def create_classify(request: ClassificationRequest, raw_request: Request):
try:
generator = await handler.create_classify(request, raw_request)
except Exception as e:
raise HTTPException(
status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, detail=str(e)
) from e
return handler.create_error_response(e)
if isinstance(generator, ErrorResponse):
return JSONResponse(
content=generator.model_dump(), status_code=generator.error.code

View File

@@ -2,7 +2,7 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from http import HTTPStatus
from fastapi import APIRouter, Depends, HTTPException, Request
from fastapi import APIRouter, Depends, Request
from fastapi.responses import JSONResponse, StreamingResponse
from typing_extensions import assert_never
@@ -47,9 +47,7 @@ async def create_embedding(
try:
generator = await handler.create_embedding(request, raw_request)
except Exception as e:
raise HTTPException(
status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, detail=str(e)
) from e
return handler.create_error_response(e)
if isinstance(generator, ErrorResponse):
return JSONResponse(

View File

@@ -75,7 +75,7 @@ class EmbeddingCompletionRequest(OpenAIBaseModel):
return PoolingParams(
truncate_prompt_tokens=self.truncate_prompt_tokens,
dimensions=self.dimensions,
normalize=self.normalize,
use_activation=self.normalize,
)
@@ -189,7 +189,7 @@ class EmbeddingChatRequest(OpenAIBaseModel):
return PoolingParams(
truncate_prompt_tokens=self.truncate_prompt_tokens,
dimensions=self.dimensions,
normalize=self.normalize,
use_activation=self.normalize,
)

View File

@@ -2,7 +2,7 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from http import HTTPStatus
from fastapi import APIRouter, Depends, HTTPException, Request
from fastapi import APIRouter, Depends, Request
from fastapi.responses import JSONResponse, StreamingResponse
from typing_extensions import assert_never
@@ -44,9 +44,8 @@ async def create_pooling(request: PoolingRequest, raw_request: Request):
try:
generator = await handler.create_pooling(request, raw_request)
except Exception as e:
raise HTTPException(
status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, detail=str(e)
) from e
return handler.create_error_response(e)
if isinstance(generator, ErrorResponse):
return JSONResponse(
content=generator.model_dump(), status_code=generator.error.code

View File

@@ -40,7 +40,6 @@ class PoolingCompletionRequest(EmbeddingCompletionRequest):
return PoolingParams(
truncate_prompt_tokens=self.truncate_prompt_tokens,
dimensions=self.dimensions,
normalize=self.normalize,
use_activation=get_use_activation(self),
)
@@ -66,7 +65,6 @@ class PoolingChatRequest(EmbeddingChatRequest):
return PoolingParams(
truncate_prompt_tokens=self.truncate_prompt_tokens,
dimensions=self.dimensions,
normalize=self.normalize,
use_activation=get_use_activation(self),
)

View File

@@ -2,7 +2,7 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from http import HTTPStatus
from fastapi import APIRouter, Depends, HTTPException, Request
from fastapi import APIRouter, Depends, Request
from fastapi.responses import JSONResponse
from typing_extensions import assert_never
@@ -52,9 +52,8 @@ async def create_score(request: ScoreRequest, raw_request: Request):
try:
generator = await handler.create_score(request, raw_request)
except Exception as e:
raise HTTPException(
status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, detail=str(e)
) from e
return handler.create_error_response(e)
if isinstance(generator, ErrorResponse):
return JSONResponse(
content=generator.model_dump(), status_code=generator.error.code
@@ -104,9 +103,8 @@ async def do_rerank(request: RerankRequest, raw_request: Request):
try:
generator = await handler.do_rerank(request, raw_request)
except Exception as e:
raise HTTPException(
status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, detail=str(e)
) from e
return handler.create_error_response(e)
if isinstance(generator, ErrorResponse):
return JSONResponse(
content=generator.model_dump(), status_code=generator.error.code

View File

@@ -67,9 +67,8 @@ async def generate(request: GenerateRequest, raw_request: Request):
try:
generator = await handler.serve_tokens(request, raw_request)
except Exception as e:
raise HTTPException(
status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, detail=str(e)
) from e
return handler.create_error_response(e)
if isinstance(generator, ErrorResponse):
return JSONResponse(
content=generator.model_dump(), status_code=generator.error.code

View File

@@ -49,14 +49,8 @@ async def tokenize(request: TokenizeRequest, raw_request: Request):
try:
generator = await handler.create_tokenize(request, raw_request)
except NotImplementedError as e:
raise HTTPException(
status_code=HTTPStatus.NOT_IMPLEMENTED.value, detail=str(e)
) from e
except Exception as e:
raise HTTPException(
status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, detail=str(e)
) from e
return handler.create_error_response(e)
if isinstance(generator, ErrorResponse):
return JSONResponse(

View File

@@ -7,7 +7,7 @@ import functools
import os
from argparse import Namespace
from pathlib import Path
from typing import Any
from typing import TYPE_CHECKING, Any
import regex as re
from fastapi import Request
@@ -22,18 +22,25 @@ from vllm.entrypoints.chat_utils import (
resolve_hf_chat_template,
resolve_mistral_chat_template,
)
from vllm.entrypoints.openai.cli_args import make_arg_parser
from vllm.entrypoints.openai.protocol import (
ChatCompletionRequest,
CompletionRequest,
StreamOptions,
)
from vllm.entrypoints.openai.serving_models import LoRAModulePath
from vllm.logger import init_logger
from vllm.platforms import current_platform
from vllm.tokenizers.mistral import MistralTokenizer
from vllm.utils.argparse_utils import FlexibleArgumentParser
if TYPE_CHECKING:
from vllm.entrypoints.openai.protocol import (
ChatCompletionRequest,
CompletionRequest,
StreamOptions,
)
from vllm.entrypoints.openai.serving_models import LoRAModulePath
else:
ChatCompletionRequest = object
CompletionRequest = object
StreamOptions = object
LoRAModulePath = object
logger = init_logger(__name__)
VLLM_SUBCMD_PARSER_EPILOG = (
@@ -206,7 +213,7 @@ def _validate_truncation_size(
def get_max_tokens(
max_model_len: int,
request: ChatCompletionRequest | CompletionRequest,
request: "ChatCompletionRequest | CompletionRequest",
input_length: int,
default_sampling_params: dict,
) -> int:
@@ -227,6 +234,8 @@ def get_max_tokens(
def log_non_default_args(args: Namespace | EngineArgs):
from vllm.entrypoints.openai.cli_args import make_arg_parser
non_default_args = {}
# Handle Namespace
@@ -255,7 +264,7 @@ def log_non_default_args(args: Namespace | EngineArgs):
def should_include_usage(
stream_options: StreamOptions | None, enable_force_include_usage: bool
stream_options: "StreamOptions | None", enable_force_include_usage: bool
) -> tuple[bool, bool]:
if stream_options:
include_usage = stream_options.include_usage or enable_force_include_usage
@@ -270,6 +279,8 @@ def should_include_usage(
def process_lora_modules(
args_lora_modules: list[LoRAModulePath], default_mm_loras: dict[str, str] | None
) -> list[LoRAModulePath]:
from vllm.entrypoints.openai.serving_models import LoRAModulePath
lora_modules = args_lora_modules
if default_mm_loras:
default_mm_lora_paths = [

View File

@@ -442,9 +442,9 @@ def get_vllm_port() -> int | None:
try:
return int(port)
except ValueError as err:
from urllib.parse import urlparse
from urllib3.util import parse_url
parsed = urlparse(port)
parsed = parse_url(port)
if parsed.scheme:
raise ValueError(
f"VLLM_PORT '{port}' appears to be a URI. "

View File

@@ -83,7 +83,7 @@ class EmbeddingPoolerHead(SequencePoolerHead):
# for normalize
if self.activation is not None:
flags = [p.normalize for p in pooling_params]
flags = [p.use_activation for p in pooling_params]
if len(set(flags)) == 1:
if flags[0]:
pooled_data = self.activation(pooled_data)

View File

@@ -95,8 +95,8 @@ def pooler_for_embed(pooler_config: PoolerConfig):
vllm_config = get_current_vllm_config()
model_config = vllm_config.model_config
head = EmbeddingPoolerHead(
projector=_load_st_projector(model_config),
head_dtype=model_config.head_dtype,
projector=_load_st_projector(model_config),
activation=PoolerNormalize(),
)
@@ -116,9 +116,9 @@ def pooler_for_classify(
vllm_config = get_current_vllm_config()
model_config = vllm_config.model_config
head = ClassifierPoolerHead(
head_dtype=model_config.head_dtype,
classifier=classifier,
logit_bias=model_config.pooler_config.logit_bias,
head_dtype=model_config.head_dtype,
activation=resolve_classifier_act_fn(
model_config, static_num_labels=True, act_fn=act_fn
),

View File

@@ -44,14 +44,14 @@ class TokenPoolerHead(nn.Module, ABC):
class TokenEmbeddingPoolerHead(TokenPoolerHead):
def __init__(
self,
projector: ProjectorFn | None = None,
head_dtype: torch.dtype | str | None = None,
projector: ProjectorFn | None = None,
activation: ActivationFn | None = None,
) -> None:
super().__init__()
self.projector = projector
self.head_dtype = head_dtype
self.projector = projector
self.activation = activation
def get_supported_tasks(self) -> Set[PoolingTask]:
@@ -79,7 +79,7 @@ class TokenEmbeddingPoolerHead(TokenPoolerHead):
pooled_data = pooled_data[..., : pooling_param.dimensions]
# for normalize
if self.activation is not None and pooling_param.normalize:
if self.activation is not None and pooling_param.use_activation:
pooled_data = self.activation(pooled_data)
# pooled_data shape: [n_tokens, embedding_dimension]

View File

@@ -95,8 +95,8 @@ def pooler_for_token_embed(pooler_config: PoolerConfig):
vllm_config = get_current_vllm_config()
model_config = vllm_config.model_config
head = TokenEmbeddingPoolerHead(
projector=_load_st_projector(model_config),
head_dtype=model_config.head_dtype,
projector=_load_st_projector(model_config),
activation=PoolerNormalize(),
)
@@ -116,9 +116,9 @@ def pooler_for_token_classify(
vllm_config = get_current_vllm_config()
model_config = vllm_config.model_config
head = TokenClassifierPoolerHead(
head_dtype=model_config.head_dtype,
classifier=classifier,
logit_bias=model_config.pooler_config.logit_bias,
head_dtype=model_config.head_dtype,
activation=resolve_classifier_act_fn(
model_config, static_num_labels=False, act_fn=act_fn
),

View File

@@ -98,7 +98,9 @@ class QuantFP8(CustomOp):
num_token_padding=self.num_token_padding,
scale_ub=scale_ub,
use_per_token_if_dynamic=self.use_per_token_if_dynamic,
group_shape=self.group_shape if self.static else None,
group_shape=(self.group_shape.row, self.group_shape.col)
if self.static
else None,
)
def forward_hip(

View File

@@ -116,8 +116,8 @@ class BertPooler(SequencePooler):
# Use lambdas so that weights are not registered under `self.head`
self.head = EmbeddingPoolerHead(
projector=lambda x: self.dense(x),
head_dtype=head_dtype,
projector=lambda x: self.dense(x),
activation=LambdaPoolerActivation(self.act_fn),
)

View File

@@ -309,12 +309,13 @@ class ModernBertPooler(SequencePooler):
config.hidden_size,
eps=config.norm_eps,
bias=config.norm_bias,
dtype=head_dtype,
)
# Use lambdas so that weights are not registered under `self.head`
self.head = EmbeddingPoolerHead(
projector=lambda x: self.dense(x),
head_dtype=head_dtype,
projector=lambda x: self.dense(x),
activation=LambdaPoolerActivation(lambda x: self.norm(self.act(x))),
)

View File

@@ -9,13 +9,13 @@ from concurrent.futures import ThreadPoolExecutor
from itertools import groupby
from pathlib import Path
from typing import TYPE_CHECKING, Any, TypeVar
from urllib.parse import ParseResult, urlparse
from urllib.request import url2pathname
import numpy as np
import numpy.typing as npt
import torch
from PIL import Image, UnidentifiedImageError
from urllib3.util import Url, parse_url
import vllm.envs as envs
from vllm.connections import HTTPConnection, global_http_connection
@@ -101,11 +101,14 @@ class MediaConnector:
def _load_data_url(
self,
url_spec: ParseResult,
url_spec: Url,
media_io: MediaIO[_M],
) -> _M: # type: ignore[type-var]
data_spec, data = url_spec.path.split(",", 1)
url_spec_path = url_spec.path or ""
data_spec, data = url_spec_path.split(",", 1)
media_type, data_type = data_spec.split(";", 1)
# media_type starts with a leading "/" (e.g., "/video/jpeg")
media_type = media_type.lstrip("/")
if data_type != "base64":
msg = "Only base64 data URLs are supported for now."
@@ -115,7 +118,7 @@ class MediaConnector:
def _load_file_url(
self,
url_spec: ParseResult,
url_spec: Url,
media_io: MediaIO[_M],
) -> _M: # type: ignore[type-var]
allowed_local_media_path = self.allowed_local_media_path
@@ -124,7 +127,9 @@ class MediaConnector:
"Cannot load local files without `--allowed-local-media-path`."
)
filepath = Path(url2pathname(url_spec.netloc + url_spec.path))
url_spec_path = url_spec.path or ""
url_spec_netloc = url_spec.netloc or ""
filepath = Path(url2pathname(url_spec_netloc + url_spec_path))
if allowed_local_media_path not in filepath.resolve().parents:
raise ValueError(
f"The file path {filepath} must be a subpath "
@@ -133,7 +138,7 @@ class MediaConnector:
return media_io.load_file(filepath)
def _assert_url_in_allowed_media_domains(self, url_spec: ParseResult) -> None:
def _assert_url_in_allowed_media_domains(self, url_spec: Url) -> None:
if (
self.allowed_media_domains
and url_spec.hostname not in self.allowed_media_domains
@@ -151,9 +156,9 @@ class MediaConnector:
*,
fetch_timeout: int | None = None,
) -> _M: # type: ignore[type-var]
url_spec = urlparse(url)
url_spec = parse_url(url)
if url_spec.scheme.startswith("http"):
if url_spec.scheme and url_spec.scheme.startswith("http"):
self._assert_url_in_allowed_media_domains(url_spec)
connection = self.connection
@@ -181,10 +186,10 @@ class MediaConnector:
*,
fetch_timeout: int | None = None,
) -> _M:
url_spec = urlparse(url)
url_spec = parse_url(url)
loop = asyncio.get_running_loop()
if url_spec.scheme.startswith("http"):
if url_spec.scheme and url_spec.scheme.startswith("http"):
self._assert_url_in_allowed_media_domains(url_spec)
connection = self.connection

View File

@@ -26,9 +26,9 @@ class PoolingParams(
Set to None to disable truncation.
dimensions: Reduce the dimensions of embeddings
if model support matryoshka representation.
normalize: Whether to normalize the embeddings outputs.
softmax: softmax will be deprecated, please use use_activation instead.
activation: activation will be deprecated, please use use_activation instead.
normalize: Deprecated, please use use_activation instead.
softmax: Deprecated, please use use_activation instead.
activation: Deprecated, please use use_activation instead.
use_activation: Whether to apply activation function to
the classification outputs.
"""
@@ -63,15 +63,15 @@ class PoolingParams(
@property
def all_parameters(self) -> list[str]:
return ["dimensions", "normalize", "use_activation"]
return ["dimensions", "use_activation"]
@property
def valid_parameters(self):
return {
"embed": ["dimensions", "normalize"],
"embed": ["dimensions", "use_activation"],
"classify": ["use_activation"],
"score": ["use_activation"],
"token_embed": ["dimensions", "normalize"],
"token_embed": ["dimensions", "use_activation"],
"token_classify": ["use_activation"],
}
@@ -162,8 +162,8 @@ class PoolingParams(
def _set_default_parameters(self, model_config: Optional["ModelConfig"]):
if self.task in ["embed", "token_embed"]:
if self.normalize is None:
self.normalize = True
if self.use_activation is None:
self.use_activation = True
if self.dimensions is not None and model_config is not None:
if not model_config.is_matryoshka:
@@ -213,7 +213,6 @@ class PoolingParams(
return (
f"PoolingParams("
f"task={self.task}, "
f"normalize={self.normalize}, "
f"dimensions={self.dimensions}, "
f"use_activation={self.use_activation}, "
f"step_tag_id={self.step_tag_id}, "

View File

@@ -801,7 +801,7 @@ def get_pooling_config(
logger.info("Found pooling configuration.")
config: dict[str, Any] = {"normalize": normalize}
config: dict[str, Any] = {"use_activation": normalize}
for key, val in pooling_dict.items():
if val is True:
pooling_type = parse_pooling_type(key)

View File

@@ -11,12 +11,12 @@ from collections.abc import (
Sequence,
)
from typing import Any
from urllib.parse import urlparse
from uuid import uuid4
import psutil
import zmq
import zmq.asyncio
from urllib3.util import parse_url
import vllm.envs as envs
from vllm.logger import init_logger
@@ -217,13 +217,15 @@ def find_process_using_port(port: int) -> psutil.Process | None:
def split_zmq_path(path: str) -> tuple[str, str, str]:
"""Split a zmq path into its parts."""
parsed = urlparse(path)
parsed = parse_url(path)
if not parsed.scheme:
raise ValueError(f"Invalid zmq path: {path}")
scheme = parsed.scheme
host = parsed.hostname or ""
port = str(parsed.port or "")
if host.startswith("[") and host.endswith("]"):
host = host[1:-1] # Remove brackets for IPv6 address
if scheme == "tcp" and not all((host, port)):
# The host and port fields are required for tcp

View File

@@ -167,7 +167,16 @@ class RocmAttentionBackend(AttentionBackend):
# ROCM paged attention kernel only supports block sizes 16 and 32
# due to shared memory (LDS) constraints on AMD GPUs.
# See csrc/rocm/attention.cu CALL_CUSTOM_LAUNCHER_BLK macro.
return [16, 32]
# However, The limitations in [16, 32] are reasonable for a native C++ kernel,
# but vLLM should allow support for non-standard sizes via the Triton path,
# as addressed in this PR: https://github.com/vllm-project/vllm/pull/31380,
# where the Triton kernel under rocm_atten does not support inference
# for a non-standard qwen3-next model with a block_size of 544.
# We have fixed the Triton kernel so that the standard model uses the original
# bit-addressing logic, while the non-standard model
# uses our optimized kernel logic.
return [16, 32, 544]
@classmethod
def get_supported_head_sizes(cls) -> list[int]:

View File

@@ -357,7 +357,8 @@ class EncoderDecoderCacheManager(EncoderCacheManager):
def __init__(self, cache_size: int):
self.cache_size = cache_size
self.num_free_slots = cache_size
self.freed: list[str] = []
self.allocated: list[str] = []
self.to_free: list[str] = []
def check_and_update_cache(self, request: Request, input_id: int) -> bool:
return False
@@ -383,7 +384,7 @@ class EncoderDecoderCacheManager(EncoderCacheManager):
self.num_free_slots -= num_encoder_embeds
mm_hash = request.mm_features[input_id].identifier
self.freed.append(mm_hash)
self.allocated.append(mm_hash)
def free(self, request: Request) -> None:
for input_id in range(len(request.mm_features)):
@@ -393,9 +394,14 @@ class EncoderDecoderCacheManager(EncoderCacheManager):
return set(range(len(request.mm_features)))
def get_freed_mm_hashes(self) -> list[str]:
freed = self.freed
self.freed = []
return freed
# As encoder cache is not used for enc-dec models, we can free the entries here
# The actual free happens in the runner, *before* the model is executed.
# Therefore, `freeable` acts as a buffer to free the entries only after the
# model is executed, mimicking the state transition of `EncoderCacheManager`.
to_free = self.to_free
self.to_free = self.allocated
self.allocated = []
return to_free
def free_encoder_input(self, request: Request, input_id: int) -> None:
num_encoder_embeds = request.get_num_encoder_embeds(input_id)

View File

@@ -174,6 +174,8 @@ class TopKTopPSampler(nn.Module):
k: torch.Tensor | None,
p: torch.Tensor | None,
) -> tuple[torch.Tensor, torch.Tensor | None]:
# FIXME: Fix aiter_sampler's accuracy issue and remove this flag
DISABLE_AITER_SAMPLER = True
"""Optimized ROCm/aiter path (same structure as forward_cuda)."""
if (k is None and p is None) or generators:
if generators:
@@ -186,6 +188,8 @@ class TopKTopPSampler(nn.Module):
"processed_logits",
"processed_logprobs",
), "aiter sampler does not support returning logits/logprobs."
if DISABLE_AITER_SAMPLER:
return self.forward_native(logits, generators, k, p)
return self.aiter_sample(logits, k, p, generators), None
def aiter_sample(