Compare commits
14 Commits
v0.18.0rc2
...
v0.18.1
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
a26e8dc7ff | ||
|
|
599e7359a3 | ||
|
|
d0cf73ce42 | ||
|
|
f0a5c5973b | ||
|
|
b7e4b88987 | ||
|
|
90b29e5302 | ||
|
|
a45d96ff42 | ||
|
|
7693c8eabf | ||
|
|
7624525bf6 | ||
|
|
d1b4f10b19 | ||
|
|
9fdc0f3aeb | ||
|
|
05d96d7991 | ||
|
|
ccbc5ac449 | ||
|
|
bcf2be9612 |
@@ -12,7 +12,7 @@ steps:
|
||||
depends_on: ~
|
||||
id: build-wheel-arm64-cuda-12-9
|
||||
agents:
|
||||
queue: arm64_cpu_queue_postmerge
|
||||
queue: arm64_cpu_queue_release
|
||||
commands:
|
||||
# #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here:
|
||||
# https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7
|
||||
@@ -27,7 +27,7 @@ steps:
|
||||
depends_on: ~
|
||||
id: build-wheel-arm64-cuda-13-0
|
||||
agents:
|
||||
queue: arm64_cpu_queue_postmerge
|
||||
queue: arm64_cpu_queue_release
|
||||
commands:
|
||||
# #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here:
|
||||
# https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7
|
||||
@@ -42,7 +42,7 @@ steps:
|
||||
depends_on: ~
|
||||
id: build-wheel-arm64-cpu
|
||||
agents:
|
||||
queue: arm64_cpu_queue_postmerge
|
||||
queue: arm64_cpu_queue_release
|
||||
commands:
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_BUILD_ACL=ON --tag vllm-ci:build-image --target vllm-build --progress plain -f docker/Dockerfile.cpu ."
|
||||
- "mkdir artifacts"
|
||||
@@ -55,7 +55,7 @@ steps:
|
||||
depends_on: ~
|
||||
id: build-wheel-x86-cuda-12-9
|
||||
agents:
|
||||
queue: cpu_queue_postmerge
|
||||
queue: cpu_queue_release
|
||||
commands:
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
||||
- "mkdir artifacts"
|
||||
@@ -68,7 +68,7 @@ steps:
|
||||
depends_on: ~
|
||||
id: build-wheel-x86-cuda-13-0
|
||||
agents:
|
||||
queue: cpu_queue_postmerge
|
||||
queue: cpu_queue_release
|
||||
commands:
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
||||
- "mkdir artifacts"
|
||||
@@ -81,7 +81,7 @@ steps:
|
||||
depends_on: ~
|
||||
id: build-wheel-x86-cpu
|
||||
agents:
|
||||
queue: cpu_queue_postmerge
|
||||
queue: cpu_queue_release
|
||||
commands:
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_CPU_X86=true --tag vllm-ci:build-image --target vllm-build --progress plain -f docker/Dockerfile.cpu ."
|
||||
- "mkdir artifacts"
|
||||
@@ -97,7 +97,7 @@ steps:
|
||||
depends_on: ~
|
||||
id: build-release-image-x86
|
||||
agents:
|
||||
queue: cpu_queue_postmerge
|
||||
queue: cpu_queue_release
|
||||
commands:
|
||||
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
|
||||
@@ -110,7 +110,7 @@ steps:
|
||||
depends_on: ~
|
||||
id: build-release-image-arm64
|
||||
agents:
|
||||
queue: arm64_cpu_queue_postmerge
|
||||
queue: arm64_cpu_queue_release
|
||||
commands:
|
||||
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
|
||||
@@ -120,7 +120,7 @@ steps:
|
||||
depends_on: ~
|
||||
id: build-release-image-x86-cuda-13-0
|
||||
agents:
|
||||
queue: cpu_queue_postmerge
|
||||
queue: cpu_queue_release
|
||||
commands:
|
||||
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg INSTALL_KV_CONNECTORS=true --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130 --target vllm-openai --progress plain -f docker/Dockerfile ."
|
||||
@@ -133,13 +133,57 @@ steps:
|
||||
depends_on: ~
|
||||
id: build-release-image-arm64-cuda-13-0
|
||||
agents:
|
||||
queue: arm64_cpu_queue_postmerge
|
||||
queue: arm64_cpu_queue_release
|
||||
commands:
|
||||
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
||||
# compute capability 12.0 for RTX-50 series / RTX PRO 6000 Blackwell, 12.1 for DGX Spark
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0 12.1' --build-arg INSTALL_KV_CONNECTORS=true --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130 --target vllm-openai --progress plain -f docker/Dockerfile ."
|
||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130"
|
||||
|
||||
- label: "Build release image - x86_64 - CUDA 12.9 - Ubuntu 24.04"
|
||||
depends_on: ~
|
||||
id: build-release-image-x86-ubuntu2404
|
||||
agents:
|
||||
queue: cpu_queue_release
|
||||
commands:
|
||||
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg UBUNTU_VERSION=24.04 --build-arg GDRCOPY_OS_VERSION=Ubuntu24_04 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-ubuntu2404 --target vllm-openai --progress plain -f docker/Dockerfile ."
|
||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-ubuntu2404"
|
||||
- "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-ubuntu2404 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-ubuntu2404"
|
||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-ubuntu2404"
|
||||
|
||||
- label: "Build release image - aarch64 - CUDA 12.9 - Ubuntu 24.04"
|
||||
depends_on: ~
|
||||
id: build-release-image-arm64-ubuntu2404
|
||||
agents:
|
||||
queue: arm64_cpu_queue_release
|
||||
commands:
|
||||
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg UBUNTU_VERSION=24.04 --build-arg GDRCOPY_OS_VERSION=Ubuntu24_04 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-ubuntu2404 --target vllm-openai --progress plain -f docker/Dockerfile ."
|
||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-ubuntu2404"
|
||||
|
||||
- label: "Build release image - x86_64 - CUDA 13.0 - Ubuntu 24.04"
|
||||
depends_on: ~
|
||||
id: build-release-image-x86-cuda-13-0-ubuntu2404
|
||||
agents:
|
||||
queue: cpu_queue_release
|
||||
commands:
|
||||
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg UBUNTU_VERSION=24.04 --build-arg GDRCOPY_OS_VERSION=Ubuntu24_04 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0 12.1' --build-arg INSTALL_KV_CONNECTORS=true --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu24.04 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130-ubuntu2404 --target vllm-openai --progress plain -f docker/Dockerfile ."
|
||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130-ubuntu2404"
|
||||
- "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130-ubuntu2404 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu130-ubuntu2404"
|
||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu130-ubuntu2404"
|
||||
|
||||
- label: "Build release image - aarch64 - CUDA 13.0 - Ubuntu 24.04"
|
||||
depends_on: ~
|
||||
id: build-release-image-arm64-cuda-13-0-ubuntu2404
|
||||
agents:
|
||||
queue: arm64_cpu_queue_release
|
||||
commands:
|
||||
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg UBUNTU_VERSION=24.04 --build-arg GDRCOPY_OS_VERSION=Ubuntu24_04 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0 12.1' --build-arg INSTALL_KV_CONNECTORS=true --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu24.04 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130-ubuntu2404 --target vllm-openai --progress plain -f docker/Dockerfile ."
|
||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130-ubuntu2404"
|
||||
|
||||
- block: "Build release image for x86_64 CPU"
|
||||
key: block-cpu-release-image-build
|
||||
depends_on: ~
|
||||
@@ -148,8 +192,9 @@ steps:
|
||||
depends_on:
|
||||
- block-cpu-release-image-build
|
||||
- input-release-version
|
||||
id: build-release-image-x86-cpu
|
||||
agents:
|
||||
queue: cpu_queue_postmerge
|
||||
queue: cpu_queue_release
|
||||
commands:
|
||||
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_CPU_X86=true --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ."
|
||||
@@ -163,11 +208,12 @@ steps:
|
||||
depends_on: ~
|
||||
|
||||
- label: "Build release image - arm64 - CPU"
|
||||
depends_on:
|
||||
depends_on:
|
||||
- block-arm64-cpu-release-image-build
|
||||
- input-release-version
|
||||
id: build-release-image-arm64-cpu
|
||||
agents:
|
||||
queue: arm64_cpu_queue_postmerge
|
||||
queue: arm64_cpu_queue_release
|
||||
commands:
|
||||
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ."
|
||||
@@ -185,7 +231,7 @@ steps:
|
||||
- build-release-image-arm64
|
||||
id: create-multi-arch-manifest
|
||||
agents:
|
||||
queue: small_cpu_queue_postmerge
|
||||
queue: small_cpu_queue_release
|
||||
commands:
|
||||
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
||||
- "docker manifest create public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64 --amend"
|
||||
@@ -196,7 +242,7 @@ steps:
|
||||
- create-multi-arch-manifest
|
||||
id: annotate-release-workflow
|
||||
agents:
|
||||
queue: small_cpu_queue_postmerge
|
||||
queue: small_cpu_queue_release
|
||||
commands:
|
||||
- "bash .buildkite/scripts/annotate-release.sh"
|
||||
|
||||
@@ -206,18 +252,67 @@ steps:
|
||||
- build-release-image-arm64-cuda-13-0
|
||||
id: create-multi-arch-manifest-cuda-13-0
|
||||
agents:
|
||||
queue: small_cpu_queue_postmerge
|
||||
queue: small_cpu_queue_release
|
||||
commands:
|
||||
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
||||
- "docker manifest create public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu130 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64-cu130 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64-cu130 --amend"
|
||||
- "docker manifest push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu130"
|
||||
|
||||
- label: "Create multi-arch manifest - CUDA 12.9 - Ubuntu 24.04"
|
||||
depends_on:
|
||||
- build-release-image-x86-ubuntu2404
|
||||
- build-release-image-arm64-ubuntu2404
|
||||
id: create-multi-arch-manifest-ubuntu2404
|
||||
agents:
|
||||
queue: small_cpu_queue_release
|
||||
commands:
|
||||
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
||||
- "docker manifest create public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-ubuntu2404 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64-ubuntu2404 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64-ubuntu2404 --amend"
|
||||
- "docker manifest push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-ubuntu2404"
|
||||
|
||||
- label: "Create multi-arch manifest - CUDA 13.0 - Ubuntu 24.04"
|
||||
depends_on:
|
||||
- build-release-image-x86-cuda-13-0-ubuntu2404
|
||||
- build-release-image-arm64-cuda-13-0-ubuntu2404
|
||||
id: create-multi-arch-manifest-cuda-13-0-ubuntu2404
|
||||
agents:
|
||||
queue: small_cpu_queue_release
|
||||
commands:
|
||||
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
||||
- "docker manifest create public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu130-ubuntu2404 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64-cu130-ubuntu2404 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64-cu130-ubuntu2404 --amend"
|
||||
- "docker manifest push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu130-ubuntu2404"
|
||||
|
||||
- block: "Confirm publishing release images to DockerHub"
|
||||
key: block-publish-release-images-dockerhub
|
||||
depends_on:
|
||||
- create-multi-arch-manifest
|
||||
- create-multi-arch-manifest-cuda-13-0
|
||||
- build-release-image-x86-cpu
|
||||
- build-release-image-arm64-cpu
|
||||
- build-rocm-release-image
|
||||
|
||||
- label: "Publish release images to DockerHub"
|
||||
key: publish-release-images-dockerhub
|
||||
depends_on:
|
||||
- block-publish-release-images-dockerhub
|
||||
agents:
|
||||
queue: small_cpu_queue_release
|
||||
commands:
|
||||
- "bash .buildkite/scripts/push-release-builds.sh"
|
||||
plugins:
|
||||
- docker-login#v3.0.0:
|
||||
username: vllmbot
|
||||
password-env: DOCKERHUB_TOKEN
|
||||
env:
|
||||
DOCKER_BUILDKIT: "1"
|
||||
DOCKERHUB_USERNAME: "vllmbot"
|
||||
|
||||
- label: "Publish nightly multi-arch image to DockerHub"
|
||||
depends_on:
|
||||
- create-multi-arch-manifest
|
||||
if: build.env("NIGHTLY") == "1"
|
||||
agents:
|
||||
queue: small_cpu_queue_postmerge
|
||||
queue: small_cpu_queue_release
|
||||
commands:
|
||||
- "bash .buildkite/scripts/push-nightly-builds.sh"
|
||||
# Clean up old nightly builds (keep only last 14)
|
||||
@@ -235,7 +330,7 @@ steps:
|
||||
- create-multi-arch-manifest-cuda-13-0
|
||||
if: build.env("NIGHTLY") == "1"
|
||||
agents:
|
||||
queue: small_cpu_queue_postmerge
|
||||
queue: small_cpu_queue_release
|
||||
commands:
|
||||
- "bash .buildkite/scripts/push-nightly-builds.sh cu130"
|
||||
# Clean up old nightly builds (keep only last 14)
|
||||
@@ -262,7 +357,7 @@ steps:
|
||||
- block-upload-release-wheels
|
||||
id: upload-release-wheels
|
||||
agents:
|
||||
queue: small_cpu_queue_postmerge
|
||||
queue: small_cpu_queue_release
|
||||
commands:
|
||||
- "bash .buildkite/scripts/upload-release-wheels-pypi.sh"
|
||||
|
||||
@@ -323,7 +418,7 @@ steps:
|
||||
- step: input-rocm-config
|
||||
allow_failure: true # Allow failure so non-UI builds can proceed (input step is skipped)
|
||||
agents:
|
||||
queue: cpu_queue_postmerge
|
||||
queue: cpu_queue_release
|
||||
commands:
|
||||
# Set configuration and check cache
|
||||
- |
|
||||
@@ -465,7 +560,7 @@ steps:
|
||||
- step: build-rocm-base-wheels
|
||||
allow_failure: false
|
||||
agents:
|
||||
queue: cpu_queue_postmerge
|
||||
queue: cpu_queue_release
|
||||
timeout_in_minutes: 180
|
||||
commands:
|
||||
# Download artifacts and prepare Docker image
|
||||
@@ -575,7 +670,7 @@ steps:
|
||||
- step: build-rocm-vllm-wheel
|
||||
allow_failure: false
|
||||
agents:
|
||||
queue: cpu_queue_postmerge
|
||||
queue: cpu_queue_release
|
||||
timeout_in_minutes: 60
|
||||
commands:
|
||||
# Download all wheel artifacts and run upload
|
||||
@@ -624,7 +719,7 @@ steps:
|
||||
- step: input-release-version
|
||||
allow_failure: true
|
||||
agents:
|
||||
queue: cpu_queue_postmerge
|
||||
queue: cpu_queue_release
|
||||
commands:
|
||||
- "bash .buildkite/scripts/annotate-rocm-release.sh"
|
||||
env:
|
||||
@@ -641,7 +736,7 @@ steps:
|
||||
depends_on: block-generate-root-index-rocm-wheels
|
||||
id: generate-root-index-rocm-wheels
|
||||
agents:
|
||||
queue: cpu_queue_postmerge
|
||||
queue: cpu_queue_release
|
||||
commands:
|
||||
- "bash tools/vllm-rocm/generate-rocm-wheels-root-index.sh"
|
||||
env:
|
||||
@@ -655,7 +750,7 @@ steps:
|
||||
- step: build-rocm-base-wheels
|
||||
allow_failure: false
|
||||
agents:
|
||||
queue: cpu_queue_postmerge
|
||||
queue: cpu_queue_release
|
||||
timeout_in_minutes: 60
|
||||
commands:
|
||||
- |
|
||||
|
||||
113
.buildkite/scripts/push-release-builds.sh
Executable file
113
.buildkite/scripts/push-release-builds.sh
Executable file
@@ -0,0 +1,113 @@
|
||||
#!/bin/bash
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
# Ensure git tags are up-to-date (Buildkite's default fetch doesn't always include tags)
|
||||
echo "Fetching latest tags from origin..."
|
||||
git fetch --tags --force origin
|
||||
|
||||
# Derive release version from the git tag on the current commit.
|
||||
# The pipeline must be triggered on a tagged commit (e.g. v0.18.1).
|
||||
RELEASE_VERSION=$(git describe --exact-match --tags "${BUILDKITE_COMMIT}" 2>/dev/null || true)
|
||||
if [ -z "${RELEASE_VERSION}" ]; then
|
||||
echo "[FATAL] Commit ${BUILDKITE_COMMIT} has no exact git tag. " \
|
||||
"Release images must be published from a tagged commit."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Strip leading 'v' for use in Docker tags (e.g. v0.18.1 -> 0.18.1)
|
||||
PURE_VERSION="${RELEASE_VERSION#v}"
|
||||
|
||||
echo "========================================"
|
||||
echo "Publishing release images"
|
||||
echo " Commit: ${BUILDKITE_COMMIT}"
|
||||
echo " Release version: ${RELEASE_VERSION}"
|
||||
echo "========================================"
|
||||
|
||||
set -x
|
||||
|
||||
# ---- CUDA (default, CUDA 12.9) ----
|
||||
docker pull "public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64"
|
||||
docker pull "public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64"
|
||||
|
||||
docker tag "public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64" "vllm/vllm-openai:latest-x86_64"
|
||||
docker tag "public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64" "vllm/vllm-openai:v${PURE_VERSION}-x86_64"
|
||||
docker push "vllm/vllm-openai:latest-x86_64"
|
||||
docker push "vllm/vllm-openai:v${PURE_VERSION}-x86_64"
|
||||
|
||||
docker tag "public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64" "vllm/vllm-openai:latest-aarch64"
|
||||
docker tag "public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64" "vllm/vllm-openai:v${PURE_VERSION}-aarch64"
|
||||
docker push "vllm/vllm-openai:latest-aarch64"
|
||||
docker push "vllm/vllm-openai:v${PURE_VERSION}-aarch64"
|
||||
|
||||
docker manifest rm "vllm/vllm-openai:latest" || true
|
||||
docker manifest create "vllm/vllm-openai:latest" "vllm/vllm-openai:latest-x86_64" "vllm/vllm-openai:latest-aarch64"
|
||||
docker manifest push "vllm/vllm-openai:latest"
|
||||
|
||||
docker manifest rm "vllm/vllm-openai:v${PURE_VERSION}" || true
|
||||
docker manifest create "vllm/vllm-openai:v${PURE_VERSION}" "vllm/vllm-openai:v${PURE_VERSION}-x86_64" "vllm/vllm-openai:v${PURE_VERSION}-aarch64"
|
||||
docker manifest push "vllm/vllm-openai:v${PURE_VERSION}"
|
||||
|
||||
# ---- CUDA 13.0 ----
|
||||
docker pull "public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64-cu130"
|
||||
docker pull "public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64-cu130"
|
||||
|
||||
docker tag "public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64-cu130" "vllm/vllm-openai:latest-x86_64-cu130"
|
||||
docker tag "public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64-cu130" "vllm/vllm-openai:v${PURE_VERSION}-x86_64-cu130"
|
||||
docker push "vllm/vllm-openai:latest-x86_64-cu130"
|
||||
docker push "vllm/vllm-openai:v${PURE_VERSION}-x86_64-cu130"
|
||||
|
||||
docker tag "public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64-cu130" "vllm/vllm-openai:latest-aarch64-cu130"
|
||||
docker tag "public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64-cu130" "vllm/vllm-openai:v${PURE_VERSION}-aarch64-cu130"
|
||||
docker push "vllm/vllm-openai:latest-aarch64-cu130"
|
||||
docker push "vllm/vllm-openai:v${PURE_VERSION}-aarch64-cu130"
|
||||
|
||||
docker manifest rm "vllm/vllm-openai:latest-cu130" || true
|
||||
docker manifest create "vllm/vllm-openai:latest-cu130" "vllm/vllm-openai:latest-x86_64-cu130" "vllm/vllm-openai:latest-aarch64-cu130"
|
||||
docker manifest push "vllm/vllm-openai:latest-cu130"
|
||||
|
||||
docker manifest rm "vllm/vllm-openai:v${PURE_VERSION}-cu130" || true
|
||||
docker manifest create "vllm/vllm-openai:v${PURE_VERSION}-cu130" "vllm/vllm-openai:v${PURE_VERSION}-x86_64-cu130" "vllm/vllm-openai:v${PURE_VERSION}-aarch64-cu130"
|
||||
docker manifest push "vllm/vllm-openai:v${PURE_VERSION}-cu130"
|
||||
|
||||
# ---- ROCm ----
|
||||
docker pull "public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm"
|
||||
docker pull "public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm-base"
|
||||
|
||||
docker tag "public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm" "vllm/vllm-openai-rocm:latest"
|
||||
docker tag "public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm" "vllm/vllm-openai-rocm:v${PURE_VERSION}"
|
||||
docker push "vllm/vllm-openai-rocm:latest"
|
||||
docker push "vllm/vllm-openai-rocm:v${PURE_VERSION}"
|
||||
|
||||
docker tag "public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm-base" "vllm/vllm-openai-rocm:latest-base"
|
||||
docker tag "public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm-base" "vllm/vllm-openai-rocm:v${PURE_VERSION}-base"
|
||||
docker push "vllm/vllm-openai-rocm:latest-base"
|
||||
docker push "vllm/vllm-openai-rocm:v${PURE_VERSION}-base"
|
||||
|
||||
# ---- CPU ----
|
||||
# CPU images in ECR are tagged with the full version including 'v' (e.g. v0.18.1),
|
||||
# matching the value from the Buildkite release-version metadata input.
|
||||
docker pull "public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:${RELEASE_VERSION}"
|
||||
docker pull "public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:${RELEASE_VERSION}"
|
||||
|
||||
docker tag "public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:${RELEASE_VERSION}" "vllm/vllm-openai-cpu:latest-x86_64"
|
||||
docker tag "public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:${RELEASE_VERSION}" "vllm/vllm-openai-cpu:v${PURE_VERSION}-x86_64"
|
||||
docker push "vllm/vllm-openai-cpu:latest-x86_64"
|
||||
docker push "vllm/vllm-openai-cpu:v${PURE_VERSION}-x86_64"
|
||||
|
||||
docker tag "public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:${RELEASE_VERSION}" "vllm/vllm-openai-cpu:latest-arm64"
|
||||
docker tag "public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:${RELEASE_VERSION}" "vllm/vllm-openai-cpu:v${PURE_VERSION}-arm64"
|
||||
docker push "vllm/vllm-openai-cpu:latest-arm64"
|
||||
docker push "vllm/vllm-openai-cpu:v${PURE_VERSION}-arm64"
|
||||
|
||||
docker manifest rm "vllm/vllm-openai-cpu:latest" || true
|
||||
docker manifest create "vllm/vllm-openai-cpu:latest" "vllm/vllm-openai-cpu:latest-x86_64" "vllm/vllm-openai-cpu:latest-arm64"
|
||||
docker manifest push "vllm/vllm-openai-cpu:latest"
|
||||
|
||||
docker manifest rm "vllm/vllm-openai-cpu:v${PURE_VERSION}" || true
|
||||
docker manifest create "vllm/vllm-openai-cpu:v${PURE_VERSION}" "vllm/vllm-openai-cpu:v${PURE_VERSION}-x86_64" "vllm/vllm-openai-cpu:v${PURE_VERSION}-arm64"
|
||||
docker manifest push "vllm/vllm-openai-cpu:v${PURE_VERSION}"
|
||||
|
||||
echo "========================================"
|
||||
echo "Successfully published release images for ${RELEASE_VERSION}"
|
||||
echo "========================================"
|
||||
@@ -45,6 +45,22 @@ steps:
|
||||
commands:
|
||||
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt
|
||||
|
||||
- label: LM Eval Qwen3.5 Models (B200)
|
||||
timeout_in_minutes: 120
|
||||
device: b200
|
||||
optional: true
|
||||
num_devices: 2
|
||||
source_file_dependencies:
|
||||
- vllm/model_executor/models/qwen3_5.py
|
||||
- vllm/model_executor/models/qwen3_5_mtp.py
|
||||
- vllm/transformers_utils/configs/qwen3_5.py
|
||||
- vllm/transformers_utils/configs/qwen3_5_moe.py
|
||||
- vllm/model_executor/models/qwen3_next.py
|
||||
- vllm/model_executor/models/qwen3_next_mtp.py
|
||||
- vllm/model_executor/layers/fla/ops/
|
||||
commands:
|
||||
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-qwen35-blackwell.txt
|
||||
|
||||
- label: LM Eval Large Models (H200)
|
||||
timeout_in_minutes: 60
|
||||
device: h200
|
||||
|
||||
@@ -24,6 +24,7 @@
|
||||
|
||||
ARG CUDA_VERSION=12.9.1
|
||||
ARG PYTHON_VERSION=3.12
|
||||
ARG UBUNTU_VERSION=22.04
|
||||
|
||||
# By parameterizing the base images, we allow third-party to use their own
|
||||
# base images. One use case is hermetic builds with base images stored in
|
||||
@@ -38,7 +39,7 @@ ARG PYTHON_VERSION=3.12
|
||||
# version are not backwards compatible with OSes that use an earlier version.
|
||||
ARG BUILD_BASE_IMAGE=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04
|
||||
# Using cuda base image with minimal dependencies necessary for JIT compilation (FlashInfer, DeepGEMM, EP kernels)
|
||||
ARG FINAL_BASE_IMAGE=nvidia/cuda:${CUDA_VERSION}-base-ubuntu22.04
|
||||
ARG FINAL_BASE_IMAGE=nvidia/cuda:${CUDA_VERSION}-base-ubuntu${UBUNTU_VERSION}
|
||||
|
||||
# By parameterizing the Deadsnakes repository URL, we allow third-party to use
|
||||
# their own mirror. When doing so, we don't benefit from the transparent
|
||||
@@ -111,6 +112,10 @@ RUN apt-get update -y \
|
||||
gcc-10 \
|
||||
g++-10 \
|
||||
&& update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-10 110 --slave /usr/bin/g++ g++ /usr/bin/g++-10 \
|
||||
# Install python dev headers if available (needed for cmake FindPython on Ubuntu 24.04
|
||||
# which ships cmake 3.28 and requires Development.SABIModule; silently skipped on
|
||||
# Ubuntu 20.04/22.04 where python3.x-dev is not available without a PPA)
|
||||
&& (apt-get install -y --no-install-recommends python${PYTHON_VERSION}-dev 2>/dev/null || true) \
|
||||
&& rm -rf /var/lib/apt/lists/* \
|
||||
&& curl -LsSf https://astral.sh/uv/install.sh | sh \
|
||||
&& $HOME/.local/bin/uv venv /opt/venv --python ${PYTHON_VERSION} \
|
||||
@@ -507,7 +512,6 @@ RUN apt-get update -y \
|
||||
software-properties-common \
|
||||
curl \
|
||||
sudo \
|
||||
python3-pip \
|
||||
ffmpeg \
|
||||
libsm6 \
|
||||
libxext6 \
|
||||
@@ -535,6 +539,7 @@ RUN apt-get update -y \
|
||||
&& update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \
|
||||
&& update-alternatives --set python3 /usr/bin/python${PYTHON_VERSION} \
|
||||
&& ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \
|
||||
&& rm -f /usr/lib/python${PYTHON_VERSION}/EXTERNALLY-MANAGED \
|
||||
&& curl -sS ${GET_PIP_URL} | python${PYTHON_VERSION} \
|
||||
&& python3 --version && python3 -m pip --version
|
||||
|
||||
@@ -593,6 +598,25 @@ RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
--extra-index-url https://flashinfer.ai/whl/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') \
|
||||
&& flashinfer show-config
|
||||
|
||||
# Pre-download FlashInfer TRTLLM BMM headers for air-gapped environments.
|
||||
# At runtime, MoE JIT compilation downloads these from edge.urm.nvidia.com
|
||||
# which fails without internet. This step caches them at build time.
|
||||
RUN python3 <<'PYEOF'
|
||||
from flashinfer.jit import env as jit_env
|
||||
from flashinfer.jit.cubin_loader import download_trtllm_headers, get_cubin
|
||||
from flashinfer.artifacts import ArtifactPath, CheckSumHash
|
||||
|
||||
download_trtllm_headers(
|
||||
'bmm',
|
||||
jit_env.FLASHINFER_CUBIN_DIR / 'flashinfer' / 'trtllm' / 'batched_gemm' / 'trtllmGen_bmm_export',
|
||||
f'{ArtifactPath.TRTLLM_GEN_BMM}/include/trtllmGen_bmm_export',
|
||||
ArtifactPath.TRTLLM_GEN_BMM,
|
||||
get_cubin(f'{ArtifactPath.TRTLLM_GEN_BMM}/checksums.txt', CheckSumHash.TRTLLM_GEN_BMM),
|
||||
)
|
||||
|
||||
print('FlashInfer TRTLLM BMM headers downloaded successfully')
|
||||
PYEOF
|
||||
|
||||
# ============================================================
|
||||
# OPENAI API SERVER DEPENDENCIES
|
||||
# Pre-install these to avoid reinstalling on every vLLM wheel rebuild
|
||||
|
||||
@@ -33,6 +33,10 @@ group "default" {
|
||||
targets = ["openai"]
|
||||
}
|
||||
|
||||
group "all" {
|
||||
targets = ["openai", "openai-ubuntu2404"]
|
||||
}
|
||||
|
||||
# Base targets
|
||||
|
||||
target "_common" {
|
||||
@@ -74,3 +78,29 @@ target "openai" {
|
||||
tags = ["vllm:openai"]
|
||||
output = ["type=docker"]
|
||||
}
|
||||
|
||||
# Ubuntu 24.04 targets
|
||||
|
||||
target "test-ubuntu2404" {
|
||||
inherits = ["_common", "_labels"]
|
||||
target = "test"
|
||||
tags = ["vllm:test-ubuntu24.04"]
|
||||
args = {
|
||||
UBUNTU_VERSION = "24.04"
|
||||
GDRCOPY_OS_VERSION = "Ubuntu24_04"
|
||||
FLASHINFER_AOT_COMPILE = "true"
|
||||
}
|
||||
output = ["type=docker"]
|
||||
}
|
||||
|
||||
target "openai-ubuntu2404" {
|
||||
inherits = ["_common", "_labels"]
|
||||
target = "vllm-openai"
|
||||
tags = ["vllm:openai-ubuntu24.04"]
|
||||
args = {
|
||||
UBUNTU_VERSION = "24.04"
|
||||
GDRCOPY_OS_VERSION = "Ubuntu24_04"
|
||||
FLASHINFER_AOT_COMPILE = "true"
|
||||
}
|
||||
output = ["type=docker"]
|
||||
}
|
||||
|
||||
@@ -7,6 +7,9 @@
|
||||
"PYTHON_VERSION": {
|
||||
"default": "3.12"
|
||||
},
|
||||
"UBUNTU_VERSION": {
|
||||
"default": "22.04"
|
||||
},
|
||||
"BUILD_BASE_IMAGE": {
|
||||
"default": "nvidia/cuda:12.9.1-devel-ubuntu20.04"
|
||||
},
|
||||
|
||||
9
tests/evals/gsm8k/configs/Qwen3.5-35B-A3B-DEP2.yaml
Normal file
9
tests/evals/gsm8k/configs/Qwen3.5-35B-A3B-DEP2.yaml
Normal file
@@ -0,0 +1,9 @@
|
||||
model_name: "Qwen/Qwen3.5-35B-A3B"
|
||||
accuracy_threshold: 0.84
|
||||
tolerance: 0.03
|
||||
num_questions: 1319
|
||||
num_fewshot: 5
|
||||
server_args: >-
|
||||
--max-model-len 4096
|
||||
--data-parallel-size 2
|
||||
--enable-expert-parallel
|
||||
10
tests/evals/gsm8k/configs/Qwen3.5-35B-A3B-FP8-DEP2.yaml
Normal file
10
tests/evals/gsm8k/configs/Qwen3.5-35B-A3B-FP8-DEP2.yaml
Normal file
@@ -0,0 +1,10 @@
|
||||
model_name: "Qwen/Qwen3.5-35B-A3B-FP8"
|
||||
accuracy_threshold: 0.79
|
||||
tolerance: 0.03
|
||||
num_questions: 1319
|
||||
num_fewshot: 5
|
||||
server_args: >-
|
||||
--max-model-len 4096
|
||||
--data-parallel-size 2
|
||||
--enable-expert-parallel
|
||||
--kv-cache-dtype fp8
|
||||
@@ -0,0 +1,9 @@
|
||||
model_name: "nvidia/Qwen3.5-397B-A17B-NVFP4"
|
||||
accuracy_threshold: 0.88
|
||||
tolerance: 0.03
|
||||
num_questions: 1319
|
||||
num_fewshot: 5
|
||||
server_args: >-
|
||||
--max-model-len 4096
|
||||
--data-parallel-size 2
|
||||
--enable-expert-parallel
|
||||
3
tests/evals/gsm8k/configs/models-qwen35-blackwell.txt
Normal file
3
tests/evals/gsm8k/configs/models-qwen35-blackwell.txt
Normal file
@@ -0,0 +1,3 @@
|
||||
Qwen3.5-35B-A3B-DEP2.yaml
|
||||
Qwen3.5-35B-A3B-FP8-DEP2.yaml
|
||||
Qwen3.5-397B-A17B-NVFP4-DEP2.yaml
|
||||
@@ -19,8 +19,6 @@ from vllm.platforms import current_platform
|
||||
|
||||
from .gsm8k_eval import evaluate_gsm8k
|
||||
|
||||
TOL = 0.08 # Absolute tolerance for accuracy comparison
|
||||
|
||||
|
||||
def run_gsm8k_eval(eval_config: dict, server_url: str) -> dict:
|
||||
"""Run GSM8K evaluation using our isolated script."""
|
||||
@@ -99,20 +97,20 @@ def test_gsm8k_correctness(config_filename):
|
||||
|
||||
measured_metric = results["accuracy"]
|
||||
expected_metric = eval_config["accuracy_threshold"]
|
||||
tol = eval_config.get("tolerance", 0.08)
|
||||
|
||||
print(f"GSM8K Results for {eval_config['model_name']}:")
|
||||
print(f" Measured metric: {measured_metric:.4f}")
|
||||
print(f" Expected metric: {expected_metric:.4f}")
|
||||
print(f" Tolerance: {TOL:.4f}")
|
||||
print(f" Tolerance: {tol:.4f}")
|
||||
print(f" Questions: {results['num_questions']}")
|
||||
print(f" Invalid rate: {results['invalid_rate']:.3f}")
|
||||
print(f" Latency: {results['latency']:.1f}s")
|
||||
print(f" QPS: {results['questions_per_second']:.1f}")
|
||||
|
||||
# Verify metric is within tolerance
|
||||
assert measured_metric >= expected_metric - TOL, (
|
||||
assert measured_metric >= expected_metric - tol, (
|
||||
f"GSM8K metric too low: {measured_metric:.4f} < "
|
||||
f"{expected_metric:.4f} - {TOL:.4f} = {expected_metric - TOL:.4f}"
|
||||
f"{expected_metric:.4f} - {tol:.4f} = {expected_metric - tol:.4f}"
|
||||
)
|
||||
|
||||
print(f"✅ GSM8K test passed for {eval_config['model_name']}")
|
||||
|
||||
@@ -24,6 +24,7 @@ from transformers import (
|
||||
GenerationConfig,
|
||||
GenerationMixin,
|
||||
)
|
||||
from transformers.masking_utils import create_causal_mask
|
||||
from transformers.video_utils import VideoMetadata
|
||||
|
||||
from vllm.logprobs import SampleLogprobs
|
||||
@@ -679,10 +680,14 @@ def isaac_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
|
||||
sin = sin.to(inputs_embeds.dtype)
|
||||
|
||||
# Prepare attention mask
|
||||
if attention_mask is not None:
|
||||
attention_mask = self._update_causal_mask(
|
||||
attention_mask, inputs_embeds, cache_position, past_key_values, False
|
||||
)
|
||||
attention_mask = create_causal_mask(
|
||||
config=self.config,
|
||||
input_embeds=inputs_embeds,
|
||||
attention_mask=attention_mask,
|
||||
past_key_values=past_key_values,
|
||||
position_ids=position_ids,
|
||||
cache_position=cache_position,
|
||||
)
|
||||
|
||||
# Initialize and collect hidden states
|
||||
hidden_states = inputs_embeds
|
||||
|
||||
@@ -780,6 +780,7 @@ _MULTIMODAL_EXAMPLE_MODELS = {
|
||||
"Ernie4_5_VLMoeForConditionalGeneration": _HfExamplesInfo(
|
||||
"baidu/ERNIE-4.5-VL-28B-A3B-PT",
|
||||
trust_remote_code=True,
|
||||
revision="refs/pr/17",
|
||||
),
|
||||
"FireRedASR2ForConditionalGeneration": _HfExamplesInfo(
|
||||
"allendou/FireRedASR2-LLM-vllm",
|
||||
|
||||
@@ -373,8 +373,15 @@ class InductorStandaloneAdaptor(CompilerInterface):
|
||||
break
|
||||
|
||||
if input_fake_mode is not None:
|
||||
fake_mode_ctx: Any = patch(
|
||||
"torch._inductor.standalone_compile.FakeTensorMode",
|
||||
# Use patch.object on the actual module from sys.modules
|
||||
# because in Python <=3.10 the string-based patch() resolves
|
||||
# torch._inductor.standalone_compile to the wrapper function
|
||||
# (defined in __init__.py) instead of the module.
|
||||
import sys
|
||||
|
||||
fake_mode_ctx: Any = patch.object(
|
||||
sys.modules["torch._inductor.standalone_compile"],
|
||||
"FakeTensorMode",
|
||||
lambda *a, **kw: input_fake_mode,
|
||||
)
|
||||
else:
|
||||
|
||||
@@ -30,7 +30,7 @@ class AttentionConfig:
|
||||
use_cudnn_prefill: bool = False
|
||||
"""Whether to use cudnn prefill."""
|
||||
|
||||
use_trtllm_ragged_deepseek_prefill: bool = False
|
||||
use_trtllm_ragged_deepseek_prefill: bool = True
|
||||
"""Whether to use TRTLLM ragged deepseek prefill."""
|
||||
|
||||
use_trtllm_attention: bool | None = None
|
||||
|
||||
@@ -682,6 +682,27 @@ class VllmConfig:
|
||||
self.model_config, self.load_config
|
||||
)
|
||||
|
||||
if (
|
||||
self.quant_config is not None
|
||||
and self.model_config is not None
|
||||
and hasattr(self.quant_config, "use_deep_gemm")
|
||||
and self.quant_config.use_deep_gemm is None
|
||||
):
|
||||
from vllm.utils.deep_gemm import should_auto_disable_deep_gemm
|
||||
|
||||
model_type = getattr(self.model_config.hf_text_config, "model_type", None)
|
||||
if should_auto_disable_deep_gemm(model_type):
|
||||
self.quant_config.use_deep_gemm = False
|
||||
logger.warning_once(
|
||||
"Auto-disabled DeepGemm for model_type=%s on Blackwell. "
|
||||
"DeepGemm E8M0 scale format causes accuracy degradation "
|
||||
"for this architecture. Falling back to CUTLASS. "
|
||||
"To disable DeepGemm globally, set VLLM_USE_DEEP_GEMM=0.",
|
||||
model_type,
|
||||
)
|
||||
|
||||
from vllm.v1.executor.abstract import Executor
|
||||
|
||||
executor_backend = self.parallel_config.distributed_executor_backend
|
||||
executor_supports_async_sched = executor_backend in (
|
||||
"mp",
|
||||
|
||||
@@ -253,23 +253,25 @@ class TrtLlmFp8ExpertsMonolithic(TrtLlmFp8ExpertsBase, mk.FusedMoEExpertsMonolit
|
||||
weight_key: QuantKey | None,
|
||||
activation_key: QuantKey | None,
|
||||
) -> bool:
|
||||
"""Monolithic kernels need to express router support."""
|
||||
"""Monolithic kernels need to express router support.
|
||||
Renormalize/RenormalizeNaive are excluded: the monolithic kernel's
|
||||
internal routing for these methods produces output uncorrelated
|
||||
with the modular kernel's output and with Triton kernel's output
|
||||
for Qwen3.5-35B-A3B-FP8.
|
||||
See: https://github.com/vllm-project/vllm/issues/37591
|
||||
"""
|
||||
# NOTE(dbari): TopK routing could also be enabled, but need to validate models
|
||||
# NOTE(dbari): Default is not implemented and should not be enabled until it is
|
||||
if (weight_key, activation_key) == (kFp8Static128BlockSym, kFp8Dynamic128Sym):
|
||||
# NOTE(rob): potentially allow others here. This is a conservative list.
|
||||
return routing_method in [
|
||||
RoutingMethodType.DeepSeekV3,
|
||||
RoutingMethodType.Renormalize,
|
||||
RoutingMethodType.RenormalizeNaive,
|
||||
]
|
||||
elif (weight_key, activation_key) == (kFp8StaticTensorSym, kFp8StaticTensorSym):
|
||||
# NOTE(dbari): as above, potentially allow others here.
|
||||
return routing_method in [
|
||||
RoutingMethodType.DeepSeekV3,
|
||||
RoutingMethodType.Llama4,
|
||||
RoutingMethodType.Renormalize,
|
||||
RoutingMethodType.RenormalizeNaive,
|
||||
]
|
||||
else:
|
||||
raise ValueError("Unsupported quantization scheme.")
|
||||
|
||||
@@ -135,6 +135,7 @@ class Fp8Config(QuantizationConfig):
|
||||
f"{activation_scheme} activation scheme."
|
||||
)
|
||||
self.weight_block_size = weight_block_size
|
||||
self.use_deep_gemm: bool | None = None
|
||||
|
||||
@classmethod
|
||||
def get_name(cls) -> QuantizationMethods:
|
||||
@@ -291,7 +292,10 @@ class Fp8LinearMethod(LinearMethodBase):
|
||||
self.use_marlin = False
|
||||
|
||||
self.use_aiter_and_is_supported = rocm_aiter_ops.is_linear_fp8_enabled()
|
||||
self.use_deep_gemm = is_deep_gemm_supported()
|
||||
if self.quant_config.use_deep_gemm is not None:
|
||||
self.use_deep_gemm = self.quant_config.use_deep_gemm
|
||||
else:
|
||||
self.use_deep_gemm = is_deep_gemm_supported()
|
||||
|
||||
self.weight_block_size = self.quant_config.weight_block_size
|
||||
self.block_quant = self.weight_block_size is not None
|
||||
@@ -305,6 +309,7 @@ class Fp8LinearMethod(LinearMethodBase):
|
||||
act_quant_group_shape=GroupShape(1, self.weight_block_size[0]),
|
||||
cutlass_block_fp8_supported=self.cutlass_block_fp8_supported,
|
||||
use_aiter_and_is_supported=self.use_aiter_and_is_supported,
|
||||
use_deep_gemm=self.use_deep_gemm,
|
||||
)
|
||||
else:
|
||||
# Use per-token quantization for better perf if dynamic and cutlass
|
||||
@@ -440,7 +445,7 @@ class Fp8LinearMethod(LinearMethodBase):
|
||||
del layer.input_scale
|
||||
return
|
||||
|
||||
if self.block_quant:
|
||||
if self.block_quant and self.use_deep_gemm:
|
||||
maybe_post_process_fp8_weight_block(layer)
|
||||
|
||||
def apply(
|
||||
|
||||
@@ -91,6 +91,7 @@ class QuantFP8(CustomOp):
|
||||
|
||||
if (
|
||||
self.is_group_quant
|
||||
and self.use_ue8m0
|
||||
and self.use_deep_gemm_supported
|
||||
and (DeepGemmQuantScaleFMT.from_oracle() == DeepGemmQuantScaleFMT.UE8M0)
|
||||
):
|
||||
|
||||
@@ -356,10 +356,14 @@ class W8A8BlockFp8LinearOp:
|
||||
act_quant_group_shape: GroupShape,
|
||||
cutlass_block_fp8_supported: bool = CUTLASS_BLOCK_FP8_SUPPORTED,
|
||||
use_aiter_and_is_supported: bool = False,
|
||||
use_deep_gemm: bool | None = None,
|
||||
):
|
||||
self.weight_group_shape = weight_group_shape
|
||||
self.act_quant_group_shape = act_quant_group_shape
|
||||
self.is_deep_gemm_supported = is_deep_gemm_supported()
|
||||
if use_deep_gemm is not None:
|
||||
self.is_deep_gemm_supported = use_deep_gemm
|
||||
else:
|
||||
self.is_deep_gemm_supported = is_deep_gemm_supported()
|
||||
self.is_hopper = current_platform.is_device_capability(90)
|
||||
self.use_deep_gemm_e8m0 = is_deep_gemm_e8m0_used()
|
||||
self.is_flashinfer_supported = is_flashinfer_fp8_blockscale_gemm_supported()
|
||||
|
||||
@@ -23,6 +23,24 @@ from vllm.platforms import current_platform
|
||||
from vllm.utils.import_utils import has_deep_gemm
|
||||
from vllm.utils.math_utils import cdiv
|
||||
|
||||
_DEEPGEMM_BLACKWELL_EXCLUDED_MODEL_TYPES: set[str] = {
|
||||
"qwen3_5_text",
|
||||
"qwen3_5_moe_text",
|
||||
}
|
||||
|
||||
|
||||
def should_auto_disable_deep_gemm(model_type: str | None) -> bool:
|
||||
"""Check if DeepGemm should be auto-disabled for this model on Blackwell.
|
||||
|
||||
Returns True if the model is known to have accuracy degradation with
|
||||
DeepGemm's E8M0 scale format on Blackwell GPUs (SM100+).
|
||||
"""
|
||||
if model_type is None:
|
||||
return False
|
||||
if not current_platform.is_device_capability_family(100):
|
||||
return False
|
||||
return model_type in _DEEPGEMM_BLACKWELL_EXCLUDED_MODEL_TYPES
|
||||
|
||||
|
||||
class DeepGemmQuantScaleFMT(Enum):
|
||||
# Float32 scales in Float32 tensor
|
||||
|
||||
Reference in New Issue
Block a user