Compare commits

..

14 Commits

Author SHA1 Message Date
Matthew Bonanni
a26e8dc7ff [Bugfix][MLA] Change default SM100 MLA prefill backend back to TRT-LLM (#38562)
Signed-off-by: Matthew Bonanni <mbonanni@redhat.com>
(cherry picked from commit 2c734ed0e0)
2026-03-30 12:42:26 -07:00
khluu
599e7359a3 release push
Signed-off-by: khluu <khluu000@gmail.com>
2026-03-30 12:41:59 -07:00
Kevin H. Luu
d0cf73ce42 [release] Move the rest of release jobs to release queue (#38044)
Signed-off-by: khluu <khluu000@gmail.com>
(cherry picked from commit af945615b5)
2026-03-30 11:07:11 -07:00
amey asgaonkar
f0a5c5973b Add Ubuntu 24.04 support for Docker builds (#35386)
Signed-off-by: aasgaonkar <aasgaonkar@nvidia.com>
(cherry picked from commit 0c1809c806)
2026-03-30 11:06:43 -07:00
Kevin H. Luu
b7e4b88987 [release] Move agent queue to Release cluster queues (#37783)
Signed-off-by: khluu <khluu000@gmail.com>
(cherry picked from commit 7281199a8c)
2026-03-30 10:58:59 -07:00
haosdent
90b29e5302 [CI] Fix Ernie4.5-VL initialization test (#38429)
Signed-off-by: haosdent <haosdent@gmail.com>
(cherry picked from commit b2bc736b12)
2026-03-30 01:02:23 -07:00
Nicolò Lucchesi
a45d96ff42 [CI] Skip failing test (#38369)
Signed-off-by: NickLucche <nlucches@redhat.com>
(cherry picked from commit 44a6528028)
2026-03-29 00:08:38 -07:00
Harry Mellor
7693c8eabf Fix attribute error in isaac_patch_hf_runner (#37685)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
(cherry picked from commit 9f6d9dd371)
2026-03-29 00:07:54 -07:00
Vadim Gimpelson
7624525bf6 cherry-pick [Bugfix] Restore prepare_fp8_layer_for_marlin removed by merge conflict resolution
Signed-off-by: khluu <khluu000@gmail.com>
Co-authored-by: vadiklyutiy <vgimpelson@nvidia.com>
#38398
2026-03-27 14:49:47 -07:00
Michael Goin
d1b4f10b19 cherry-pick [CI Bugfix] Pre-download missing FlashInfer headers in Docker build
Signed-off-by: khluu <khluu000@gmail.com>
#38391
2026-03-27 14:49:47 -07:00
khluu
9fdc0f3aeb merge
Signed-off-by: khluu <khluu000@gmail.com>
2026-03-26 02:17:52 -07:00
Vadim Gimpelson
05d96d7991 merge
Signed-off-by: khluu <khluu000@gmail.com>
2026-03-26 01:25:41 -07:00
Dimitrios Bariamis
ccbc5ac449 [Bugfix] Fix mock.patch resolution failure for standalone_compile.FakeTensorMode on Python <= 3.10 (#37158)
Signed-off-by: Dimitrios Bariamis <12195802+dbari@users.noreply.github.com>
Co-authored-by: Dimitrios Bariamis <12195802+dbari@users.noreply.github.com>
(cherry picked from commit 1204cf0a9d)
2026-03-24 17:59:17 -07:00
khluu
bcf2be9612 [cherry-pick][Bugfix] Disable monolithic TRTLLM MoE for Renormalize routing (#37591)#37605
Signed-off-by: khluu <khluu000@gmail.com>
2026-03-19 15:06:38 -07:00
21 changed files with 422 additions and 48 deletions

View File

@@ -12,7 +12,7 @@ steps:
depends_on: ~ depends_on: ~
id: build-wheel-arm64-cuda-12-9 id: build-wheel-arm64-cuda-12-9
agents: agents:
queue: arm64_cpu_queue_postmerge queue: arm64_cpu_queue_release
commands: commands:
# #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here: # #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here:
# https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7 # https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7
@@ -27,7 +27,7 @@ steps:
depends_on: ~ depends_on: ~
id: build-wheel-arm64-cuda-13-0 id: build-wheel-arm64-cuda-13-0
agents: agents:
queue: arm64_cpu_queue_postmerge queue: arm64_cpu_queue_release
commands: commands:
# #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here: # #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here:
# https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7 # https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7
@@ -42,7 +42,7 @@ steps:
depends_on: ~ depends_on: ~
id: build-wheel-arm64-cpu id: build-wheel-arm64-cpu
agents: agents:
queue: arm64_cpu_queue_postmerge queue: arm64_cpu_queue_release
commands: commands:
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_BUILD_ACL=ON --tag vllm-ci:build-image --target vllm-build --progress plain -f docker/Dockerfile.cpu ." - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_BUILD_ACL=ON --tag vllm-ci:build-image --target vllm-build --progress plain -f docker/Dockerfile.cpu ."
- "mkdir artifacts" - "mkdir artifacts"
@@ -55,7 +55,7 @@ steps:
depends_on: ~ depends_on: ~
id: build-wheel-x86-cuda-12-9 id: build-wheel-x86-cuda-12-9
agents: agents:
queue: cpu_queue_postmerge queue: cpu_queue_release
commands: commands:
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ." - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
- "mkdir artifacts" - "mkdir artifacts"
@@ -68,7 +68,7 @@ steps:
depends_on: ~ depends_on: ~
id: build-wheel-x86-cuda-13-0 id: build-wheel-x86-cuda-13-0
agents: agents:
queue: cpu_queue_postmerge queue: cpu_queue_release
commands: commands:
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ." - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
- "mkdir artifacts" - "mkdir artifacts"
@@ -81,7 +81,7 @@ steps:
depends_on: ~ depends_on: ~
id: build-wheel-x86-cpu id: build-wheel-x86-cpu
agents: agents:
queue: cpu_queue_postmerge queue: cpu_queue_release
commands: commands:
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_CPU_X86=true --tag vllm-ci:build-image --target vllm-build --progress plain -f docker/Dockerfile.cpu ." - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_CPU_X86=true --tag vllm-ci:build-image --target vllm-build --progress plain -f docker/Dockerfile.cpu ."
- "mkdir artifacts" - "mkdir artifacts"
@@ -97,7 +97,7 @@ steps:
depends_on: ~ depends_on: ~
id: build-release-image-x86 id: build-release-image-x86
agents: agents:
queue: cpu_queue_postmerge queue: cpu_queue_release
commands: commands:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7" - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ." - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
@@ -110,7 +110,7 @@ steps:
depends_on: ~ depends_on: ~
id: build-release-image-arm64 id: build-release-image-arm64
agents: agents:
queue: arm64_cpu_queue_postmerge queue: arm64_cpu_queue_release
commands: commands:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7" - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ." - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
@@ -120,7 +120,7 @@ steps:
depends_on: ~ depends_on: ~
id: build-release-image-x86-cuda-13-0 id: build-release-image-x86-cuda-13-0
agents: agents:
queue: cpu_queue_postmerge queue: cpu_queue_release
commands: commands:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7" - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg INSTALL_KV_CONNECTORS=true --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130 --target vllm-openai --progress plain -f docker/Dockerfile ." - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg INSTALL_KV_CONNECTORS=true --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130 --target vllm-openai --progress plain -f docker/Dockerfile ."
@@ -133,13 +133,57 @@ steps:
depends_on: ~ depends_on: ~
id: build-release-image-arm64-cuda-13-0 id: build-release-image-arm64-cuda-13-0
agents: agents:
queue: arm64_cpu_queue_postmerge queue: arm64_cpu_queue_release
commands: commands:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7" - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
# compute capability 12.0 for RTX-50 series / RTX PRO 6000 Blackwell, 12.1 for DGX Spark # compute capability 12.0 for RTX-50 series / RTX PRO 6000 Blackwell, 12.1 for DGX Spark
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0 12.1' --build-arg INSTALL_KV_CONNECTORS=true --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130 --target vllm-openai --progress plain -f docker/Dockerfile ." - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0 12.1' --build-arg INSTALL_KV_CONNECTORS=true --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130 --target vllm-openai --progress plain -f docker/Dockerfile ."
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130" - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130"
- label: "Build release image - x86_64 - CUDA 12.9 - Ubuntu 24.04"
depends_on: ~
id: build-release-image-x86-ubuntu2404
agents:
queue: cpu_queue_release
commands:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg UBUNTU_VERSION=24.04 --build-arg GDRCOPY_OS_VERSION=Ubuntu24_04 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-ubuntu2404 --target vllm-openai --progress plain -f docker/Dockerfile ."
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-ubuntu2404"
- "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-ubuntu2404 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-ubuntu2404"
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-ubuntu2404"
- label: "Build release image - aarch64 - CUDA 12.9 - Ubuntu 24.04"
depends_on: ~
id: build-release-image-arm64-ubuntu2404
agents:
queue: arm64_cpu_queue_release
commands:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg UBUNTU_VERSION=24.04 --build-arg GDRCOPY_OS_VERSION=Ubuntu24_04 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-ubuntu2404 --target vllm-openai --progress plain -f docker/Dockerfile ."
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-ubuntu2404"
- label: "Build release image - x86_64 - CUDA 13.0 - Ubuntu 24.04"
depends_on: ~
id: build-release-image-x86-cuda-13-0-ubuntu2404
agents:
queue: cpu_queue_release
commands:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg UBUNTU_VERSION=24.04 --build-arg GDRCOPY_OS_VERSION=Ubuntu24_04 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0 12.1' --build-arg INSTALL_KV_CONNECTORS=true --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu24.04 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130-ubuntu2404 --target vllm-openai --progress plain -f docker/Dockerfile ."
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130-ubuntu2404"
- "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130-ubuntu2404 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu130-ubuntu2404"
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu130-ubuntu2404"
- label: "Build release image - aarch64 - CUDA 13.0 - Ubuntu 24.04"
depends_on: ~
id: build-release-image-arm64-cuda-13-0-ubuntu2404
agents:
queue: arm64_cpu_queue_release
commands:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg UBUNTU_VERSION=24.04 --build-arg GDRCOPY_OS_VERSION=Ubuntu24_04 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0 12.1' --build-arg INSTALL_KV_CONNECTORS=true --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu24.04 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130-ubuntu2404 --target vllm-openai --progress plain -f docker/Dockerfile ."
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130-ubuntu2404"
- block: "Build release image for x86_64 CPU" - block: "Build release image for x86_64 CPU"
key: block-cpu-release-image-build key: block-cpu-release-image-build
depends_on: ~ depends_on: ~
@@ -148,8 +192,9 @@ steps:
depends_on: depends_on:
- block-cpu-release-image-build - block-cpu-release-image-build
- input-release-version - input-release-version
id: build-release-image-x86-cpu
agents: agents:
queue: cpu_queue_postmerge queue: cpu_queue_release
commands: commands:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7" - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_CPU_X86=true --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ." - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_CPU_X86=true --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ."
@@ -166,8 +211,9 @@ steps:
depends_on: depends_on:
- block-arm64-cpu-release-image-build - block-arm64-cpu-release-image-build
- input-release-version - input-release-version
id: build-release-image-arm64-cpu
agents: agents:
queue: arm64_cpu_queue_postmerge queue: arm64_cpu_queue_release
commands: commands:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7" - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ." - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ."
@@ -185,7 +231,7 @@ steps:
- build-release-image-arm64 - build-release-image-arm64
id: create-multi-arch-manifest id: create-multi-arch-manifest
agents: agents:
queue: small_cpu_queue_postmerge queue: small_cpu_queue_release
commands: commands:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7" - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
- "docker manifest create public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64 --amend" - "docker manifest create public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64 --amend"
@@ -196,7 +242,7 @@ steps:
- create-multi-arch-manifest - create-multi-arch-manifest
id: annotate-release-workflow id: annotate-release-workflow
agents: agents:
queue: small_cpu_queue_postmerge queue: small_cpu_queue_release
commands: commands:
- "bash .buildkite/scripts/annotate-release.sh" - "bash .buildkite/scripts/annotate-release.sh"
@@ -206,18 +252,67 @@ steps:
- build-release-image-arm64-cuda-13-0 - build-release-image-arm64-cuda-13-0
id: create-multi-arch-manifest-cuda-13-0 id: create-multi-arch-manifest-cuda-13-0
agents: agents:
queue: small_cpu_queue_postmerge queue: small_cpu_queue_release
commands: commands:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7" - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
- "docker manifest create public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu130 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64-cu130 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64-cu130 --amend" - "docker manifest create public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu130 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64-cu130 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64-cu130 --amend"
- "docker manifest push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu130" - "docker manifest push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu130"
- label: "Create multi-arch manifest - CUDA 12.9 - Ubuntu 24.04"
depends_on:
- build-release-image-x86-ubuntu2404
- build-release-image-arm64-ubuntu2404
id: create-multi-arch-manifest-ubuntu2404
agents:
queue: small_cpu_queue_release
commands:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
- "docker manifest create public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-ubuntu2404 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64-ubuntu2404 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64-ubuntu2404 --amend"
- "docker manifest push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-ubuntu2404"
- label: "Create multi-arch manifest - CUDA 13.0 - Ubuntu 24.04"
depends_on:
- build-release-image-x86-cuda-13-0-ubuntu2404
- build-release-image-arm64-cuda-13-0-ubuntu2404
id: create-multi-arch-manifest-cuda-13-0-ubuntu2404
agents:
queue: small_cpu_queue_release
commands:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
- "docker manifest create public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu130-ubuntu2404 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64-cu130-ubuntu2404 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64-cu130-ubuntu2404 --amend"
- "docker manifest push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu130-ubuntu2404"
- block: "Confirm publishing release images to DockerHub"
key: block-publish-release-images-dockerhub
depends_on:
- create-multi-arch-manifest
- create-multi-arch-manifest-cuda-13-0
- build-release-image-x86-cpu
- build-release-image-arm64-cpu
- build-rocm-release-image
- label: "Publish release images to DockerHub"
key: publish-release-images-dockerhub
depends_on:
- block-publish-release-images-dockerhub
agents:
queue: small_cpu_queue_release
commands:
- "bash .buildkite/scripts/push-release-builds.sh"
plugins:
- docker-login#v3.0.0:
username: vllmbot
password-env: DOCKERHUB_TOKEN
env:
DOCKER_BUILDKIT: "1"
DOCKERHUB_USERNAME: "vllmbot"
- label: "Publish nightly multi-arch image to DockerHub" - label: "Publish nightly multi-arch image to DockerHub"
depends_on: depends_on:
- create-multi-arch-manifest - create-multi-arch-manifest
if: build.env("NIGHTLY") == "1" if: build.env("NIGHTLY") == "1"
agents: agents:
queue: small_cpu_queue_postmerge queue: small_cpu_queue_release
commands: commands:
- "bash .buildkite/scripts/push-nightly-builds.sh" - "bash .buildkite/scripts/push-nightly-builds.sh"
# Clean up old nightly builds (keep only last 14) # Clean up old nightly builds (keep only last 14)
@@ -235,7 +330,7 @@ steps:
- create-multi-arch-manifest-cuda-13-0 - create-multi-arch-manifest-cuda-13-0
if: build.env("NIGHTLY") == "1" if: build.env("NIGHTLY") == "1"
agents: agents:
queue: small_cpu_queue_postmerge queue: small_cpu_queue_release
commands: commands:
- "bash .buildkite/scripts/push-nightly-builds.sh cu130" - "bash .buildkite/scripts/push-nightly-builds.sh cu130"
# Clean up old nightly builds (keep only last 14) # Clean up old nightly builds (keep only last 14)
@@ -262,7 +357,7 @@ steps:
- block-upload-release-wheels - block-upload-release-wheels
id: upload-release-wheels id: upload-release-wheels
agents: agents:
queue: small_cpu_queue_postmerge queue: small_cpu_queue_release
commands: commands:
- "bash .buildkite/scripts/upload-release-wheels-pypi.sh" - "bash .buildkite/scripts/upload-release-wheels-pypi.sh"
@@ -323,7 +418,7 @@ steps:
- step: input-rocm-config - step: input-rocm-config
allow_failure: true # Allow failure so non-UI builds can proceed (input step is skipped) allow_failure: true # Allow failure so non-UI builds can proceed (input step is skipped)
agents: agents:
queue: cpu_queue_postmerge queue: cpu_queue_release
commands: commands:
# Set configuration and check cache # Set configuration and check cache
- | - |
@@ -465,7 +560,7 @@ steps:
- step: build-rocm-base-wheels - step: build-rocm-base-wheels
allow_failure: false allow_failure: false
agents: agents:
queue: cpu_queue_postmerge queue: cpu_queue_release
timeout_in_minutes: 180 timeout_in_minutes: 180
commands: commands:
# Download artifacts and prepare Docker image # Download artifacts and prepare Docker image
@@ -575,7 +670,7 @@ steps:
- step: build-rocm-vllm-wheel - step: build-rocm-vllm-wheel
allow_failure: false allow_failure: false
agents: agents:
queue: cpu_queue_postmerge queue: cpu_queue_release
timeout_in_minutes: 60 timeout_in_minutes: 60
commands: commands:
# Download all wheel artifacts and run upload # Download all wheel artifacts and run upload
@@ -624,7 +719,7 @@ steps:
- step: input-release-version - step: input-release-version
allow_failure: true allow_failure: true
agents: agents:
queue: cpu_queue_postmerge queue: cpu_queue_release
commands: commands:
- "bash .buildkite/scripts/annotate-rocm-release.sh" - "bash .buildkite/scripts/annotate-rocm-release.sh"
env: env:
@@ -641,7 +736,7 @@ steps:
depends_on: block-generate-root-index-rocm-wheels depends_on: block-generate-root-index-rocm-wheels
id: generate-root-index-rocm-wheels id: generate-root-index-rocm-wheels
agents: agents:
queue: cpu_queue_postmerge queue: cpu_queue_release
commands: commands:
- "bash tools/vllm-rocm/generate-rocm-wheels-root-index.sh" - "bash tools/vllm-rocm/generate-rocm-wheels-root-index.sh"
env: env:
@@ -655,7 +750,7 @@ steps:
- step: build-rocm-base-wheels - step: build-rocm-base-wheels
allow_failure: false allow_failure: false
agents: agents:
queue: cpu_queue_postmerge queue: cpu_queue_release
timeout_in_minutes: 60 timeout_in_minutes: 60
commands: commands:
- | - |

View File

@@ -0,0 +1,113 @@
#!/bin/bash
set -euo pipefail
# Ensure git tags are up-to-date (Buildkite's default fetch doesn't always include tags)
echo "Fetching latest tags from origin..."
git fetch --tags --force origin
# Derive release version from the git tag on the current commit.
# The pipeline must be triggered on a tagged commit (e.g. v0.18.1).
RELEASE_VERSION=$(git describe --exact-match --tags "${BUILDKITE_COMMIT}" 2>/dev/null || true)
if [ -z "${RELEASE_VERSION}" ]; then
echo "[FATAL] Commit ${BUILDKITE_COMMIT} has no exact git tag. " \
"Release images must be published from a tagged commit."
exit 1
fi
# Strip leading 'v' for use in Docker tags (e.g. v0.18.1 -> 0.18.1)
PURE_VERSION="${RELEASE_VERSION#v}"
echo "========================================"
echo "Publishing release images"
echo " Commit: ${BUILDKITE_COMMIT}"
echo " Release version: ${RELEASE_VERSION}"
echo "========================================"
set -x
# ---- CUDA (default, CUDA 12.9) ----
docker pull "public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64"
docker pull "public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64"
docker tag "public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64" "vllm/vllm-openai:latest-x86_64"
docker tag "public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64" "vllm/vllm-openai:v${PURE_VERSION}-x86_64"
docker push "vllm/vllm-openai:latest-x86_64"
docker push "vllm/vllm-openai:v${PURE_VERSION}-x86_64"
docker tag "public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64" "vllm/vllm-openai:latest-aarch64"
docker tag "public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64" "vllm/vllm-openai:v${PURE_VERSION}-aarch64"
docker push "vllm/vllm-openai:latest-aarch64"
docker push "vllm/vllm-openai:v${PURE_VERSION}-aarch64"
docker manifest rm "vllm/vllm-openai:latest" || true
docker manifest create "vllm/vllm-openai:latest" "vllm/vllm-openai:latest-x86_64" "vllm/vllm-openai:latest-aarch64"
docker manifest push "vllm/vllm-openai:latest"
docker manifest rm "vllm/vllm-openai:v${PURE_VERSION}" || true
docker manifest create "vllm/vllm-openai:v${PURE_VERSION}" "vllm/vllm-openai:v${PURE_VERSION}-x86_64" "vllm/vllm-openai:v${PURE_VERSION}-aarch64"
docker manifest push "vllm/vllm-openai:v${PURE_VERSION}"
# ---- CUDA 13.0 ----
docker pull "public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64-cu130"
docker pull "public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64-cu130"
docker tag "public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64-cu130" "vllm/vllm-openai:latest-x86_64-cu130"
docker tag "public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64-cu130" "vllm/vllm-openai:v${PURE_VERSION}-x86_64-cu130"
docker push "vllm/vllm-openai:latest-x86_64-cu130"
docker push "vllm/vllm-openai:v${PURE_VERSION}-x86_64-cu130"
docker tag "public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64-cu130" "vllm/vllm-openai:latest-aarch64-cu130"
docker tag "public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64-cu130" "vllm/vllm-openai:v${PURE_VERSION}-aarch64-cu130"
docker push "vllm/vllm-openai:latest-aarch64-cu130"
docker push "vllm/vllm-openai:v${PURE_VERSION}-aarch64-cu130"
docker manifest rm "vllm/vllm-openai:latest-cu130" || true
docker manifest create "vllm/vllm-openai:latest-cu130" "vllm/vllm-openai:latest-x86_64-cu130" "vllm/vllm-openai:latest-aarch64-cu130"
docker manifest push "vllm/vllm-openai:latest-cu130"
docker manifest rm "vllm/vllm-openai:v${PURE_VERSION}-cu130" || true
docker manifest create "vllm/vllm-openai:v${PURE_VERSION}-cu130" "vllm/vllm-openai:v${PURE_VERSION}-x86_64-cu130" "vllm/vllm-openai:v${PURE_VERSION}-aarch64-cu130"
docker manifest push "vllm/vllm-openai:v${PURE_VERSION}-cu130"
# ---- ROCm ----
docker pull "public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm"
docker pull "public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm-base"
docker tag "public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm" "vllm/vllm-openai-rocm:latest"
docker tag "public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm" "vllm/vllm-openai-rocm:v${PURE_VERSION}"
docker push "vllm/vllm-openai-rocm:latest"
docker push "vllm/vllm-openai-rocm:v${PURE_VERSION}"
docker tag "public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm-base" "vllm/vllm-openai-rocm:latest-base"
docker tag "public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm-base" "vllm/vllm-openai-rocm:v${PURE_VERSION}-base"
docker push "vllm/vllm-openai-rocm:latest-base"
docker push "vllm/vllm-openai-rocm:v${PURE_VERSION}-base"
# ---- CPU ----
# CPU images in ECR are tagged with the full version including 'v' (e.g. v0.18.1),
# matching the value from the Buildkite release-version metadata input.
docker pull "public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:${RELEASE_VERSION}"
docker pull "public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:${RELEASE_VERSION}"
docker tag "public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:${RELEASE_VERSION}" "vllm/vllm-openai-cpu:latest-x86_64"
docker tag "public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:${RELEASE_VERSION}" "vllm/vllm-openai-cpu:v${PURE_VERSION}-x86_64"
docker push "vllm/vllm-openai-cpu:latest-x86_64"
docker push "vllm/vllm-openai-cpu:v${PURE_VERSION}-x86_64"
docker tag "public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:${RELEASE_VERSION}" "vllm/vllm-openai-cpu:latest-arm64"
docker tag "public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:${RELEASE_VERSION}" "vllm/vllm-openai-cpu:v${PURE_VERSION}-arm64"
docker push "vllm/vllm-openai-cpu:latest-arm64"
docker push "vllm/vllm-openai-cpu:v${PURE_VERSION}-arm64"
docker manifest rm "vllm/vllm-openai-cpu:latest" || true
docker manifest create "vllm/vllm-openai-cpu:latest" "vllm/vllm-openai-cpu:latest-x86_64" "vllm/vllm-openai-cpu:latest-arm64"
docker manifest push "vllm/vllm-openai-cpu:latest"
docker manifest rm "vllm/vllm-openai-cpu:v${PURE_VERSION}" || true
docker manifest create "vllm/vllm-openai-cpu:v${PURE_VERSION}" "vllm/vllm-openai-cpu:v${PURE_VERSION}-x86_64" "vllm/vllm-openai-cpu:v${PURE_VERSION}-arm64"
docker manifest push "vllm/vllm-openai-cpu:v${PURE_VERSION}"
echo "========================================"
echo "Successfully published release images for ${RELEASE_VERSION}"
echo "========================================"

View File

@@ -45,6 +45,22 @@ steps:
commands: commands:
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt
- label: LM Eval Qwen3.5 Models (B200)
timeout_in_minutes: 120
device: b200
optional: true
num_devices: 2
source_file_dependencies:
- vllm/model_executor/models/qwen3_5.py
- vllm/model_executor/models/qwen3_5_mtp.py
- vllm/transformers_utils/configs/qwen3_5.py
- vllm/transformers_utils/configs/qwen3_5_moe.py
- vllm/model_executor/models/qwen3_next.py
- vllm/model_executor/models/qwen3_next_mtp.py
- vllm/model_executor/layers/fla/ops/
commands:
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-qwen35-blackwell.txt
- label: LM Eval Large Models (H200) - label: LM Eval Large Models (H200)
timeout_in_minutes: 60 timeout_in_minutes: 60
device: h200 device: h200

View File

@@ -24,6 +24,7 @@
ARG CUDA_VERSION=12.9.1 ARG CUDA_VERSION=12.9.1
ARG PYTHON_VERSION=3.12 ARG PYTHON_VERSION=3.12
ARG UBUNTU_VERSION=22.04
# By parameterizing the base images, we allow third-party to use their own # By parameterizing the base images, we allow third-party to use their own
# base images. One use case is hermetic builds with base images stored in # base images. One use case is hermetic builds with base images stored in
@@ -38,7 +39,7 @@ ARG PYTHON_VERSION=3.12
# version are not backwards compatible with OSes that use an earlier version. # version are not backwards compatible with OSes that use an earlier version.
ARG BUILD_BASE_IMAGE=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04 ARG BUILD_BASE_IMAGE=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04
# Using cuda base image with minimal dependencies necessary for JIT compilation (FlashInfer, DeepGEMM, EP kernels) # Using cuda base image with minimal dependencies necessary for JIT compilation (FlashInfer, DeepGEMM, EP kernels)
ARG FINAL_BASE_IMAGE=nvidia/cuda:${CUDA_VERSION}-base-ubuntu22.04 ARG FINAL_BASE_IMAGE=nvidia/cuda:${CUDA_VERSION}-base-ubuntu${UBUNTU_VERSION}
# By parameterizing the Deadsnakes repository URL, we allow third-party to use # By parameterizing the Deadsnakes repository URL, we allow third-party to use
# their own mirror. When doing so, we don't benefit from the transparent # their own mirror. When doing so, we don't benefit from the transparent
@@ -111,6 +112,10 @@ RUN apt-get update -y \
gcc-10 \ gcc-10 \
g++-10 \ g++-10 \
&& update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-10 110 --slave /usr/bin/g++ g++ /usr/bin/g++-10 \ && update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-10 110 --slave /usr/bin/g++ g++ /usr/bin/g++-10 \
# Install python dev headers if available (needed for cmake FindPython on Ubuntu 24.04
# which ships cmake 3.28 and requires Development.SABIModule; silently skipped on
# Ubuntu 20.04/22.04 where python3.x-dev is not available without a PPA)
&& (apt-get install -y --no-install-recommends python${PYTHON_VERSION}-dev 2>/dev/null || true) \
&& rm -rf /var/lib/apt/lists/* \ && rm -rf /var/lib/apt/lists/* \
&& curl -LsSf https://astral.sh/uv/install.sh | sh \ && curl -LsSf https://astral.sh/uv/install.sh | sh \
&& $HOME/.local/bin/uv venv /opt/venv --python ${PYTHON_VERSION} \ && $HOME/.local/bin/uv venv /opt/venv --python ${PYTHON_VERSION} \
@@ -507,7 +512,6 @@ RUN apt-get update -y \
software-properties-common \ software-properties-common \
curl \ curl \
sudo \ sudo \
python3-pip \
ffmpeg \ ffmpeg \
libsm6 \ libsm6 \
libxext6 \ libxext6 \
@@ -535,6 +539,7 @@ RUN apt-get update -y \
&& update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \ && update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \
&& update-alternatives --set python3 /usr/bin/python${PYTHON_VERSION} \ && update-alternatives --set python3 /usr/bin/python${PYTHON_VERSION} \
&& ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \ && ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \
&& rm -f /usr/lib/python${PYTHON_VERSION}/EXTERNALLY-MANAGED \
&& curl -sS ${GET_PIP_URL} | python${PYTHON_VERSION} \ && curl -sS ${GET_PIP_URL} | python${PYTHON_VERSION} \
&& python3 --version && python3 -m pip --version && python3 --version && python3 -m pip --version
@@ -593,6 +598,25 @@ RUN --mount=type=cache,target=/root/.cache/uv \
--extra-index-url https://flashinfer.ai/whl/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') \ --extra-index-url https://flashinfer.ai/whl/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') \
&& flashinfer show-config && flashinfer show-config
# Pre-download FlashInfer TRTLLM BMM headers for air-gapped environments.
# At runtime, MoE JIT compilation downloads these from edge.urm.nvidia.com
# which fails without internet. This step caches them at build time.
RUN python3 <<'PYEOF'
from flashinfer.jit import env as jit_env
from flashinfer.jit.cubin_loader import download_trtllm_headers, get_cubin
from flashinfer.artifacts import ArtifactPath, CheckSumHash
download_trtllm_headers(
'bmm',
jit_env.FLASHINFER_CUBIN_DIR / 'flashinfer' / 'trtllm' / 'batched_gemm' / 'trtllmGen_bmm_export',
f'{ArtifactPath.TRTLLM_GEN_BMM}/include/trtllmGen_bmm_export',
ArtifactPath.TRTLLM_GEN_BMM,
get_cubin(f'{ArtifactPath.TRTLLM_GEN_BMM}/checksums.txt', CheckSumHash.TRTLLM_GEN_BMM),
)
print('FlashInfer TRTLLM BMM headers downloaded successfully')
PYEOF
# ============================================================ # ============================================================
# OPENAI API SERVER DEPENDENCIES # OPENAI API SERVER DEPENDENCIES
# Pre-install these to avoid reinstalling on every vLLM wheel rebuild # Pre-install these to avoid reinstalling on every vLLM wheel rebuild

View File

@@ -33,6 +33,10 @@ group "default" {
targets = ["openai"] targets = ["openai"]
} }
group "all" {
targets = ["openai", "openai-ubuntu2404"]
}
# Base targets # Base targets
target "_common" { target "_common" {
@@ -74,3 +78,29 @@ target "openai" {
tags = ["vllm:openai"] tags = ["vllm:openai"]
output = ["type=docker"] output = ["type=docker"]
} }
# Ubuntu 24.04 targets
target "test-ubuntu2404" {
inherits = ["_common", "_labels"]
target = "test"
tags = ["vllm:test-ubuntu24.04"]
args = {
UBUNTU_VERSION = "24.04"
GDRCOPY_OS_VERSION = "Ubuntu24_04"
FLASHINFER_AOT_COMPILE = "true"
}
output = ["type=docker"]
}
target "openai-ubuntu2404" {
inherits = ["_common", "_labels"]
target = "vllm-openai"
tags = ["vllm:openai-ubuntu24.04"]
args = {
UBUNTU_VERSION = "24.04"
GDRCOPY_OS_VERSION = "Ubuntu24_04"
FLASHINFER_AOT_COMPILE = "true"
}
output = ["type=docker"]
}

View File

@@ -7,6 +7,9 @@
"PYTHON_VERSION": { "PYTHON_VERSION": {
"default": "3.12" "default": "3.12"
}, },
"UBUNTU_VERSION": {
"default": "22.04"
},
"BUILD_BASE_IMAGE": { "BUILD_BASE_IMAGE": {
"default": "nvidia/cuda:12.9.1-devel-ubuntu20.04" "default": "nvidia/cuda:12.9.1-devel-ubuntu20.04"
}, },

View File

@@ -0,0 +1,9 @@
model_name: "Qwen/Qwen3.5-35B-A3B"
accuracy_threshold: 0.84
tolerance: 0.03
num_questions: 1319
num_fewshot: 5
server_args: >-
--max-model-len 4096
--data-parallel-size 2
--enable-expert-parallel

View File

@@ -0,0 +1,10 @@
model_name: "Qwen/Qwen3.5-35B-A3B-FP8"
accuracy_threshold: 0.79
tolerance: 0.03
num_questions: 1319
num_fewshot: 5
server_args: >-
--max-model-len 4096
--data-parallel-size 2
--enable-expert-parallel
--kv-cache-dtype fp8

View File

@@ -0,0 +1,9 @@
model_name: "nvidia/Qwen3.5-397B-A17B-NVFP4"
accuracy_threshold: 0.88
tolerance: 0.03
num_questions: 1319
num_fewshot: 5
server_args: >-
--max-model-len 4096
--data-parallel-size 2
--enable-expert-parallel

View File

@@ -0,0 +1,3 @@
Qwen3.5-35B-A3B-DEP2.yaml
Qwen3.5-35B-A3B-FP8-DEP2.yaml
Qwen3.5-397B-A17B-NVFP4-DEP2.yaml

View File

@@ -19,8 +19,6 @@ from vllm.platforms import current_platform
from .gsm8k_eval import evaluate_gsm8k from .gsm8k_eval import evaluate_gsm8k
TOL = 0.08 # Absolute tolerance for accuracy comparison
def run_gsm8k_eval(eval_config: dict, server_url: str) -> dict: def run_gsm8k_eval(eval_config: dict, server_url: str) -> dict:
"""Run GSM8K evaluation using our isolated script.""" """Run GSM8K evaluation using our isolated script."""
@@ -99,20 +97,20 @@ def test_gsm8k_correctness(config_filename):
measured_metric = results["accuracy"] measured_metric = results["accuracy"]
expected_metric = eval_config["accuracy_threshold"] expected_metric = eval_config["accuracy_threshold"]
tol = eval_config.get("tolerance", 0.08)
print(f"GSM8K Results for {eval_config['model_name']}:") print(f"GSM8K Results for {eval_config['model_name']}:")
print(f" Measured metric: {measured_metric:.4f}") print(f" Measured metric: {measured_metric:.4f}")
print(f" Expected metric: {expected_metric:.4f}") print(f" Expected metric: {expected_metric:.4f}")
print(f" Tolerance: {TOL:.4f}") print(f" Tolerance: {tol:.4f}")
print(f" Questions: {results['num_questions']}") print(f" Questions: {results['num_questions']}")
print(f" Invalid rate: {results['invalid_rate']:.3f}") print(f" Invalid rate: {results['invalid_rate']:.3f}")
print(f" Latency: {results['latency']:.1f}s") print(f" Latency: {results['latency']:.1f}s")
print(f" QPS: {results['questions_per_second']:.1f}") print(f" QPS: {results['questions_per_second']:.1f}")
# Verify metric is within tolerance assert measured_metric >= expected_metric - tol, (
assert measured_metric >= expected_metric - TOL, (
f"GSM8K metric too low: {measured_metric:.4f} < " f"GSM8K metric too low: {measured_metric:.4f} < "
f"{expected_metric:.4f} - {TOL:.4f} = {expected_metric - TOL:.4f}" f"{expected_metric:.4f} - {tol:.4f} = {expected_metric - tol:.4f}"
) )
print(f"✅ GSM8K test passed for {eval_config['model_name']}") print(f"✅ GSM8K test passed for {eval_config['model_name']}")

View File

@@ -24,6 +24,7 @@ from transformers import (
GenerationConfig, GenerationConfig,
GenerationMixin, GenerationMixin,
) )
from transformers.masking_utils import create_causal_mask
from transformers.video_utils import VideoMetadata from transformers.video_utils import VideoMetadata
from vllm.logprobs import SampleLogprobs from vllm.logprobs import SampleLogprobs
@@ -679,10 +680,14 @@ def isaac_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
sin = sin.to(inputs_embeds.dtype) sin = sin.to(inputs_embeds.dtype)
# Prepare attention mask # Prepare attention mask
if attention_mask is not None: attention_mask = create_causal_mask(
attention_mask = self._update_causal_mask( config=self.config,
attention_mask, inputs_embeds, cache_position, past_key_values, False input_embeds=inputs_embeds,
) attention_mask=attention_mask,
past_key_values=past_key_values,
position_ids=position_ids,
cache_position=cache_position,
)
# Initialize and collect hidden states # Initialize and collect hidden states
hidden_states = inputs_embeds hidden_states = inputs_embeds

View File

@@ -780,6 +780,7 @@ _MULTIMODAL_EXAMPLE_MODELS = {
"Ernie4_5_VLMoeForConditionalGeneration": _HfExamplesInfo( "Ernie4_5_VLMoeForConditionalGeneration": _HfExamplesInfo(
"baidu/ERNIE-4.5-VL-28B-A3B-PT", "baidu/ERNIE-4.5-VL-28B-A3B-PT",
trust_remote_code=True, trust_remote_code=True,
revision="refs/pr/17",
), ),
"FireRedASR2ForConditionalGeneration": _HfExamplesInfo( "FireRedASR2ForConditionalGeneration": _HfExamplesInfo(
"allendou/FireRedASR2-LLM-vllm", "allendou/FireRedASR2-LLM-vllm",

View File

@@ -373,8 +373,15 @@ class InductorStandaloneAdaptor(CompilerInterface):
break break
if input_fake_mode is not None: if input_fake_mode is not None:
fake_mode_ctx: Any = patch( # Use patch.object on the actual module from sys.modules
"torch._inductor.standalone_compile.FakeTensorMode", # because in Python <=3.10 the string-based patch() resolves
# torch._inductor.standalone_compile to the wrapper function
# (defined in __init__.py) instead of the module.
import sys
fake_mode_ctx: Any = patch.object(
sys.modules["torch._inductor.standalone_compile"],
"FakeTensorMode",
lambda *a, **kw: input_fake_mode, lambda *a, **kw: input_fake_mode,
) )
else: else:

View File

@@ -30,7 +30,7 @@ class AttentionConfig:
use_cudnn_prefill: bool = False use_cudnn_prefill: bool = False
"""Whether to use cudnn prefill.""" """Whether to use cudnn prefill."""
use_trtllm_ragged_deepseek_prefill: bool = False use_trtllm_ragged_deepseek_prefill: bool = True
"""Whether to use TRTLLM ragged deepseek prefill.""" """Whether to use TRTLLM ragged deepseek prefill."""
use_trtllm_attention: bool | None = None use_trtllm_attention: bool | None = None

View File

@@ -682,6 +682,27 @@ class VllmConfig:
self.model_config, self.load_config self.model_config, self.load_config
) )
if (
self.quant_config is not None
and self.model_config is not None
and hasattr(self.quant_config, "use_deep_gemm")
and self.quant_config.use_deep_gemm is None
):
from vllm.utils.deep_gemm import should_auto_disable_deep_gemm
model_type = getattr(self.model_config.hf_text_config, "model_type", None)
if should_auto_disable_deep_gemm(model_type):
self.quant_config.use_deep_gemm = False
logger.warning_once(
"Auto-disabled DeepGemm for model_type=%s on Blackwell. "
"DeepGemm E8M0 scale format causes accuracy degradation "
"for this architecture. Falling back to CUTLASS. "
"To disable DeepGemm globally, set VLLM_USE_DEEP_GEMM=0.",
model_type,
)
from vllm.v1.executor.abstract import Executor
executor_backend = self.parallel_config.distributed_executor_backend executor_backend = self.parallel_config.distributed_executor_backend
executor_supports_async_sched = executor_backend in ( executor_supports_async_sched = executor_backend in (
"mp", "mp",

View File

@@ -253,23 +253,25 @@ class TrtLlmFp8ExpertsMonolithic(TrtLlmFp8ExpertsBase, mk.FusedMoEExpertsMonolit
weight_key: QuantKey | None, weight_key: QuantKey | None,
activation_key: QuantKey | None, activation_key: QuantKey | None,
) -> bool: ) -> bool:
"""Monolithic kernels need to express router support.""" """Monolithic kernels need to express router support.
Renormalize/RenormalizeNaive are excluded: the monolithic kernel's
internal routing for these methods produces output uncorrelated
with the modular kernel's output and with Triton kernel's output
for Qwen3.5-35B-A3B-FP8.
See: https://github.com/vllm-project/vllm/issues/37591
"""
# NOTE(dbari): TopK routing could also be enabled, but need to validate models # NOTE(dbari): TopK routing could also be enabled, but need to validate models
# NOTE(dbari): Default is not implemented and should not be enabled until it is # NOTE(dbari): Default is not implemented and should not be enabled until it is
if (weight_key, activation_key) == (kFp8Static128BlockSym, kFp8Dynamic128Sym): if (weight_key, activation_key) == (kFp8Static128BlockSym, kFp8Dynamic128Sym):
# NOTE(rob): potentially allow others here. This is a conservative list. # NOTE(rob): potentially allow others here. This is a conservative list.
return routing_method in [ return routing_method in [
RoutingMethodType.DeepSeekV3, RoutingMethodType.DeepSeekV3,
RoutingMethodType.Renormalize,
RoutingMethodType.RenormalizeNaive,
] ]
elif (weight_key, activation_key) == (kFp8StaticTensorSym, kFp8StaticTensorSym): elif (weight_key, activation_key) == (kFp8StaticTensorSym, kFp8StaticTensorSym):
# NOTE(dbari): as above, potentially allow others here. # NOTE(dbari): as above, potentially allow others here.
return routing_method in [ return routing_method in [
RoutingMethodType.DeepSeekV3, RoutingMethodType.DeepSeekV3,
RoutingMethodType.Llama4, RoutingMethodType.Llama4,
RoutingMethodType.Renormalize,
RoutingMethodType.RenormalizeNaive,
] ]
else: else:
raise ValueError("Unsupported quantization scheme.") raise ValueError("Unsupported quantization scheme.")

View File

@@ -135,6 +135,7 @@ class Fp8Config(QuantizationConfig):
f"{activation_scheme} activation scheme." f"{activation_scheme} activation scheme."
) )
self.weight_block_size = weight_block_size self.weight_block_size = weight_block_size
self.use_deep_gemm: bool | None = None
@classmethod @classmethod
def get_name(cls) -> QuantizationMethods: def get_name(cls) -> QuantizationMethods:
@@ -291,7 +292,10 @@ class Fp8LinearMethod(LinearMethodBase):
self.use_marlin = False self.use_marlin = False
self.use_aiter_and_is_supported = rocm_aiter_ops.is_linear_fp8_enabled() self.use_aiter_and_is_supported = rocm_aiter_ops.is_linear_fp8_enabled()
self.use_deep_gemm = is_deep_gemm_supported() if self.quant_config.use_deep_gemm is not None:
self.use_deep_gemm = self.quant_config.use_deep_gemm
else:
self.use_deep_gemm = is_deep_gemm_supported()
self.weight_block_size = self.quant_config.weight_block_size self.weight_block_size = self.quant_config.weight_block_size
self.block_quant = self.weight_block_size is not None self.block_quant = self.weight_block_size is not None
@@ -305,6 +309,7 @@ class Fp8LinearMethod(LinearMethodBase):
act_quant_group_shape=GroupShape(1, self.weight_block_size[0]), act_quant_group_shape=GroupShape(1, self.weight_block_size[0]),
cutlass_block_fp8_supported=self.cutlass_block_fp8_supported, cutlass_block_fp8_supported=self.cutlass_block_fp8_supported,
use_aiter_and_is_supported=self.use_aiter_and_is_supported, use_aiter_and_is_supported=self.use_aiter_and_is_supported,
use_deep_gemm=self.use_deep_gemm,
) )
else: else:
# Use per-token quantization for better perf if dynamic and cutlass # Use per-token quantization for better perf if dynamic and cutlass
@@ -440,7 +445,7 @@ class Fp8LinearMethod(LinearMethodBase):
del layer.input_scale del layer.input_scale
return return
if self.block_quant: if self.block_quant and self.use_deep_gemm:
maybe_post_process_fp8_weight_block(layer) maybe_post_process_fp8_weight_block(layer)
def apply( def apply(

View File

@@ -91,6 +91,7 @@ class QuantFP8(CustomOp):
if ( if (
self.is_group_quant self.is_group_quant
and self.use_ue8m0
and self.use_deep_gemm_supported and self.use_deep_gemm_supported
and (DeepGemmQuantScaleFMT.from_oracle() == DeepGemmQuantScaleFMT.UE8M0) and (DeepGemmQuantScaleFMT.from_oracle() == DeepGemmQuantScaleFMT.UE8M0)
): ):

View File

@@ -356,10 +356,14 @@ class W8A8BlockFp8LinearOp:
act_quant_group_shape: GroupShape, act_quant_group_shape: GroupShape,
cutlass_block_fp8_supported: bool = CUTLASS_BLOCK_FP8_SUPPORTED, cutlass_block_fp8_supported: bool = CUTLASS_BLOCK_FP8_SUPPORTED,
use_aiter_and_is_supported: bool = False, use_aiter_and_is_supported: bool = False,
use_deep_gemm: bool | None = None,
): ):
self.weight_group_shape = weight_group_shape self.weight_group_shape = weight_group_shape
self.act_quant_group_shape = act_quant_group_shape self.act_quant_group_shape = act_quant_group_shape
self.is_deep_gemm_supported = is_deep_gemm_supported() if use_deep_gemm is not None:
self.is_deep_gemm_supported = use_deep_gemm
else:
self.is_deep_gemm_supported = is_deep_gemm_supported()
self.is_hopper = current_platform.is_device_capability(90) self.is_hopper = current_platform.is_device_capability(90)
self.use_deep_gemm_e8m0 = is_deep_gemm_e8m0_used() self.use_deep_gemm_e8m0 = is_deep_gemm_e8m0_used()
self.is_flashinfer_supported = is_flashinfer_fp8_blockscale_gemm_supported() self.is_flashinfer_supported = is_flashinfer_fp8_blockscale_gemm_supported()

View File

@@ -23,6 +23,24 @@ from vllm.platforms import current_platform
from vllm.utils.import_utils import has_deep_gemm from vllm.utils.import_utils import has_deep_gemm
from vllm.utils.math_utils import cdiv from vllm.utils.math_utils import cdiv
_DEEPGEMM_BLACKWELL_EXCLUDED_MODEL_TYPES: set[str] = {
"qwen3_5_text",
"qwen3_5_moe_text",
}
def should_auto_disable_deep_gemm(model_type: str | None) -> bool:
"""Check if DeepGemm should be auto-disabled for this model on Blackwell.
Returns True if the model is known to have accuracy degradation with
DeepGemm's E8M0 scale format on Blackwell GPUs (SM100+).
"""
if model_type is None:
return False
if not current_platform.is_device_capability_family(100):
return False
return model_type in _DEEPGEMM_BLACKWELL_EXCLUDED_MODEL_TYPES
class DeepGemmQuantScaleFMT(Enum): class DeepGemmQuantScaleFMT(Enum):
# Float32 scales in Float32 tensor # Float32 scales in Float32 tensor