Compare commits

..

8 Commits

Author SHA1 Message Date
Shengqi Chen
d7de043d55 [CI] fix version comparsion and exclusion patterns in upload-release-wheels.sh (#32971)
Signed-off-by: Shengqi Chen <harry-chen@outlook.com>
(cherry picked from commit 136c499f6e)
2026-01-23 14:22:49 -08:00
Nicolò Lucchesi
4dc11b06d3 [Bugfix] Fix Whisper/encoder-decoder GPU memory leak (#32789)
Signed-off-by: NickLucche <nlucches@redhat.com>
(cherry picked from commit ea6102b85d)
2026-01-23 02:53:12 -08:00
Isotr0py
2bd95d803a [Misc] Bump opencv-python dependecy version to 4.13 (#32668)
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
(cherry picked from commit 444e2e7e1f)
2026-01-23 02:52:47 -08:00
Isotr0py
f46d576c54 [Misc] Replace urllib's urlparse with urllib3's parse_url (#32746)
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
(cherry picked from commit 8ebf271bb6)
2026-01-23 02:51:53 -08:00
Shengqi Chen
d68209402d [build] fix cu130 related release pipeline steps and publish as nightly image (#32522)
Signed-off-by: Shengqi Chen <harry-chen@outlook.com>
(cherry picked from commit 965765aef9)
2026-01-17 18:38:46 -08:00
Shengqi Chen
b17039bccc [CI] Implement uploading to PyPI and GitHub in the release pipeline, enable release image building for CUDA 13.0 (#31032)
(cherry picked from commit 8e61425ee6)
2026-01-16 21:04:48 -08:00
Cyrus Leung
48b67ba75f [Frontend] Standardize use of create_error_response (#32319)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2026-01-16 11:35:10 +00:00
TJian
09f4264a55 [Bugfix] Fix ROCm dockerfiles (#32447)
Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com>
2026-01-16 10:50:00 +08:00
28 changed files with 423 additions and 225 deletions

View File

@@ -1,6 +1,6 @@
steps: steps:
# aarch64 + CUDA builds # aarch64 + CUDA builds
- label: "Build arm64 wheel - CUDA 12.9" - label: "Build wheel - aarch64 - CUDA 12.9"
depends_on: ~ depends_on: ~
id: build-wheel-arm64-cuda-12-9 id: build-wheel-arm64-cuda-12-9
agents: agents:
@@ -11,11 +11,11 @@ steps:
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ." - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
- "mkdir artifacts" - "mkdir artifacts"
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'" - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
- "bash .buildkite/scripts/upload-wheels.sh" - "bash .buildkite/scripts/upload-nightly-wheels.sh"
env: env:
DOCKER_BUILDKIT: "1" DOCKER_BUILDKIT: "1"
- label: "Build arm64 wheel - CUDA 13.0" - label: "Build wheel - aarch64 - CUDA 13.0"
depends_on: ~ depends_on: ~
id: build-wheel-arm64-cuda-13-0 id: build-wheel-arm64-cuda-13-0
agents: agents:
@@ -26,12 +26,12 @@ steps:
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ." - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
- "mkdir artifacts" - "mkdir artifacts"
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'" - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
- "bash .buildkite/scripts/upload-wheels.sh manylinux_2_35" - "bash .buildkite/scripts/upload-nightly-wheels.sh manylinux_2_35"
env: env:
DOCKER_BUILDKIT: "1" DOCKER_BUILDKIT: "1"
# aarch64 build # aarch64 build
- label: "Build arm64 CPU wheel" - label: "Build wheel - aarch64 - CPU"
depends_on: ~ depends_on: ~
id: build-wheel-arm64-cpu id: build-wheel-arm64-cpu
agents: agents:
@@ -40,39 +40,39 @@ steps:
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_BUILD_ACL=ON --tag vllm-ci:build-image --target vllm-build --progress plain -f docker/Dockerfile.cpu ." - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_BUILD_ACL=ON --tag vllm-ci:build-image --target vllm-build --progress plain -f docker/Dockerfile.cpu ."
- "mkdir artifacts" - "mkdir artifacts"
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'" - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
- "bash .buildkite/scripts/upload-wheels.sh manylinux_2_35" - "bash .buildkite/scripts/upload-nightly-wheels.sh manylinux_2_35"
env: env:
DOCKER_BUILDKIT: "1" DOCKER_BUILDKIT: "1"
# x86 + CUDA builds # x86 + CUDA builds
- label: "Build wheel - CUDA 12.9" - label: "Build wheel - x86_64 - CUDA 12.9"
depends_on: ~ depends_on: ~
id: build-wheel-cuda-12-9 id: build-wheel-x86-cuda-12-9
agents: agents:
queue: cpu_queue_postmerge queue: cpu_queue_postmerge
commands: commands:
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ." - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
- "mkdir artifacts" - "mkdir artifacts"
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'" - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
- "bash .buildkite/scripts/upload-wheels.sh manylinux_2_31" - "bash .buildkite/scripts/upload-nightly-wheels.sh manylinux_2_31"
env: env:
DOCKER_BUILDKIT: "1" DOCKER_BUILDKIT: "1"
- label: "Build wheel - CUDA 13.0" - label: "Build wheel - x86_64 - CUDA 13.0"
depends_on: ~ depends_on: ~
id: build-wheel-cuda-13-0 id: build-wheel-x86-cuda-13-0
agents: agents:
queue: cpu_queue_postmerge queue: cpu_queue_postmerge
commands: commands:
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ." - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
- "mkdir artifacts" - "mkdir artifacts"
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'" - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
- "bash .buildkite/scripts/upload-wheels.sh manylinux_2_35" - "bash .buildkite/scripts/upload-nightly-wheels.sh manylinux_2_35"
env: env:
DOCKER_BUILDKIT: "1" DOCKER_BUILDKIT: "1"
# x86 CPU wheel build # x86 CPU wheel build
- label: "Build x86 CPU wheel" - label: "Build wheel - x86_64 - CPU"
depends_on: ~ depends_on: ~
id: build-wheel-x86-cpu id: build-wheel-x86-cpu
agents: agents:
@@ -81,12 +81,12 @@ steps:
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_CPU_AVX512BF16=true --build-arg VLLM_CPU_AVX512VNNI=true --build-arg VLLM_CPU_AMXBF16=true --tag vllm-ci:build-image --target vllm-build --progress plain -f docker/Dockerfile.cpu ." - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_CPU_AVX512BF16=true --build-arg VLLM_CPU_AVX512VNNI=true --build-arg VLLM_CPU_AMXBF16=true --tag vllm-ci:build-image --target vllm-build --progress plain -f docker/Dockerfile.cpu ."
- "mkdir artifacts" - "mkdir artifacts"
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'" - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
- "bash .buildkite/scripts/upload-wheels.sh manylinux_2_35" - "bash .buildkite/scripts/upload-nightly-wheels.sh manylinux_2_35"
env: env:
DOCKER_BUILDKIT: "1" DOCKER_BUILDKIT: "1"
# Build release images (12.9) # Build release images (CUDA 12.9)
- label: "Build release image (x86)" - label: "Build release image - x86_64 - CUDA 12.9"
depends_on: ~ depends_on: ~
id: build-release-image-x86 id: build-release-image-x86
agents: agents:
@@ -99,7 +99,7 @@ steps:
- "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT" - "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT" - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
- label: "Build release image (arm64)" - label: "Build release image - aarch64 - CUDA 12.9"
depends_on: ~ depends_on: ~
id: build-release-image-arm64 id: build-release-image-arm64
agents: agents:
@@ -109,34 +109,93 @@ steps:
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ." - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)" - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)"
# Add job to create multi-arch manifest - label: "Create multi-arch manifest - CUDA 12.9"
- label: "Create multi-arch manifest"
depends_on: depends_on:
- build-release-image-x86 - build-release-image-x86
- build-release-image-arm64 - build-release-image-arm64
id: create-multi-arch-manifest id: create-multi-arch-manifest
agents: agents:
queue: cpu_queue_postmerge queue: small_cpu_queue_postmerge
commands: commands:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7" - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
- "docker manifest create public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64 --amend" - "docker manifest create public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64 --amend"
- "docker manifest push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT" - "docker manifest push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
- label: "Annotate release workflow" - label: "Annotate release workflow - CUDA 12.9"
depends_on: depends_on:
- create-multi-arch-manifest - create-multi-arch-manifest
id: annotate-release-workflow id: annotate-release-workflow
agents: agents:
queue: cpu_queue_postmerge queue: small_cpu_queue_postmerge
commands: commands:
- "bash .buildkite/scripts/annotate-release.sh" - "bash .buildkite/scripts/annotate-release.sh"
- block: "Build CUDA 13.0 release images"
key: block-release-image-build-cuda-13-0
depends_on: ~
- label: "Build release image - x86_64 - CUDA 13.0"
depends_on: block-release-image-build-cuda-13-0
id: build-release-image-x86-cuda-13-0
agents:
queue: cpu_queue_postmerge
commands:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg INSTALL_KV_CONNECTORS=true --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130 --target vllm-openai --progress plain -f docker/Dockerfile ."
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130"
# re-tag to default image tag and push, just in case arm64 build fails
- "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu130"
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu130"
- label: "Build release image - aarch64 - CUDA 13.0"
depends_on: block-release-image-build-cuda-13-0
id: build-release-image-arm64-cuda-13-0
agents:
queue: arm64_cpu_queue_postmerge
commands:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
# compute capability 12.0 for RTX-50 series / RTX PRO 6000 Blackwell, 12.1 for DGX Spark
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0 12.1' --build-arg INSTALL_KV_CONNECTORS=true --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130 --target vllm-openai --progress plain -f docker/Dockerfile ."
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130"
- label: "Create multi-arch manifest - CUDA 13.0"
depends_on:
- build-release-image-x86-cuda-13-0
- build-release-image-arm64-cuda-13-0
id: create-multi-arch-manifest-cuda-13-0
agents:
queue: small_cpu_queue_postmerge
commands:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
- "docker manifest create public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu130 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64-cu130 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64-cu130 --amend"
- "docker manifest push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu130"
- input: "Provide Release version here" - input: "Provide Release version here"
id: input-release-version id: input-release-version
fields: fields:
- text: "What is the release version?" - text: "What is the release version?"
key: release-version key: release-version
- block: "Confirm update release wheels to PyPI (experimental, use with caution)?"
key: block-upload-release-wheels
depends_on:
- input-release-version
- build-wheel-x86-cuda-12-9
- build-wheel-x86-cuda-13-0
- build-wheel-x86-cpu
- build-wheel-arm64-cuda-12-9
- build-wheel-arm64-cuda-13-0
- build-wheel-arm64-cpu
- label: "Upload release wheels to PyPI and GitHub"
depends_on:
- block-upload-release-wheels
id: upload-release-wheels
agents:
queue: small_cpu_queue_postmerge
commands:
- "bash .buildkite/scripts/upload-release-wheels.sh"
- block: "Build CPU release image" - block: "Build CPU release image"
key: block-cpu-release-image-build key: block-cpu-release-image-build
depends_on: ~ depends_on: ~
@@ -185,26 +244,15 @@ steps:
# Build vLLM ROCm image using the base # Build vLLM ROCm image using the base
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg BASE_IMAGE=rocm/vllm-dev:base-$BUILDKITE_COMMIT --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-rocm --target vllm-openai --progress plain -f docker/Dockerfile.rocm ." - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg BASE_IMAGE=rocm/vllm-dev:base-$BUILDKITE_COMMIT --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-rocm --target vllm-openai --progress plain -f docker/Dockerfile.rocm ."
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-rocm" - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-rocm"
- label: "Build and publish nightly multi-arch image to DockerHub" - label: "Build and publish nightly multi-arch image to DockerHub"
depends_on: depends_on:
- create-multi-arch-manifest - create-multi-arch-manifest
if: build.env("NIGHTLY") == "1" if: build.env("NIGHTLY") == "1"
agents: agents:
queue: cpu_queue_postmerge queue: small_cpu_queue_postmerge
commands: commands:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7" - "bash .buildkite/scripts/push-nightly-builds.sh"
- "docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64"
- "docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64"
- "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64 vllm/vllm-openai:nightly-x86_64"
- "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64 vllm/vllm-openai:nightly-aarch64"
- "docker push vllm/vllm-openai:nightly-x86_64"
- "docker push vllm/vllm-openai:nightly-aarch64"
- "docker manifest create vllm/vllm-openai:nightly vllm/vllm-openai:nightly-x86_64 vllm/vllm-openai:nightly-aarch64 --amend"
- "docker manifest create vllm/vllm-openai:nightly-$BUILDKITE_COMMIT vllm/vllm-openai:nightly-x86_64 vllm/vllm-openai:nightly-aarch64 --amend"
- "docker manifest push vllm/vllm-openai:nightly"
- "docker manifest push vllm/vllm-openai:nightly-$BUILDKITE_COMMIT"
# Clean up old nightly builds (keep only last 14) # Clean up old nightly builds (keep only last 14)
- "bash .buildkite/scripts/cleanup-nightly-builds.sh" - "bash .buildkite/scripts/cleanup-nightly-builds.sh"
plugins: plugins:
@@ -215,6 +263,25 @@ steps:
DOCKER_BUILDKIT: "1" DOCKER_BUILDKIT: "1"
DOCKERHUB_USERNAME: "vllmbot" DOCKERHUB_USERNAME: "vllmbot"
- label: "Build and publish nightly multi-arch image to DockerHub - CUDA 13.0"
depends_on:
- create-multi-arch-manifest-cuda-13-0
if: build.env("NIGHTLY") == "1"
agents:
queue: small_cpu_queue_postmerge
commands:
- "bash .buildkite/scripts/push-nightly-builds.sh cu130"
# Clean up old nightly builds (keep only last 14)
- "bash .buildkite/scripts/cleanup-nightly-builds.sh cu130-nightly-"
plugins:
- docker-login#v3.0.0:
username: vllmbot
password-env: DOCKERHUB_TOKEN
env:
DOCKER_BUILDKIT: "1"
DOCKERHUB_USERNAME: "vllmbot"
# ============================================================================= # =============================================================================
# ROCm Release Pipeline (x86_64 only) # ROCm Release Pipeline (x86_64 only)
# ============================================================================= # =============================================================================

View File

@@ -3,7 +3,14 @@
set -ex set -ex
# Clean up old nightly builds from DockerHub, keeping only the last 14 builds # Clean up old nightly builds from DockerHub, keeping only the last 14 builds
# This script uses DockerHub API to list and delete old tags with "nightly-" prefix # This script uses DockerHub API to list and delete old tags with specified prefix
# Usage: cleanup-nightly-builds.sh [TAG_PREFIX]
# Example: cleanup-nightly-builds.sh "nightly-" or cleanup-nightly-builds.sh "cu130-nightly-"
# Get tag prefix from argument, default to "nightly-" if not provided
TAG_PREFIX="${1:-nightly-}"
echo "Cleaning up tags with prefix: $TAG_PREFIX"
# DockerHub API endpoint for vllm/vllm-openai repository # DockerHub API endpoint for vllm/vllm-openai repository
REPO_API_URL="https://hub.docker.com/v2/repositories/vllm/vllm-openai/tags" REPO_API_URL="https://hub.docker.com/v2/repositories/vllm/vllm-openai/tags"
@@ -45,7 +52,7 @@ get_all_tags() {
set -x set -x
# Get both last_updated timestamp and tag name, separated by | # Get both last_updated timestamp and tag name, separated by |
local tags=$(echo "$response" | jq -r '.results[] | select(.name | startswith("nightly-")) | "\(.last_updated)|\(.name)"') local tags=$(echo "$response" | jq -r --arg prefix "$TAG_PREFIX" '.results[] | select(.name | startswith($prefix)) | "\(.last_updated)|\(.name)"')
if [ -z "$tags" ]; then if [ -z "$tags" ]; then
break break

View File

@@ -0,0 +1,36 @@
#!/bin/bash
set -ex
# Get tag variant from argument, default to empty if not provided, should be something like "cu130".
# Due to limits in cleanup script, we must move variants to use separate tags like "cu130-nightly",
# otherwise they will be cleaned up together with the main "nightly" tags.
TAG_VARIANT="$1"
if [ -n "$TAG_VARIANT" ]; then
ORIG_TAG_SUFFIX="-$TAG_VARIANT"
TAG_NAME="$TAG_VARIANT-nightly"
else
ORIG_TAG_SUFFIX=""
TAG_NAME="nightly"
fi
ORIG_TAG_NAME="$BUILDKITE_COMMIT"
echo "Pushing original tag $ORIG_TAG_NAME$ORIG_TAG_SUFFIX to new nightly tag name: $TAG_NAME"
# pull original arch-dependent images from AWS ECR Public
aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:$ORIG_TAG_NAME-x86_64$ORIG_TAG_SUFFIX
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:$ORIG_TAG_NAME-aarch64$ORIG_TAG_SUFFIX
# tag arch-dependent images
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$ORIG_TAG_NAME-x86_64$ORIG_TAG_SUFFIX vllm/vllm-openai:$TAG_NAME-x86_64
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$ORIG_TAG_NAME-aarch64$ORIG_TAG_SUFFIX vllm/vllm-openai:$TAG_NAME-aarch64
# push arch-dependent images to DockerHub
docker push vllm/vllm-openai:$TAG_NAME-x86_64
docker push vllm/vllm-openai:$TAG_NAME-aarch64
# push arch-independent manifest to DockerHub
docker manifest create vllm/vllm-openai:$TAG_NAME vllm/vllm-openai:$TAG_NAME-x86_64 vllm/vllm-openai:$TAG_NAME-aarch64 --amend
docker manifest create vllm/vllm-openai:$TAG_NAME-$BUILDKITE_COMMIT vllm/vllm-openai:$TAG_NAME-x86_64 vllm/vllm-openai:$TAG_NAME-aarch64 --amend
docker manifest push vllm/vllm-openai:$TAG_NAME
docker manifest push vllm/vllm-openai:$TAG_NAME-$BUILDKITE_COMMIT

View File

@@ -0,0 +1,104 @@
#!/usr/bin/env bash
set -e
BUCKET="vllm-wheels"
SUBPATH=$BUILDKITE_COMMIT
S3_COMMIT_PREFIX="s3://$BUCKET/$SUBPATH/"
RELEASE_VERSION=$(buildkite-agent meta-data get release-version)
echo "Release version from Buildkite: $RELEASE_VERSION"
GIT_VERSION=$(git describe --exact-match --tags $BUILDKITE_COMMIT 2>/dev/null)
if [ -z "$GIT_VERSION" ]; then
echo "[FATAL] Not on a git tag, cannot create release."
exit 1
else
echo "Git version for commit $BUILDKITE_COMMIT: $GIT_VERSION"
fi
# sanity check for version mismatch
if [ "$RELEASE_VERSION" != "$GIT_VERSION" ]; then
if [ "$FORCE_RELEASE_IGNORE_VERSION_MISMATCH" == "true" ]; then
echo "[WARNING] Force release and ignore version mismatch"
else
echo "[FATAL] Release version from Buildkite does not match Git version."
exit 1
fi
fi
PURE_VERSION=${RELEASE_VERSION#v} # remove leading 'v'
# check pypi token
if [ -z "$PYPI_TOKEN" ]; then
echo "[FATAL] PYPI_TOKEN is not set."
exit 1
else
export TWINE_USERNAME="__token__"
export TWINE_PASSWORD="$PYPI_TOKEN"
fi
# check github token
if [ -z "$GITHUB_TOKEN" ]; then
echo "[FATAL] GITHUB_TOKEN is not set."
exit 1
else
export GH_TOKEN="$GITHUB_TOKEN"
fi
set -x # avoid printing secrets above
# download gh CLI from github
# Get latest gh CLI version from GitHub API
GH_VERSION=$(curl -s https://api.github.com/repos/cli/cli/releases/latest | grep '"tag_name":' | sed -E 's/.*"([^"]+)".*/\1/' | sed 's/^v//')
if [ -z "$GH_VERSION" ]; then
echo "[FATAL] Failed to get latest gh CLI version from GitHub"
exit 1
fi
echo "Downloading gh CLI version: $GH_VERSION"
GH_TARBALL="gh_${GH_VERSION}_linux_amd64.tar.gz"
GH_URL="https://github.com/cli/cli/releases/download/v${GH_VERSION}/${GH_TARBALL}"
GH_INSTALL_DIR="/tmp/gh-install"
mkdir -p "$GH_INSTALL_DIR"
pushd "$GH_INSTALL_DIR"
curl -L -o "$GH_TARBALL" "$GH_URL"
tar -xzf "$GH_TARBALL"
GH_BIN=$(realpath $(find . -name "gh" -type f -executable | head -n 1))
if [ -z "$GH_BIN" ]; then
echo "[FATAL] Failed to find gh CLI executable"
exit 1
fi
echo "gh CLI downloaded successfully, version: $($GH_BIN --version)"
echo "Last 5 releases on GitHub:" # as a sanity check of gh and GH_TOKEN
command "$GH_BIN" release list --limit 5
popd
# install twine from pypi
python3 -m venv /tmp/vllm-release-env
source /tmp/vllm-release-env/bin/activate
pip install twine
python3 -m twine --version
# copy release wheels to local directory
DIST_DIR=/tmp/vllm-release-dist
echo "Existing wheels on S3:"
aws s3 ls "$S3_COMMIT_PREFIX"
echo "Copying wheels to local directory"
mkdir -p $DIST_DIR
# include only wheels for the release version, ignore all files with "dev" or "rc" in the name (without excluding 'aarch64')
aws s3 cp --recursive --exclude "*" --include "vllm-${PURE_VERSION}*.whl" --exclude "*dev*" --exclude "*rc[0-9]*" "$S3_COMMIT_PREFIX" $DIST_DIR
echo "Wheels copied to local directory"
# generate source tarball
git archive --format=tar.gz --output="$DIST_DIR/vllm-${PURE_VERSION}.tar.gz" $BUILDKITE_COMMIT
ls -la $DIST_DIR
# upload wheels to PyPI (only default variant, i.e. files without '+' in the name)
PYPI_WHEEL_FILES=$(find $DIST_DIR -name "vllm-${PURE_VERSION}*.whl" -not -name "*+*")
if [ -z "$PYPI_WHEEL_FILES" ]; then
echo "No default variant wheels found, quitting..."
exit 1
fi
python3 -m twine check $PYPI_WHEEL_FILES
python3 -m twine --non-interactive --verbose upload $PYPI_WHEEL_FILES
echo "Wheels uploaded to PyPI"
# create release on GitHub with the release version and all wheels
command "$GH_BIN" release create $GIT_VERSION -d --latest --notes-from-tag --verify-tag $DIST_DIR/*.whl

View File

@@ -85,6 +85,8 @@ ONBUILD COPY ./ vllm/
FROM base AS fetch_vllm_1 FROM base AS fetch_vllm_1
ARG VLLM_REPO="https://github.com/vllm-project/vllm.git" ARG VLLM_REPO="https://github.com/vllm-project/vllm.git"
ARG VLLM_BRANCH="main" ARG VLLM_BRANCH="main"
ENV VLLM_REPO=${VLLM_REPO}
ENV VLLM_BRANCH=${VLLM_BRANCH}
ONBUILD RUN git clone ${VLLM_REPO} \ ONBUILD RUN git clone ${VLLM_REPO} \
&& cd vllm \ && cd vllm \
&& git fetch -v --prune -- origin ${VLLM_BRANCH} \ && git fetch -v --prune -- origin ${VLLM_BRANCH} \
@@ -301,6 +303,10 @@ RUN --mount=type=bind,from=export_vllm,src=/,target=/install \
&& pip uninstall -y vllm \ && pip uninstall -y vllm \
&& uv pip install --system *.whl && uv pip install --system *.whl
# Install RIXL wheel
RUN --mount=type=bind,from=build_rixl,src=/app/install,target=/rixl_install \
uv pip install --system /rixl_install/*.whl
WORKDIR /vllm-workspace WORKDIR /vllm-workspace
ARG COMMON_WORKDIR ARG COMMON_WORKDIR
COPY --from=build_vllm ${COMMON_WORKDIR}/vllm /vllm-workspace COPY --from=build_vllm ${COMMON_WORKDIR}/vllm /vllm-workspace

View File

@@ -198,92 +198,6 @@ RUN cd mori \
RUN mkdir -p /app/install && cp /app/mori/dist/*.whl /app/install RUN mkdir -p /app/install && cp /app/mori/dist/*.whl /app/install
###
### RIXL Build
###
FROM build_pytorch AS build_rixl
ARG RIXL_BRANCH
ARG RIXL_REPO
ARG ETCD_BRANCH
ARG ETCD_REPO
ARG UCX_BRANCH
ARG UCX_REPO
ENV ROCM_PATH=/opt/rocm
ENV UCX_HOME=/usr/local/ucx
ENV RIXL_HOME=/usr/local/rixl
ENV RIXL_BENCH_HOME=/usr/local/rixl_bench
# RIXL build system dependences and RDMA support
RUN apt-get -y update && apt-get -y install autoconf libtool pkg-config \
libgrpc-dev \
libgrpc++-dev \
libprotobuf-dev \
protobuf-compiler-grpc \
libcpprest-dev \
libaio-dev \
librdmacm1 \
librdmacm-dev \
libibverbs1 \
libibverbs-dev \
ibverbs-utils \
rdmacm-utils \
ibverbs-providers
RUN pip install meson auditwheel patchelf tomlkit
WORKDIR /workspace
RUN git clone ${ETCD_REPO} && \
cd etcd-cpp-apiv3 && \
git checkout ${ETCD_BRANCH} && \
mkdir build && cd build && \
cmake .. -DCMAKE_POLICY_VERSION_MINIMUM=3.5 && \
make -j$(nproc) && \
make install
RUN cd /usr/local/src && \
git clone ${UCX_REPO} && \
cd ucx && \
git checkout ${UCX_BRANCH} && \
./autogen.sh && \
mkdir build && cd build && \
../configure \
--prefix=/usr/local/ucx \
--enable-shared \
--disable-static \
--disable-doxygen-doc \
--enable-optimizations \
--enable-devel-headers \
--with-rocm=/opt/rocm \
--with-verbs \
--with-dm \
--enable-mt && \
make -j && \
make -j install
ENV PATH=/usr/local/ucx/bin:$PATH
ENV LD_LIBRARY_PATH=${UCX_HOME}/lib:${LD_LIBRARY_PATH}
RUN git clone ${RIXL_REPO} /opt/rixl && \
cd /opt/rixl && \
git checkout ${RIXL_BRANCH} && \
meson setup build --prefix=${RIXL_HOME} \
-Ducx_path=${UCX_HOME} \
-Drocm_path=${ROCM_PATH} && \
cd build && \
ninja && \
ninja install
# Generate RIXL wheel
RUN cd /opt/rixl && mkdir -p /app/install && \
./contrib/build-wheel.sh \
--output-dir /app/install \
--rocm-dir ${ROCM_PATH} \
--ucx-plugins-dir ${UCX_HOME}/lib/ucx \
--nixl-plugins-dir ${RIXL_HOME}/lib/x86_64-linux-gnu/plugins
### ###
### FlashAttention Build ### FlashAttention Build
### ###
@@ -365,8 +279,6 @@ RUN --mount=type=bind,from=build_aiter,src=/app/install/,target=/install \
cp /install/*.whl /app/debs cp /install/*.whl /app/debs
RUN --mount=type=bind,from=build_mori,src=/app/install/,target=/install \ RUN --mount=type=bind,from=build_mori,src=/app/install/,target=/install \
cp /install/*.whl /app/debs cp /install/*.whl /app/debs
RUN --mount=type=bind,from=build_rixl,src=/app/install/,target=/install \
cp /install/*.whl /app/debs
FROM base AS final FROM base AS final
RUN --mount=type=bind,from=debs,src=/app/debs,target=/install \ RUN --mount=type=bind,from=debs,src=/app/debs,target=/install \
@@ -385,12 +297,6 @@ ARG FA_BRANCH
ARG FA_REPO ARG FA_REPO
ARG AITER_BRANCH ARG AITER_BRANCH
ARG AITER_REPO ARG AITER_REPO
ARG RIXL_BRANCH
ARG RIXL_REPO
ARG ETCD_BRANCH
ARG ETCD_REPO
ARG UCX_BRANCH
ARG UCX_REPO
ARG MORI_BRANCH ARG MORI_BRANCH
ARG MORI_REPO ARG MORI_REPO
RUN echo "BASE_IMAGE: ${BASE_IMAGE}" > /app/versions.txt \ RUN echo "BASE_IMAGE: ${BASE_IMAGE}" > /app/versions.txt \
@@ -406,11 +312,5 @@ RUN echo "BASE_IMAGE: ${BASE_IMAGE}" > /app/versions.txt \
&& echo "FA_REPO: ${FA_REPO}" >> /app/versions.txt \ && echo "FA_REPO: ${FA_REPO}" >> /app/versions.txt \
&& echo "AITER_BRANCH: ${AITER_BRANCH}" >> /app/versions.txt \ && echo "AITER_BRANCH: ${AITER_BRANCH}" >> /app/versions.txt \
&& echo "AITER_REPO: ${AITER_REPO}" >> /app/versions.txt \ && echo "AITER_REPO: ${AITER_REPO}" >> /app/versions.txt \
&& echo "RIXL_BRANCH: ${RIXL_BRANCH}" >> /app/versions.txt \
&& echo "RIXL_REPO: ${RIXL_REPO}" >> /app/versions.txt \
&& echo "ETCD_BRANCH: ${ETCD_BRANCH}" >> /app/versions.txt \
&& echo "ETCD_REPO: ${ETCD_REPO}" >> /app/versions.txt \
&& echo "UCX_BRANCH: ${UCX_BRANCH}" >> /app/versions.txt \
&& echo "UCX_REPO: ${UCX_REPO}" >> /app/versions.txt \
&& echo "MORI_BRANCH: ${MORI_BRANCH}" >> /app/versions.txt \ && echo "MORI_BRANCH: ${MORI_BRANCH}" >> /app/versions.txt \
&& echo "MORI_REPO: ${MORI_REPO}" >> /app/versions.txt && echo "MORI_REPO: ${MORI_REPO}" >> /app/versions.txt

View File

@@ -32,7 +32,7 @@ pyzmq >= 25.0.0
msgspec msgspec
gguf >= 0.17.0 gguf >= 0.17.0
mistral_common[image] >= 1.8.8 mistral_common[image] >= 1.8.8
opencv-python-headless >= 4.11.0 # required for video IO opencv-python-headless >= 4.13.0 # required for video IO
pyyaml pyyaml
six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that needs to be the latest version for python 3.12 six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that needs to be the latest version for python 3.12
setuptools>=77.0.3,<81.0.0; python_version > '3.11' # Setuptools is used by triton, we need to ensure a modern version is installed for 3.12+ so that it does not try to import distutils, which was removed in 3.12 setuptools>=77.0.3,<81.0.0; python_version > '3.11' # Setuptools is used by triton, we need to ensure a modern version is installed for 3.12+ so that it does not try to import distutils, which was removed in 3.12

View File

@@ -25,7 +25,7 @@ transformers_stream_generator # required for qwen-vl test
matplotlib # required for qwen-vl test matplotlib # required for qwen-vl test
mistral_common[image,audio] >= 1.8.8 # required for voxtral test mistral_common[image,audio] >= 1.8.8 # required for voxtral test
num2words # required for smolvlm test num2words # required for smolvlm test
opencv-python-headless >= 4.11.0 # required for video test opencv-python-headless >= 4.13.0 # required for video test
datamodel_code_generator # required for minicpm3 test datamodel_code_generator # required for minicpm3 test
lm-eval[api]>=0.4.9.2 # required for model evaluation test lm-eval[api]>=0.4.9.2 # required for model evaluation test
mteb>=1.38.11, <2 # required for mteb test mteb>=1.38.11, <2 # required for mteb test
@@ -37,8 +37,8 @@ bitsandbytes>=0.46.1
buildkite-test-collector==0.1.9 buildkite-test-collector==0.1.9
genai_perf==0.0.8 genai_perf>=0.0.8
tritonclient==2.51.0 tritonclient>=2.51.0
numba == 0.61.2 # Required for N-gram speculative decoding numba == 0.61.2 # Required for N-gram speculative decoding
numpy numpy

View File

@@ -33,7 +33,7 @@ matplotlib # required for qwen-vl test
mistral_common[image,audio] >= 1.8.8 # required for voxtral test mistral_common[image,audio] >= 1.8.8 # required for voxtral test
num2words # required for smolvlm test num2words # required for smolvlm test
open_clip_torch==2.32.0 # Required for nemotron_vl test, Nemotron Parse in test_common.py open_clip_torch==2.32.0 # Required for nemotron_vl test, Nemotron Parse in test_common.py
opencv-python-headless >= 4.11.0 # required for video test opencv-python-headless >= 4.13.0 # required for video test
datamodel_code_generator # required for minicpm3 test datamodel_code_generator # required for minicpm3 test
lm-eval[api]>=0.4.9.2 # required for model evaluation test lm-eval[api]>=0.4.9.2 # required for model evaluation test
mteb[bm25s]>=2, <3 # required for mteb test mteb[bm25s]>=2, <3 # required for mteb test
@@ -45,8 +45,8 @@ bitsandbytes==0.46.1
buildkite-test-collector==0.1.9 buildkite-test-collector==0.1.9
genai_perf==0.0.8 genai_perf>=0.0.8
tritonclient==2.51.0 tritonclient>=2.51.0
arctic-inference == 0.1.1 # Required for suffix decoding test arctic-inference == 0.1.1 # Required for suffix decoding test
numba == 0.61.2 # Required for N-gram speculative decoding numba == 0.61.2 # Required for N-gram speculative decoding

View File

@@ -31,7 +31,11 @@ albumentations==1.4.6
# -r requirements/test.in # -r requirements/test.in
# terratorch # terratorch
alembic==1.16.4 alembic==1.16.4
# via mlflow # via
# mlflow
# optuna
annotated-doc==0.0.4
# via fastapi
annotated-types==0.7.0 annotated-types==0.7.0
# via pydantic # via pydantic
antlr4-python3-runtime==4.9.3 antlr4-python3-runtime==4.9.3
@@ -143,6 +147,8 @@ colorama==0.4.6
# tqdm-multiprocess # tqdm-multiprocess
colorful==0.5.6 colorful==0.5.6
# via ray # via ray
colorlog==6.10.1
# via optuna
contourpy==1.3.0 contourpy==1.3.0
# via matplotlib # via matplotlib
coverage==7.10.6 coverage==7.10.6
@@ -250,7 +256,7 @@ fsspec==2024.9.0
# torch # torch
ftfy==6.3.1 ftfy==6.3.1
# via open-clip-torch # via open-clip-torch
genai-perf==0.0.8 genai-perf==0.0.16
# via -r requirements/test.in # via -r requirements/test.in
genson==1.3.0 genson==1.3.0
# via datamodel-code-generator # via datamodel-code-generator
@@ -387,6 +393,7 @@ jinja2==3.1.6
# via # via
# datamodel-code-generator # datamodel-code-generator
# flask # flask
# genai-perf
# mlflow # mlflow
# torch # torch
jiwer==3.0.5 jiwer==3.0.5
@@ -526,7 +533,7 @@ numba==0.61.2
# librosa # librosa
numexpr==2.10.1 numexpr==2.10.1
# via lm-eval # via lm-eval
numpy==1.26.4 numpy==2.2.6
# via # via
# -r requirements/test.in # -r requirements/test.in
# accelerate # accelerate
@@ -556,6 +563,7 @@ numpy==1.26.4
# numba # numba
# numexpr # numexpr
# opencv-python-headless # opencv-python-headless
# optuna
# pandas # pandas
# patsy # patsy
# peft # peft
@@ -635,7 +643,7 @@ opencensus==0.11.4
# via ray # via ray
opencensus-context==0.1.3 opencensus-context==0.1.3
# via opencensus # via opencensus
opencv-python-headless==4.11.0.86 opencv-python-headless==4.13.0.90
# via # via
# -r requirements/test.in # -r requirements/test.in
# albucore # albucore
@@ -658,6 +666,10 @@ opentelemetry-sdk==1.35.0
# ray # ray
opentelemetry-semantic-conventions==0.56b0 opentelemetry-semantic-conventions==0.56b0
# via opentelemetry-sdk # via opentelemetry-sdk
optuna==3.6.1
# via genai-perf
orjson==3.11.5
# via genai-perf
packaging==24.2 packaging==24.2
# via # via
# accelerate # accelerate
@@ -676,6 +688,7 @@ packaging==24.2
# lightning-utilities # lightning-utilities
# matplotlib # matplotlib
# mlflow-skinny # mlflow-skinny
# optuna
# peft # peft
# plotly # plotly
# pooch # pooch
@@ -715,6 +728,8 @@ peft==0.16.0
# lm-eval # lm-eval
perceptron==0.1.4 perceptron==0.1.4
# via -r requirements/test.in # via -r requirements/test.in
perf-analyzer==0.1.0
# via genai-perf
pillow==10.4.0 pillow==10.4.0
# via # via
# genai-perf # genai-perf
@@ -901,6 +916,7 @@ pyyaml==6.0.2
# lightning # lightning
# mlflow-skinny # mlflow-skinny
# omegaconf # omegaconf
# optuna
# peft # peft
# pytorch-lightning # pytorch-lightning
# ray # ray
@@ -1063,6 +1079,7 @@ sortedcontainers==2.4.0
soundfile==0.12.1 soundfile==0.12.1
# via # via
# -r requirements/test.in # -r requirements/test.in
# genai-perf
# librosa # librosa
# mistral-common # mistral-common
soxr==0.5.0.post1 soxr==0.5.0.post1
@@ -1073,6 +1090,7 @@ sqlalchemy==2.0.41
# via # via
# alembic # alembic
# mlflow # mlflow
# optuna
sqlitedict==2.1.0 sqlitedict==2.1.0
# via lm-eval # via lm-eval
sqlparse==0.5.3 sqlparse==0.5.3
@@ -1202,6 +1220,7 @@ tqdm==4.66.6
# mteb # mteb
# nltk # nltk
# open-clip-torch # open-clip-torch
# optuna
# peft # peft
# pqdm # pqdm
# pretrainedmodels # pretrainedmodels
@@ -1224,10 +1243,8 @@ transformers-stream-generator==0.0.5
# via -r requirements/test.in # via -r requirements/test.in
triton==3.5.1 triton==3.5.1
# via torch # via torch
tritonclient==2.51.0 tritonclient==2.64.0
# via # via -r requirements/test.in
# -r requirements/test.in
# genai-perf
typepy==1.3.2 typepy==1.3.2
# via # via
# dataproperty # dataproperty

View File

@@ -267,12 +267,16 @@ async def test_audio_with_max_tokens(mary_had_lamb, client_and_model):
out_tokens = tok(out_text, add_special_tokens=False)["input_ids"] out_tokens = tok(out_text, add_special_tokens=False)["input_ids"]
assert len(out_tokens) == 1 assert len(out_tokens) == 1
# max_completion_tokens > max_model_len # max_completion_tokens > max_model_len
# max_model_len=32768 for Gemma-3n-E2B-it
transcription = await client.audio.transcriptions.create( transcription = await client.audio.transcriptions.create(
model=model_name, model=model_name,
file=mary_had_lamb, file=mary_had_lamb,
response_format="text", response_format="text",
temperature=0.0, temperature=0.0,
extra_body={"max_completion_tokens": int(1e6)}, extra_body={
"max_completion_tokens": int(1e6),
"repetition_penalty": 1.3,
},
) )
out = json.loads(transcription) out = json.loads(transcription)
out_text = out["text"] out_text = out["text"]

View File

@@ -176,3 +176,46 @@ def test_models_distributed(
distributed_executor_backend=distributed_executor_backend, distributed_executor_backend=distributed_executor_backend,
enforce_eager=False, enforce_eager=False,
) )
@pytest.mark.core_model
@pytest.mark.parametrize("model", ["openai/whisper-large-v3-turbo"])
def test_encoder_cache_cleanup(
vllm_runner,
model: str,
input_audios,
monkeypatch,
) -> None:
"""Test that encoder cache is properly cleaned up after requests complete.
This is a regression test for a bug where encoder cache entries were freed
in the same scheduling step they were allocated, before the model could use
them.
"""
# Set single-process mode to access the model runner's encoder cache directly
monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
check_model_available(model)
with vllm_runner(
model,
dtype="half",
max_model_len=448,
tensor_parallel_size=1,
limit_mm_per_prompt={"audio": 2},
enforce_eager=True,
) as vllm_model:
engine_core = vllm_model.llm.llm_engine.engine_core.engine_core
model_runner = engine_core.model_executor.driver_worker.worker.model_runner
encoder_cache = model_runner.encoder_cache
# Run multiple sequential requests to ensure cache is properly managed
for vllm_prompts, _, audios in input_audios:
vllm_model.generate_greedy(vllm_prompts, max_tokens=50, audios=audios)
# After all requests complete, encoder cache should be empty
cache_size = len(encoder_cache)
assert cache_size == 0, (
f"Encoder cache should be empty after all requests complete, "
f"but has {cache_size} entries. This indicates encoder cache "
f"entries are not being properly freed."
)

View File

@@ -3,10 +3,10 @@
from collections.abc import Mapping, MutableMapping from collections.abc import Mapping, MutableMapping
from pathlib import Path from pathlib import Path
from urllib.parse import urlparse
import aiohttp import aiohttp
import requests import requests
from urllib3.util import parse_url
from vllm.version import __version__ as VLLM_VERSION from vllm.version import __version__ as VLLM_VERSION
@@ -37,7 +37,7 @@ class HTTPConnection:
return self._async_client return self._async_client
def _validate_http_url(self, url: str): def _validate_http_url(self, url: str):
parsed_url = urlparse(url) parsed_url = parse_url(url)
if parsed_url.scheme not in ("http", "https"): if parsed_url.scheme not in ("http", "https"):
raise ValueError( raise ValueError(

View File

@@ -540,14 +540,8 @@ async def create_completion(request: CompletionRequest, raw_request: Request):
try: try:
generator = await handler.create_completion(request, raw_request) generator = await handler.create_completion(request, raw_request)
except OverflowError as e:
raise HTTPException(
status_code=HTTPStatus.BAD_REQUEST.value, detail=str(e)
) from e
except Exception as e: except Exception as e:
raise HTTPException( return handler.create_error_response(e)
status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, detail=str(e)
) from e
if isinstance(generator, ErrorResponse): if isinstance(generator, ErrorResponse):
return JSONResponse( return JSONResponse(

View File

@@ -86,7 +86,7 @@ from vllm.entrypoints.responses_utils import (
construct_input_messages, construct_input_messages,
) )
from vllm.entrypoints.serve.disagg.protocol import GenerateRequest, GenerateResponse from vllm.entrypoints.serve.disagg.protocol import GenerateRequest, GenerateResponse
from vllm.entrypoints.utils import _validate_truncation_size from vllm.entrypoints.utils import _validate_truncation_size, sanitize_message
from vllm.inputs.data import PromptType, TokensPrompt from vllm.inputs.data import PromptType, TokensPrompt
from vllm.inputs.parse import ( from vllm.inputs.parse import (
PromptComponents, PromptComponents,
@@ -760,11 +760,15 @@ class OpenAIServing:
err_type = "BadRequestError" err_type = "BadRequestError"
status_code = HTTPStatus.BAD_REQUEST status_code = HTTPStatus.BAD_REQUEST
param = exc.parameter param = exc.parameter
elif isinstance(exc, (ValueError, TypeError, RuntimeError)): elif isinstance(exc, (ValueError, TypeError, RuntimeError, OverflowError)):
# Common validation errors from user input # Common validation errors from user input
err_type = "BadRequestError" err_type = "BadRequestError"
status_code = HTTPStatus.BAD_REQUEST status_code = HTTPStatus.BAD_REQUEST
param = None param = None
elif isinstance(exc, NotImplementedError):
err_type = "NotImplementedError"
status_code = HTTPStatus.NOT_IMPLEMENTED
param = None
elif exc.__class__.__name__ == "TemplateError": elif exc.__class__.__name__ == "TemplateError":
# jinja2.TemplateError (avoid importing jinja2) # jinja2.TemplateError (avoid importing jinja2)
err_type = "BadRequestError" err_type = "BadRequestError"
@@ -783,9 +787,10 @@ class OpenAIServing:
traceback.print_exc() traceback.print_exc()
else: else:
traceback.print_stack() traceback.print_stack()
return ErrorResponse( return ErrorResponse(
error=ErrorInfo( error=ErrorInfo(
message=message, message=sanitize_message(message),
type=err_type, type=err_type,
code=status_code.value, code=status_code.value,
param=param, param=param,

View File

@@ -16,6 +16,7 @@ from vllm.entrypoints.openai.protocol import (
ModelPermission, ModelPermission,
UnloadLoRAAdapterRequest, UnloadLoRAAdapterRequest,
) )
from vllm.entrypoints.utils import sanitize_message
from vllm.logger import init_logger from vllm.logger import init_logger
from vllm.lora.request import LoRARequest from vllm.lora.request import LoRARequest
from vllm.lora.resolver import LoRAResolver, LoRAResolverRegistry from vllm.lora.resolver import LoRAResolver, LoRAResolverRegistry
@@ -300,5 +301,9 @@ def create_error_response(
status_code: HTTPStatus = HTTPStatus.BAD_REQUEST, status_code: HTTPStatus = HTTPStatus.BAD_REQUEST,
) -> ErrorResponse: ) -> ErrorResponse:
return ErrorResponse( return ErrorResponse(
error=ErrorInfo(message=message, type=err_type, code=status_code.value) error=ErrorInfo(
message=sanitize_message(message),
type=err_type,
code=status_code.value,
)
) )

View File

@@ -1,8 +1,7 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from http import HTTPStatus
from fastapi import APIRouter, Depends, HTTPException, Request from fastapi import APIRouter, Depends, Request
from starlette.responses import JSONResponse from starlette.responses import JSONResponse
from typing_extensions import assert_never from typing_extensions import assert_never
@@ -36,9 +35,8 @@ async def create_classify(request: ClassificationRequest, raw_request: Request):
try: try:
generator = await handler.create_classify(request, raw_request) generator = await handler.create_classify(request, raw_request)
except Exception as e: except Exception as e:
raise HTTPException( return handler.create_error_response(e)
status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, detail=str(e)
) from e
if isinstance(generator, ErrorResponse): if isinstance(generator, ErrorResponse):
return JSONResponse( return JSONResponse(
content=generator.model_dump(), status_code=generator.error.code content=generator.model_dump(), status_code=generator.error.code

View File

@@ -2,7 +2,7 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from http import HTTPStatus from http import HTTPStatus
from fastapi import APIRouter, Depends, HTTPException, Request from fastapi import APIRouter, Depends, Request
from fastapi.responses import JSONResponse, StreamingResponse from fastapi.responses import JSONResponse, StreamingResponse
from typing_extensions import assert_never from typing_extensions import assert_never
@@ -47,9 +47,7 @@ async def create_embedding(
try: try:
generator = await handler.create_embedding(request, raw_request) generator = await handler.create_embedding(request, raw_request)
except Exception as e: except Exception as e:
raise HTTPException( return handler.create_error_response(e)
status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, detail=str(e)
) from e
if isinstance(generator, ErrorResponse): if isinstance(generator, ErrorResponse):
return JSONResponse( return JSONResponse(

View File

@@ -2,7 +2,7 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from http import HTTPStatus from http import HTTPStatus
from fastapi import APIRouter, Depends, HTTPException, Request from fastapi import APIRouter, Depends, Request
from fastapi.responses import JSONResponse, StreamingResponse from fastapi.responses import JSONResponse, StreamingResponse
from typing_extensions import assert_never from typing_extensions import assert_never
@@ -44,9 +44,8 @@ async def create_pooling(request: PoolingRequest, raw_request: Request):
try: try:
generator = await handler.create_pooling(request, raw_request) generator = await handler.create_pooling(request, raw_request)
except Exception as e: except Exception as e:
raise HTTPException( return handler.create_error_response(e)
status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, detail=str(e)
) from e
if isinstance(generator, ErrorResponse): if isinstance(generator, ErrorResponse):
return JSONResponse( return JSONResponse(
content=generator.model_dump(), status_code=generator.error.code content=generator.model_dump(), status_code=generator.error.code

View File

@@ -2,7 +2,7 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from http import HTTPStatus from http import HTTPStatus
from fastapi import APIRouter, Depends, HTTPException, Request from fastapi import APIRouter, Depends, Request
from fastapi.responses import JSONResponse from fastapi.responses import JSONResponse
from typing_extensions import assert_never from typing_extensions import assert_never
@@ -52,9 +52,8 @@ async def create_score(request: ScoreRequest, raw_request: Request):
try: try:
generator = await handler.create_score(request, raw_request) generator = await handler.create_score(request, raw_request)
except Exception as e: except Exception as e:
raise HTTPException( return handler.create_error_response(e)
status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, detail=str(e)
) from e
if isinstance(generator, ErrorResponse): if isinstance(generator, ErrorResponse):
return JSONResponse( return JSONResponse(
content=generator.model_dump(), status_code=generator.error.code content=generator.model_dump(), status_code=generator.error.code
@@ -104,9 +103,8 @@ async def do_rerank(request: RerankRequest, raw_request: Request):
try: try:
generator = await handler.do_rerank(request, raw_request) generator = await handler.do_rerank(request, raw_request)
except Exception as e: except Exception as e:
raise HTTPException( return handler.create_error_response(e)
status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, detail=str(e)
) from e
if isinstance(generator, ErrorResponse): if isinstance(generator, ErrorResponse):
return JSONResponse( return JSONResponse(
content=generator.model_dump(), status_code=generator.error.code content=generator.model_dump(), status_code=generator.error.code

View File

@@ -67,9 +67,8 @@ async def generate(request: GenerateRequest, raw_request: Request):
try: try:
generator = await handler.serve_tokens(request, raw_request) generator = await handler.serve_tokens(request, raw_request)
except Exception as e: except Exception as e:
raise HTTPException( return handler.create_error_response(e)
status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, detail=str(e)
) from e
if isinstance(generator, ErrorResponse): if isinstance(generator, ErrorResponse):
return JSONResponse( return JSONResponse(
content=generator.model_dump(), status_code=generator.error.code content=generator.model_dump(), status_code=generator.error.code

View File

@@ -49,14 +49,8 @@ async def tokenize(request: TokenizeRequest, raw_request: Request):
try: try:
generator = await handler.create_tokenize(request, raw_request) generator = await handler.create_tokenize(request, raw_request)
except NotImplementedError as e:
raise HTTPException(
status_code=HTTPStatus.NOT_IMPLEMENTED.value, detail=str(e)
) from e
except Exception as e: except Exception as e:
raise HTTPException( return handler.create_error_response(e)
status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, detail=str(e)
) from e
if isinstance(generator, ErrorResponse): if isinstance(generator, ErrorResponse):
return JSONResponse( return JSONResponse(

View File

@@ -7,7 +7,7 @@ import functools
import os import os
from argparse import Namespace from argparse import Namespace
from pathlib import Path from pathlib import Path
from typing import Any from typing import TYPE_CHECKING, Any
import regex as re import regex as re
from fastapi import Request from fastapi import Request
@@ -22,18 +22,25 @@ from vllm.entrypoints.chat_utils import (
resolve_hf_chat_template, resolve_hf_chat_template,
resolve_mistral_chat_template, resolve_mistral_chat_template,
) )
from vllm.entrypoints.openai.cli_args import make_arg_parser
from vllm.entrypoints.openai.protocol import (
ChatCompletionRequest,
CompletionRequest,
StreamOptions,
)
from vllm.entrypoints.openai.serving_models import LoRAModulePath
from vllm.logger import init_logger from vllm.logger import init_logger
from vllm.platforms import current_platform from vllm.platforms import current_platform
from vllm.tokenizers.mistral import MistralTokenizer from vllm.tokenizers.mistral import MistralTokenizer
from vllm.utils.argparse_utils import FlexibleArgumentParser from vllm.utils.argparse_utils import FlexibleArgumentParser
if TYPE_CHECKING:
from vllm.entrypoints.openai.protocol import (
ChatCompletionRequest,
CompletionRequest,
StreamOptions,
)
from vllm.entrypoints.openai.serving_models import LoRAModulePath
else:
ChatCompletionRequest = object
CompletionRequest = object
StreamOptions = object
LoRAModulePath = object
logger = init_logger(__name__) logger = init_logger(__name__)
VLLM_SUBCMD_PARSER_EPILOG = ( VLLM_SUBCMD_PARSER_EPILOG = (
@@ -206,7 +213,7 @@ def _validate_truncation_size(
def get_max_tokens( def get_max_tokens(
max_model_len: int, max_model_len: int,
request: ChatCompletionRequest | CompletionRequest, request: "ChatCompletionRequest | CompletionRequest",
input_length: int, input_length: int,
default_sampling_params: dict, default_sampling_params: dict,
) -> int: ) -> int:
@@ -227,6 +234,8 @@ def get_max_tokens(
def log_non_default_args(args: Namespace | EngineArgs): def log_non_default_args(args: Namespace | EngineArgs):
from vllm.entrypoints.openai.cli_args import make_arg_parser
non_default_args = {} non_default_args = {}
# Handle Namespace # Handle Namespace
@@ -255,7 +264,7 @@ def log_non_default_args(args: Namespace | EngineArgs):
def should_include_usage( def should_include_usage(
stream_options: StreamOptions | None, enable_force_include_usage: bool stream_options: "StreamOptions | None", enable_force_include_usage: bool
) -> tuple[bool, bool]: ) -> tuple[bool, bool]:
if stream_options: if stream_options:
include_usage = stream_options.include_usage or enable_force_include_usage include_usage = stream_options.include_usage or enable_force_include_usage
@@ -270,6 +279,8 @@ def should_include_usage(
def process_lora_modules( def process_lora_modules(
args_lora_modules: list[LoRAModulePath], default_mm_loras: dict[str, str] | None args_lora_modules: list[LoRAModulePath], default_mm_loras: dict[str, str] | None
) -> list[LoRAModulePath]: ) -> list[LoRAModulePath]:
from vllm.entrypoints.openai.serving_models import LoRAModulePath
lora_modules = args_lora_modules lora_modules = args_lora_modules
if default_mm_loras: if default_mm_loras:
default_mm_lora_paths = [ default_mm_lora_paths = [

View File

@@ -442,9 +442,9 @@ def get_vllm_port() -> int | None:
try: try:
return int(port) return int(port)
except ValueError as err: except ValueError as err:
from urllib.parse import urlparse from urllib3.util import parse_url
parsed = urlparse(port) parsed = parse_url(port)
if parsed.scheme: if parsed.scheme:
raise ValueError( raise ValueError(
f"VLLM_PORT '{port}' appears to be a URI. " f"VLLM_PORT '{port}' appears to be a URI. "

View File

@@ -9,13 +9,13 @@ from concurrent.futures import ThreadPoolExecutor
from itertools import groupby from itertools import groupby
from pathlib import Path from pathlib import Path
from typing import TYPE_CHECKING, Any, TypeVar from typing import TYPE_CHECKING, Any, TypeVar
from urllib.parse import ParseResult, urlparse
from urllib.request import url2pathname from urllib.request import url2pathname
import numpy as np import numpy as np
import numpy.typing as npt import numpy.typing as npt
import torch import torch
from PIL import Image, UnidentifiedImageError from PIL import Image, UnidentifiedImageError
from urllib3.util import Url, parse_url
import vllm.envs as envs import vllm.envs as envs
from vllm.connections import HTTPConnection, global_http_connection from vllm.connections import HTTPConnection, global_http_connection
@@ -101,11 +101,14 @@ class MediaConnector:
def _load_data_url( def _load_data_url(
self, self,
url_spec: ParseResult, url_spec: Url,
media_io: MediaIO[_M], media_io: MediaIO[_M],
) -> _M: # type: ignore[type-var] ) -> _M: # type: ignore[type-var]
data_spec, data = url_spec.path.split(",", 1) url_spec_path = url_spec.path or ""
data_spec, data = url_spec_path.split(",", 1)
media_type, data_type = data_spec.split(";", 1) media_type, data_type = data_spec.split(";", 1)
# media_type starts with a leading "/" (e.g., "/video/jpeg")
media_type = media_type.lstrip("/")
if data_type != "base64": if data_type != "base64":
msg = "Only base64 data URLs are supported for now." msg = "Only base64 data URLs are supported for now."
@@ -115,7 +118,7 @@ class MediaConnector:
def _load_file_url( def _load_file_url(
self, self,
url_spec: ParseResult, url_spec: Url,
media_io: MediaIO[_M], media_io: MediaIO[_M],
) -> _M: # type: ignore[type-var] ) -> _M: # type: ignore[type-var]
allowed_local_media_path = self.allowed_local_media_path allowed_local_media_path = self.allowed_local_media_path
@@ -124,7 +127,9 @@ class MediaConnector:
"Cannot load local files without `--allowed-local-media-path`." "Cannot load local files without `--allowed-local-media-path`."
) )
filepath = Path(url2pathname(url_spec.netloc + url_spec.path)) url_spec_path = url_spec.path or ""
url_spec_netloc = url_spec.netloc or ""
filepath = Path(url2pathname(url_spec_netloc + url_spec_path))
if allowed_local_media_path not in filepath.resolve().parents: if allowed_local_media_path not in filepath.resolve().parents:
raise ValueError( raise ValueError(
f"The file path {filepath} must be a subpath " f"The file path {filepath} must be a subpath "
@@ -133,7 +138,7 @@ class MediaConnector:
return media_io.load_file(filepath) return media_io.load_file(filepath)
def _assert_url_in_allowed_media_domains(self, url_spec: ParseResult) -> None: def _assert_url_in_allowed_media_domains(self, url_spec: Url) -> None:
if ( if (
self.allowed_media_domains self.allowed_media_domains
and url_spec.hostname not in self.allowed_media_domains and url_spec.hostname not in self.allowed_media_domains
@@ -151,9 +156,9 @@ class MediaConnector:
*, *,
fetch_timeout: int | None = None, fetch_timeout: int | None = None,
) -> _M: # type: ignore[type-var] ) -> _M: # type: ignore[type-var]
url_spec = urlparse(url) url_spec = parse_url(url)
if url_spec.scheme.startswith("http"): if url_spec.scheme and url_spec.scheme.startswith("http"):
self._assert_url_in_allowed_media_domains(url_spec) self._assert_url_in_allowed_media_domains(url_spec)
connection = self.connection connection = self.connection
@@ -181,10 +186,10 @@ class MediaConnector:
*, *,
fetch_timeout: int | None = None, fetch_timeout: int | None = None,
) -> _M: ) -> _M:
url_spec = urlparse(url) url_spec = parse_url(url)
loop = asyncio.get_running_loop() loop = asyncio.get_running_loop()
if url_spec.scheme.startswith("http"): if url_spec.scheme and url_spec.scheme.startswith("http"):
self._assert_url_in_allowed_media_domains(url_spec) self._assert_url_in_allowed_media_domains(url_spec)
connection = self.connection connection = self.connection

View File

@@ -11,12 +11,12 @@ from collections.abc import (
Sequence, Sequence,
) )
from typing import Any from typing import Any
from urllib.parse import urlparse
from uuid import uuid4 from uuid import uuid4
import psutil import psutil
import zmq import zmq
import zmq.asyncio import zmq.asyncio
from urllib3.util import parse_url
import vllm.envs as envs import vllm.envs as envs
from vllm.logger import init_logger from vllm.logger import init_logger
@@ -217,13 +217,15 @@ def find_process_using_port(port: int) -> psutil.Process | None:
def split_zmq_path(path: str) -> tuple[str, str, str]: def split_zmq_path(path: str) -> tuple[str, str, str]:
"""Split a zmq path into its parts.""" """Split a zmq path into its parts."""
parsed = urlparse(path) parsed = parse_url(path)
if not parsed.scheme: if not parsed.scheme:
raise ValueError(f"Invalid zmq path: {path}") raise ValueError(f"Invalid zmq path: {path}")
scheme = parsed.scheme scheme = parsed.scheme
host = parsed.hostname or "" host = parsed.hostname or ""
port = str(parsed.port or "") port = str(parsed.port or "")
if host.startswith("[") and host.endswith("]"):
host = host[1:-1] # Remove brackets for IPv6 address
if scheme == "tcp" and not all((host, port)): if scheme == "tcp" and not all((host, port)):
# The host and port fields are required for tcp # The host and port fields are required for tcp

View File

@@ -357,7 +357,8 @@ class EncoderDecoderCacheManager(EncoderCacheManager):
def __init__(self, cache_size: int): def __init__(self, cache_size: int):
self.cache_size = cache_size self.cache_size = cache_size
self.num_free_slots = cache_size self.num_free_slots = cache_size
self.freed: list[str] = [] self.allocated: list[str] = []
self.to_free: list[str] = []
def check_and_update_cache(self, request: Request, input_id: int) -> bool: def check_and_update_cache(self, request: Request, input_id: int) -> bool:
return False return False
@@ -383,7 +384,7 @@ class EncoderDecoderCacheManager(EncoderCacheManager):
self.num_free_slots -= num_encoder_embeds self.num_free_slots -= num_encoder_embeds
mm_hash = request.mm_features[input_id].identifier mm_hash = request.mm_features[input_id].identifier
self.freed.append(mm_hash) self.allocated.append(mm_hash)
def free(self, request: Request) -> None: def free(self, request: Request) -> None:
for input_id in range(len(request.mm_features)): for input_id in range(len(request.mm_features)):
@@ -393,9 +394,14 @@ class EncoderDecoderCacheManager(EncoderCacheManager):
return set(range(len(request.mm_features))) return set(range(len(request.mm_features)))
def get_freed_mm_hashes(self) -> list[str]: def get_freed_mm_hashes(self) -> list[str]:
freed = self.freed # As encoder cache is not used for enc-dec models, we can free the entries here
self.freed = [] # The actual free happens in the runner, *before* the model is executed.
return freed # Therefore, `freeable` acts as a buffer to free the entries only after the
# model is executed, mimicking the state transition of `EncoderCacheManager`.
to_free = self.to_free
self.to_free = self.allocated
self.allocated = []
return to_free
def free_encoder_input(self, request: Request, input_id: int) -> None: def free_encoder_input(self, request: Request, input_id: int) -> None:
num_encoder_embeds = request.get_num_encoder_embeds(input_id) num_encoder_embeds = request.get_num_encoder_embeds(input_id)