Compare commits

..

5 Commits

Author SHA1 Message Date
Shengqi Chen
d7de043d55 [CI] fix version comparsion and exclusion patterns in upload-release-wheels.sh (#32971)
Signed-off-by: Shengqi Chen <harry-chen@outlook.com>
(cherry picked from commit 136c499f6e)
2026-01-23 14:22:49 -08:00
Nicolò Lucchesi
4dc11b06d3 [Bugfix] Fix Whisper/encoder-decoder GPU memory leak (#32789)
Signed-off-by: NickLucche <nlucches@redhat.com>
(cherry picked from commit ea6102b85d)
2026-01-23 02:53:12 -08:00
Isotr0py
2bd95d803a [Misc] Bump opencv-python dependecy version to 4.13 (#32668)
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
(cherry picked from commit 444e2e7e1f)
2026-01-23 02:52:47 -08:00
Isotr0py
f46d576c54 [Misc] Replace urllib's urlparse with urllib3's parse_url (#32746)
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
(cherry picked from commit 8ebf271bb6)
2026-01-23 02:51:53 -08:00
Shengqi Chen
d68209402d [build] fix cu130 related release pipeline steps and publish as nightly image (#32522)
Signed-off-by: Shengqi Chen <harry-chen@outlook.com>
(cherry picked from commit 965765aef9)
2026-01-17 18:38:46 -08:00
15 changed files with 188 additions and 58 deletions

View File

@@ -141,7 +141,7 @@ steps:
queue: cpu_queue_postmerge queue: cpu_queue_postmerge
commands: commands:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7" - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.2 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg INSTALL_KV_CONNECTORS=true --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.2-devel-ubuntu22.04 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130 --target vllm-openai --progress plain -f docker/Dockerfile ." - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg INSTALL_KV_CONNECTORS=true --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130 --target vllm-openai --progress plain -f docker/Dockerfile ."
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130" - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130"
# re-tag to default image tag and push, just in case arm64 build fails # re-tag to default image tag and push, just in case arm64 build fails
- "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu130" - "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu130"
@@ -154,7 +154,8 @@ steps:
queue: arm64_cpu_queue_postmerge queue: arm64_cpu_queue_postmerge
commands: commands:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7" - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.2 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --build-arg INSTALL_KV_CONNECTORS=true --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.2-devel-ubuntu22.04 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130 --target vllm-openai --progress plain -f docker/Dockerfile ." # compute capability 12.0 for RTX-50 series / RTX PRO 6000 Blackwell, 12.1 for DGX Spark
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0 12.1' --build-arg INSTALL_KV_CONNECTORS=true --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130 --target vllm-openai --progress plain -f docker/Dockerfile ."
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130" - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130"
- label: "Create multi-arch manifest - CUDA 13.0" - label: "Create multi-arch manifest - CUDA 13.0"
@@ -243,7 +244,6 @@ steps:
# Build vLLM ROCm image using the base # Build vLLM ROCm image using the base
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg BASE_IMAGE=rocm/vllm-dev:base-$BUILDKITE_COMMIT --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-rocm --target vllm-openai --progress plain -f docker/Dockerfile.rocm ." - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg BASE_IMAGE=rocm/vllm-dev:base-$BUILDKITE_COMMIT --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-rocm --target vllm-openai --progress plain -f docker/Dockerfile.rocm ."
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-rocm" - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-rocm"
- label: "Build and publish nightly multi-arch image to DockerHub" - label: "Build and publish nightly multi-arch image to DockerHub"
depends_on: depends_on:
@@ -252,17 +252,7 @@ steps:
agents: agents:
queue: small_cpu_queue_postmerge queue: small_cpu_queue_postmerge
commands: commands:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7" - "bash .buildkite/scripts/push-nightly-builds.sh"
- "docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64"
- "docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64"
- "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64 vllm/vllm-openai:nightly-x86_64"
- "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64 vllm/vllm-openai:nightly-aarch64"
- "docker push vllm/vllm-openai:nightly-x86_64"
- "docker push vllm/vllm-openai:nightly-aarch64"
- "docker manifest create vllm/vllm-openai:nightly vllm/vllm-openai:nightly-x86_64 vllm/vllm-openai:nightly-aarch64 --amend"
- "docker manifest create vllm/vllm-openai:nightly-$BUILDKITE_COMMIT vllm/vllm-openai:nightly-x86_64 vllm/vllm-openai:nightly-aarch64 --amend"
- "docker manifest push vllm/vllm-openai:nightly"
- "docker manifest push vllm/vllm-openai:nightly-$BUILDKITE_COMMIT"
# Clean up old nightly builds (keep only last 14) # Clean up old nightly builds (keep only last 14)
- "bash .buildkite/scripts/cleanup-nightly-builds.sh" - "bash .buildkite/scripts/cleanup-nightly-builds.sh"
plugins: plugins:
@@ -273,6 +263,25 @@ steps:
DOCKER_BUILDKIT: "1" DOCKER_BUILDKIT: "1"
DOCKERHUB_USERNAME: "vllmbot" DOCKERHUB_USERNAME: "vllmbot"
- label: "Build and publish nightly multi-arch image to DockerHub - CUDA 13.0"
depends_on:
- create-multi-arch-manifest-cuda-13-0
if: build.env("NIGHTLY") == "1"
agents:
queue: small_cpu_queue_postmerge
commands:
- "bash .buildkite/scripts/push-nightly-builds.sh cu130"
# Clean up old nightly builds (keep only last 14)
- "bash .buildkite/scripts/cleanup-nightly-builds.sh cu130-nightly-"
plugins:
- docker-login#v3.0.0:
username: vllmbot
password-env: DOCKERHUB_TOKEN
env:
DOCKER_BUILDKIT: "1"
DOCKERHUB_USERNAME: "vllmbot"
# ============================================================================= # =============================================================================
# ROCm Release Pipeline (x86_64 only) # ROCm Release Pipeline (x86_64 only)
# ============================================================================= # =============================================================================

View File

@@ -3,7 +3,14 @@
set -ex set -ex
# Clean up old nightly builds from DockerHub, keeping only the last 14 builds # Clean up old nightly builds from DockerHub, keeping only the last 14 builds
# This script uses DockerHub API to list and delete old tags with "nightly-" prefix # This script uses DockerHub API to list and delete old tags with specified prefix
# Usage: cleanup-nightly-builds.sh [TAG_PREFIX]
# Example: cleanup-nightly-builds.sh "nightly-" or cleanup-nightly-builds.sh "cu130-nightly-"
# Get tag prefix from argument, default to "nightly-" if not provided
TAG_PREFIX="${1:-nightly-}"
echo "Cleaning up tags with prefix: $TAG_PREFIX"
# DockerHub API endpoint for vllm/vllm-openai repository # DockerHub API endpoint for vllm/vllm-openai repository
REPO_API_URL="https://hub.docker.com/v2/repositories/vllm/vllm-openai/tags" REPO_API_URL="https://hub.docker.com/v2/repositories/vllm/vllm-openai/tags"
@@ -45,7 +52,7 @@ get_all_tags() {
set -x set -x
# Get both last_updated timestamp and tag name, separated by | # Get both last_updated timestamp and tag name, separated by |
local tags=$(echo "$response" | jq -r '.results[] | select(.name | startswith("nightly-")) | "\(.last_updated)|\(.name)"') local tags=$(echo "$response" | jq -r --arg prefix "$TAG_PREFIX" '.results[] | select(.name | startswith($prefix)) | "\(.last_updated)|\(.name)"')
if [ -z "$tags" ]; then if [ -z "$tags" ]; then
break break

View File

@@ -0,0 +1,36 @@
#!/bin/bash
set -ex
# Get tag variant from argument, default to empty if not provided, should be something like "cu130".
# Due to limits in cleanup script, we must move variants to use separate tags like "cu130-nightly",
# otherwise they will be cleaned up together with the main "nightly" tags.
TAG_VARIANT="$1"
if [ -n "$TAG_VARIANT" ]; then
ORIG_TAG_SUFFIX="-$TAG_VARIANT"
TAG_NAME="$TAG_VARIANT-nightly"
else
ORIG_TAG_SUFFIX=""
TAG_NAME="nightly"
fi
ORIG_TAG_NAME="$BUILDKITE_COMMIT"
echo "Pushing original tag $ORIG_TAG_NAME$ORIG_TAG_SUFFIX to new nightly tag name: $TAG_NAME"
# pull original arch-dependent images from AWS ECR Public
aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:$ORIG_TAG_NAME-x86_64$ORIG_TAG_SUFFIX
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:$ORIG_TAG_NAME-aarch64$ORIG_TAG_SUFFIX
# tag arch-dependent images
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$ORIG_TAG_NAME-x86_64$ORIG_TAG_SUFFIX vllm/vllm-openai:$TAG_NAME-x86_64
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$ORIG_TAG_NAME-aarch64$ORIG_TAG_SUFFIX vllm/vllm-openai:$TAG_NAME-aarch64
# push arch-dependent images to DockerHub
docker push vllm/vllm-openai:$TAG_NAME-x86_64
docker push vllm/vllm-openai:$TAG_NAME-aarch64
# push arch-independent manifest to DockerHub
docker manifest create vllm/vllm-openai:$TAG_NAME vllm/vllm-openai:$TAG_NAME-x86_64 vllm/vllm-openai:$TAG_NAME-aarch64 --amend
docker manifest create vllm/vllm-openai:$TAG_NAME-$BUILDKITE_COMMIT vllm/vllm-openai:$TAG_NAME-x86_64 vllm/vllm-openai:$TAG_NAME-aarch64 --amend
docker manifest push vllm/vllm-openai:$TAG_NAME
docker manifest push vllm/vllm-openai:$TAG_NAME-$BUILDKITE_COMMIT

View File

@@ -16,7 +16,7 @@ else
echo "Git version for commit $BUILDKITE_COMMIT: $GIT_VERSION" echo "Git version for commit $BUILDKITE_COMMIT: $GIT_VERSION"
fi fi
# sanity check for version mismatch # sanity check for version mismatch
if [ "v$RELEASE_VERSION" != "$GIT_VERSION" ]; then if [ "$RELEASE_VERSION" != "$GIT_VERSION" ]; then
if [ "$FORCE_RELEASE_IGNORE_VERSION_MISMATCH" == "true" ]; then if [ "$FORCE_RELEASE_IGNORE_VERSION_MISMATCH" == "true" ]; then
echo "[WARNING] Force release and ignore version mismatch" echo "[WARNING] Force release and ignore version mismatch"
else else
@@ -24,6 +24,7 @@ if [ "v$RELEASE_VERSION" != "$GIT_VERSION" ]; then
exit 1 exit 1
fi fi
fi fi
PURE_VERSION=${RELEASE_VERSION#v} # remove leading 'v'
# check pypi token # check pypi token
if [ -z "$PYPI_TOKEN" ]; then if [ -z "$PYPI_TOKEN" ]; then
@@ -81,16 +82,16 @@ echo "Existing wheels on S3:"
aws s3 ls "$S3_COMMIT_PREFIX" aws s3 ls "$S3_COMMIT_PREFIX"
echo "Copying wheels to local directory" echo "Copying wheels to local directory"
mkdir -p $DIST_DIR mkdir -p $DIST_DIR
# include only wheels for the release version, ignore all files with "dev" or "rc" in the name # include only wheels for the release version, ignore all files with "dev" or "rc" in the name (without excluding 'aarch64')
aws s3 cp --recursive --exclude "*" --include "vllm-${RELEASE_VERSION}*.whl" --exclude "*dev*" --exclude "*rc*" "$S3_COMMIT_PREFIX" $DIST_DIR aws s3 cp --recursive --exclude "*" --include "vllm-${PURE_VERSION}*.whl" --exclude "*dev*" --exclude "*rc[0-9]*" "$S3_COMMIT_PREFIX" $DIST_DIR
echo "Wheels copied to local directory" echo "Wheels copied to local directory"
# generate source tarball # generate source tarball
git archive --format=tar.gz --output="$DIST_DIR/vllm-${RELEASE_VERSION}.tar.gz" $BUILDKITE_COMMIT git archive --format=tar.gz --output="$DIST_DIR/vllm-${PURE_VERSION}.tar.gz" $BUILDKITE_COMMIT
ls -la $DIST_DIR ls -la $DIST_DIR
# upload wheels to PyPI (only default variant, i.e. files without '+' in the name) # upload wheels to PyPI (only default variant, i.e. files without '+' in the name)
PYPI_WHEEL_FILES=$(find $DIST_DIR -name "vllm-${RELEASE_VERSION}*.whl" -not -name "*+*") PYPI_WHEEL_FILES=$(find $DIST_DIR -name "vllm-${PURE_VERSION}*.whl" -not -name "*+*")
if [ -z "$PYPI_WHEEL_FILES" ]; then if [ -z "$PYPI_WHEEL_FILES" ]; then
echo "No default variant wheels found, quitting..." echo "No default variant wheels found, quitting..."
exit 1 exit 1

View File

@@ -32,7 +32,7 @@ pyzmq >= 25.0.0
msgspec msgspec
gguf >= 0.17.0 gguf >= 0.17.0
mistral_common[image] >= 1.8.8 mistral_common[image] >= 1.8.8
opencv-python-headless >= 4.11.0 # required for video IO opencv-python-headless >= 4.13.0 # required for video IO
pyyaml pyyaml
six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that needs to be the latest version for python 3.12 six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that needs to be the latest version for python 3.12
setuptools>=77.0.3,<81.0.0; python_version > '3.11' # Setuptools is used by triton, we need to ensure a modern version is installed for 3.12+ so that it does not try to import distutils, which was removed in 3.12 setuptools>=77.0.3,<81.0.0; python_version > '3.11' # Setuptools is used by triton, we need to ensure a modern version is installed for 3.12+ so that it does not try to import distutils, which was removed in 3.12

View File

@@ -25,7 +25,7 @@ transformers_stream_generator # required for qwen-vl test
matplotlib # required for qwen-vl test matplotlib # required for qwen-vl test
mistral_common[image,audio] >= 1.8.8 # required for voxtral test mistral_common[image,audio] >= 1.8.8 # required for voxtral test
num2words # required for smolvlm test num2words # required for smolvlm test
opencv-python-headless >= 4.11.0 # required for video test opencv-python-headless >= 4.13.0 # required for video test
datamodel_code_generator # required for minicpm3 test datamodel_code_generator # required for minicpm3 test
lm-eval[api]>=0.4.9.2 # required for model evaluation test lm-eval[api]>=0.4.9.2 # required for model evaluation test
mteb>=1.38.11, <2 # required for mteb test mteb>=1.38.11, <2 # required for mteb test
@@ -37,8 +37,8 @@ bitsandbytes>=0.46.1
buildkite-test-collector==0.1.9 buildkite-test-collector==0.1.9
genai_perf==0.0.8 genai_perf>=0.0.8
tritonclient==2.51.0 tritonclient>=2.51.0
numba == 0.61.2 # Required for N-gram speculative decoding numba == 0.61.2 # Required for N-gram speculative decoding
numpy numpy

View File

@@ -33,7 +33,7 @@ matplotlib # required for qwen-vl test
mistral_common[image,audio] >= 1.8.8 # required for voxtral test mistral_common[image,audio] >= 1.8.8 # required for voxtral test
num2words # required for smolvlm test num2words # required for smolvlm test
open_clip_torch==2.32.0 # Required for nemotron_vl test, Nemotron Parse in test_common.py open_clip_torch==2.32.0 # Required for nemotron_vl test, Nemotron Parse in test_common.py
opencv-python-headless >= 4.11.0 # required for video test opencv-python-headless >= 4.13.0 # required for video test
datamodel_code_generator # required for minicpm3 test datamodel_code_generator # required for minicpm3 test
lm-eval[api]>=0.4.9.2 # required for model evaluation test lm-eval[api]>=0.4.9.2 # required for model evaluation test
mteb[bm25s]>=2, <3 # required for mteb test mteb[bm25s]>=2, <3 # required for mteb test
@@ -45,8 +45,8 @@ bitsandbytes==0.46.1
buildkite-test-collector==0.1.9 buildkite-test-collector==0.1.9
genai_perf==0.0.8 genai_perf>=0.0.8
tritonclient==2.51.0 tritonclient>=2.51.0
arctic-inference == 0.1.1 # Required for suffix decoding test arctic-inference == 0.1.1 # Required for suffix decoding test
numba == 0.61.2 # Required for N-gram speculative decoding numba == 0.61.2 # Required for N-gram speculative decoding

View File

@@ -31,7 +31,11 @@ albumentations==1.4.6
# -r requirements/test.in # -r requirements/test.in
# terratorch # terratorch
alembic==1.16.4 alembic==1.16.4
# via mlflow # via
# mlflow
# optuna
annotated-doc==0.0.4
# via fastapi
annotated-types==0.7.0 annotated-types==0.7.0
# via pydantic # via pydantic
antlr4-python3-runtime==4.9.3 antlr4-python3-runtime==4.9.3
@@ -143,6 +147,8 @@ colorama==0.4.6
# tqdm-multiprocess # tqdm-multiprocess
colorful==0.5.6 colorful==0.5.6
# via ray # via ray
colorlog==6.10.1
# via optuna
contourpy==1.3.0 contourpy==1.3.0
# via matplotlib # via matplotlib
coverage==7.10.6 coverage==7.10.6
@@ -250,7 +256,7 @@ fsspec==2024.9.0
# torch # torch
ftfy==6.3.1 ftfy==6.3.1
# via open-clip-torch # via open-clip-torch
genai-perf==0.0.8 genai-perf==0.0.16
# via -r requirements/test.in # via -r requirements/test.in
genson==1.3.0 genson==1.3.0
# via datamodel-code-generator # via datamodel-code-generator
@@ -387,6 +393,7 @@ jinja2==3.1.6
# via # via
# datamodel-code-generator # datamodel-code-generator
# flask # flask
# genai-perf
# mlflow # mlflow
# torch # torch
jiwer==3.0.5 jiwer==3.0.5
@@ -526,7 +533,7 @@ numba==0.61.2
# librosa # librosa
numexpr==2.10.1 numexpr==2.10.1
# via lm-eval # via lm-eval
numpy==1.26.4 numpy==2.2.6
# via # via
# -r requirements/test.in # -r requirements/test.in
# accelerate # accelerate
@@ -556,6 +563,7 @@ numpy==1.26.4
# numba # numba
# numexpr # numexpr
# opencv-python-headless # opencv-python-headless
# optuna
# pandas # pandas
# patsy # patsy
# peft # peft
@@ -635,7 +643,7 @@ opencensus==0.11.4
# via ray # via ray
opencensus-context==0.1.3 opencensus-context==0.1.3
# via opencensus # via opencensus
opencv-python-headless==4.11.0.86 opencv-python-headless==4.13.0.90
# via # via
# -r requirements/test.in # -r requirements/test.in
# albucore # albucore
@@ -658,6 +666,10 @@ opentelemetry-sdk==1.35.0
# ray # ray
opentelemetry-semantic-conventions==0.56b0 opentelemetry-semantic-conventions==0.56b0
# via opentelemetry-sdk # via opentelemetry-sdk
optuna==3.6.1
# via genai-perf
orjson==3.11.5
# via genai-perf
packaging==24.2 packaging==24.2
# via # via
# accelerate # accelerate
@@ -676,6 +688,7 @@ packaging==24.2
# lightning-utilities # lightning-utilities
# matplotlib # matplotlib
# mlflow-skinny # mlflow-skinny
# optuna
# peft # peft
# plotly # plotly
# pooch # pooch
@@ -715,6 +728,8 @@ peft==0.16.0
# lm-eval # lm-eval
perceptron==0.1.4 perceptron==0.1.4
# via -r requirements/test.in # via -r requirements/test.in
perf-analyzer==0.1.0
# via genai-perf
pillow==10.4.0 pillow==10.4.0
# via # via
# genai-perf # genai-perf
@@ -901,6 +916,7 @@ pyyaml==6.0.2
# lightning # lightning
# mlflow-skinny # mlflow-skinny
# omegaconf # omegaconf
# optuna
# peft # peft
# pytorch-lightning # pytorch-lightning
# ray # ray
@@ -1063,6 +1079,7 @@ sortedcontainers==2.4.0
soundfile==0.12.1 soundfile==0.12.1
# via # via
# -r requirements/test.in # -r requirements/test.in
# genai-perf
# librosa # librosa
# mistral-common # mistral-common
soxr==0.5.0.post1 soxr==0.5.0.post1
@@ -1073,6 +1090,7 @@ sqlalchemy==2.0.41
# via # via
# alembic # alembic
# mlflow # mlflow
# optuna
sqlitedict==2.1.0 sqlitedict==2.1.0
# via lm-eval # via lm-eval
sqlparse==0.5.3 sqlparse==0.5.3
@@ -1202,6 +1220,7 @@ tqdm==4.66.6
# mteb # mteb
# nltk # nltk
# open-clip-torch # open-clip-torch
# optuna
# peft # peft
# pqdm # pqdm
# pretrainedmodels # pretrainedmodels
@@ -1224,10 +1243,8 @@ transformers-stream-generator==0.0.5
# via -r requirements/test.in # via -r requirements/test.in
triton==3.5.1 triton==3.5.1
# via torch # via torch
tritonclient==2.51.0 tritonclient==2.64.0
# via # via -r requirements/test.in
# -r requirements/test.in
# genai-perf
typepy==1.3.2 typepy==1.3.2
# via # via
# dataproperty # dataproperty

View File

@@ -267,12 +267,16 @@ async def test_audio_with_max_tokens(mary_had_lamb, client_and_model):
out_tokens = tok(out_text, add_special_tokens=False)["input_ids"] out_tokens = tok(out_text, add_special_tokens=False)["input_ids"]
assert len(out_tokens) == 1 assert len(out_tokens) == 1
# max_completion_tokens > max_model_len # max_completion_tokens > max_model_len
# max_model_len=32768 for Gemma-3n-E2B-it
transcription = await client.audio.transcriptions.create( transcription = await client.audio.transcriptions.create(
model=model_name, model=model_name,
file=mary_had_lamb, file=mary_had_lamb,
response_format="text", response_format="text",
temperature=0.0, temperature=0.0,
extra_body={"max_completion_tokens": int(1e6)}, extra_body={
"max_completion_tokens": int(1e6),
"repetition_penalty": 1.3,
},
) )
out = json.loads(transcription) out = json.loads(transcription)
out_text = out["text"] out_text = out["text"]

View File

@@ -176,3 +176,46 @@ def test_models_distributed(
distributed_executor_backend=distributed_executor_backend, distributed_executor_backend=distributed_executor_backend,
enforce_eager=False, enforce_eager=False,
) )
@pytest.mark.core_model
@pytest.mark.parametrize("model", ["openai/whisper-large-v3-turbo"])
def test_encoder_cache_cleanup(
vllm_runner,
model: str,
input_audios,
monkeypatch,
) -> None:
"""Test that encoder cache is properly cleaned up after requests complete.
This is a regression test for a bug where encoder cache entries were freed
in the same scheduling step they were allocated, before the model could use
them.
"""
# Set single-process mode to access the model runner's encoder cache directly
monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
check_model_available(model)
with vllm_runner(
model,
dtype="half",
max_model_len=448,
tensor_parallel_size=1,
limit_mm_per_prompt={"audio": 2},
enforce_eager=True,
) as vllm_model:
engine_core = vllm_model.llm.llm_engine.engine_core.engine_core
model_runner = engine_core.model_executor.driver_worker.worker.model_runner
encoder_cache = model_runner.encoder_cache
# Run multiple sequential requests to ensure cache is properly managed
for vllm_prompts, _, audios in input_audios:
vllm_model.generate_greedy(vllm_prompts, max_tokens=50, audios=audios)
# After all requests complete, encoder cache should be empty
cache_size = len(encoder_cache)
assert cache_size == 0, (
f"Encoder cache should be empty after all requests complete, "
f"but has {cache_size} entries. This indicates encoder cache "
f"entries are not being properly freed."
)

View File

@@ -3,10 +3,10 @@
from collections.abc import Mapping, MutableMapping from collections.abc import Mapping, MutableMapping
from pathlib import Path from pathlib import Path
from urllib.parse import urlparse
import aiohttp import aiohttp
import requests import requests
from urllib3.util import parse_url
from vllm.version import __version__ as VLLM_VERSION from vllm.version import __version__ as VLLM_VERSION
@@ -37,7 +37,7 @@ class HTTPConnection:
return self._async_client return self._async_client
def _validate_http_url(self, url: str): def _validate_http_url(self, url: str):
parsed_url = urlparse(url) parsed_url = parse_url(url)
if parsed_url.scheme not in ("http", "https"): if parsed_url.scheme not in ("http", "https"):
raise ValueError( raise ValueError(

View File

@@ -442,9 +442,9 @@ def get_vllm_port() -> int | None:
try: try:
return int(port) return int(port)
except ValueError as err: except ValueError as err:
from urllib.parse import urlparse from urllib3.util import parse_url
parsed = urlparse(port) parsed = parse_url(port)
if parsed.scheme: if parsed.scheme:
raise ValueError( raise ValueError(
f"VLLM_PORT '{port}' appears to be a URI. " f"VLLM_PORT '{port}' appears to be a URI. "

View File

@@ -9,13 +9,13 @@ from concurrent.futures import ThreadPoolExecutor
from itertools import groupby from itertools import groupby
from pathlib import Path from pathlib import Path
from typing import TYPE_CHECKING, Any, TypeVar from typing import TYPE_CHECKING, Any, TypeVar
from urllib.parse import ParseResult, urlparse
from urllib.request import url2pathname from urllib.request import url2pathname
import numpy as np import numpy as np
import numpy.typing as npt import numpy.typing as npt
import torch import torch
from PIL import Image, UnidentifiedImageError from PIL import Image, UnidentifiedImageError
from urllib3.util import Url, parse_url
import vllm.envs as envs import vllm.envs as envs
from vllm.connections import HTTPConnection, global_http_connection from vllm.connections import HTTPConnection, global_http_connection
@@ -101,11 +101,14 @@ class MediaConnector:
def _load_data_url( def _load_data_url(
self, self,
url_spec: ParseResult, url_spec: Url,
media_io: MediaIO[_M], media_io: MediaIO[_M],
) -> _M: # type: ignore[type-var] ) -> _M: # type: ignore[type-var]
data_spec, data = url_spec.path.split(",", 1) url_spec_path = url_spec.path or ""
data_spec, data = url_spec_path.split(",", 1)
media_type, data_type = data_spec.split(";", 1) media_type, data_type = data_spec.split(";", 1)
# media_type starts with a leading "/" (e.g., "/video/jpeg")
media_type = media_type.lstrip("/")
if data_type != "base64": if data_type != "base64":
msg = "Only base64 data URLs are supported for now." msg = "Only base64 data URLs are supported for now."
@@ -115,7 +118,7 @@ class MediaConnector:
def _load_file_url( def _load_file_url(
self, self,
url_spec: ParseResult, url_spec: Url,
media_io: MediaIO[_M], media_io: MediaIO[_M],
) -> _M: # type: ignore[type-var] ) -> _M: # type: ignore[type-var]
allowed_local_media_path = self.allowed_local_media_path allowed_local_media_path = self.allowed_local_media_path
@@ -124,7 +127,9 @@ class MediaConnector:
"Cannot load local files without `--allowed-local-media-path`." "Cannot load local files without `--allowed-local-media-path`."
) )
filepath = Path(url2pathname(url_spec.netloc + url_spec.path)) url_spec_path = url_spec.path or ""
url_spec_netloc = url_spec.netloc or ""
filepath = Path(url2pathname(url_spec_netloc + url_spec_path))
if allowed_local_media_path not in filepath.resolve().parents: if allowed_local_media_path not in filepath.resolve().parents:
raise ValueError( raise ValueError(
f"The file path {filepath} must be a subpath " f"The file path {filepath} must be a subpath "
@@ -133,7 +138,7 @@ class MediaConnector:
return media_io.load_file(filepath) return media_io.load_file(filepath)
def _assert_url_in_allowed_media_domains(self, url_spec: ParseResult) -> None: def _assert_url_in_allowed_media_domains(self, url_spec: Url) -> None:
if ( if (
self.allowed_media_domains self.allowed_media_domains
and url_spec.hostname not in self.allowed_media_domains and url_spec.hostname not in self.allowed_media_domains
@@ -151,9 +156,9 @@ class MediaConnector:
*, *,
fetch_timeout: int | None = None, fetch_timeout: int | None = None,
) -> _M: # type: ignore[type-var] ) -> _M: # type: ignore[type-var]
url_spec = urlparse(url) url_spec = parse_url(url)
if url_spec.scheme.startswith("http"): if url_spec.scheme and url_spec.scheme.startswith("http"):
self._assert_url_in_allowed_media_domains(url_spec) self._assert_url_in_allowed_media_domains(url_spec)
connection = self.connection connection = self.connection
@@ -181,10 +186,10 @@ class MediaConnector:
*, *,
fetch_timeout: int | None = None, fetch_timeout: int | None = None,
) -> _M: ) -> _M:
url_spec = urlparse(url) url_spec = parse_url(url)
loop = asyncio.get_running_loop() loop = asyncio.get_running_loop()
if url_spec.scheme.startswith("http"): if url_spec.scheme and url_spec.scheme.startswith("http"):
self._assert_url_in_allowed_media_domains(url_spec) self._assert_url_in_allowed_media_domains(url_spec)
connection = self.connection connection = self.connection

View File

@@ -11,12 +11,12 @@ from collections.abc import (
Sequence, Sequence,
) )
from typing import Any from typing import Any
from urllib.parse import urlparse
from uuid import uuid4 from uuid import uuid4
import psutil import psutil
import zmq import zmq
import zmq.asyncio import zmq.asyncio
from urllib3.util import parse_url
import vllm.envs as envs import vllm.envs as envs
from vllm.logger import init_logger from vllm.logger import init_logger
@@ -217,13 +217,15 @@ def find_process_using_port(port: int) -> psutil.Process | None:
def split_zmq_path(path: str) -> tuple[str, str, str]: def split_zmq_path(path: str) -> tuple[str, str, str]:
"""Split a zmq path into its parts.""" """Split a zmq path into its parts."""
parsed = urlparse(path) parsed = parse_url(path)
if not parsed.scheme: if not parsed.scheme:
raise ValueError(f"Invalid zmq path: {path}") raise ValueError(f"Invalid zmq path: {path}")
scheme = parsed.scheme scheme = parsed.scheme
host = parsed.hostname or "" host = parsed.hostname or ""
port = str(parsed.port or "") port = str(parsed.port or "")
if host.startswith("[") and host.endswith("]"):
host = host[1:-1] # Remove brackets for IPv6 address
if scheme == "tcp" and not all((host, port)): if scheme == "tcp" and not all((host, port)):
# The host and port fields are required for tcp # The host and port fields are required for tcp

View File

@@ -357,7 +357,8 @@ class EncoderDecoderCacheManager(EncoderCacheManager):
def __init__(self, cache_size: int): def __init__(self, cache_size: int):
self.cache_size = cache_size self.cache_size = cache_size
self.num_free_slots = cache_size self.num_free_slots = cache_size
self.freed: list[str] = [] self.allocated: list[str] = []
self.to_free: list[str] = []
def check_and_update_cache(self, request: Request, input_id: int) -> bool: def check_and_update_cache(self, request: Request, input_id: int) -> bool:
return False return False
@@ -383,7 +384,7 @@ class EncoderDecoderCacheManager(EncoderCacheManager):
self.num_free_slots -= num_encoder_embeds self.num_free_slots -= num_encoder_embeds
mm_hash = request.mm_features[input_id].identifier mm_hash = request.mm_features[input_id].identifier
self.freed.append(mm_hash) self.allocated.append(mm_hash)
def free(self, request: Request) -> None: def free(self, request: Request) -> None:
for input_id in range(len(request.mm_features)): for input_id in range(len(request.mm_features)):
@@ -393,9 +394,14 @@ class EncoderDecoderCacheManager(EncoderCacheManager):
return set(range(len(request.mm_features))) return set(range(len(request.mm_features)))
def get_freed_mm_hashes(self) -> list[str]: def get_freed_mm_hashes(self) -> list[str]:
freed = self.freed # As encoder cache is not used for enc-dec models, we can free the entries here
self.freed = [] # The actual free happens in the runner, *before* the model is executed.
return freed # Therefore, `freeable` acts as a buffer to free the entries only after the
# model is executed, mimicking the state transition of `EncoderCacheManager`.
to_free = self.to_free
self.to_free = self.allocated
self.allocated = []
return to_free
def free_encoder_input(self, request: Request, input_id: int) -> None: def free_encoder_input(self, request: Request, input_id: int) -> None:
num_encoder_embeds = request.get_num_encoder_embeds(input_id) num_encoder_embeds = request.get_num_encoder_embeds(input_id)