Compare commits
5 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
d7de043d55 | ||
|
|
4dc11b06d3 | ||
|
|
2bd95d803a | ||
|
|
f46d576c54 | ||
|
|
d68209402d |
@@ -141,7 +141,7 @@ steps:
|
||||
queue: cpu_queue_postmerge
|
||||
commands:
|
||||
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.2 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg INSTALL_KV_CONNECTORS=true --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.2-devel-ubuntu22.04 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130 --target vllm-openai --progress plain -f docker/Dockerfile ."
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg INSTALL_KV_CONNECTORS=true --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130 --target vllm-openai --progress plain -f docker/Dockerfile ."
|
||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130"
|
||||
# re-tag to default image tag and push, just in case arm64 build fails
|
||||
- "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu130"
|
||||
@@ -154,7 +154,8 @@ steps:
|
||||
queue: arm64_cpu_queue_postmerge
|
||||
commands:
|
||||
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.2 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --build-arg INSTALL_KV_CONNECTORS=true --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.2-devel-ubuntu22.04 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130 --target vllm-openai --progress plain -f docker/Dockerfile ."
|
||||
# compute capability 12.0 for RTX-50 series / RTX PRO 6000 Blackwell, 12.1 for DGX Spark
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0 12.1' --build-arg INSTALL_KV_CONNECTORS=true --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130 --target vllm-openai --progress plain -f docker/Dockerfile ."
|
||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130"
|
||||
|
||||
- label: "Create multi-arch manifest - CUDA 13.0"
|
||||
@@ -243,7 +244,6 @@ steps:
|
||||
# Build vLLM ROCm image using the base
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg BASE_IMAGE=rocm/vllm-dev:base-$BUILDKITE_COMMIT --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-rocm --target vllm-openai --progress plain -f docker/Dockerfile.rocm ."
|
||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-rocm"
|
||||
|
||||
|
||||
- label: "Build and publish nightly multi-arch image to DockerHub"
|
||||
depends_on:
|
||||
@@ -252,17 +252,7 @@ steps:
|
||||
agents:
|
||||
queue: small_cpu_queue_postmerge
|
||||
commands:
|
||||
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
||||
- "docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64"
|
||||
- "docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64"
|
||||
- "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64 vllm/vllm-openai:nightly-x86_64"
|
||||
- "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64 vllm/vllm-openai:nightly-aarch64"
|
||||
- "docker push vllm/vllm-openai:nightly-x86_64"
|
||||
- "docker push vllm/vllm-openai:nightly-aarch64"
|
||||
- "docker manifest create vllm/vllm-openai:nightly vllm/vllm-openai:nightly-x86_64 vllm/vllm-openai:nightly-aarch64 --amend"
|
||||
- "docker manifest create vllm/vllm-openai:nightly-$BUILDKITE_COMMIT vllm/vllm-openai:nightly-x86_64 vllm/vllm-openai:nightly-aarch64 --amend"
|
||||
- "docker manifest push vllm/vllm-openai:nightly"
|
||||
- "docker manifest push vllm/vllm-openai:nightly-$BUILDKITE_COMMIT"
|
||||
- "bash .buildkite/scripts/push-nightly-builds.sh"
|
||||
# Clean up old nightly builds (keep only last 14)
|
||||
- "bash .buildkite/scripts/cleanup-nightly-builds.sh"
|
||||
plugins:
|
||||
@@ -273,6 +263,25 @@ steps:
|
||||
DOCKER_BUILDKIT: "1"
|
||||
DOCKERHUB_USERNAME: "vllmbot"
|
||||
|
||||
- label: "Build and publish nightly multi-arch image to DockerHub - CUDA 13.0"
|
||||
depends_on:
|
||||
- create-multi-arch-manifest-cuda-13-0
|
||||
if: build.env("NIGHTLY") == "1"
|
||||
agents:
|
||||
queue: small_cpu_queue_postmerge
|
||||
commands:
|
||||
- "bash .buildkite/scripts/push-nightly-builds.sh cu130"
|
||||
# Clean up old nightly builds (keep only last 14)
|
||||
- "bash .buildkite/scripts/cleanup-nightly-builds.sh cu130-nightly-"
|
||||
plugins:
|
||||
- docker-login#v3.0.0:
|
||||
username: vllmbot
|
||||
password-env: DOCKERHUB_TOKEN
|
||||
env:
|
||||
DOCKER_BUILDKIT: "1"
|
||||
DOCKERHUB_USERNAME: "vllmbot"
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# ROCm Release Pipeline (x86_64 only)
|
||||
# =============================================================================
|
||||
|
||||
@@ -3,7 +3,14 @@
|
||||
set -ex
|
||||
|
||||
# Clean up old nightly builds from DockerHub, keeping only the last 14 builds
|
||||
# This script uses DockerHub API to list and delete old tags with "nightly-" prefix
|
||||
# This script uses DockerHub API to list and delete old tags with specified prefix
|
||||
# Usage: cleanup-nightly-builds.sh [TAG_PREFIX]
|
||||
# Example: cleanup-nightly-builds.sh "nightly-" or cleanup-nightly-builds.sh "cu130-nightly-"
|
||||
|
||||
# Get tag prefix from argument, default to "nightly-" if not provided
|
||||
TAG_PREFIX="${1:-nightly-}"
|
||||
|
||||
echo "Cleaning up tags with prefix: $TAG_PREFIX"
|
||||
|
||||
# DockerHub API endpoint for vllm/vllm-openai repository
|
||||
REPO_API_URL="https://hub.docker.com/v2/repositories/vllm/vllm-openai/tags"
|
||||
@@ -45,7 +52,7 @@ get_all_tags() {
|
||||
set -x
|
||||
|
||||
# Get both last_updated timestamp and tag name, separated by |
|
||||
local tags=$(echo "$response" | jq -r '.results[] | select(.name | startswith("nightly-")) | "\(.last_updated)|\(.name)"')
|
||||
local tags=$(echo "$response" | jq -r --arg prefix "$TAG_PREFIX" '.results[] | select(.name | startswith($prefix)) | "\(.last_updated)|\(.name)"')
|
||||
|
||||
if [ -z "$tags" ]; then
|
||||
break
|
||||
|
||||
36
.buildkite/scripts/push-nightly-builds.sh
Executable file
36
.buildkite/scripts/push-nightly-builds.sh
Executable file
@@ -0,0 +1,36 @@
|
||||
#!/bin/bash
|
||||
|
||||
set -ex
|
||||
|
||||
# Get tag variant from argument, default to empty if not provided, should be something like "cu130".
|
||||
# Due to limits in cleanup script, we must move variants to use separate tags like "cu130-nightly",
|
||||
# otherwise they will be cleaned up together with the main "nightly" tags.
|
||||
|
||||
TAG_VARIANT="$1"
|
||||
if [ -n "$TAG_VARIANT" ]; then
|
||||
ORIG_TAG_SUFFIX="-$TAG_VARIANT"
|
||||
TAG_NAME="$TAG_VARIANT-nightly"
|
||||
else
|
||||
ORIG_TAG_SUFFIX=""
|
||||
TAG_NAME="nightly"
|
||||
fi
|
||||
|
||||
ORIG_TAG_NAME="$BUILDKITE_COMMIT"
|
||||
|
||||
echo "Pushing original tag $ORIG_TAG_NAME$ORIG_TAG_SUFFIX to new nightly tag name: $TAG_NAME"
|
||||
|
||||
# pull original arch-dependent images from AWS ECR Public
|
||||
aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7
|
||||
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:$ORIG_TAG_NAME-x86_64$ORIG_TAG_SUFFIX
|
||||
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:$ORIG_TAG_NAME-aarch64$ORIG_TAG_SUFFIX
|
||||
# tag arch-dependent images
|
||||
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$ORIG_TAG_NAME-x86_64$ORIG_TAG_SUFFIX vllm/vllm-openai:$TAG_NAME-x86_64
|
||||
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$ORIG_TAG_NAME-aarch64$ORIG_TAG_SUFFIX vllm/vllm-openai:$TAG_NAME-aarch64
|
||||
# push arch-dependent images to DockerHub
|
||||
docker push vllm/vllm-openai:$TAG_NAME-x86_64
|
||||
docker push vllm/vllm-openai:$TAG_NAME-aarch64
|
||||
# push arch-independent manifest to DockerHub
|
||||
docker manifest create vllm/vllm-openai:$TAG_NAME vllm/vllm-openai:$TAG_NAME-x86_64 vllm/vllm-openai:$TAG_NAME-aarch64 --amend
|
||||
docker manifest create vllm/vllm-openai:$TAG_NAME-$BUILDKITE_COMMIT vllm/vllm-openai:$TAG_NAME-x86_64 vllm/vllm-openai:$TAG_NAME-aarch64 --amend
|
||||
docker manifest push vllm/vllm-openai:$TAG_NAME
|
||||
docker manifest push vllm/vllm-openai:$TAG_NAME-$BUILDKITE_COMMIT
|
||||
@@ -16,7 +16,7 @@ else
|
||||
echo "Git version for commit $BUILDKITE_COMMIT: $GIT_VERSION"
|
||||
fi
|
||||
# sanity check for version mismatch
|
||||
if [ "v$RELEASE_VERSION" != "$GIT_VERSION" ]; then
|
||||
if [ "$RELEASE_VERSION" != "$GIT_VERSION" ]; then
|
||||
if [ "$FORCE_RELEASE_IGNORE_VERSION_MISMATCH" == "true" ]; then
|
||||
echo "[WARNING] Force release and ignore version mismatch"
|
||||
else
|
||||
@@ -24,6 +24,7 @@ if [ "v$RELEASE_VERSION" != "$GIT_VERSION" ]; then
|
||||
exit 1
|
||||
fi
|
||||
fi
|
||||
PURE_VERSION=${RELEASE_VERSION#v} # remove leading 'v'
|
||||
|
||||
# check pypi token
|
||||
if [ -z "$PYPI_TOKEN" ]; then
|
||||
@@ -81,16 +82,16 @@ echo "Existing wheels on S3:"
|
||||
aws s3 ls "$S3_COMMIT_PREFIX"
|
||||
echo "Copying wheels to local directory"
|
||||
mkdir -p $DIST_DIR
|
||||
# include only wheels for the release version, ignore all files with "dev" or "rc" in the name
|
||||
aws s3 cp --recursive --exclude "*" --include "vllm-${RELEASE_VERSION}*.whl" --exclude "*dev*" --exclude "*rc*" "$S3_COMMIT_PREFIX" $DIST_DIR
|
||||
# include only wheels for the release version, ignore all files with "dev" or "rc" in the name (without excluding 'aarch64')
|
||||
aws s3 cp --recursive --exclude "*" --include "vllm-${PURE_VERSION}*.whl" --exclude "*dev*" --exclude "*rc[0-9]*" "$S3_COMMIT_PREFIX" $DIST_DIR
|
||||
echo "Wheels copied to local directory"
|
||||
# generate source tarball
|
||||
git archive --format=tar.gz --output="$DIST_DIR/vllm-${RELEASE_VERSION}.tar.gz" $BUILDKITE_COMMIT
|
||||
git archive --format=tar.gz --output="$DIST_DIR/vllm-${PURE_VERSION}.tar.gz" $BUILDKITE_COMMIT
|
||||
ls -la $DIST_DIR
|
||||
|
||||
|
||||
# upload wheels to PyPI (only default variant, i.e. files without '+' in the name)
|
||||
PYPI_WHEEL_FILES=$(find $DIST_DIR -name "vllm-${RELEASE_VERSION}*.whl" -not -name "*+*")
|
||||
PYPI_WHEEL_FILES=$(find $DIST_DIR -name "vllm-${PURE_VERSION}*.whl" -not -name "*+*")
|
||||
if [ -z "$PYPI_WHEEL_FILES" ]; then
|
||||
echo "No default variant wheels found, quitting..."
|
||||
exit 1
|
||||
|
||||
@@ -32,7 +32,7 @@ pyzmq >= 25.0.0
|
||||
msgspec
|
||||
gguf >= 0.17.0
|
||||
mistral_common[image] >= 1.8.8
|
||||
opencv-python-headless >= 4.11.0 # required for video IO
|
||||
opencv-python-headless >= 4.13.0 # required for video IO
|
||||
pyyaml
|
||||
six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that needs to be the latest version for python 3.12
|
||||
setuptools>=77.0.3,<81.0.0; python_version > '3.11' # Setuptools is used by triton, we need to ensure a modern version is installed for 3.12+ so that it does not try to import distutils, which was removed in 3.12
|
||||
|
||||
@@ -25,7 +25,7 @@ transformers_stream_generator # required for qwen-vl test
|
||||
matplotlib # required for qwen-vl test
|
||||
mistral_common[image,audio] >= 1.8.8 # required for voxtral test
|
||||
num2words # required for smolvlm test
|
||||
opencv-python-headless >= 4.11.0 # required for video test
|
||||
opencv-python-headless >= 4.13.0 # required for video test
|
||||
datamodel_code_generator # required for minicpm3 test
|
||||
lm-eval[api]>=0.4.9.2 # required for model evaluation test
|
||||
mteb>=1.38.11, <2 # required for mteb test
|
||||
@@ -37,8 +37,8 @@ bitsandbytes>=0.46.1
|
||||
buildkite-test-collector==0.1.9
|
||||
|
||||
|
||||
genai_perf==0.0.8
|
||||
tritonclient==2.51.0
|
||||
genai_perf>=0.0.8
|
||||
tritonclient>=2.51.0
|
||||
|
||||
numba == 0.61.2 # Required for N-gram speculative decoding
|
||||
numpy
|
||||
|
||||
@@ -33,7 +33,7 @@ matplotlib # required for qwen-vl test
|
||||
mistral_common[image,audio] >= 1.8.8 # required for voxtral test
|
||||
num2words # required for smolvlm test
|
||||
open_clip_torch==2.32.0 # Required for nemotron_vl test, Nemotron Parse in test_common.py
|
||||
opencv-python-headless >= 4.11.0 # required for video test
|
||||
opencv-python-headless >= 4.13.0 # required for video test
|
||||
datamodel_code_generator # required for minicpm3 test
|
||||
lm-eval[api]>=0.4.9.2 # required for model evaluation test
|
||||
mteb[bm25s]>=2, <3 # required for mteb test
|
||||
@@ -45,8 +45,8 @@ bitsandbytes==0.46.1
|
||||
buildkite-test-collector==0.1.9
|
||||
|
||||
|
||||
genai_perf==0.0.8
|
||||
tritonclient==2.51.0
|
||||
genai_perf>=0.0.8
|
||||
tritonclient>=2.51.0
|
||||
|
||||
arctic-inference == 0.1.1 # Required for suffix decoding test
|
||||
numba == 0.61.2 # Required for N-gram speculative decoding
|
||||
|
||||
@@ -31,7 +31,11 @@ albumentations==1.4.6
|
||||
# -r requirements/test.in
|
||||
# terratorch
|
||||
alembic==1.16.4
|
||||
# via mlflow
|
||||
# via
|
||||
# mlflow
|
||||
# optuna
|
||||
annotated-doc==0.0.4
|
||||
# via fastapi
|
||||
annotated-types==0.7.0
|
||||
# via pydantic
|
||||
antlr4-python3-runtime==4.9.3
|
||||
@@ -143,6 +147,8 @@ colorama==0.4.6
|
||||
# tqdm-multiprocess
|
||||
colorful==0.5.6
|
||||
# via ray
|
||||
colorlog==6.10.1
|
||||
# via optuna
|
||||
contourpy==1.3.0
|
||||
# via matplotlib
|
||||
coverage==7.10.6
|
||||
@@ -250,7 +256,7 @@ fsspec==2024.9.0
|
||||
# torch
|
||||
ftfy==6.3.1
|
||||
# via open-clip-torch
|
||||
genai-perf==0.0.8
|
||||
genai-perf==0.0.16
|
||||
# via -r requirements/test.in
|
||||
genson==1.3.0
|
||||
# via datamodel-code-generator
|
||||
@@ -387,6 +393,7 @@ jinja2==3.1.6
|
||||
# via
|
||||
# datamodel-code-generator
|
||||
# flask
|
||||
# genai-perf
|
||||
# mlflow
|
||||
# torch
|
||||
jiwer==3.0.5
|
||||
@@ -526,7 +533,7 @@ numba==0.61.2
|
||||
# librosa
|
||||
numexpr==2.10.1
|
||||
# via lm-eval
|
||||
numpy==1.26.4
|
||||
numpy==2.2.6
|
||||
# via
|
||||
# -r requirements/test.in
|
||||
# accelerate
|
||||
@@ -556,6 +563,7 @@ numpy==1.26.4
|
||||
# numba
|
||||
# numexpr
|
||||
# opencv-python-headless
|
||||
# optuna
|
||||
# pandas
|
||||
# patsy
|
||||
# peft
|
||||
@@ -635,7 +643,7 @@ opencensus==0.11.4
|
||||
# via ray
|
||||
opencensus-context==0.1.3
|
||||
# via opencensus
|
||||
opencv-python-headless==4.11.0.86
|
||||
opencv-python-headless==4.13.0.90
|
||||
# via
|
||||
# -r requirements/test.in
|
||||
# albucore
|
||||
@@ -658,6 +666,10 @@ opentelemetry-sdk==1.35.0
|
||||
# ray
|
||||
opentelemetry-semantic-conventions==0.56b0
|
||||
# via opentelemetry-sdk
|
||||
optuna==3.6.1
|
||||
# via genai-perf
|
||||
orjson==3.11.5
|
||||
# via genai-perf
|
||||
packaging==24.2
|
||||
# via
|
||||
# accelerate
|
||||
@@ -676,6 +688,7 @@ packaging==24.2
|
||||
# lightning-utilities
|
||||
# matplotlib
|
||||
# mlflow-skinny
|
||||
# optuna
|
||||
# peft
|
||||
# plotly
|
||||
# pooch
|
||||
@@ -715,6 +728,8 @@ peft==0.16.0
|
||||
# lm-eval
|
||||
perceptron==0.1.4
|
||||
# via -r requirements/test.in
|
||||
perf-analyzer==0.1.0
|
||||
# via genai-perf
|
||||
pillow==10.4.0
|
||||
# via
|
||||
# genai-perf
|
||||
@@ -901,6 +916,7 @@ pyyaml==6.0.2
|
||||
# lightning
|
||||
# mlflow-skinny
|
||||
# omegaconf
|
||||
# optuna
|
||||
# peft
|
||||
# pytorch-lightning
|
||||
# ray
|
||||
@@ -1063,6 +1079,7 @@ sortedcontainers==2.4.0
|
||||
soundfile==0.12.1
|
||||
# via
|
||||
# -r requirements/test.in
|
||||
# genai-perf
|
||||
# librosa
|
||||
# mistral-common
|
||||
soxr==0.5.0.post1
|
||||
@@ -1073,6 +1090,7 @@ sqlalchemy==2.0.41
|
||||
# via
|
||||
# alembic
|
||||
# mlflow
|
||||
# optuna
|
||||
sqlitedict==2.1.0
|
||||
# via lm-eval
|
||||
sqlparse==0.5.3
|
||||
@@ -1202,6 +1220,7 @@ tqdm==4.66.6
|
||||
# mteb
|
||||
# nltk
|
||||
# open-clip-torch
|
||||
# optuna
|
||||
# peft
|
||||
# pqdm
|
||||
# pretrainedmodels
|
||||
@@ -1224,10 +1243,8 @@ transformers-stream-generator==0.0.5
|
||||
# via -r requirements/test.in
|
||||
triton==3.5.1
|
||||
# via torch
|
||||
tritonclient==2.51.0
|
||||
# via
|
||||
# -r requirements/test.in
|
||||
# genai-perf
|
||||
tritonclient==2.64.0
|
||||
# via -r requirements/test.in
|
||||
typepy==1.3.2
|
||||
# via
|
||||
# dataproperty
|
||||
|
||||
@@ -267,12 +267,16 @@ async def test_audio_with_max_tokens(mary_had_lamb, client_and_model):
|
||||
out_tokens = tok(out_text, add_special_tokens=False)["input_ids"]
|
||||
assert len(out_tokens) == 1
|
||||
# max_completion_tokens > max_model_len
|
||||
# max_model_len=32768 for Gemma-3n-E2B-it
|
||||
transcription = await client.audio.transcriptions.create(
|
||||
model=model_name,
|
||||
file=mary_had_lamb,
|
||||
response_format="text",
|
||||
temperature=0.0,
|
||||
extra_body={"max_completion_tokens": int(1e6)},
|
||||
extra_body={
|
||||
"max_completion_tokens": int(1e6),
|
||||
"repetition_penalty": 1.3,
|
||||
},
|
||||
)
|
||||
out = json.loads(transcription)
|
||||
out_text = out["text"]
|
||||
|
||||
@@ -176,3 +176,46 @@ def test_models_distributed(
|
||||
distributed_executor_backend=distributed_executor_backend,
|
||||
enforce_eager=False,
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.core_model
|
||||
@pytest.mark.parametrize("model", ["openai/whisper-large-v3-turbo"])
|
||||
def test_encoder_cache_cleanup(
|
||||
vllm_runner,
|
||||
model: str,
|
||||
input_audios,
|
||||
monkeypatch,
|
||||
) -> None:
|
||||
"""Test that encoder cache is properly cleaned up after requests complete.
|
||||
|
||||
This is a regression test for a bug where encoder cache entries were freed
|
||||
in the same scheduling step they were allocated, before the model could use
|
||||
them.
|
||||
"""
|
||||
# Set single-process mode to access the model runner's encoder cache directly
|
||||
monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
|
||||
check_model_available(model)
|
||||
|
||||
with vllm_runner(
|
||||
model,
|
||||
dtype="half",
|
||||
max_model_len=448,
|
||||
tensor_parallel_size=1,
|
||||
limit_mm_per_prompt={"audio": 2},
|
||||
enforce_eager=True,
|
||||
) as vllm_model:
|
||||
engine_core = vllm_model.llm.llm_engine.engine_core.engine_core
|
||||
model_runner = engine_core.model_executor.driver_worker.worker.model_runner
|
||||
encoder_cache = model_runner.encoder_cache
|
||||
|
||||
# Run multiple sequential requests to ensure cache is properly managed
|
||||
for vllm_prompts, _, audios in input_audios:
|
||||
vllm_model.generate_greedy(vllm_prompts, max_tokens=50, audios=audios)
|
||||
|
||||
# After all requests complete, encoder cache should be empty
|
||||
cache_size = len(encoder_cache)
|
||||
assert cache_size == 0, (
|
||||
f"Encoder cache should be empty after all requests complete, "
|
||||
f"but has {cache_size} entries. This indicates encoder cache "
|
||||
f"entries are not being properly freed."
|
||||
)
|
||||
|
||||
@@ -3,10 +3,10 @@
|
||||
|
||||
from collections.abc import Mapping, MutableMapping
|
||||
from pathlib import Path
|
||||
from urllib.parse import urlparse
|
||||
|
||||
import aiohttp
|
||||
import requests
|
||||
from urllib3.util import parse_url
|
||||
|
||||
from vllm.version import __version__ as VLLM_VERSION
|
||||
|
||||
@@ -37,7 +37,7 @@ class HTTPConnection:
|
||||
return self._async_client
|
||||
|
||||
def _validate_http_url(self, url: str):
|
||||
parsed_url = urlparse(url)
|
||||
parsed_url = parse_url(url)
|
||||
|
||||
if parsed_url.scheme not in ("http", "https"):
|
||||
raise ValueError(
|
||||
|
||||
@@ -442,9 +442,9 @@ def get_vllm_port() -> int | None:
|
||||
try:
|
||||
return int(port)
|
||||
except ValueError as err:
|
||||
from urllib.parse import urlparse
|
||||
from urllib3.util import parse_url
|
||||
|
||||
parsed = urlparse(port)
|
||||
parsed = parse_url(port)
|
||||
if parsed.scheme:
|
||||
raise ValueError(
|
||||
f"VLLM_PORT '{port}' appears to be a URI. "
|
||||
|
||||
@@ -9,13 +9,13 @@ from concurrent.futures import ThreadPoolExecutor
|
||||
from itertools import groupby
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING, Any, TypeVar
|
||||
from urllib.parse import ParseResult, urlparse
|
||||
from urllib.request import url2pathname
|
||||
|
||||
import numpy as np
|
||||
import numpy.typing as npt
|
||||
import torch
|
||||
from PIL import Image, UnidentifiedImageError
|
||||
from urllib3.util import Url, parse_url
|
||||
|
||||
import vllm.envs as envs
|
||||
from vllm.connections import HTTPConnection, global_http_connection
|
||||
@@ -101,11 +101,14 @@ class MediaConnector:
|
||||
|
||||
def _load_data_url(
|
||||
self,
|
||||
url_spec: ParseResult,
|
||||
url_spec: Url,
|
||||
media_io: MediaIO[_M],
|
||||
) -> _M: # type: ignore[type-var]
|
||||
data_spec, data = url_spec.path.split(",", 1)
|
||||
url_spec_path = url_spec.path or ""
|
||||
data_spec, data = url_spec_path.split(",", 1)
|
||||
media_type, data_type = data_spec.split(";", 1)
|
||||
# media_type starts with a leading "/" (e.g., "/video/jpeg")
|
||||
media_type = media_type.lstrip("/")
|
||||
|
||||
if data_type != "base64":
|
||||
msg = "Only base64 data URLs are supported for now."
|
||||
@@ -115,7 +118,7 @@ class MediaConnector:
|
||||
|
||||
def _load_file_url(
|
||||
self,
|
||||
url_spec: ParseResult,
|
||||
url_spec: Url,
|
||||
media_io: MediaIO[_M],
|
||||
) -> _M: # type: ignore[type-var]
|
||||
allowed_local_media_path = self.allowed_local_media_path
|
||||
@@ -124,7 +127,9 @@ class MediaConnector:
|
||||
"Cannot load local files without `--allowed-local-media-path`."
|
||||
)
|
||||
|
||||
filepath = Path(url2pathname(url_spec.netloc + url_spec.path))
|
||||
url_spec_path = url_spec.path or ""
|
||||
url_spec_netloc = url_spec.netloc or ""
|
||||
filepath = Path(url2pathname(url_spec_netloc + url_spec_path))
|
||||
if allowed_local_media_path not in filepath.resolve().parents:
|
||||
raise ValueError(
|
||||
f"The file path {filepath} must be a subpath "
|
||||
@@ -133,7 +138,7 @@ class MediaConnector:
|
||||
|
||||
return media_io.load_file(filepath)
|
||||
|
||||
def _assert_url_in_allowed_media_domains(self, url_spec: ParseResult) -> None:
|
||||
def _assert_url_in_allowed_media_domains(self, url_spec: Url) -> None:
|
||||
if (
|
||||
self.allowed_media_domains
|
||||
and url_spec.hostname not in self.allowed_media_domains
|
||||
@@ -151,9 +156,9 @@ class MediaConnector:
|
||||
*,
|
||||
fetch_timeout: int | None = None,
|
||||
) -> _M: # type: ignore[type-var]
|
||||
url_spec = urlparse(url)
|
||||
url_spec = parse_url(url)
|
||||
|
||||
if url_spec.scheme.startswith("http"):
|
||||
if url_spec.scheme and url_spec.scheme.startswith("http"):
|
||||
self._assert_url_in_allowed_media_domains(url_spec)
|
||||
|
||||
connection = self.connection
|
||||
@@ -181,10 +186,10 @@ class MediaConnector:
|
||||
*,
|
||||
fetch_timeout: int | None = None,
|
||||
) -> _M:
|
||||
url_spec = urlparse(url)
|
||||
url_spec = parse_url(url)
|
||||
loop = asyncio.get_running_loop()
|
||||
|
||||
if url_spec.scheme.startswith("http"):
|
||||
if url_spec.scheme and url_spec.scheme.startswith("http"):
|
||||
self._assert_url_in_allowed_media_domains(url_spec)
|
||||
|
||||
connection = self.connection
|
||||
|
||||
@@ -11,12 +11,12 @@ from collections.abc import (
|
||||
Sequence,
|
||||
)
|
||||
from typing import Any
|
||||
from urllib.parse import urlparse
|
||||
from uuid import uuid4
|
||||
|
||||
import psutil
|
||||
import zmq
|
||||
import zmq.asyncio
|
||||
from urllib3.util import parse_url
|
||||
|
||||
import vllm.envs as envs
|
||||
from vllm.logger import init_logger
|
||||
@@ -217,13 +217,15 @@ def find_process_using_port(port: int) -> psutil.Process | None:
|
||||
|
||||
def split_zmq_path(path: str) -> tuple[str, str, str]:
|
||||
"""Split a zmq path into its parts."""
|
||||
parsed = urlparse(path)
|
||||
parsed = parse_url(path)
|
||||
if not parsed.scheme:
|
||||
raise ValueError(f"Invalid zmq path: {path}")
|
||||
|
||||
scheme = parsed.scheme
|
||||
host = parsed.hostname or ""
|
||||
port = str(parsed.port or "")
|
||||
if host.startswith("[") and host.endswith("]"):
|
||||
host = host[1:-1] # Remove brackets for IPv6 address
|
||||
|
||||
if scheme == "tcp" and not all((host, port)):
|
||||
# The host and port fields are required for tcp
|
||||
|
||||
@@ -357,7 +357,8 @@ class EncoderDecoderCacheManager(EncoderCacheManager):
|
||||
def __init__(self, cache_size: int):
|
||||
self.cache_size = cache_size
|
||||
self.num_free_slots = cache_size
|
||||
self.freed: list[str] = []
|
||||
self.allocated: list[str] = []
|
||||
self.to_free: list[str] = []
|
||||
|
||||
def check_and_update_cache(self, request: Request, input_id: int) -> bool:
|
||||
return False
|
||||
@@ -383,7 +384,7 @@ class EncoderDecoderCacheManager(EncoderCacheManager):
|
||||
self.num_free_slots -= num_encoder_embeds
|
||||
|
||||
mm_hash = request.mm_features[input_id].identifier
|
||||
self.freed.append(mm_hash)
|
||||
self.allocated.append(mm_hash)
|
||||
|
||||
def free(self, request: Request) -> None:
|
||||
for input_id in range(len(request.mm_features)):
|
||||
@@ -393,9 +394,14 @@ class EncoderDecoderCacheManager(EncoderCacheManager):
|
||||
return set(range(len(request.mm_features)))
|
||||
|
||||
def get_freed_mm_hashes(self) -> list[str]:
|
||||
freed = self.freed
|
||||
self.freed = []
|
||||
return freed
|
||||
# As encoder cache is not used for enc-dec models, we can free the entries here
|
||||
# The actual free happens in the runner, *before* the model is executed.
|
||||
# Therefore, `freeable` acts as a buffer to free the entries only after the
|
||||
# model is executed, mimicking the state transition of `EncoderCacheManager`.
|
||||
to_free = self.to_free
|
||||
self.to_free = self.allocated
|
||||
self.allocated = []
|
||||
return to_free
|
||||
|
||||
def free_encoder_input(self, request: Request, input_id: int) -> None:
|
||||
num_encoder_embeds = request.get_num_encoder_embeds(input_id)
|
||||
|
||||
Reference in New Issue
Block a user