Compare commits
8 Commits
v0.14.0rc2
...
v0.14.1
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
d7de043d55 | ||
|
|
4dc11b06d3 | ||
|
|
2bd95d803a | ||
|
|
f46d576c54 | ||
|
|
d68209402d | ||
|
|
b17039bccc | ||
|
|
48b67ba75f | ||
|
|
09f4264a55 |
@@ -1,6 +1,6 @@
|
|||||||
steps:
|
steps:
|
||||||
# aarch64 + CUDA builds
|
# aarch64 + CUDA builds
|
||||||
- label: "Build arm64 wheel - CUDA 12.9"
|
- label: "Build wheel - aarch64 - CUDA 12.9"
|
||||||
depends_on: ~
|
depends_on: ~
|
||||||
id: build-wheel-arm64-cuda-12-9
|
id: build-wheel-arm64-cuda-12-9
|
||||||
agents:
|
agents:
|
||||||
@@ -11,11 +11,11 @@ steps:
|
|||||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
||||||
- "mkdir artifacts"
|
- "mkdir artifacts"
|
||||||
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
||||||
- "bash .buildkite/scripts/upload-wheels.sh"
|
- "bash .buildkite/scripts/upload-nightly-wheels.sh"
|
||||||
env:
|
env:
|
||||||
DOCKER_BUILDKIT: "1"
|
DOCKER_BUILDKIT: "1"
|
||||||
|
|
||||||
- label: "Build arm64 wheel - CUDA 13.0"
|
- label: "Build wheel - aarch64 - CUDA 13.0"
|
||||||
depends_on: ~
|
depends_on: ~
|
||||||
id: build-wheel-arm64-cuda-13-0
|
id: build-wheel-arm64-cuda-13-0
|
||||||
agents:
|
agents:
|
||||||
@@ -26,12 +26,12 @@ steps:
|
|||||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
||||||
- "mkdir artifacts"
|
- "mkdir artifacts"
|
||||||
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
||||||
- "bash .buildkite/scripts/upload-wheels.sh manylinux_2_35"
|
- "bash .buildkite/scripts/upload-nightly-wheels.sh manylinux_2_35"
|
||||||
env:
|
env:
|
||||||
DOCKER_BUILDKIT: "1"
|
DOCKER_BUILDKIT: "1"
|
||||||
|
|
||||||
# aarch64 build
|
# aarch64 build
|
||||||
- label: "Build arm64 CPU wheel"
|
- label: "Build wheel - aarch64 - CPU"
|
||||||
depends_on: ~
|
depends_on: ~
|
||||||
id: build-wheel-arm64-cpu
|
id: build-wheel-arm64-cpu
|
||||||
agents:
|
agents:
|
||||||
@@ -40,39 +40,39 @@ steps:
|
|||||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_BUILD_ACL=ON --tag vllm-ci:build-image --target vllm-build --progress plain -f docker/Dockerfile.cpu ."
|
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_BUILD_ACL=ON --tag vllm-ci:build-image --target vllm-build --progress plain -f docker/Dockerfile.cpu ."
|
||||||
- "mkdir artifacts"
|
- "mkdir artifacts"
|
||||||
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
||||||
- "bash .buildkite/scripts/upload-wheels.sh manylinux_2_35"
|
- "bash .buildkite/scripts/upload-nightly-wheels.sh manylinux_2_35"
|
||||||
env:
|
env:
|
||||||
DOCKER_BUILDKIT: "1"
|
DOCKER_BUILDKIT: "1"
|
||||||
|
|
||||||
# x86 + CUDA builds
|
# x86 + CUDA builds
|
||||||
- label: "Build wheel - CUDA 12.9"
|
- label: "Build wheel - x86_64 - CUDA 12.9"
|
||||||
depends_on: ~
|
depends_on: ~
|
||||||
id: build-wheel-cuda-12-9
|
id: build-wheel-x86-cuda-12-9
|
||||||
agents:
|
agents:
|
||||||
queue: cpu_queue_postmerge
|
queue: cpu_queue_postmerge
|
||||||
commands:
|
commands:
|
||||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
||||||
- "mkdir artifacts"
|
- "mkdir artifacts"
|
||||||
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
||||||
- "bash .buildkite/scripts/upload-wheels.sh manylinux_2_31"
|
- "bash .buildkite/scripts/upload-nightly-wheels.sh manylinux_2_31"
|
||||||
env:
|
env:
|
||||||
DOCKER_BUILDKIT: "1"
|
DOCKER_BUILDKIT: "1"
|
||||||
|
|
||||||
- label: "Build wheel - CUDA 13.0"
|
- label: "Build wheel - x86_64 - CUDA 13.0"
|
||||||
depends_on: ~
|
depends_on: ~
|
||||||
id: build-wheel-cuda-13-0
|
id: build-wheel-x86-cuda-13-0
|
||||||
agents:
|
agents:
|
||||||
queue: cpu_queue_postmerge
|
queue: cpu_queue_postmerge
|
||||||
commands:
|
commands:
|
||||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
||||||
- "mkdir artifacts"
|
- "mkdir artifacts"
|
||||||
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
||||||
- "bash .buildkite/scripts/upload-wheels.sh manylinux_2_35"
|
- "bash .buildkite/scripts/upload-nightly-wheels.sh manylinux_2_35"
|
||||||
env:
|
env:
|
||||||
DOCKER_BUILDKIT: "1"
|
DOCKER_BUILDKIT: "1"
|
||||||
|
|
||||||
# x86 CPU wheel build
|
# x86 CPU wheel build
|
||||||
- label: "Build x86 CPU wheel"
|
- label: "Build wheel - x86_64 - CPU"
|
||||||
depends_on: ~
|
depends_on: ~
|
||||||
id: build-wheel-x86-cpu
|
id: build-wheel-x86-cpu
|
||||||
agents:
|
agents:
|
||||||
@@ -81,12 +81,12 @@ steps:
|
|||||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_CPU_AVX512BF16=true --build-arg VLLM_CPU_AVX512VNNI=true --build-arg VLLM_CPU_AMXBF16=true --tag vllm-ci:build-image --target vllm-build --progress plain -f docker/Dockerfile.cpu ."
|
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_CPU_AVX512BF16=true --build-arg VLLM_CPU_AVX512VNNI=true --build-arg VLLM_CPU_AMXBF16=true --tag vllm-ci:build-image --target vllm-build --progress plain -f docker/Dockerfile.cpu ."
|
||||||
- "mkdir artifacts"
|
- "mkdir artifacts"
|
||||||
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
||||||
- "bash .buildkite/scripts/upload-wheels.sh manylinux_2_35"
|
- "bash .buildkite/scripts/upload-nightly-wheels.sh manylinux_2_35"
|
||||||
env:
|
env:
|
||||||
DOCKER_BUILDKIT: "1"
|
DOCKER_BUILDKIT: "1"
|
||||||
|
|
||||||
# Build release images (12.9)
|
# Build release images (CUDA 12.9)
|
||||||
- label: "Build release image (x86)"
|
- label: "Build release image - x86_64 - CUDA 12.9"
|
||||||
depends_on: ~
|
depends_on: ~
|
||||||
id: build-release-image-x86
|
id: build-release-image-x86
|
||||||
agents:
|
agents:
|
||||||
@@ -99,7 +99,7 @@ steps:
|
|||||||
- "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
|
- "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
|
||||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
|
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
|
||||||
|
|
||||||
- label: "Build release image (arm64)"
|
- label: "Build release image - aarch64 - CUDA 12.9"
|
||||||
depends_on: ~
|
depends_on: ~
|
||||||
id: build-release-image-arm64
|
id: build-release-image-arm64
|
||||||
agents:
|
agents:
|
||||||
@@ -109,34 +109,93 @@ steps:
|
|||||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
|
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
|
||||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)"
|
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)"
|
||||||
|
|
||||||
# Add job to create multi-arch manifest
|
- label: "Create multi-arch manifest - CUDA 12.9"
|
||||||
- label: "Create multi-arch manifest"
|
|
||||||
depends_on:
|
depends_on:
|
||||||
- build-release-image-x86
|
- build-release-image-x86
|
||||||
- build-release-image-arm64
|
- build-release-image-arm64
|
||||||
id: create-multi-arch-manifest
|
id: create-multi-arch-manifest
|
||||||
agents:
|
agents:
|
||||||
queue: cpu_queue_postmerge
|
queue: small_cpu_queue_postmerge
|
||||||
commands:
|
commands:
|
||||||
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
||||||
- "docker manifest create public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64 --amend"
|
- "docker manifest create public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64 --amend"
|
||||||
- "docker manifest push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
|
- "docker manifest push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
|
||||||
|
|
||||||
- label: "Annotate release workflow"
|
- label: "Annotate release workflow - CUDA 12.9"
|
||||||
depends_on:
|
depends_on:
|
||||||
- create-multi-arch-manifest
|
- create-multi-arch-manifest
|
||||||
id: annotate-release-workflow
|
id: annotate-release-workflow
|
||||||
agents:
|
agents:
|
||||||
queue: cpu_queue_postmerge
|
queue: small_cpu_queue_postmerge
|
||||||
commands:
|
commands:
|
||||||
- "bash .buildkite/scripts/annotate-release.sh"
|
- "bash .buildkite/scripts/annotate-release.sh"
|
||||||
|
|
||||||
|
- block: "Build CUDA 13.0 release images"
|
||||||
|
key: block-release-image-build-cuda-13-0
|
||||||
|
depends_on: ~
|
||||||
|
|
||||||
|
- label: "Build release image - x86_64 - CUDA 13.0"
|
||||||
|
depends_on: block-release-image-build-cuda-13-0
|
||||||
|
id: build-release-image-x86-cuda-13-0
|
||||||
|
agents:
|
||||||
|
queue: cpu_queue_postmerge
|
||||||
|
commands:
|
||||||
|
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
||||||
|
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg INSTALL_KV_CONNECTORS=true --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130 --target vllm-openai --progress plain -f docker/Dockerfile ."
|
||||||
|
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130"
|
||||||
|
# re-tag to default image tag and push, just in case arm64 build fails
|
||||||
|
- "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu130"
|
||||||
|
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu130"
|
||||||
|
|
||||||
|
- label: "Build release image - aarch64 - CUDA 13.0"
|
||||||
|
depends_on: block-release-image-build-cuda-13-0
|
||||||
|
id: build-release-image-arm64-cuda-13-0
|
||||||
|
agents:
|
||||||
|
queue: arm64_cpu_queue_postmerge
|
||||||
|
commands:
|
||||||
|
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
||||||
|
# compute capability 12.0 for RTX-50 series / RTX PRO 6000 Blackwell, 12.1 for DGX Spark
|
||||||
|
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0 12.1' --build-arg INSTALL_KV_CONNECTORS=true --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130 --target vllm-openai --progress plain -f docker/Dockerfile ."
|
||||||
|
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130"
|
||||||
|
|
||||||
|
- label: "Create multi-arch manifest - CUDA 13.0"
|
||||||
|
depends_on:
|
||||||
|
- build-release-image-x86-cuda-13-0
|
||||||
|
- build-release-image-arm64-cuda-13-0
|
||||||
|
id: create-multi-arch-manifest-cuda-13-0
|
||||||
|
agents:
|
||||||
|
queue: small_cpu_queue_postmerge
|
||||||
|
commands:
|
||||||
|
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
||||||
|
- "docker manifest create public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu130 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64-cu130 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64-cu130 --amend"
|
||||||
|
- "docker manifest push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu130"
|
||||||
|
|
||||||
- input: "Provide Release version here"
|
- input: "Provide Release version here"
|
||||||
id: input-release-version
|
id: input-release-version
|
||||||
fields:
|
fields:
|
||||||
- text: "What is the release version?"
|
- text: "What is the release version?"
|
||||||
key: release-version
|
key: release-version
|
||||||
|
|
||||||
|
- block: "Confirm update release wheels to PyPI (experimental, use with caution)?"
|
||||||
|
key: block-upload-release-wheels
|
||||||
|
depends_on:
|
||||||
|
- input-release-version
|
||||||
|
- build-wheel-x86-cuda-12-9
|
||||||
|
- build-wheel-x86-cuda-13-0
|
||||||
|
- build-wheel-x86-cpu
|
||||||
|
- build-wheel-arm64-cuda-12-9
|
||||||
|
- build-wheel-arm64-cuda-13-0
|
||||||
|
- build-wheel-arm64-cpu
|
||||||
|
|
||||||
|
- label: "Upload release wheels to PyPI and GitHub"
|
||||||
|
depends_on:
|
||||||
|
- block-upload-release-wheels
|
||||||
|
id: upload-release-wheels
|
||||||
|
agents:
|
||||||
|
queue: small_cpu_queue_postmerge
|
||||||
|
commands:
|
||||||
|
- "bash .buildkite/scripts/upload-release-wheels.sh"
|
||||||
|
|
||||||
- block: "Build CPU release image"
|
- block: "Build CPU release image"
|
||||||
key: block-cpu-release-image-build
|
key: block-cpu-release-image-build
|
||||||
depends_on: ~
|
depends_on: ~
|
||||||
@@ -186,25 +245,14 @@ steps:
|
|||||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg BASE_IMAGE=rocm/vllm-dev:base-$BUILDKITE_COMMIT --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-rocm --target vllm-openai --progress plain -f docker/Dockerfile.rocm ."
|
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg BASE_IMAGE=rocm/vllm-dev:base-$BUILDKITE_COMMIT --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-rocm --target vllm-openai --progress plain -f docker/Dockerfile.rocm ."
|
||||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-rocm"
|
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-rocm"
|
||||||
|
|
||||||
|
|
||||||
- label: "Build and publish nightly multi-arch image to DockerHub"
|
- label: "Build and publish nightly multi-arch image to DockerHub"
|
||||||
depends_on:
|
depends_on:
|
||||||
- create-multi-arch-manifest
|
- create-multi-arch-manifest
|
||||||
if: build.env("NIGHTLY") == "1"
|
if: build.env("NIGHTLY") == "1"
|
||||||
agents:
|
agents:
|
||||||
queue: cpu_queue_postmerge
|
queue: small_cpu_queue_postmerge
|
||||||
commands:
|
commands:
|
||||||
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
- "bash .buildkite/scripts/push-nightly-builds.sh"
|
||||||
- "docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64"
|
|
||||||
- "docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64"
|
|
||||||
- "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64 vllm/vllm-openai:nightly-x86_64"
|
|
||||||
- "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64 vllm/vllm-openai:nightly-aarch64"
|
|
||||||
- "docker push vllm/vllm-openai:nightly-x86_64"
|
|
||||||
- "docker push vllm/vllm-openai:nightly-aarch64"
|
|
||||||
- "docker manifest create vllm/vllm-openai:nightly vllm/vllm-openai:nightly-x86_64 vllm/vllm-openai:nightly-aarch64 --amend"
|
|
||||||
- "docker manifest create vllm/vllm-openai:nightly-$BUILDKITE_COMMIT vllm/vllm-openai:nightly-x86_64 vllm/vllm-openai:nightly-aarch64 --amend"
|
|
||||||
- "docker manifest push vllm/vllm-openai:nightly"
|
|
||||||
- "docker manifest push vllm/vllm-openai:nightly-$BUILDKITE_COMMIT"
|
|
||||||
# Clean up old nightly builds (keep only last 14)
|
# Clean up old nightly builds (keep only last 14)
|
||||||
- "bash .buildkite/scripts/cleanup-nightly-builds.sh"
|
- "bash .buildkite/scripts/cleanup-nightly-builds.sh"
|
||||||
plugins:
|
plugins:
|
||||||
@@ -215,6 +263,25 @@ steps:
|
|||||||
DOCKER_BUILDKIT: "1"
|
DOCKER_BUILDKIT: "1"
|
||||||
DOCKERHUB_USERNAME: "vllmbot"
|
DOCKERHUB_USERNAME: "vllmbot"
|
||||||
|
|
||||||
|
- label: "Build and publish nightly multi-arch image to DockerHub - CUDA 13.0"
|
||||||
|
depends_on:
|
||||||
|
- create-multi-arch-manifest-cuda-13-0
|
||||||
|
if: build.env("NIGHTLY") == "1"
|
||||||
|
agents:
|
||||||
|
queue: small_cpu_queue_postmerge
|
||||||
|
commands:
|
||||||
|
- "bash .buildkite/scripts/push-nightly-builds.sh cu130"
|
||||||
|
# Clean up old nightly builds (keep only last 14)
|
||||||
|
- "bash .buildkite/scripts/cleanup-nightly-builds.sh cu130-nightly-"
|
||||||
|
plugins:
|
||||||
|
- docker-login#v3.0.0:
|
||||||
|
username: vllmbot
|
||||||
|
password-env: DOCKERHUB_TOKEN
|
||||||
|
env:
|
||||||
|
DOCKER_BUILDKIT: "1"
|
||||||
|
DOCKERHUB_USERNAME: "vllmbot"
|
||||||
|
|
||||||
|
|
||||||
# =============================================================================
|
# =============================================================================
|
||||||
# ROCm Release Pipeline (x86_64 only)
|
# ROCm Release Pipeline (x86_64 only)
|
||||||
# =============================================================================
|
# =============================================================================
|
||||||
|
|||||||
@@ -3,7 +3,14 @@
|
|||||||
set -ex
|
set -ex
|
||||||
|
|
||||||
# Clean up old nightly builds from DockerHub, keeping only the last 14 builds
|
# Clean up old nightly builds from DockerHub, keeping only the last 14 builds
|
||||||
# This script uses DockerHub API to list and delete old tags with "nightly-" prefix
|
# This script uses DockerHub API to list and delete old tags with specified prefix
|
||||||
|
# Usage: cleanup-nightly-builds.sh [TAG_PREFIX]
|
||||||
|
# Example: cleanup-nightly-builds.sh "nightly-" or cleanup-nightly-builds.sh "cu130-nightly-"
|
||||||
|
|
||||||
|
# Get tag prefix from argument, default to "nightly-" if not provided
|
||||||
|
TAG_PREFIX="${1:-nightly-}"
|
||||||
|
|
||||||
|
echo "Cleaning up tags with prefix: $TAG_PREFIX"
|
||||||
|
|
||||||
# DockerHub API endpoint for vllm/vllm-openai repository
|
# DockerHub API endpoint for vllm/vllm-openai repository
|
||||||
REPO_API_URL="https://hub.docker.com/v2/repositories/vllm/vllm-openai/tags"
|
REPO_API_URL="https://hub.docker.com/v2/repositories/vllm/vllm-openai/tags"
|
||||||
@@ -45,7 +52,7 @@ get_all_tags() {
|
|||||||
set -x
|
set -x
|
||||||
|
|
||||||
# Get both last_updated timestamp and tag name, separated by |
|
# Get both last_updated timestamp and tag name, separated by |
|
||||||
local tags=$(echo "$response" | jq -r '.results[] | select(.name | startswith("nightly-")) | "\(.last_updated)|\(.name)"')
|
local tags=$(echo "$response" | jq -r --arg prefix "$TAG_PREFIX" '.results[] | select(.name | startswith($prefix)) | "\(.last_updated)|\(.name)"')
|
||||||
|
|
||||||
if [ -z "$tags" ]; then
|
if [ -z "$tags" ]; then
|
||||||
break
|
break
|
||||||
|
|||||||
36
.buildkite/scripts/push-nightly-builds.sh
Executable file
36
.buildkite/scripts/push-nightly-builds.sh
Executable file
@@ -0,0 +1,36 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
set -ex
|
||||||
|
|
||||||
|
# Get tag variant from argument, default to empty if not provided, should be something like "cu130".
|
||||||
|
# Due to limits in cleanup script, we must move variants to use separate tags like "cu130-nightly",
|
||||||
|
# otherwise they will be cleaned up together with the main "nightly" tags.
|
||||||
|
|
||||||
|
TAG_VARIANT="$1"
|
||||||
|
if [ -n "$TAG_VARIANT" ]; then
|
||||||
|
ORIG_TAG_SUFFIX="-$TAG_VARIANT"
|
||||||
|
TAG_NAME="$TAG_VARIANT-nightly"
|
||||||
|
else
|
||||||
|
ORIG_TAG_SUFFIX=""
|
||||||
|
TAG_NAME="nightly"
|
||||||
|
fi
|
||||||
|
|
||||||
|
ORIG_TAG_NAME="$BUILDKITE_COMMIT"
|
||||||
|
|
||||||
|
echo "Pushing original tag $ORIG_TAG_NAME$ORIG_TAG_SUFFIX to new nightly tag name: $TAG_NAME"
|
||||||
|
|
||||||
|
# pull original arch-dependent images from AWS ECR Public
|
||||||
|
aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7
|
||||||
|
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:$ORIG_TAG_NAME-x86_64$ORIG_TAG_SUFFIX
|
||||||
|
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:$ORIG_TAG_NAME-aarch64$ORIG_TAG_SUFFIX
|
||||||
|
# tag arch-dependent images
|
||||||
|
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$ORIG_TAG_NAME-x86_64$ORIG_TAG_SUFFIX vllm/vllm-openai:$TAG_NAME-x86_64
|
||||||
|
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$ORIG_TAG_NAME-aarch64$ORIG_TAG_SUFFIX vllm/vllm-openai:$TAG_NAME-aarch64
|
||||||
|
# push arch-dependent images to DockerHub
|
||||||
|
docker push vllm/vllm-openai:$TAG_NAME-x86_64
|
||||||
|
docker push vllm/vllm-openai:$TAG_NAME-aarch64
|
||||||
|
# push arch-independent manifest to DockerHub
|
||||||
|
docker manifest create vllm/vllm-openai:$TAG_NAME vllm/vllm-openai:$TAG_NAME-x86_64 vllm/vllm-openai:$TAG_NAME-aarch64 --amend
|
||||||
|
docker manifest create vllm/vllm-openai:$TAG_NAME-$BUILDKITE_COMMIT vllm/vllm-openai:$TAG_NAME-x86_64 vllm/vllm-openai:$TAG_NAME-aarch64 --amend
|
||||||
|
docker manifest push vllm/vllm-openai:$TAG_NAME
|
||||||
|
docker manifest push vllm/vllm-openai:$TAG_NAME-$BUILDKITE_COMMIT
|
||||||
104
.buildkite/scripts/upload-release-wheels.sh
Normal file
104
.buildkite/scripts/upload-release-wheels.sh
Normal file
@@ -0,0 +1,104 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
|
||||||
|
set -e
|
||||||
|
|
||||||
|
BUCKET="vllm-wheels"
|
||||||
|
SUBPATH=$BUILDKITE_COMMIT
|
||||||
|
S3_COMMIT_PREFIX="s3://$BUCKET/$SUBPATH/"
|
||||||
|
|
||||||
|
RELEASE_VERSION=$(buildkite-agent meta-data get release-version)
|
||||||
|
echo "Release version from Buildkite: $RELEASE_VERSION"
|
||||||
|
GIT_VERSION=$(git describe --exact-match --tags $BUILDKITE_COMMIT 2>/dev/null)
|
||||||
|
if [ -z "$GIT_VERSION" ]; then
|
||||||
|
echo "[FATAL] Not on a git tag, cannot create release."
|
||||||
|
exit 1
|
||||||
|
else
|
||||||
|
echo "Git version for commit $BUILDKITE_COMMIT: $GIT_VERSION"
|
||||||
|
fi
|
||||||
|
# sanity check for version mismatch
|
||||||
|
if [ "$RELEASE_VERSION" != "$GIT_VERSION" ]; then
|
||||||
|
if [ "$FORCE_RELEASE_IGNORE_VERSION_MISMATCH" == "true" ]; then
|
||||||
|
echo "[WARNING] Force release and ignore version mismatch"
|
||||||
|
else
|
||||||
|
echo "[FATAL] Release version from Buildkite does not match Git version."
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
PURE_VERSION=${RELEASE_VERSION#v} # remove leading 'v'
|
||||||
|
|
||||||
|
# check pypi token
|
||||||
|
if [ -z "$PYPI_TOKEN" ]; then
|
||||||
|
echo "[FATAL] PYPI_TOKEN is not set."
|
||||||
|
exit 1
|
||||||
|
else
|
||||||
|
export TWINE_USERNAME="__token__"
|
||||||
|
export TWINE_PASSWORD="$PYPI_TOKEN"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# check github token
|
||||||
|
if [ -z "$GITHUB_TOKEN" ]; then
|
||||||
|
echo "[FATAL] GITHUB_TOKEN is not set."
|
||||||
|
exit 1
|
||||||
|
else
|
||||||
|
export GH_TOKEN="$GITHUB_TOKEN"
|
||||||
|
fi
|
||||||
|
|
||||||
|
set -x # avoid printing secrets above
|
||||||
|
|
||||||
|
# download gh CLI from github
|
||||||
|
# Get latest gh CLI version from GitHub API
|
||||||
|
GH_VERSION=$(curl -s https://api.github.com/repos/cli/cli/releases/latest | grep '"tag_name":' | sed -E 's/.*"([^"]+)".*/\1/' | sed 's/^v//')
|
||||||
|
if [ -z "$GH_VERSION" ]; then
|
||||||
|
echo "[FATAL] Failed to get latest gh CLI version from GitHub"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
echo "Downloading gh CLI version: $GH_VERSION"
|
||||||
|
GH_TARBALL="gh_${GH_VERSION}_linux_amd64.tar.gz"
|
||||||
|
GH_URL="https://github.com/cli/cli/releases/download/v${GH_VERSION}/${GH_TARBALL}"
|
||||||
|
GH_INSTALL_DIR="/tmp/gh-install"
|
||||||
|
mkdir -p "$GH_INSTALL_DIR"
|
||||||
|
pushd "$GH_INSTALL_DIR"
|
||||||
|
curl -L -o "$GH_TARBALL" "$GH_URL"
|
||||||
|
tar -xzf "$GH_TARBALL"
|
||||||
|
GH_BIN=$(realpath $(find . -name "gh" -type f -executable | head -n 1))
|
||||||
|
if [ -z "$GH_BIN" ]; then
|
||||||
|
echo "[FATAL] Failed to find gh CLI executable"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
echo "gh CLI downloaded successfully, version: $($GH_BIN --version)"
|
||||||
|
echo "Last 5 releases on GitHub:" # as a sanity check of gh and GH_TOKEN
|
||||||
|
command "$GH_BIN" release list --limit 5
|
||||||
|
popd
|
||||||
|
|
||||||
|
# install twine from pypi
|
||||||
|
python3 -m venv /tmp/vllm-release-env
|
||||||
|
source /tmp/vllm-release-env/bin/activate
|
||||||
|
pip install twine
|
||||||
|
python3 -m twine --version
|
||||||
|
|
||||||
|
# copy release wheels to local directory
|
||||||
|
DIST_DIR=/tmp/vllm-release-dist
|
||||||
|
echo "Existing wheels on S3:"
|
||||||
|
aws s3 ls "$S3_COMMIT_PREFIX"
|
||||||
|
echo "Copying wheels to local directory"
|
||||||
|
mkdir -p $DIST_DIR
|
||||||
|
# include only wheels for the release version, ignore all files with "dev" or "rc" in the name (without excluding 'aarch64')
|
||||||
|
aws s3 cp --recursive --exclude "*" --include "vllm-${PURE_VERSION}*.whl" --exclude "*dev*" --exclude "*rc[0-9]*" "$S3_COMMIT_PREFIX" $DIST_DIR
|
||||||
|
echo "Wheels copied to local directory"
|
||||||
|
# generate source tarball
|
||||||
|
git archive --format=tar.gz --output="$DIST_DIR/vllm-${PURE_VERSION}.tar.gz" $BUILDKITE_COMMIT
|
||||||
|
ls -la $DIST_DIR
|
||||||
|
|
||||||
|
|
||||||
|
# upload wheels to PyPI (only default variant, i.e. files without '+' in the name)
|
||||||
|
PYPI_WHEEL_FILES=$(find $DIST_DIR -name "vllm-${PURE_VERSION}*.whl" -not -name "*+*")
|
||||||
|
if [ -z "$PYPI_WHEEL_FILES" ]; then
|
||||||
|
echo "No default variant wheels found, quitting..."
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
python3 -m twine check $PYPI_WHEEL_FILES
|
||||||
|
python3 -m twine --non-interactive --verbose upload $PYPI_WHEEL_FILES
|
||||||
|
echo "Wheels uploaded to PyPI"
|
||||||
|
|
||||||
|
# create release on GitHub with the release version and all wheels
|
||||||
|
command "$GH_BIN" release create $GIT_VERSION -d --latest --notes-from-tag --verify-tag $DIST_DIR/*.whl
|
||||||
@@ -85,6 +85,8 @@ ONBUILD COPY ./ vllm/
|
|||||||
FROM base AS fetch_vllm_1
|
FROM base AS fetch_vllm_1
|
||||||
ARG VLLM_REPO="https://github.com/vllm-project/vllm.git"
|
ARG VLLM_REPO="https://github.com/vllm-project/vllm.git"
|
||||||
ARG VLLM_BRANCH="main"
|
ARG VLLM_BRANCH="main"
|
||||||
|
ENV VLLM_REPO=${VLLM_REPO}
|
||||||
|
ENV VLLM_BRANCH=${VLLM_BRANCH}
|
||||||
ONBUILD RUN git clone ${VLLM_REPO} \
|
ONBUILD RUN git clone ${VLLM_REPO} \
|
||||||
&& cd vllm \
|
&& cd vllm \
|
||||||
&& git fetch -v --prune -- origin ${VLLM_BRANCH} \
|
&& git fetch -v --prune -- origin ${VLLM_BRANCH} \
|
||||||
@@ -301,6 +303,10 @@ RUN --mount=type=bind,from=export_vllm,src=/,target=/install \
|
|||||||
&& pip uninstall -y vllm \
|
&& pip uninstall -y vllm \
|
||||||
&& uv pip install --system *.whl
|
&& uv pip install --system *.whl
|
||||||
|
|
||||||
|
# Install RIXL wheel
|
||||||
|
RUN --mount=type=bind,from=build_rixl,src=/app/install,target=/rixl_install \
|
||||||
|
uv pip install --system /rixl_install/*.whl
|
||||||
|
|
||||||
WORKDIR /vllm-workspace
|
WORKDIR /vllm-workspace
|
||||||
ARG COMMON_WORKDIR
|
ARG COMMON_WORKDIR
|
||||||
COPY --from=build_vllm ${COMMON_WORKDIR}/vllm /vllm-workspace
|
COPY --from=build_vllm ${COMMON_WORKDIR}/vllm /vllm-workspace
|
||||||
|
|||||||
@@ -198,92 +198,6 @@ RUN cd mori \
|
|||||||
RUN mkdir -p /app/install && cp /app/mori/dist/*.whl /app/install
|
RUN mkdir -p /app/install && cp /app/mori/dist/*.whl /app/install
|
||||||
|
|
||||||
|
|
||||||
###
|
|
||||||
### RIXL Build
|
|
||||||
###
|
|
||||||
FROM build_pytorch AS build_rixl
|
|
||||||
ARG RIXL_BRANCH
|
|
||||||
ARG RIXL_REPO
|
|
||||||
ARG ETCD_BRANCH
|
|
||||||
ARG ETCD_REPO
|
|
||||||
ARG UCX_BRANCH
|
|
||||||
ARG UCX_REPO
|
|
||||||
|
|
||||||
ENV ROCM_PATH=/opt/rocm
|
|
||||||
ENV UCX_HOME=/usr/local/ucx
|
|
||||||
ENV RIXL_HOME=/usr/local/rixl
|
|
||||||
ENV RIXL_BENCH_HOME=/usr/local/rixl_bench
|
|
||||||
|
|
||||||
# RIXL build system dependences and RDMA support
|
|
||||||
RUN apt-get -y update && apt-get -y install autoconf libtool pkg-config \
|
|
||||||
libgrpc-dev \
|
|
||||||
libgrpc++-dev \
|
|
||||||
libprotobuf-dev \
|
|
||||||
protobuf-compiler-grpc \
|
|
||||||
libcpprest-dev \
|
|
||||||
libaio-dev \
|
|
||||||
librdmacm1 \
|
|
||||||
librdmacm-dev \
|
|
||||||
libibverbs1 \
|
|
||||||
libibverbs-dev \
|
|
||||||
ibverbs-utils \
|
|
||||||
rdmacm-utils \
|
|
||||||
ibverbs-providers
|
|
||||||
|
|
||||||
RUN pip install meson auditwheel patchelf tomlkit
|
|
||||||
|
|
||||||
WORKDIR /workspace
|
|
||||||
|
|
||||||
RUN git clone ${ETCD_REPO} && \
|
|
||||||
cd etcd-cpp-apiv3 && \
|
|
||||||
git checkout ${ETCD_BRANCH} && \
|
|
||||||
mkdir build && cd build && \
|
|
||||||
cmake .. -DCMAKE_POLICY_VERSION_MINIMUM=3.5 && \
|
|
||||||
make -j$(nproc) && \
|
|
||||||
make install
|
|
||||||
|
|
||||||
RUN cd /usr/local/src && \
|
|
||||||
git clone ${UCX_REPO} && \
|
|
||||||
cd ucx && \
|
|
||||||
git checkout ${UCX_BRANCH} && \
|
|
||||||
./autogen.sh && \
|
|
||||||
mkdir build && cd build && \
|
|
||||||
../configure \
|
|
||||||
--prefix=/usr/local/ucx \
|
|
||||||
--enable-shared \
|
|
||||||
--disable-static \
|
|
||||||
--disable-doxygen-doc \
|
|
||||||
--enable-optimizations \
|
|
||||||
--enable-devel-headers \
|
|
||||||
--with-rocm=/opt/rocm \
|
|
||||||
--with-verbs \
|
|
||||||
--with-dm \
|
|
||||||
--enable-mt && \
|
|
||||||
make -j && \
|
|
||||||
make -j install
|
|
||||||
|
|
||||||
ENV PATH=/usr/local/ucx/bin:$PATH
|
|
||||||
ENV LD_LIBRARY_PATH=${UCX_HOME}/lib:${LD_LIBRARY_PATH}
|
|
||||||
|
|
||||||
RUN git clone ${RIXL_REPO} /opt/rixl && \
|
|
||||||
cd /opt/rixl && \
|
|
||||||
git checkout ${RIXL_BRANCH} && \
|
|
||||||
meson setup build --prefix=${RIXL_HOME} \
|
|
||||||
-Ducx_path=${UCX_HOME} \
|
|
||||||
-Drocm_path=${ROCM_PATH} && \
|
|
||||||
cd build && \
|
|
||||||
ninja && \
|
|
||||||
ninja install
|
|
||||||
|
|
||||||
# Generate RIXL wheel
|
|
||||||
RUN cd /opt/rixl && mkdir -p /app/install && \
|
|
||||||
./contrib/build-wheel.sh \
|
|
||||||
--output-dir /app/install \
|
|
||||||
--rocm-dir ${ROCM_PATH} \
|
|
||||||
--ucx-plugins-dir ${UCX_HOME}/lib/ucx \
|
|
||||||
--nixl-plugins-dir ${RIXL_HOME}/lib/x86_64-linux-gnu/plugins
|
|
||||||
|
|
||||||
|
|
||||||
###
|
###
|
||||||
### FlashAttention Build
|
### FlashAttention Build
|
||||||
###
|
###
|
||||||
@@ -365,8 +279,6 @@ RUN --mount=type=bind,from=build_aiter,src=/app/install/,target=/install \
|
|||||||
cp /install/*.whl /app/debs
|
cp /install/*.whl /app/debs
|
||||||
RUN --mount=type=bind,from=build_mori,src=/app/install/,target=/install \
|
RUN --mount=type=bind,from=build_mori,src=/app/install/,target=/install \
|
||||||
cp /install/*.whl /app/debs
|
cp /install/*.whl /app/debs
|
||||||
RUN --mount=type=bind,from=build_rixl,src=/app/install/,target=/install \
|
|
||||||
cp /install/*.whl /app/debs
|
|
||||||
|
|
||||||
FROM base AS final
|
FROM base AS final
|
||||||
RUN --mount=type=bind,from=debs,src=/app/debs,target=/install \
|
RUN --mount=type=bind,from=debs,src=/app/debs,target=/install \
|
||||||
@@ -385,12 +297,6 @@ ARG FA_BRANCH
|
|||||||
ARG FA_REPO
|
ARG FA_REPO
|
||||||
ARG AITER_BRANCH
|
ARG AITER_BRANCH
|
||||||
ARG AITER_REPO
|
ARG AITER_REPO
|
||||||
ARG RIXL_BRANCH
|
|
||||||
ARG RIXL_REPO
|
|
||||||
ARG ETCD_BRANCH
|
|
||||||
ARG ETCD_REPO
|
|
||||||
ARG UCX_BRANCH
|
|
||||||
ARG UCX_REPO
|
|
||||||
ARG MORI_BRANCH
|
ARG MORI_BRANCH
|
||||||
ARG MORI_REPO
|
ARG MORI_REPO
|
||||||
RUN echo "BASE_IMAGE: ${BASE_IMAGE}" > /app/versions.txt \
|
RUN echo "BASE_IMAGE: ${BASE_IMAGE}" > /app/versions.txt \
|
||||||
@@ -406,11 +312,5 @@ RUN echo "BASE_IMAGE: ${BASE_IMAGE}" > /app/versions.txt \
|
|||||||
&& echo "FA_REPO: ${FA_REPO}" >> /app/versions.txt \
|
&& echo "FA_REPO: ${FA_REPO}" >> /app/versions.txt \
|
||||||
&& echo "AITER_BRANCH: ${AITER_BRANCH}" >> /app/versions.txt \
|
&& echo "AITER_BRANCH: ${AITER_BRANCH}" >> /app/versions.txt \
|
||||||
&& echo "AITER_REPO: ${AITER_REPO}" >> /app/versions.txt \
|
&& echo "AITER_REPO: ${AITER_REPO}" >> /app/versions.txt \
|
||||||
&& echo "RIXL_BRANCH: ${RIXL_BRANCH}" >> /app/versions.txt \
|
|
||||||
&& echo "RIXL_REPO: ${RIXL_REPO}" >> /app/versions.txt \
|
|
||||||
&& echo "ETCD_BRANCH: ${ETCD_BRANCH}" >> /app/versions.txt \
|
|
||||||
&& echo "ETCD_REPO: ${ETCD_REPO}" >> /app/versions.txt \
|
|
||||||
&& echo "UCX_BRANCH: ${UCX_BRANCH}" >> /app/versions.txt \
|
|
||||||
&& echo "UCX_REPO: ${UCX_REPO}" >> /app/versions.txt \
|
|
||||||
&& echo "MORI_BRANCH: ${MORI_BRANCH}" >> /app/versions.txt \
|
&& echo "MORI_BRANCH: ${MORI_BRANCH}" >> /app/versions.txt \
|
||||||
&& echo "MORI_REPO: ${MORI_REPO}" >> /app/versions.txt
|
&& echo "MORI_REPO: ${MORI_REPO}" >> /app/versions.txt
|
||||||
|
|||||||
@@ -32,7 +32,7 @@ pyzmq >= 25.0.0
|
|||||||
msgspec
|
msgspec
|
||||||
gguf >= 0.17.0
|
gguf >= 0.17.0
|
||||||
mistral_common[image] >= 1.8.8
|
mistral_common[image] >= 1.8.8
|
||||||
opencv-python-headless >= 4.11.0 # required for video IO
|
opencv-python-headless >= 4.13.0 # required for video IO
|
||||||
pyyaml
|
pyyaml
|
||||||
six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that needs to be the latest version for python 3.12
|
six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that needs to be the latest version for python 3.12
|
||||||
setuptools>=77.0.3,<81.0.0; python_version > '3.11' # Setuptools is used by triton, we need to ensure a modern version is installed for 3.12+ so that it does not try to import distutils, which was removed in 3.12
|
setuptools>=77.0.3,<81.0.0; python_version > '3.11' # Setuptools is used by triton, we need to ensure a modern version is installed for 3.12+ so that it does not try to import distutils, which was removed in 3.12
|
||||||
|
|||||||
@@ -25,7 +25,7 @@ transformers_stream_generator # required for qwen-vl test
|
|||||||
matplotlib # required for qwen-vl test
|
matplotlib # required for qwen-vl test
|
||||||
mistral_common[image,audio] >= 1.8.8 # required for voxtral test
|
mistral_common[image,audio] >= 1.8.8 # required for voxtral test
|
||||||
num2words # required for smolvlm test
|
num2words # required for smolvlm test
|
||||||
opencv-python-headless >= 4.11.0 # required for video test
|
opencv-python-headless >= 4.13.0 # required for video test
|
||||||
datamodel_code_generator # required for minicpm3 test
|
datamodel_code_generator # required for minicpm3 test
|
||||||
lm-eval[api]>=0.4.9.2 # required for model evaluation test
|
lm-eval[api]>=0.4.9.2 # required for model evaluation test
|
||||||
mteb>=1.38.11, <2 # required for mteb test
|
mteb>=1.38.11, <2 # required for mteb test
|
||||||
@@ -37,8 +37,8 @@ bitsandbytes>=0.46.1
|
|||||||
buildkite-test-collector==0.1.9
|
buildkite-test-collector==0.1.9
|
||||||
|
|
||||||
|
|
||||||
genai_perf==0.0.8
|
genai_perf>=0.0.8
|
||||||
tritonclient==2.51.0
|
tritonclient>=2.51.0
|
||||||
|
|
||||||
numba == 0.61.2 # Required for N-gram speculative decoding
|
numba == 0.61.2 # Required for N-gram speculative decoding
|
||||||
numpy
|
numpy
|
||||||
|
|||||||
@@ -33,7 +33,7 @@ matplotlib # required for qwen-vl test
|
|||||||
mistral_common[image,audio] >= 1.8.8 # required for voxtral test
|
mistral_common[image,audio] >= 1.8.8 # required for voxtral test
|
||||||
num2words # required for smolvlm test
|
num2words # required for smolvlm test
|
||||||
open_clip_torch==2.32.0 # Required for nemotron_vl test, Nemotron Parse in test_common.py
|
open_clip_torch==2.32.0 # Required for nemotron_vl test, Nemotron Parse in test_common.py
|
||||||
opencv-python-headless >= 4.11.0 # required for video test
|
opencv-python-headless >= 4.13.0 # required for video test
|
||||||
datamodel_code_generator # required for minicpm3 test
|
datamodel_code_generator # required for minicpm3 test
|
||||||
lm-eval[api]>=0.4.9.2 # required for model evaluation test
|
lm-eval[api]>=0.4.9.2 # required for model evaluation test
|
||||||
mteb[bm25s]>=2, <3 # required for mteb test
|
mteb[bm25s]>=2, <3 # required for mteb test
|
||||||
@@ -45,8 +45,8 @@ bitsandbytes==0.46.1
|
|||||||
buildkite-test-collector==0.1.9
|
buildkite-test-collector==0.1.9
|
||||||
|
|
||||||
|
|
||||||
genai_perf==0.0.8
|
genai_perf>=0.0.8
|
||||||
tritonclient==2.51.0
|
tritonclient>=2.51.0
|
||||||
|
|
||||||
arctic-inference == 0.1.1 # Required for suffix decoding test
|
arctic-inference == 0.1.1 # Required for suffix decoding test
|
||||||
numba == 0.61.2 # Required for N-gram speculative decoding
|
numba == 0.61.2 # Required for N-gram speculative decoding
|
||||||
|
|||||||
@@ -31,7 +31,11 @@ albumentations==1.4.6
|
|||||||
# -r requirements/test.in
|
# -r requirements/test.in
|
||||||
# terratorch
|
# terratorch
|
||||||
alembic==1.16.4
|
alembic==1.16.4
|
||||||
# via mlflow
|
# via
|
||||||
|
# mlflow
|
||||||
|
# optuna
|
||||||
|
annotated-doc==0.0.4
|
||||||
|
# via fastapi
|
||||||
annotated-types==0.7.0
|
annotated-types==0.7.0
|
||||||
# via pydantic
|
# via pydantic
|
||||||
antlr4-python3-runtime==4.9.3
|
antlr4-python3-runtime==4.9.3
|
||||||
@@ -143,6 +147,8 @@ colorama==0.4.6
|
|||||||
# tqdm-multiprocess
|
# tqdm-multiprocess
|
||||||
colorful==0.5.6
|
colorful==0.5.6
|
||||||
# via ray
|
# via ray
|
||||||
|
colorlog==6.10.1
|
||||||
|
# via optuna
|
||||||
contourpy==1.3.0
|
contourpy==1.3.0
|
||||||
# via matplotlib
|
# via matplotlib
|
||||||
coverage==7.10.6
|
coverage==7.10.6
|
||||||
@@ -250,7 +256,7 @@ fsspec==2024.9.0
|
|||||||
# torch
|
# torch
|
||||||
ftfy==6.3.1
|
ftfy==6.3.1
|
||||||
# via open-clip-torch
|
# via open-clip-torch
|
||||||
genai-perf==0.0.8
|
genai-perf==0.0.16
|
||||||
# via -r requirements/test.in
|
# via -r requirements/test.in
|
||||||
genson==1.3.0
|
genson==1.3.0
|
||||||
# via datamodel-code-generator
|
# via datamodel-code-generator
|
||||||
@@ -387,6 +393,7 @@ jinja2==3.1.6
|
|||||||
# via
|
# via
|
||||||
# datamodel-code-generator
|
# datamodel-code-generator
|
||||||
# flask
|
# flask
|
||||||
|
# genai-perf
|
||||||
# mlflow
|
# mlflow
|
||||||
# torch
|
# torch
|
||||||
jiwer==3.0.5
|
jiwer==3.0.5
|
||||||
@@ -526,7 +533,7 @@ numba==0.61.2
|
|||||||
# librosa
|
# librosa
|
||||||
numexpr==2.10.1
|
numexpr==2.10.1
|
||||||
# via lm-eval
|
# via lm-eval
|
||||||
numpy==1.26.4
|
numpy==2.2.6
|
||||||
# via
|
# via
|
||||||
# -r requirements/test.in
|
# -r requirements/test.in
|
||||||
# accelerate
|
# accelerate
|
||||||
@@ -556,6 +563,7 @@ numpy==1.26.4
|
|||||||
# numba
|
# numba
|
||||||
# numexpr
|
# numexpr
|
||||||
# opencv-python-headless
|
# opencv-python-headless
|
||||||
|
# optuna
|
||||||
# pandas
|
# pandas
|
||||||
# patsy
|
# patsy
|
||||||
# peft
|
# peft
|
||||||
@@ -635,7 +643,7 @@ opencensus==0.11.4
|
|||||||
# via ray
|
# via ray
|
||||||
opencensus-context==0.1.3
|
opencensus-context==0.1.3
|
||||||
# via opencensus
|
# via opencensus
|
||||||
opencv-python-headless==4.11.0.86
|
opencv-python-headless==4.13.0.90
|
||||||
# via
|
# via
|
||||||
# -r requirements/test.in
|
# -r requirements/test.in
|
||||||
# albucore
|
# albucore
|
||||||
@@ -658,6 +666,10 @@ opentelemetry-sdk==1.35.0
|
|||||||
# ray
|
# ray
|
||||||
opentelemetry-semantic-conventions==0.56b0
|
opentelemetry-semantic-conventions==0.56b0
|
||||||
# via opentelemetry-sdk
|
# via opentelemetry-sdk
|
||||||
|
optuna==3.6.1
|
||||||
|
# via genai-perf
|
||||||
|
orjson==3.11.5
|
||||||
|
# via genai-perf
|
||||||
packaging==24.2
|
packaging==24.2
|
||||||
# via
|
# via
|
||||||
# accelerate
|
# accelerate
|
||||||
@@ -676,6 +688,7 @@ packaging==24.2
|
|||||||
# lightning-utilities
|
# lightning-utilities
|
||||||
# matplotlib
|
# matplotlib
|
||||||
# mlflow-skinny
|
# mlflow-skinny
|
||||||
|
# optuna
|
||||||
# peft
|
# peft
|
||||||
# plotly
|
# plotly
|
||||||
# pooch
|
# pooch
|
||||||
@@ -715,6 +728,8 @@ peft==0.16.0
|
|||||||
# lm-eval
|
# lm-eval
|
||||||
perceptron==0.1.4
|
perceptron==0.1.4
|
||||||
# via -r requirements/test.in
|
# via -r requirements/test.in
|
||||||
|
perf-analyzer==0.1.0
|
||||||
|
# via genai-perf
|
||||||
pillow==10.4.0
|
pillow==10.4.0
|
||||||
# via
|
# via
|
||||||
# genai-perf
|
# genai-perf
|
||||||
@@ -901,6 +916,7 @@ pyyaml==6.0.2
|
|||||||
# lightning
|
# lightning
|
||||||
# mlflow-skinny
|
# mlflow-skinny
|
||||||
# omegaconf
|
# omegaconf
|
||||||
|
# optuna
|
||||||
# peft
|
# peft
|
||||||
# pytorch-lightning
|
# pytorch-lightning
|
||||||
# ray
|
# ray
|
||||||
@@ -1063,6 +1079,7 @@ sortedcontainers==2.4.0
|
|||||||
soundfile==0.12.1
|
soundfile==0.12.1
|
||||||
# via
|
# via
|
||||||
# -r requirements/test.in
|
# -r requirements/test.in
|
||||||
|
# genai-perf
|
||||||
# librosa
|
# librosa
|
||||||
# mistral-common
|
# mistral-common
|
||||||
soxr==0.5.0.post1
|
soxr==0.5.0.post1
|
||||||
@@ -1073,6 +1090,7 @@ sqlalchemy==2.0.41
|
|||||||
# via
|
# via
|
||||||
# alembic
|
# alembic
|
||||||
# mlflow
|
# mlflow
|
||||||
|
# optuna
|
||||||
sqlitedict==2.1.0
|
sqlitedict==2.1.0
|
||||||
# via lm-eval
|
# via lm-eval
|
||||||
sqlparse==0.5.3
|
sqlparse==0.5.3
|
||||||
@@ -1202,6 +1220,7 @@ tqdm==4.66.6
|
|||||||
# mteb
|
# mteb
|
||||||
# nltk
|
# nltk
|
||||||
# open-clip-torch
|
# open-clip-torch
|
||||||
|
# optuna
|
||||||
# peft
|
# peft
|
||||||
# pqdm
|
# pqdm
|
||||||
# pretrainedmodels
|
# pretrainedmodels
|
||||||
@@ -1224,10 +1243,8 @@ transformers-stream-generator==0.0.5
|
|||||||
# via -r requirements/test.in
|
# via -r requirements/test.in
|
||||||
triton==3.5.1
|
triton==3.5.1
|
||||||
# via torch
|
# via torch
|
||||||
tritonclient==2.51.0
|
tritonclient==2.64.0
|
||||||
# via
|
# via -r requirements/test.in
|
||||||
# -r requirements/test.in
|
|
||||||
# genai-perf
|
|
||||||
typepy==1.3.2
|
typepy==1.3.2
|
||||||
# via
|
# via
|
||||||
# dataproperty
|
# dataproperty
|
||||||
|
|||||||
@@ -267,12 +267,16 @@ async def test_audio_with_max_tokens(mary_had_lamb, client_and_model):
|
|||||||
out_tokens = tok(out_text, add_special_tokens=False)["input_ids"]
|
out_tokens = tok(out_text, add_special_tokens=False)["input_ids"]
|
||||||
assert len(out_tokens) == 1
|
assert len(out_tokens) == 1
|
||||||
# max_completion_tokens > max_model_len
|
# max_completion_tokens > max_model_len
|
||||||
|
# max_model_len=32768 for Gemma-3n-E2B-it
|
||||||
transcription = await client.audio.transcriptions.create(
|
transcription = await client.audio.transcriptions.create(
|
||||||
model=model_name,
|
model=model_name,
|
||||||
file=mary_had_lamb,
|
file=mary_had_lamb,
|
||||||
response_format="text",
|
response_format="text",
|
||||||
temperature=0.0,
|
temperature=0.0,
|
||||||
extra_body={"max_completion_tokens": int(1e6)},
|
extra_body={
|
||||||
|
"max_completion_tokens": int(1e6),
|
||||||
|
"repetition_penalty": 1.3,
|
||||||
|
},
|
||||||
)
|
)
|
||||||
out = json.loads(transcription)
|
out = json.loads(transcription)
|
||||||
out_text = out["text"]
|
out_text = out["text"]
|
||||||
|
|||||||
@@ -176,3 +176,46 @@ def test_models_distributed(
|
|||||||
distributed_executor_backend=distributed_executor_backend,
|
distributed_executor_backend=distributed_executor_backend,
|
||||||
enforce_eager=False,
|
enforce_eager=False,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.core_model
|
||||||
|
@pytest.mark.parametrize("model", ["openai/whisper-large-v3-turbo"])
|
||||||
|
def test_encoder_cache_cleanup(
|
||||||
|
vllm_runner,
|
||||||
|
model: str,
|
||||||
|
input_audios,
|
||||||
|
monkeypatch,
|
||||||
|
) -> None:
|
||||||
|
"""Test that encoder cache is properly cleaned up after requests complete.
|
||||||
|
|
||||||
|
This is a regression test for a bug where encoder cache entries were freed
|
||||||
|
in the same scheduling step they were allocated, before the model could use
|
||||||
|
them.
|
||||||
|
"""
|
||||||
|
# Set single-process mode to access the model runner's encoder cache directly
|
||||||
|
monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
|
||||||
|
check_model_available(model)
|
||||||
|
|
||||||
|
with vllm_runner(
|
||||||
|
model,
|
||||||
|
dtype="half",
|
||||||
|
max_model_len=448,
|
||||||
|
tensor_parallel_size=1,
|
||||||
|
limit_mm_per_prompt={"audio": 2},
|
||||||
|
enforce_eager=True,
|
||||||
|
) as vllm_model:
|
||||||
|
engine_core = vllm_model.llm.llm_engine.engine_core.engine_core
|
||||||
|
model_runner = engine_core.model_executor.driver_worker.worker.model_runner
|
||||||
|
encoder_cache = model_runner.encoder_cache
|
||||||
|
|
||||||
|
# Run multiple sequential requests to ensure cache is properly managed
|
||||||
|
for vllm_prompts, _, audios in input_audios:
|
||||||
|
vllm_model.generate_greedy(vllm_prompts, max_tokens=50, audios=audios)
|
||||||
|
|
||||||
|
# After all requests complete, encoder cache should be empty
|
||||||
|
cache_size = len(encoder_cache)
|
||||||
|
assert cache_size == 0, (
|
||||||
|
f"Encoder cache should be empty after all requests complete, "
|
||||||
|
f"but has {cache_size} entries. This indicates encoder cache "
|
||||||
|
f"entries are not being properly freed."
|
||||||
|
)
|
||||||
|
|||||||
@@ -3,10 +3,10 @@
|
|||||||
|
|
||||||
from collections.abc import Mapping, MutableMapping
|
from collections.abc import Mapping, MutableMapping
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from urllib.parse import urlparse
|
|
||||||
|
|
||||||
import aiohttp
|
import aiohttp
|
||||||
import requests
|
import requests
|
||||||
|
from urllib3.util import parse_url
|
||||||
|
|
||||||
from vllm.version import __version__ as VLLM_VERSION
|
from vllm.version import __version__ as VLLM_VERSION
|
||||||
|
|
||||||
@@ -37,7 +37,7 @@ class HTTPConnection:
|
|||||||
return self._async_client
|
return self._async_client
|
||||||
|
|
||||||
def _validate_http_url(self, url: str):
|
def _validate_http_url(self, url: str):
|
||||||
parsed_url = urlparse(url)
|
parsed_url = parse_url(url)
|
||||||
|
|
||||||
if parsed_url.scheme not in ("http", "https"):
|
if parsed_url.scheme not in ("http", "https"):
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
|
|||||||
@@ -540,14 +540,8 @@ async def create_completion(request: CompletionRequest, raw_request: Request):
|
|||||||
|
|
||||||
try:
|
try:
|
||||||
generator = await handler.create_completion(request, raw_request)
|
generator = await handler.create_completion(request, raw_request)
|
||||||
except OverflowError as e:
|
|
||||||
raise HTTPException(
|
|
||||||
status_code=HTTPStatus.BAD_REQUEST.value, detail=str(e)
|
|
||||||
) from e
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
raise HTTPException(
|
return handler.create_error_response(e)
|
||||||
status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, detail=str(e)
|
|
||||||
) from e
|
|
||||||
|
|
||||||
if isinstance(generator, ErrorResponse):
|
if isinstance(generator, ErrorResponse):
|
||||||
return JSONResponse(
|
return JSONResponse(
|
||||||
|
|||||||
@@ -86,7 +86,7 @@ from vllm.entrypoints.responses_utils import (
|
|||||||
construct_input_messages,
|
construct_input_messages,
|
||||||
)
|
)
|
||||||
from vllm.entrypoints.serve.disagg.protocol import GenerateRequest, GenerateResponse
|
from vllm.entrypoints.serve.disagg.protocol import GenerateRequest, GenerateResponse
|
||||||
from vllm.entrypoints.utils import _validate_truncation_size
|
from vllm.entrypoints.utils import _validate_truncation_size, sanitize_message
|
||||||
from vllm.inputs.data import PromptType, TokensPrompt
|
from vllm.inputs.data import PromptType, TokensPrompt
|
||||||
from vllm.inputs.parse import (
|
from vllm.inputs.parse import (
|
||||||
PromptComponents,
|
PromptComponents,
|
||||||
@@ -760,11 +760,15 @@ class OpenAIServing:
|
|||||||
err_type = "BadRequestError"
|
err_type = "BadRequestError"
|
||||||
status_code = HTTPStatus.BAD_REQUEST
|
status_code = HTTPStatus.BAD_REQUEST
|
||||||
param = exc.parameter
|
param = exc.parameter
|
||||||
elif isinstance(exc, (ValueError, TypeError, RuntimeError)):
|
elif isinstance(exc, (ValueError, TypeError, RuntimeError, OverflowError)):
|
||||||
# Common validation errors from user input
|
# Common validation errors from user input
|
||||||
err_type = "BadRequestError"
|
err_type = "BadRequestError"
|
||||||
status_code = HTTPStatus.BAD_REQUEST
|
status_code = HTTPStatus.BAD_REQUEST
|
||||||
param = None
|
param = None
|
||||||
|
elif isinstance(exc, NotImplementedError):
|
||||||
|
err_type = "NotImplementedError"
|
||||||
|
status_code = HTTPStatus.NOT_IMPLEMENTED
|
||||||
|
param = None
|
||||||
elif exc.__class__.__name__ == "TemplateError":
|
elif exc.__class__.__name__ == "TemplateError":
|
||||||
# jinja2.TemplateError (avoid importing jinja2)
|
# jinja2.TemplateError (avoid importing jinja2)
|
||||||
err_type = "BadRequestError"
|
err_type = "BadRequestError"
|
||||||
@@ -783,9 +787,10 @@ class OpenAIServing:
|
|||||||
traceback.print_exc()
|
traceback.print_exc()
|
||||||
else:
|
else:
|
||||||
traceback.print_stack()
|
traceback.print_stack()
|
||||||
|
|
||||||
return ErrorResponse(
|
return ErrorResponse(
|
||||||
error=ErrorInfo(
|
error=ErrorInfo(
|
||||||
message=message,
|
message=sanitize_message(message),
|
||||||
type=err_type,
|
type=err_type,
|
||||||
code=status_code.value,
|
code=status_code.value,
|
||||||
param=param,
|
param=param,
|
||||||
|
|||||||
@@ -16,6 +16,7 @@ from vllm.entrypoints.openai.protocol import (
|
|||||||
ModelPermission,
|
ModelPermission,
|
||||||
UnloadLoRAAdapterRequest,
|
UnloadLoRAAdapterRequest,
|
||||||
)
|
)
|
||||||
|
from vllm.entrypoints.utils import sanitize_message
|
||||||
from vllm.logger import init_logger
|
from vllm.logger import init_logger
|
||||||
from vllm.lora.request import LoRARequest
|
from vllm.lora.request import LoRARequest
|
||||||
from vllm.lora.resolver import LoRAResolver, LoRAResolverRegistry
|
from vllm.lora.resolver import LoRAResolver, LoRAResolverRegistry
|
||||||
@@ -300,5 +301,9 @@ def create_error_response(
|
|||||||
status_code: HTTPStatus = HTTPStatus.BAD_REQUEST,
|
status_code: HTTPStatus = HTTPStatus.BAD_REQUEST,
|
||||||
) -> ErrorResponse:
|
) -> ErrorResponse:
|
||||||
return ErrorResponse(
|
return ErrorResponse(
|
||||||
error=ErrorInfo(message=message, type=err_type, code=status_code.value)
|
error=ErrorInfo(
|
||||||
|
message=sanitize_message(message),
|
||||||
|
type=err_type,
|
||||||
|
code=status_code.value,
|
||||||
|
)
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -1,8 +1,7 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
from http import HTTPStatus
|
|
||||||
|
|
||||||
from fastapi import APIRouter, Depends, HTTPException, Request
|
from fastapi import APIRouter, Depends, Request
|
||||||
from starlette.responses import JSONResponse
|
from starlette.responses import JSONResponse
|
||||||
from typing_extensions import assert_never
|
from typing_extensions import assert_never
|
||||||
|
|
||||||
@@ -36,9 +35,8 @@ async def create_classify(request: ClassificationRequest, raw_request: Request):
|
|||||||
try:
|
try:
|
||||||
generator = await handler.create_classify(request, raw_request)
|
generator = await handler.create_classify(request, raw_request)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
raise HTTPException(
|
return handler.create_error_response(e)
|
||||||
status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, detail=str(e)
|
|
||||||
) from e
|
|
||||||
if isinstance(generator, ErrorResponse):
|
if isinstance(generator, ErrorResponse):
|
||||||
return JSONResponse(
|
return JSONResponse(
|
||||||
content=generator.model_dump(), status_code=generator.error.code
|
content=generator.model_dump(), status_code=generator.error.code
|
||||||
|
|||||||
@@ -2,7 +2,7 @@
|
|||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
from http import HTTPStatus
|
from http import HTTPStatus
|
||||||
|
|
||||||
from fastapi import APIRouter, Depends, HTTPException, Request
|
from fastapi import APIRouter, Depends, Request
|
||||||
from fastapi.responses import JSONResponse, StreamingResponse
|
from fastapi.responses import JSONResponse, StreamingResponse
|
||||||
from typing_extensions import assert_never
|
from typing_extensions import assert_never
|
||||||
|
|
||||||
@@ -47,9 +47,7 @@ async def create_embedding(
|
|||||||
try:
|
try:
|
||||||
generator = await handler.create_embedding(request, raw_request)
|
generator = await handler.create_embedding(request, raw_request)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
raise HTTPException(
|
return handler.create_error_response(e)
|
||||||
status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, detail=str(e)
|
|
||||||
) from e
|
|
||||||
|
|
||||||
if isinstance(generator, ErrorResponse):
|
if isinstance(generator, ErrorResponse):
|
||||||
return JSONResponse(
|
return JSONResponse(
|
||||||
|
|||||||
@@ -2,7 +2,7 @@
|
|||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
from http import HTTPStatus
|
from http import HTTPStatus
|
||||||
|
|
||||||
from fastapi import APIRouter, Depends, HTTPException, Request
|
from fastapi import APIRouter, Depends, Request
|
||||||
from fastapi.responses import JSONResponse, StreamingResponse
|
from fastapi.responses import JSONResponse, StreamingResponse
|
||||||
from typing_extensions import assert_never
|
from typing_extensions import assert_never
|
||||||
|
|
||||||
@@ -44,9 +44,8 @@ async def create_pooling(request: PoolingRequest, raw_request: Request):
|
|||||||
try:
|
try:
|
||||||
generator = await handler.create_pooling(request, raw_request)
|
generator = await handler.create_pooling(request, raw_request)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
raise HTTPException(
|
return handler.create_error_response(e)
|
||||||
status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, detail=str(e)
|
|
||||||
) from e
|
|
||||||
if isinstance(generator, ErrorResponse):
|
if isinstance(generator, ErrorResponse):
|
||||||
return JSONResponse(
|
return JSONResponse(
|
||||||
content=generator.model_dump(), status_code=generator.error.code
|
content=generator.model_dump(), status_code=generator.error.code
|
||||||
|
|||||||
@@ -2,7 +2,7 @@
|
|||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
from http import HTTPStatus
|
from http import HTTPStatus
|
||||||
|
|
||||||
from fastapi import APIRouter, Depends, HTTPException, Request
|
from fastapi import APIRouter, Depends, Request
|
||||||
from fastapi.responses import JSONResponse
|
from fastapi.responses import JSONResponse
|
||||||
from typing_extensions import assert_never
|
from typing_extensions import assert_never
|
||||||
|
|
||||||
@@ -52,9 +52,8 @@ async def create_score(request: ScoreRequest, raw_request: Request):
|
|||||||
try:
|
try:
|
||||||
generator = await handler.create_score(request, raw_request)
|
generator = await handler.create_score(request, raw_request)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
raise HTTPException(
|
return handler.create_error_response(e)
|
||||||
status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, detail=str(e)
|
|
||||||
) from e
|
|
||||||
if isinstance(generator, ErrorResponse):
|
if isinstance(generator, ErrorResponse):
|
||||||
return JSONResponse(
|
return JSONResponse(
|
||||||
content=generator.model_dump(), status_code=generator.error.code
|
content=generator.model_dump(), status_code=generator.error.code
|
||||||
@@ -104,9 +103,8 @@ async def do_rerank(request: RerankRequest, raw_request: Request):
|
|||||||
try:
|
try:
|
||||||
generator = await handler.do_rerank(request, raw_request)
|
generator = await handler.do_rerank(request, raw_request)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
raise HTTPException(
|
return handler.create_error_response(e)
|
||||||
status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, detail=str(e)
|
|
||||||
) from e
|
|
||||||
if isinstance(generator, ErrorResponse):
|
if isinstance(generator, ErrorResponse):
|
||||||
return JSONResponse(
|
return JSONResponse(
|
||||||
content=generator.model_dump(), status_code=generator.error.code
|
content=generator.model_dump(), status_code=generator.error.code
|
||||||
|
|||||||
@@ -67,9 +67,8 @@ async def generate(request: GenerateRequest, raw_request: Request):
|
|||||||
try:
|
try:
|
||||||
generator = await handler.serve_tokens(request, raw_request)
|
generator = await handler.serve_tokens(request, raw_request)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
raise HTTPException(
|
return handler.create_error_response(e)
|
||||||
status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, detail=str(e)
|
|
||||||
) from e
|
|
||||||
if isinstance(generator, ErrorResponse):
|
if isinstance(generator, ErrorResponse):
|
||||||
return JSONResponse(
|
return JSONResponse(
|
||||||
content=generator.model_dump(), status_code=generator.error.code
|
content=generator.model_dump(), status_code=generator.error.code
|
||||||
|
|||||||
@@ -49,14 +49,8 @@ async def tokenize(request: TokenizeRequest, raw_request: Request):
|
|||||||
|
|
||||||
try:
|
try:
|
||||||
generator = await handler.create_tokenize(request, raw_request)
|
generator = await handler.create_tokenize(request, raw_request)
|
||||||
except NotImplementedError as e:
|
|
||||||
raise HTTPException(
|
|
||||||
status_code=HTTPStatus.NOT_IMPLEMENTED.value, detail=str(e)
|
|
||||||
) from e
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
raise HTTPException(
|
return handler.create_error_response(e)
|
||||||
status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, detail=str(e)
|
|
||||||
) from e
|
|
||||||
|
|
||||||
if isinstance(generator, ErrorResponse):
|
if isinstance(generator, ErrorResponse):
|
||||||
return JSONResponse(
|
return JSONResponse(
|
||||||
|
|||||||
@@ -7,7 +7,7 @@ import functools
|
|||||||
import os
|
import os
|
||||||
from argparse import Namespace
|
from argparse import Namespace
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Any
|
from typing import TYPE_CHECKING, Any
|
||||||
|
|
||||||
import regex as re
|
import regex as re
|
||||||
from fastapi import Request
|
from fastapi import Request
|
||||||
@@ -22,18 +22,25 @@ from vllm.entrypoints.chat_utils import (
|
|||||||
resolve_hf_chat_template,
|
resolve_hf_chat_template,
|
||||||
resolve_mistral_chat_template,
|
resolve_mistral_chat_template,
|
||||||
)
|
)
|
||||||
from vllm.entrypoints.openai.cli_args import make_arg_parser
|
|
||||||
from vllm.entrypoints.openai.protocol import (
|
|
||||||
ChatCompletionRequest,
|
|
||||||
CompletionRequest,
|
|
||||||
StreamOptions,
|
|
||||||
)
|
|
||||||
from vllm.entrypoints.openai.serving_models import LoRAModulePath
|
|
||||||
from vllm.logger import init_logger
|
from vllm.logger import init_logger
|
||||||
from vllm.platforms import current_platform
|
from vllm.platforms import current_platform
|
||||||
from vllm.tokenizers.mistral import MistralTokenizer
|
from vllm.tokenizers.mistral import MistralTokenizer
|
||||||
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
||||||
|
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
from vllm.entrypoints.openai.protocol import (
|
||||||
|
ChatCompletionRequest,
|
||||||
|
CompletionRequest,
|
||||||
|
StreamOptions,
|
||||||
|
)
|
||||||
|
from vllm.entrypoints.openai.serving_models import LoRAModulePath
|
||||||
|
else:
|
||||||
|
ChatCompletionRequest = object
|
||||||
|
CompletionRequest = object
|
||||||
|
StreamOptions = object
|
||||||
|
LoRAModulePath = object
|
||||||
|
|
||||||
|
|
||||||
logger = init_logger(__name__)
|
logger = init_logger(__name__)
|
||||||
|
|
||||||
VLLM_SUBCMD_PARSER_EPILOG = (
|
VLLM_SUBCMD_PARSER_EPILOG = (
|
||||||
@@ -206,7 +213,7 @@ def _validate_truncation_size(
|
|||||||
|
|
||||||
def get_max_tokens(
|
def get_max_tokens(
|
||||||
max_model_len: int,
|
max_model_len: int,
|
||||||
request: ChatCompletionRequest | CompletionRequest,
|
request: "ChatCompletionRequest | CompletionRequest",
|
||||||
input_length: int,
|
input_length: int,
|
||||||
default_sampling_params: dict,
|
default_sampling_params: dict,
|
||||||
) -> int:
|
) -> int:
|
||||||
@@ -227,6 +234,8 @@ def get_max_tokens(
|
|||||||
|
|
||||||
|
|
||||||
def log_non_default_args(args: Namespace | EngineArgs):
|
def log_non_default_args(args: Namespace | EngineArgs):
|
||||||
|
from vllm.entrypoints.openai.cli_args import make_arg_parser
|
||||||
|
|
||||||
non_default_args = {}
|
non_default_args = {}
|
||||||
|
|
||||||
# Handle Namespace
|
# Handle Namespace
|
||||||
@@ -255,7 +264,7 @@ def log_non_default_args(args: Namespace | EngineArgs):
|
|||||||
|
|
||||||
|
|
||||||
def should_include_usage(
|
def should_include_usage(
|
||||||
stream_options: StreamOptions | None, enable_force_include_usage: bool
|
stream_options: "StreamOptions | None", enable_force_include_usage: bool
|
||||||
) -> tuple[bool, bool]:
|
) -> tuple[bool, bool]:
|
||||||
if stream_options:
|
if stream_options:
|
||||||
include_usage = stream_options.include_usage or enable_force_include_usage
|
include_usage = stream_options.include_usage or enable_force_include_usage
|
||||||
@@ -270,6 +279,8 @@ def should_include_usage(
|
|||||||
def process_lora_modules(
|
def process_lora_modules(
|
||||||
args_lora_modules: list[LoRAModulePath], default_mm_loras: dict[str, str] | None
|
args_lora_modules: list[LoRAModulePath], default_mm_loras: dict[str, str] | None
|
||||||
) -> list[LoRAModulePath]:
|
) -> list[LoRAModulePath]:
|
||||||
|
from vllm.entrypoints.openai.serving_models import LoRAModulePath
|
||||||
|
|
||||||
lora_modules = args_lora_modules
|
lora_modules = args_lora_modules
|
||||||
if default_mm_loras:
|
if default_mm_loras:
|
||||||
default_mm_lora_paths = [
|
default_mm_lora_paths = [
|
||||||
|
|||||||
@@ -442,9 +442,9 @@ def get_vllm_port() -> int | None:
|
|||||||
try:
|
try:
|
||||||
return int(port)
|
return int(port)
|
||||||
except ValueError as err:
|
except ValueError as err:
|
||||||
from urllib.parse import urlparse
|
from urllib3.util import parse_url
|
||||||
|
|
||||||
parsed = urlparse(port)
|
parsed = parse_url(port)
|
||||||
if parsed.scheme:
|
if parsed.scheme:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
f"VLLM_PORT '{port}' appears to be a URI. "
|
f"VLLM_PORT '{port}' appears to be a URI. "
|
||||||
|
|||||||
@@ -9,13 +9,13 @@ from concurrent.futures import ThreadPoolExecutor
|
|||||||
from itertools import groupby
|
from itertools import groupby
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import TYPE_CHECKING, Any, TypeVar
|
from typing import TYPE_CHECKING, Any, TypeVar
|
||||||
from urllib.parse import ParseResult, urlparse
|
|
||||||
from urllib.request import url2pathname
|
from urllib.request import url2pathname
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import numpy.typing as npt
|
import numpy.typing as npt
|
||||||
import torch
|
import torch
|
||||||
from PIL import Image, UnidentifiedImageError
|
from PIL import Image, UnidentifiedImageError
|
||||||
|
from urllib3.util import Url, parse_url
|
||||||
|
|
||||||
import vllm.envs as envs
|
import vllm.envs as envs
|
||||||
from vllm.connections import HTTPConnection, global_http_connection
|
from vllm.connections import HTTPConnection, global_http_connection
|
||||||
@@ -101,11 +101,14 @@ class MediaConnector:
|
|||||||
|
|
||||||
def _load_data_url(
|
def _load_data_url(
|
||||||
self,
|
self,
|
||||||
url_spec: ParseResult,
|
url_spec: Url,
|
||||||
media_io: MediaIO[_M],
|
media_io: MediaIO[_M],
|
||||||
) -> _M: # type: ignore[type-var]
|
) -> _M: # type: ignore[type-var]
|
||||||
data_spec, data = url_spec.path.split(",", 1)
|
url_spec_path = url_spec.path or ""
|
||||||
|
data_spec, data = url_spec_path.split(",", 1)
|
||||||
media_type, data_type = data_spec.split(";", 1)
|
media_type, data_type = data_spec.split(";", 1)
|
||||||
|
# media_type starts with a leading "/" (e.g., "/video/jpeg")
|
||||||
|
media_type = media_type.lstrip("/")
|
||||||
|
|
||||||
if data_type != "base64":
|
if data_type != "base64":
|
||||||
msg = "Only base64 data URLs are supported for now."
|
msg = "Only base64 data URLs are supported for now."
|
||||||
@@ -115,7 +118,7 @@ class MediaConnector:
|
|||||||
|
|
||||||
def _load_file_url(
|
def _load_file_url(
|
||||||
self,
|
self,
|
||||||
url_spec: ParseResult,
|
url_spec: Url,
|
||||||
media_io: MediaIO[_M],
|
media_io: MediaIO[_M],
|
||||||
) -> _M: # type: ignore[type-var]
|
) -> _M: # type: ignore[type-var]
|
||||||
allowed_local_media_path = self.allowed_local_media_path
|
allowed_local_media_path = self.allowed_local_media_path
|
||||||
@@ -124,7 +127,9 @@ class MediaConnector:
|
|||||||
"Cannot load local files without `--allowed-local-media-path`."
|
"Cannot load local files without `--allowed-local-media-path`."
|
||||||
)
|
)
|
||||||
|
|
||||||
filepath = Path(url2pathname(url_spec.netloc + url_spec.path))
|
url_spec_path = url_spec.path or ""
|
||||||
|
url_spec_netloc = url_spec.netloc or ""
|
||||||
|
filepath = Path(url2pathname(url_spec_netloc + url_spec_path))
|
||||||
if allowed_local_media_path not in filepath.resolve().parents:
|
if allowed_local_media_path not in filepath.resolve().parents:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
f"The file path {filepath} must be a subpath "
|
f"The file path {filepath} must be a subpath "
|
||||||
@@ -133,7 +138,7 @@ class MediaConnector:
|
|||||||
|
|
||||||
return media_io.load_file(filepath)
|
return media_io.load_file(filepath)
|
||||||
|
|
||||||
def _assert_url_in_allowed_media_domains(self, url_spec: ParseResult) -> None:
|
def _assert_url_in_allowed_media_domains(self, url_spec: Url) -> None:
|
||||||
if (
|
if (
|
||||||
self.allowed_media_domains
|
self.allowed_media_domains
|
||||||
and url_spec.hostname not in self.allowed_media_domains
|
and url_spec.hostname not in self.allowed_media_domains
|
||||||
@@ -151,9 +156,9 @@ class MediaConnector:
|
|||||||
*,
|
*,
|
||||||
fetch_timeout: int | None = None,
|
fetch_timeout: int | None = None,
|
||||||
) -> _M: # type: ignore[type-var]
|
) -> _M: # type: ignore[type-var]
|
||||||
url_spec = urlparse(url)
|
url_spec = parse_url(url)
|
||||||
|
|
||||||
if url_spec.scheme.startswith("http"):
|
if url_spec.scheme and url_spec.scheme.startswith("http"):
|
||||||
self._assert_url_in_allowed_media_domains(url_spec)
|
self._assert_url_in_allowed_media_domains(url_spec)
|
||||||
|
|
||||||
connection = self.connection
|
connection = self.connection
|
||||||
@@ -181,10 +186,10 @@ class MediaConnector:
|
|||||||
*,
|
*,
|
||||||
fetch_timeout: int | None = None,
|
fetch_timeout: int | None = None,
|
||||||
) -> _M:
|
) -> _M:
|
||||||
url_spec = urlparse(url)
|
url_spec = parse_url(url)
|
||||||
loop = asyncio.get_running_loop()
|
loop = asyncio.get_running_loop()
|
||||||
|
|
||||||
if url_spec.scheme.startswith("http"):
|
if url_spec.scheme and url_spec.scheme.startswith("http"):
|
||||||
self._assert_url_in_allowed_media_domains(url_spec)
|
self._assert_url_in_allowed_media_domains(url_spec)
|
||||||
|
|
||||||
connection = self.connection
|
connection = self.connection
|
||||||
|
|||||||
@@ -11,12 +11,12 @@ from collections.abc import (
|
|||||||
Sequence,
|
Sequence,
|
||||||
)
|
)
|
||||||
from typing import Any
|
from typing import Any
|
||||||
from urllib.parse import urlparse
|
|
||||||
from uuid import uuid4
|
from uuid import uuid4
|
||||||
|
|
||||||
import psutil
|
import psutil
|
||||||
import zmq
|
import zmq
|
||||||
import zmq.asyncio
|
import zmq.asyncio
|
||||||
|
from urllib3.util import parse_url
|
||||||
|
|
||||||
import vllm.envs as envs
|
import vllm.envs as envs
|
||||||
from vllm.logger import init_logger
|
from vllm.logger import init_logger
|
||||||
@@ -217,13 +217,15 @@ def find_process_using_port(port: int) -> psutil.Process | None:
|
|||||||
|
|
||||||
def split_zmq_path(path: str) -> tuple[str, str, str]:
|
def split_zmq_path(path: str) -> tuple[str, str, str]:
|
||||||
"""Split a zmq path into its parts."""
|
"""Split a zmq path into its parts."""
|
||||||
parsed = urlparse(path)
|
parsed = parse_url(path)
|
||||||
if not parsed.scheme:
|
if not parsed.scheme:
|
||||||
raise ValueError(f"Invalid zmq path: {path}")
|
raise ValueError(f"Invalid zmq path: {path}")
|
||||||
|
|
||||||
scheme = parsed.scheme
|
scheme = parsed.scheme
|
||||||
host = parsed.hostname or ""
|
host = parsed.hostname or ""
|
||||||
port = str(parsed.port or "")
|
port = str(parsed.port or "")
|
||||||
|
if host.startswith("[") and host.endswith("]"):
|
||||||
|
host = host[1:-1] # Remove brackets for IPv6 address
|
||||||
|
|
||||||
if scheme == "tcp" and not all((host, port)):
|
if scheme == "tcp" and not all((host, port)):
|
||||||
# The host and port fields are required for tcp
|
# The host and port fields are required for tcp
|
||||||
|
|||||||
@@ -357,7 +357,8 @@ class EncoderDecoderCacheManager(EncoderCacheManager):
|
|||||||
def __init__(self, cache_size: int):
|
def __init__(self, cache_size: int):
|
||||||
self.cache_size = cache_size
|
self.cache_size = cache_size
|
||||||
self.num_free_slots = cache_size
|
self.num_free_slots = cache_size
|
||||||
self.freed: list[str] = []
|
self.allocated: list[str] = []
|
||||||
|
self.to_free: list[str] = []
|
||||||
|
|
||||||
def check_and_update_cache(self, request: Request, input_id: int) -> bool:
|
def check_and_update_cache(self, request: Request, input_id: int) -> bool:
|
||||||
return False
|
return False
|
||||||
@@ -383,7 +384,7 @@ class EncoderDecoderCacheManager(EncoderCacheManager):
|
|||||||
self.num_free_slots -= num_encoder_embeds
|
self.num_free_slots -= num_encoder_embeds
|
||||||
|
|
||||||
mm_hash = request.mm_features[input_id].identifier
|
mm_hash = request.mm_features[input_id].identifier
|
||||||
self.freed.append(mm_hash)
|
self.allocated.append(mm_hash)
|
||||||
|
|
||||||
def free(self, request: Request) -> None:
|
def free(self, request: Request) -> None:
|
||||||
for input_id in range(len(request.mm_features)):
|
for input_id in range(len(request.mm_features)):
|
||||||
@@ -393,9 +394,14 @@ class EncoderDecoderCacheManager(EncoderCacheManager):
|
|||||||
return set(range(len(request.mm_features)))
|
return set(range(len(request.mm_features)))
|
||||||
|
|
||||||
def get_freed_mm_hashes(self) -> list[str]:
|
def get_freed_mm_hashes(self) -> list[str]:
|
||||||
freed = self.freed
|
# As encoder cache is not used for enc-dec models, we can free the entries here
|
||||||
self.freed = []
|
# The actual free happens in the runner, *before* the model is executed.
|
||||||
return freed
|
# Therefore, `freeable` acts as a buffer to free the entries only after the
|
||||||
|
# model is executed, mimicking the state transition of `EncoderCacheManager`.
|
||||||
|
to_free = self.to_free
|
||||||
|
self.to_free = self.allocated
|
||||||
|
self.allocated = []
|
||||||
|
return to_free
|
||||||
|
|
||||||
def free_encoder_input(self, request: Request, input_id: int) -> None:
|
def free_encoder_input(self, request: Request, input_id: int) -> None:
|
||||||
num_encoder_embeds = request.get_num_encoder_embeds(input_id)
|
num_encoder_embeds = request.get_num_encoder_embeds(input_id)
|
||||||
|
|||||||
Reference in New Issue
Block a user