Compare commits
11 Commits
v0.16.0rc2
...
v0.16.0
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
89a77b1084 | ||
|
|
d3c1513f5f | ||
|
|
5dbfbc967b | ||
|
|
c86cdcbcd2 | ||
|
|
3c9496f146 | ||
|
|
2d5be1dd5c | ||
|
|
7a06e5b05b | ||
|
|
946b2f106c | ||
|
|
5e8adb0c49 | ||
|
|
9be1ff2d3a | ||
|
|
b3ee90f961 |
@@ -3,7 +3,6 @@ steps:
|
|||||||
- label: ":docker: Build image"
|
- label: ":docker: Build image"
|
||||||
key: image-build
|
key: image-build
|
||||||
depends_on: []
|
depends_on: []
|
||||||
timeout_in_minutes: 600
|
|
||||||
commands:
|
commands:
|
||||||
- if [[ "$BUILDKITE_BRANCH" != "main" ]]; then .buildkite/image_build/image_build.sh $REGISTRY $REPO $BUILDKITE_COMMIT $BRANCH $VLLM_USE_PRECOMPILED $VLLM_MERGE_BASE_COMMIT $IMAGE_TAG; fi
|
- if [[ "$BUILDKITE_BRANCH" != "main" ]]; then .buildkite/image_build/image_build.sh $REGISTRY $REPO $BUILDKITE_COMMIT $BRANCH $VLLM_USE_PRECOMPILED $VLLM_MERGE_BASE_COMMIT $IMAGE_TAG; fi
|
||||||
- if [[ "$BUILDKITE_BRANCH" == "main" ]]; then .buildkite/image_build/image_build.sh $REGISTRY $REPO $BUILDKITE_COMMIT $BRANCH $VLLM_USE_PRECOMPILED $VLLM_MERGE_BASE_COMMIT $IMAGE_TAG $IMAGE_TAG_LATEST; fi
|
- if [[ "$BUILDKITE_BRANCH" == "main" ]]; then .buildkite/image_build/image_build.sh $REGISTRY $REPO $BUILDKITE_COMMIT $BRANCH $VLLM_USE_PRECOMPILED $VLLM_MERGE_BASE_COMMIT $IMAGE_TAG $IMAGE_TAG_LATEST; fi
|
||||||
|
|||||||
@@ -14,7 +14,7 @@ BUILDKITE_COMMIT=$3
|
|||||||
aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin $REGISTRY
|
aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin $REGISTRY
|
||||||
|
|
||||||
# skip build if image already exists
|
# skip build if image already exists
|
||||||
if [[ -z $(docker manifest inspect $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu) ]]; then
|
if [[ -z $(docker manifest inspect "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-arm64-cpu) ]]; then
|
||||||
echo "Image not found, proceeding with build..."
|
echo "Image not found, proceeding with build..."
|
||||||
else
|
else
|
||||||
echo "Image found"
|
echo "Image found"
|
||||||
@@ -24,10 +24,10 @@ fi
|
|||||||
# build
|
# build
|
||||||
docker build --file docker/Dockerfile.cpu \
|
docker build --file docker/Dockerfile.cpu \
|
||||||
--build-arg max_jobs=16 \
|
--build-arg max_jobs=16 \
|
||||||
--build-arg buildkite_commit=$BUILDKITE_COMMIT \
|
--build-arg buildkite_commit="$BUILDKITE_COMMIT" \
|
||||||
--tag $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu \
|
--tag "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-arm64-cpu \
|
||||||
--target vllm-test \
|
--target vllm-test \
|
||||||
--progress plain .
|
--progress plain .
|
||||||
|
|
||||||
# push
|
# push
|
||||||
docker push $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu
|
docker push "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-arm64-cpu
|
||||||
|
|||||||
@@ -248,8 +248,8 @@ steps:
|
|||||||
DOCKER_BUILDKIT: "1"
|
DOCKER_BUILDKIT: "1"
|
||||||
DOCKERHUB_USERNAME: "vllmbot"
|
DOCKERHUB_USERNAME: "vllmbot"
|
||||||
|
|
||||||
- group: "Publish wheels"
|
- group: "Publish release artifacts"
|
||||||
key: "publish-wheels"
|
key: "publish-release-artifacts"
|
||||||
steps:
|
steps:
|
||||||
- block: "Confirm update release wheels to PyPI (experimental, use with caution)?"
|
- block: "Confirm update release wheels to PyPI (experimental, use with caution)?"
|
||||||
key: block-upload-release-wheels
|
key: block-upload-release-wheels
|
||||||
@@ -266,6 +266,27 @@ steps:
|
|||||||
commands:
|
commands:
|
||||||
- "bash .buildkite/scripts/upload-release-wheels-pypi.sh"
|
- "bash .buildkite/scripts/upload-release-wheels-pypi.sh"
|
||||||
|
|
||||||
|
- block: "Confirm update release images to DockerHub"
|
||||||
|
key: block-update-release-images-dockerhub
|
||||||
|
depends_on:
|
||||||
|
- input-release-version
|
||||||
|
- annotate-release-workflow
|
||||||
|
|
||||||
|
- label: "Publish release images to DockerHub"
|
||||||
|
depends_on:
|
||||||
|
- block-update-release-images-dockerhub
|
||||||
|
agents:
|
||||||
|
queue: small_cpu_queue_postmerge
|
||||||
|
commands:
|
||||||
|
- "bash .buildkite/scripts/push-release-images-dockerhub.sh"
|
||||||
|
plugins:
|
||||||
|
- docker-login#v3.0.0:
|
||||||
|
username: vllmbot
|
||||||
|
password-env: DOCKERHUB_TOKEN
|
||||||
|
env:
|
||||||
|
DOCKER_BUILDKIT: "1"
|
||||||
|
DOCKERHUB_USERNAME: "vllmbot"
|
||||||
|
|
||||||
# =============================================================================
|
# =============================================================================
|
||||||
# ROCm Release Pipeline (x86_64 only)
|
# ROCm Release Pipeline (x86_64 only)
|
||||||
# =============================================================================
|
# =============================================================================
|
||||||
|
|||||||
98
.buildkite/scripts/push-release-images-dockerhub.sh
Normal file
98
.buildkite/scripts/push-release-images-dockerhub.sh
Normal file
@@ -0,0 +1,98 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
set -ex
|
||||||
|
|
||||||
|
RELEASE_VERSION=$(buildkite-agent meta-data get release-version 2>/dev/null | sed 's/^v//')
|
||||||
|
if [ -z "${RELEASE_VERSION}" ]; then
|
||||||
|
echo "RELEASE_VERSION is not set"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7
|
||||||
|
|
||||||
|
# Download images:
|
||||||
|
|
||||||
|
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64
|
||||||
|
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64
|
||||||
|
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64-cu130
|
||||||
|
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64-cu130
|
||||||
|
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm-base
|
||||||
|
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm
|
||||||
|
docker pull public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:v${RELEASE_VERSION}
|
||||||
|
docker pull public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:v${RELEASE_VERSION}
|
||||||
|
|
||||||
|
# Tag and push images:
|
||||||
|
|
||||||
|
## CUDA
|
||||||
|
|
||||||
|
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64 vllm/vllm-openai:x86_64
|
||||||
|
docker tag vllm/vllm-openai:x86_64 vllm/vllm-openai:latest-x86_64
|
||||||
|
docker tag vllm/vllm-openai:x86_64 vllm/vllm-openai:v${RELEASE_VERSION}-x86_64
|
||||||
|
docker push vllm/vllm-openai:latest-x86_64
|
||||||
|
docker push vllm/vllm-openai:v${RELEASE_VERSION}-x86_64
|
||||||
|
|
||||||
|
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64-cu130 vllm/vllm-openai:x86_64-cu130
|
||||||
|
docker tag vllm/vllm-openai:x86_64-cu130 vllm/vllm-openai:latest-x86_64-cu130
|
||||||
|
docker tag vllm/vllm-openai:x86_64-cu130 vllm/vllm-openai:v${RELEASE_VERSION}-x86_64-cu130
|
||||||
|
docker push vllm/vllm-openai:latest-x86_64-cu130
|
||||||
|
docker push vllm/vllm-openai:v${RELEASE_VERSION}-x86_64-cu130
|
||||||
|
|
||||||
|
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64 vllm/vllm-openai:aarch64
|
||||||
|
docker tag vllm/vllm-openai:aarch64 vllm/vllm-openai:latest-aarch64
|
||||||
|
docker tag vllm/vllm-openai:aarch64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
|
||||||
|
docker push vllm/vllm-openai:latest-aarch64
|
||||||
|
docker push vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
|
||||||
|
|
||||||
|
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64-cu130 vllm/vllm-openai:aarch64-cu130
|
||||||
|
docker tag vllm/vllm-openai:aarch64-cu130 vllm/vllm-openai:latest-aarch64-cu130
|
||||||
|
docker tag vllm/vllm-openai:aarch64-cu130 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64-cu130
|
||||||
|
docker push vllm/vllm-openai:latest-aarch64-cu130
|
||||||
|
docker push vllm/vllm-openai:v${RELEASE_VERSION}-aarch64-cu130
|
||||||
|
|
||||||
|
## ROCm
|
||||||
|
|
||||||
|
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}
|
||||||
|
docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT} vllm/vllm-openai-rocm:latest
|
||||||
|
docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT} vllm/vllm-openai-rocm:v${RELEASE_VERSION}
|
||||||
|
docker push vllm/vllm-openai-rocm:latest
|
||||||
|
docker push vllm/vllm-openai-rocm:v${RELEASE_VERSION}
|
||||||
|
|
||||||
|
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm-base vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-base
|
||||||
|
docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-base vllm/vllm-openai-rocm:latest-base
|
||||||
|
docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-base vllm/vllm-openai-rocm:v${RELEASE_VERSION}-base
|
||||||
|
docker push vllm/vllm-openai-rocm:latest-base
|
||||||
|
docker push vllm/vllm-openai-rocm:v${RELEASE_VERSION}-base
|
||||||
|
|
||||||
|
## CPU
|
||||||
|
|
||||||
|
docker tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:v${RELEASE_VERSION} vllm/vllm-openai-cpu:x86_64
|
||||||
|
docker tag vllm/vllm-openai-cpu:x86_64 vllm/vllm-openai-cpu:latest-x86_64
|
||||||
|
docker tag vllm/vllm-openai-cpu:x86_64 vllm/vllm-openai-cpu:v${RELEASE_VERSION}-x86_64
|
||||||
|
docker push vllm/vllm-openai-cpu:latest-x86_64
|
||||||
|
docker push vllm/vllm-openai-cpu:v${RELEASE_VERSION}-x86_64
|
||||||
|
|
||||||
|
docker tag public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:v${RELEASE_VERSION} vllm/vllm-openai-cpu:arm64
|
||||||
|
docker tag vllm/vllm-openai-cpu:arm64 vllm/vllm-openai-cpu:latest-arm64
|
||||||
|
docker tag vllm/vllm-openai-cpu:arm64 vllm/vllm-openai-cpu:v${RELEASE_VERSION}-arm64
|
||||||
|
docker push vllm/vllm-openai-cpu:latest-arm64
|
||||||
|
docker push vllm/vllm-openai-cpu:v${RELEASE_VERSION}-arm64
|
||||||
|
|
||||||
|
# Create multi-arch manifest:
|
||||||
|
|
||||||
|
docker manifest rm vllm/vllm-openai:latest
|
||||||
|
docker manifest create vllm/vllm-openai:latest vllm/vllm-openai:latest-x86_64 vllm/vllm-openai:latest-aarch64
|
||||||
|
docker manifest create vllm/vllm-openai:v${RELEASE_VERSION} vllm/vllm-openai:v${RELEASE_VERSION}-x86_64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
|
||||||
|
docker manifest push vllm/vllm-openai:latest
|
||||||
|
docker manifest push vllm/vllm-openai:v${RELEASE_VERSION}
|
||||||
|
|
||||||
|
docker manifest rm vllm/vllm-openai:latest-cu130
|
||||||
|
docker manifest create vllm/vllm-openai:latest-cu130 vllm/vllm-openai:latest-x86_64-cu130 vllm/vllm-openai:latest-aarch64-cu130
|
||||||
|
docker manifest create vllm/vllm-openai:v${RELEASE_VERSION}-cu130 vllm/vllm-openai:v${RELEASE_VERSION}-x86_64-cu130 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64-cu130
|
||||||
|
docker manifest push vllm/vllm-openai:latest-cu130
|
||||||
|
docker manifest push vllm/vllm-openai:v${RELEASE_VERSION}-cu130
|
||||||
|
|
||||||
|
docker manifest rm vllm/vllm-openai-cpu:latest || true
|
||||||
|
docker manifest create vllm/vllm-openai-cpu:latest vllm/vllm-openai-cpu:latest-x86_64 vllm/vllm-openai-cpu:latest-arm64
|
||||||
|
docker manifest create vllm/vllm-openai-cpu:v${RELEASE_VERSION} vllm/vllm-openai-cpu:v${RELEASE_VERSION}-x86_64 vllm/vllm-openai-cpu:v${RELEASE_VERSION}-arm64
|
||||||
|
docker manifest push vllm/vllm-openai-cpu:latest
|
||||||
|
docker manifest push vllm/vllm-openai-cpu:v${RELEASE_VERSION}
|
||||||
@@ -56,8 +56,8 @@ endif()
|
|||||||
# requirements.txt files and should be kept consistent. The ROCm torch
|
# requirements.txt files and should be kept consistent. The ROCm torch
|
||||||
# versions are derived from docker/Dockerfile.rocm
|
# versions are derived from docker/Dockerfile.rocm
|
||||||
#
|
#
|
||||||
set(TORCH_SUPPORTED_VERSION_CUDA "2.10.0")
|
set(TORCH_SUPPORTED_VERSION_CUDA "2.9.1")
|
||||||
set(TORCH_SUPPORTED_VERSION_ROCM "2.10.0")
|
set(TORCH_SUPPORTED_VERSION_ROCM "2.9.1")
|
||||||
|
|
||||||
#
|
#
|
||||||
# Try to find python package with an executable that exactly matches
|
# Try to find python package with an executable that exactly matches
|
||||||
|
|||||||
@@ -686,6 +686,7 @@ def get_model_params(config):
|
|||||||
"DeepseekV2ForCausalLM",
|
"DeepseekV2ForCausalLM",
|
||||||
"DeepseekV3ForCausalLM",
|
"DeepseekV3ForCausalLM",
|
||||||
"DeepseekV32ForCausalLM",
|
"DeepseekV32ForCausalLM",
|
||||||
|
"GlmMoeDsaForCausalLM",
|
||||||
"Glm4MoeForCausalLM",
|
"Glm4MoeForCausalLM",
|
||||||
"Glm4MoeLiteForCausalLM",
|
"Glm4MoeLiteForCausalLM",
|
||||||
"NemotronHForCausalLM",
|
"NemotronHForCausalLM",
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
# Install OpenAI triton_kernels from https://github.com/triton-lang/triton/tree/main/python/triton_kernels
|
# Install OpenAI triton_kernels from https://github.com/triton-lang/triton/tree/main/python/triton_kernels
|
||||||
|
|
||||||
set(DEFAULT_TRITON_KERNELS_TAG "v3.6.0")
|
set(DEFAULT_TRITON_KERNELS_TAG "v3.5.0")
|
||||||
|
|
||||||
# Set TRITON_KERNELS_SRC_DIR for use with local development with vLLM. We expect TRITON_KERNELS_SRC_DIR to
|
# Set TRITON_KERNELS_SRC_DIR for use with local development with vLLM. We expect TRITON_KERNELS_SRC_DIR to
|
||||||
# be directly set to the triton_kernels python directory.
|
# be directly set to the triton_kernels python directory.
|
||||||
|
|||||||
@@ -6,7 +6,7 @@ requires = [
|
|||||||
"packaging>=24.2",
|
"packaging>=24.2",
|
||||||
"setuptools>=77.0.3,<81.0.0",
|
"setuptools>=77.0.3,<81.0.0",
|
||||||
"setuptools-scm>=8.0",
|
"setuptools-scm>=8.0",
|
||||||
"torch == 2.10.0",
|
"torch == 2.9.1",
|
||||||
"wheel",
|
"wheel",
|
||||||
"jinja2",
|
"jinja2",
|
||||||
"grpcio-tools==1.78.0",
|
"grpcio-tools==1.78.0",
|
||||||
|
|||||||
@@ -4,7 +4,7 @@ ninja
|
|||||||
packaging>=24.2
|
packaging>=24.2
|
||||||
setuptools>=77.0.3,<81.0.0
|
setuptools>=77.0.3,<81.0.0
|
||||||
setuptools-scm>=8
|
setuptools-scm>=8
|
||||||
torch==2.10.0
|
torch==2.9.1
|
||||||
wheel
|
wheel
|
||||||
jinja2>=3.1.6
|
jinja2>=3.1.6
|
||||||
regex
|
regex
|
||||||
|
|||||||
@@ -5,9 +5,9 @@ numba == 0.61.2 # Required for N-gram speculative decoding
|
|||||||
|
|
||||||
# Dependencies for NVIDIA GPUs
|
# Dependencies for NVIDIA GPUs
|
||||||
ray[cgraph]>=2.48.0
|
ray[cgraph]>=2.48.0
|
||||||
torch==2.10.0
|
torch==2.9.1
|
||||||
torchaudio==2.10.0
|
torchaudio==2.9.1
|
||||||
# These must be updated alongside torch
|
# These must be updated alongside torch
|
||||||
torchvision==0.25.0 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
|
torchvision==0.24.1 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
|
||||||
# FlashInfer should be updated together with the Dockerfile
|
# FlashInfer should be updated together with the Dockerfile
|
||||||
flashinfer-python==0.6.3
|
flashinfer-python==0.6.3
|
||||||
|
|||||||
@@ -43,5 +43,5 @@ tritonclient>=2.51.0
|
|||||||
numba == 0.61.2 # Required for N-gram speculative decoding
|
numba == 0.61.2 # Required for N-gram speculative decoding
|
||||||
numpy
|
numpy
|
||||||
runai-model-streamer[s3,gcs]==0.15.3
|
runai-model-streamer[s3,gcs]==0.15.3
|
||||||
fastsafetensors>=0.1.10
|
fastsafetensors>=0.2.2
|
||||||
pydantic>=2.12 # 2.11 leads to error on python 3.13
|
pydantic>=2.12 # 2.11 leads to error on python 3.13
|
||||||
|
|||||||
@@ -1,11 +1,12 @@
|
|||||||
# Common dependencies
|
# Common dependencies
|
||||||
-r common.txt
|
-r common.txt
|
||||||
|
|
||||||
--extra-index-url https://download.pytorch.org/whl/test/rocm7.0
|
--extra-index-url https://download.pytorch.org/whl/rocm6.4
|
||||||
torch==2.10.0
|
torch==2.9.1
|
||||||
torchvision==0.25.0
|
torchvision==0.24.1
|
||||||
torchaudio==2.10.0
|
torchaudio==2.9.1
|
||||||
triton==3.6.0
|
|
||||||
|
triton==3.5.1
|
||||||
cmake>=3.26.1,<4
|
cmake>=3.26.1,<4
|
||||||
packaging>=24.2
|
packaging>=24.2
|
||||||
setuptools>=77.0.3,<80.0.0
|
setuptools>=77.0.3,<80.0.0
|
||||||
|
|||||||
@@ -1,6 +1,11 @@
|
|||||||
# Common dependencies
|
# Common dependencies
|
||||||
-r common.txt
|
-r common.txt
|
||||||
|
|
||||||
|
# The version of gRPC libraries should be consistent with each other
|
||||||
|
grpcio==1.78.0
|
||||||
|
grpcio-reflection==1.78.0
|
||||||
|
grpcio-tools==1.78.0
|
||||||
|
|
||||||
numba == 0.61.2 # Required for N-gram speculative decoding
|
numba == 0.61.2 # Required for N-gram speculative decoding
|
||||||
|
|
||||||
# Dependencies for AMD GPUs
|
# Dependencies for AMD GPUs
|
||||||
@@ -15,4 +20,3 @@ setuptools-scm>=8
|
|||||||
runai-model-streamer[s3,gcs]==0.15.3
|
runai-model-streamer[s3,gcs]==0.15.3
|
||||||
conch-triton-kernels==1.2.1
|
conch-triton-kernels==1.2.1
|
||||||
timm>=1.0.17
|
timm>=1.0.17
|
||||||
grpcio-tools==1.78.0 # Should match `build.txt`
|
|
||||||
@@ -24,10 +24,10 @@ sentence-transformers>=5.2.0 # required for embedding tests
|
|||||||
soundfile # required for audio tests
|
soundfile # required for audio tests
|
||||||
jiwer # required for audio tests
|
jiwer # required for audio tests
|
||||||
tblib # for pickling test exceptions
|
tblib # for pickling test exceptions
|
||||||
timm >=1.0.17 # required for internvl and gemma3n-mm test
|
timm==1.0.17 # required for internvl and gemma3n-mm test
|
||||||
torch==2.10.0
|
torch==2.9.1
|
||||||
torchaudio==2.10.0
|
torchaudio==2.9.1
|
||||||
torchvision==0.25.0
|
torchvision==0.24.1
|
||||||
transformers_stream_generator # required for qwen-vl test
|
transformers_stream_generator # required for qwen-vl test
|
||||||
matplotlib # required for qwen-vl test
|
matplotlib # required for qwen-vl test
|
||||||
mistral_common[image,audio] >= 1.9.0 # required for voxtral test
|
mistral_common[image,audio] >= 1.9.0 # required for voxtral test
|
||||||
@@ -48,12 +48,16 @@ buildkite-test-collector==0.1.9
|
|||||||
genai_perf>=0.0.8
|
genai_perf>=0.0.8
|
||||||
tritonclient>=2.51.0
|
tritonclient>=2.51.0
|
||||||
|
|
||||||
grpcio-tools==1.78.0 # Should match `build.txt`
|
# The version of gRPC libraries should be consistent with each other
|
||||||
|
grpcio==1.78.0
|
||||||
|
grpcio-reflection==1.78.0
|
||||||
|
grpcio-tools==1.78.0
|
||||||
|
|
||||||
arctic-inference == 0.1.1 # Required for suffix decoding test
|
arctic-inference == 0.1.1 # Required for suffix decoding test
|
||||||
numba == 0.61.2 # Required for N-gram speculative decoding
|
numba == 0.61.2 # Required for N-gram speculative decoding
|
||||||
numpy
|
numpy
|
||||||
runai-model-streamer[s3,gcs]==0.15.3
|
runai-model-streamer[s3,gcs]==0.15.3
|
||||||
fastsafetensors>=0.1.10
|
fastsafetensors>=0.2.2 # 0.2.2 contains important fixes for multi-GPU mem usage
|
||||||
pydantic>=2.12 # 2.11 leads to error on python 3.13
|
pydantic>=2.12 # 2.11 leads to error on python 3.13
|
||||||
decord==0.6.0
|
decord==0.6.0
|
||||||
terratorch @ git+https://github.com/IBM/terratorch.git@1.1.rc3 # required for PrithviMAE test
|
terratorch @ git+https://github.com/IBM/terratorch.git@1.1.rc3 # required for PrithviMAE test
|
||||||
|
|||||||
@@ -155,10 +155,6 @@ coverage==7.10.6
|
|||||||
# via pytest-cov
|
# via pytest-cov
|
||||||
cramjam==2.9.0
|
cramjam==2.9.0
|
||||||
# via fastparquet
|
# via fastparquet
|
||||||
cuda-bindings==12.9.4
|
|
||||||
# via torch
|
|
||||||
cuda-pathfinder==1.3.3
|
|
||||||
# via cuda-bindings
|
|
||||||
cupy-cuda12x==13.6.0
|
cupy-cuda12x==13.6.0
|
||||||
# via ray
|
# via ray
|
||||||
cycler==0.12.1
|
cycler==0.12.1
|
||||||
@@ -224,7 +220,7 @@ fastparquet==2024.11.0
|
|||||||
# via genai-perf
|
# via genai-perf
|
||||||
fastrlock==0.8.2
|
fastrlock==0.8.2
|
||||||
# via cupy-cuda12x
|
# via cupy-cuda12x
|
||||||
fastsafetensors==0.1.10
|
fastsafetensors==0.2.2
|
||||||
# via -r requirements/test.in
|
# via -r requirements/test.in
|
||||||
filelock==3.16.1
|
filelock==3.16.1
|
||||||
# via
|
# via
|
||||||
@@ -309,8 +305,13 @@ greenlet==3.2.3
|
|||||||
# via sqlalchemy
|
# via sqlalchemy
|
||||||
grpcio==1.78.0
|
grpcio==1.78.0
|
||||||
# via
|
# via
|
||||||
|
# -r requirements/test.in
|
||||||
|
# grpcio-reflection
|
||||||
# grpcio-tools
|
# grpcio-tools
|
||||||
# ray
|
# ray
|
||||||
|
# tensorboard
|
||||||
|
grpcio-reflection==1.78.0
|
||||||
|
# via -r requirements/test.in
|
||||||
grpcio-tools==1.78.0
|
grpcio-tools==1.78.0
|
||||||
# via -r requirements/test.in
|
# via -r requirements/test.in
|
||||||
gunicorn==23.0.0
|
gunicorn==23.0.0
|
||||||
@@ -635,7 +636,7 @@ nvidia-nvjitlink-cu12==12.9.86
|
|||||||
# nvidia-cusolver-cu12
|
# nvidia-cusolver-cu12
|
||||||
# nvidia-cusparse-cu12
|
# nvidia-cusparse-cu12
|
||||||
# torch
|
# torch
|
||||||
nvidia-nvshmem-cu12==3.4.5
|
nvidia-nvshmem-cu12==3.3.20
|
||||||
# via torch
|
# via torch
|
||||||
nvidia-nvtx-cu12==12.9.79
|
nvidia-nvtx-cu12==12.9.79
|
||||||
# via torch
|
# via torch
|
||||||
@@ -785,6 +786,7 @@ protobuf==6.33.2
|
|||||||
# via
|
# via
|
||||||
# google-api-core
|
# google-api-core
|
||||||
# googleapis-common-protos
|
# googleapis-common-protos
|
||||||
|
# grpcio-reflection
|
||||||
# grpcio-tools
|
# grpcio-tools
|
||||||
# mlflow-skinny
|
# mlflow-skinny
|
||||||
# opentelemetry-proto
|
# opentelemetry-proto
|
||||||
@@ -1167,14 +1169,13 @@ tomli==2.2.1
|
|||||||
# via schemathesis
|
# via schemathesis
|
||||||
tomli-w==1.2.0
|
tomli-w==1.2.0
|
||||||
# via schemathesis
|
# via schemathesis
|
||||||
torch==2.10.0+cu129
|
torch==2.9.1+cu129
|
||||||
# via
|
# via
|
||||||
# -r requirements/test.in
|
# -r requirements/test.in
|
||||||
# accelerate
|
# accelerate
|
||||||
# bitsandbytes
|
# bitsandbytes
|
||||||
# efficientnet-pytorch
|
# efficientnet-pytorch
|
||||||
# encodec
|
# encodec
|
||||||
# fastsafetensors
|
|
||||||
# kornia
|
# kornia
|
||||||
# lightly
|
# lightly
|
||||||
# lightning
|
# lightning
|
||||||
@@ -1196,7 +1197,7 @@ torch==2.10.0+cu129
|
|||||||
# torchvision
|
# torchvision
|
||||||
# vector-quantize-pytorch
|
# vector-quantize-pytorch
|
||||||
# vocos
|
# vocos
|
||||||
torchaudio==2.10.0+cu129
|
torchaudio==2.9.1+cu129
|
||||||
# via
|
# via
|
||||||
# -r requirements/test.in
|
# -r requirements/test.in
|
||||||
# encodec
|
# encodec
|
||||||
@@ -1209,7 +1210,7 @@ torchmetrics==1.7.4
|
|||||||
# pytorch-lightning
|
# pytorch-lightning
|
||||||
# terratorch
|
# terratorch
|
||||||
# torchgeo
|
# torchgeo
|
||||||
torchvision==0.25.0+cu129
|
torchvision==0.24.1+cu129
|
||||||
# via
|
# via
|
||||||
# -r requirements/test.in
|
# -r requirements/test.in
|
||||||
# lightly
|
# lightly
|
||||||
@@ -1251,7 +1252,7 @@ transformers==4.57.5
|
|||||||
# transformers-stream-generator
|
# transformers-stream-generator
|
||||||
transformers-stream-generator==0.0.5
|
transformers-stream-generator==0.0.5
|
||||||
# via -r requirements/test.in
|
# via -r requirements/test.in
|
||||||
triton==3.6.0
|
triton==3.5.1
|
||||||
# via torch
|
# via torch
|
||||||
tritonclient==2.64.0
|
tritonclient==2.64.0
|
||||||
# via -r requirements/test.in
|
# via -r requirements/test.in
|
||||||
|
|||||||
2
setup.py
2
setup.py
@@ -1035,7 +1035,7 @@ setup(
|
|||||||
extras_require={
|
extras_require={
|
||||||
"bench": ["pandas", "matplotlib", "seaborn", "datasets", "scipy"],
|
"bench": ["pandas", "matplotlib", "seaborn", "datasets", "scipy"],
|
||||||
"tensorizer": ["tensorizer==2.10.1"],
|
"tensorizer": ["tensorizer==2.10.1"],
|
||||||
"fastsafetensors": ["fastsafetensors >= 0.1.10"],
|
"fastsafetensors": ["fastsafetensors >= 0.2.2"],
|
||||||
"runai": ["runai-model-streamer[s3,gcs] >= 0.15.3"],
|
"runai": ["runai-model-streamer[s3,gcs] >= 0.15.3"],
|
||||||
"audio": [
|
"audio": [
|
||||||
"librosa",
|
"librosa",
|
||||||
|
|||||||
@@ -90,7 +90,9 @@ def use_vllm_config(vllm_config: VllmConfig):
|
|||||||
yield
|
yield
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.skipif(not is_torch_equal_or_newer("2.10.0"), reason="requires torch 2.10")
|
@pytest.mark.skipif(
|
||||||
|
not is_torch_equal_or_newer("2.10.0.dev"), reason="requires torch 2.10"
|
||||||
|
)
|
||||||
def test_no_dynamo_cache_entry(monkeypatch: pytest.MonkeyPatch):
|
def test_no_dynamo_cache_entry(monkeypatch: pytest.MonkeyPatch):
|
||||||
with monkeypatch.context() as m:
|
with monkeypatch.context() as m:
|
||||||
vllm_config = make_vllm_config()
|
vllm_config = make_vllm_config()
|
||||||
@@ -114,7 +116,9 @@ def test_no_dynamo_cache_entry(monkeypatch: pytest.MonkeyPatch):
|
|||||||
assert torch.allclose(actual, expected)
|
assert torch.allclose(actual, expected)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.skipif(not is_torch_equal_or_newer("2.10.0"), reason="requires torch 2.10")
|
@pytest.mark.skipif(
|
||||||
|
not is_torch_equal_or_newer("2.10.0.dev"), reason="requires torch 2.10"
|
||||||
|
)
|
||||||
def test_force_aot_load(monkeypatch: pytest.MonkeyPatch):
|
def test_force_aot_load(monkeypatch: pytest.MonkeyPatch):
|
||||||
with tempfile.TemporaryDirectory() as tmpdirname, monkeypatch.context() as m:
|
with tempfile.TemporaryDirectory() as tmpdirname, monkeypatch.context() as m:
|
||||||
args = (torch.randn(10, 10),)
|
args = (torch.randn(10, 10),)
|
||||||
@@ -128,7 +132,9 @@ def test_force_aot_load(monkeypatch: pytest.MonkeyPatch):
|
|||||||
CompiledMod(vllm_config=vllm_config)(*args)
|
CompiledMod(vllm_config=vllm_config)(*args)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.skipif(not is_torch_equal_or_newer("2.10.0"), reason="requires torch 2.10")
|
@pytest.mark.skipif(
|
||||||
|
not is_torch_equal_or_newer("2.10.0.dev"), reason="requires torch 2.10"
|
||||||
|
)
|
||||||
def test_save_and_load(monkeypatch: pytest.MonkeyPatch):
|
def test_save_and_load(monkeypatch: pytest.MonkeyPatch):
|
||||||
with monkeypatch.context() as m:
|
with monkeypatch.context() as m:
|
||||||
args = (torch.randn(10, 10),)
|
args = (torch.randn(10, 10),)
|
||||||
@@ -156,7 +162,9 @@ def test_save_and_load(monkeypatch: pytest.MonkeyPatch):
|
|||||||
assert torch.allclose(ret, expected)
|
assert torch.allclose(ret, expected)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.skipif(not is_torch_equal_or_newer("2.10.0"), reason="requires torch 2.10")
|
@pytest.mark.skipif(
|
||||||
|
not is_torch_equal_or_newer("2.10.0.dev"), reason="requires torch 2.10"
|
||||||
|
)
|
||||||
def test_cache_load_returns_tuple_consistency(monkeypatch: pytest.MonkeyPatch):
|
def test_cache_load_returns_tuple_consistency(monkeypatch: pytest.MonkeyPatch):
|
||||||
"""
|
"""
|
||||||
Test that cache loading correctly handles the returns_tuple logic.
|
Test that cache loading correctly handles the returns_tuple logic.
|
||||||
@@ -215,7 +223,9 @@ def test_cache_load_returns_tuple_consistency(monkeypatch: pytest.MonkeyPatch):
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.skipif(not is_torch_equal_or_newer("2.10.0"), reason="requires torch 2.10")
|
@pytest.mark.skipif(
|
||||||
|
not is_torch_equal_or_newer("2.10.0.dev"), reason="requires torch 2.10"
|
||||||
|
)
|
||||||
def test_cache_load_returns_tuple_consistency_tuple_output(
|
def test_cache_load_returns_tuple_consistency_tuple_output(
|
||||||
monkeypatch: pytest.MonkeyPatch,
|
monkeypatch: pytest.MonkeyPatch,
|
||||||
):
|
):
|
||||||
@@ -284,7 +294,9 @@ def test_cache_load_returns_tuple_consistency_tuple_output(
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.skipif(not is_torch_equal_or_newer("2.10.0"), reason="requires torch 2.10")
|
@pytest.mark.skipif(
|
||||||
|
not is_torch_equal_or_newer("2.10.0.dev"), reason="requires torch 2.10"
|
||||||
|
)
|
||||||
def test_shape_env(monkeypatch: pytest.MonkeyPatch):
|
def test_shape_env(monkeypatch: pytest.MonkeyPatch):
|
||||||
"""
|
"""
|
||||||
Test that the shape environment is correctly serialized and preserved
|
Test that the shape environment is correctly serialized and preserved
|
||||||
@@ -321,7 +333,9 @@ def test_shape_env(monkeypatch: pytest.MonkeyPatch):
|
|||||||
assert guards_string == " - s77 <= 42\n - Eq(Mod(s77, 2), 0)"
|
assert guards_string == " - s77 <= 42\n - Eq(Mod(s77, 2), 0)"
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.skipif(not is_torch_equal_or_newer("2.10.0"), reason="requires torch 2.10")
|
@pytest.mark.skipif(
|
||||||
|
not is_torch_equal_or_newer("2.10.0.dev"), reason="requires torch 2.10"
|
||||||
|
)
|
||||||
def test_partition_wrapper_applied_on_aot_load(
|
def test_partition_wrapper_applied_on_aot_load(
|
||||||
monkeypatch: pytest.MonkeyPatch, vllm_tmp_cache: Path, mocker
|
monkeypatch: pytest.MonkeyPatch, vllm_tmp_cache: Path, mocker
|
||||||
):
|
):
|
||||||
@@ -412,7 +426,9 @@ def test_partition_wrapper_applied_on_aot_load(
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.skipif(not is_torch_equal_or_newer("2.10.0"), reason="requires torch 2.10")
|
@pytest.mark.skipif(
|
||||||
|
not is_torch_equal_or_newer("2.10.0.dev"), reason="requires torch 2.10"
|
||||||
|
)
|
||||||
@create_new_process_for_each_test("spawn")
|
@create_new_process_for_each_test("spawn")
|
||||||
def test_gpt2_cache_hit(monkeypatch: pytest.MonkeyPatch):
|
def test_gpt2_cache_hit(monkeypatch: pytest.MonkeyPatch):
|
||||||
"""
|
"""
|
||||||
@@ -476,7 +492,9 @@ def test_gpt2_cache_hit(monkeypatch: pytest.MonkeyPatch):
|
|||||||
symbolic_shapes_module.make_symbol = original_make_symbol
|
symbolic_shapes_module.make_symbol = original_make_symbol
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.skipif(not is_torch_equal_or_newer("2.10.0"), reason="requires torch 2.10")
|
@pytest.mark.skipif(
|
||||||
|
not is_torch_equal_or_newer("2.10.0.dev"), reason="requires torch 2.10"
|
||||||
|
)
|
||||||
class TestStandaloneCompiledArtifacts:
|
class TestStandaloneCompiledArtifacts:
|
||||||
def test_init(self):
|
def test_init(self):
|
||||||
cache = StandaloneCompiledArtifacts()
|
cache = StandaloneCompiledArtifacts()
|
||||||
@@ -650,7 +668,9 @@ class TestStandaloneCompiledArtifacts:
|
|||||||
assert len(restored_cache.loaded_submodule_store) == 0
|
assert len(restored_cache.loaded_submodule_store) == 0
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.skipif(not is_torch_equal_or_newer("2.10.0"), reason="requires torch 2.10")
|
@pytest.mark.skipif(
|
||||||
|
not is_torch_equal_or_newer("2.10.0.dev"), reason="requires torch 2.10"
|
||||||
|
)
|
||||||
class TestStandaloneCompiledArtifactsIntegration:
|
class TestStandaloneCompiledArtifactsIntegration:
|
||||||
def test_add_pickle_unpickle(self):
|
def test_add_pickle_unpickle(self):
|
||||||
cache = StandaloneCompiledArtifacts()
|
cache = StandaloneCompiledArtifacts()
|
||||||
|
|||||||
@@ -39,7 +39,9 @@ def get_test_models():
|
|||||||
@pytest.mark.parametrize("use_aot_compile", ["0", "1"])
|
@pytest.mark.parametrize("use_aot_compile", ["0", "1"])
|
||||||
@pytest.mark.parametrize("use_bytecode_hook", [True, False])
|
@pytest.mark.parametrize("use_bytecode_hook", [True, False])
|
||||||
@pytest.mark.parametrize("evaluate_guards", [False, True])
|
@pytest.mark.parametrize("evaluate_guards", [False, True])
|
||||||
@pytest.mark.skipif(not is_torch_equal_or_newer("2.10.0"), reason="requires torch 2.10")
|
@pytest.mark.skipif(
|
||||||
|
not is_torch_equal_or_newer("2.10.0.dev"), reason="requires torch 2.10"
|
||||||
|
)
|
||||||
def test_dynamic_shapes_compilation(
|
def test_dynamic_shapes_compilation(
|
||||||
monkeypatch,
|
monkeypatch,
|
||||||
model_name,
|
model_name,
|
||||||
|
|||||||
@@ -129,5 +129,5 @@ async def test_multi_chunk_streaming(
|
|||||||
" First words I spoke in the original phonograph."
|
" First words I spoke in the original phonograph."
|
||||||
" A little piece of practical poetry. Mary had a little lamb,"
|
" A little piece of practical poetry. Mary had a little lamb,"
|
||||||
" it sleeps with quite a flow, and everywhere that Mary went,"
|
" it sleeps with quite a flow, and everywhere that Mary went,"
|
||||||
" the lamb was sure to go"
|
" the lamb was sure to go."
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -14,7 +14,6 @@ import torch.nn as nn
|
|||||||
from vllm.config import VllmConfig, set_current_vllm_config
|
from vllm.config import VllmConfig, set_current_vllm_config
|
||||||
from vllm.forward_context import set_forward_context
|
from vllm.forward_context import set_forward_context
|
||||||
from vllm.model_executor.layers.fused_moe.shared_fused_moe import SharedFusedMoE
|
from vllm.model_executor.layers.fused_moe.shared_fused_moe import SharedFusedMoE
|
||||||
from vllm.utils.torch_utils import is_torch_equal_or_newer
|
|
||||||
|
|
||||||
|
|
||||||
class SimpleLinear(nn.Module):
|
class SimpleLinear(nn.Module):
|
||||||
@@ -61,10 +60,6 @@ def setup_cuda():
|
|||||||
@pytest.mark.parametrize("num_tokens", [1, 32])
|
@pytest.mark.parametrize("num_tokens", [1, 32])
|
||||||
@pytest.mark.parametrize("hidden_size,latent_size", [(256, 128), (128, 64)])
|
@pytest.mark.parametrize("hidden_size,latent_size", [(256, 128), (128, 64)])
|
||||||
@pytest.mark.parametrize("dtype", [torch.bfloat16])
|
@pytest.mark.parametrize("dtype", [torch.bfloat16])
|
||||||
@pytest.mark.skipif(
|
|
||||||
is_torch_equal_or_newer("2.10.0"),
|
|
||||||
reason="Test fails with PyTorch 2.10.0 see: https://github.com/vllm-project/vllm/issues/33995",
|
|
||||||
)
|
|
||||||
def test_routed_input_transform_inside_vs_outside(
|
def test_routed_input_transform_inside_vs_outside(
|
||||||
num_tokens: int,
|
num_tokens: int,
|
||||||
hidden_size: int,
|
hidden_size: int,
|
||||||
|
|||||||
@@ -275,6 +275,9 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
|
|||||||
"zai-org/GLM-4.7-Flash",
|
"zai-org/GLM-4.7-Flash",
|
||||||
min_transformers_version="5.0.0",
|
min_transformers_version="5.0.0",
|
||||||
),
|
),
|
||||||
|
"GlmMoeDsaForCausalLM": _HfExamplesInfo(
|
||||||
|
"zai-org/GLM-5", min_transformers_version="5.0.1", is_available_online=False
|
||||||
|
),
|
||||||
"GPT2LMHeadModel": _HfExamplesInfo("openai-community/gpt2", {"alias": "gpt2"}),
|
"GPT2LMHeadModel": _HfExamplesInfo("openai-community/gpt2", {"alias": "gpt2"}),
|
||||||
"GPTBigCodeForCausalLM": _HfExamplesInfo(
|
"GPTBigCodeForCausalLM": _HfExamplesInfo(
|
||||||
"bigcode/starcoder",
|
"bigcode/starcoder",
|
||||||
|
|||||||
@@ -97,7 +97,7 @@ def can_initialize(
|
|||||||
"pickle error when loading `transformers.models.auto.CONFIG_MAPPING`"
|
"pickle error when loading `transformers.models.auto.CONFIG_MAPPING`"
|
||||||
)
|
)
|
||||||
|
|
||||||
if model_arch == "DeepseekV32ForCausalLM":
|
if model_arch in ["DeepseekV32ForCausalLM", "GlmMoeDsaForCausalLM"]:
|
||||||
from vllm.platforms import current_platform
|
from vllm.platforms import current_platform
|
||||||
|
|
||||||
capability = current_platform.get_device_capability()
|
capability = current_platform.get_device_capability()
|
||||||
|
|||||||
@@ -7,7 +7,8 @@
|
|||||||
set -e
|
set -e
|
||||||
|
|
||||||
TORCHCODEC_REPO="${TORCHCODEC_REPO:-https://github.com/pytorch/torchcodec.git}"
|
TORCHCODEC_REPO="${TORCHCODEC_REPO:-https://github.com/pytorch/torchcodec.git}"
|
||||||
TORCHCODEC_BRANCH="${TORCHCODEC_BRANCH:-main}"
|
# Pin to a specific release for reproducibility; update as needed.
|
||||||
|
TORCHCODEC_BRANCH="${TORCHCODEC_BRANCH:-v0.10.0}"
|
||||||
|
|
||||||
echo "=== TorchCodec Installation Script ==="
|
echo "=== TorchCodec Installation Script ==="
|
||||||
|
|
||||||
|
|||||||
@@ -233,7 +233,7 @@ class InductorStandaloneAdaptor(CompilerInterface):
|
|||||||
|
|
||||||
from torch._inductor import standalone_compile
|
from torch._inductor import standalone_compile
|
||||||
|
|
||||||
supports_aot = is_torch_equal_or_newer("2.10.0")
|
supports_aot = is_torch_equal_or_newer("2.10.0.dev")
|
||||||
|
|
||||||
if not supports_aot and envs.VLLM_USE_MEGA_AOT_ARTIFACT:
|
if not supports_aot and envs.VLLM_USE_MEGA_AOT_ARTIFACT:
|
||||||
logger.error(
|
logger.error(
|
||||||
|
|||||||
@@ -333,7 +333,7 @@ def _support_torch_compile(
|
|||||||
) -> None:
|
) -> None:
|
||||||
def mark_dynamic(arg: torch.Tensor, dims: list[int]) -> None:
|
def mark_dynamic(arg: torch.Tensor, dims: list[int]) -> None:
|
||||||
if ds_type == DynamicShapesType.UNBACKED:
|
if ds_type == DynamicShapesType.UNBACKED:
|
||||||
if is_torch_equal_or_newer("2.10.0"):
|
if is_torch_equal_or_newer("2.10.0.dev"):
|
||||||
for dim in dims:
|
for dim in dims:
|
||||||
torch._dynamo.decorators.mark_unbacked(
|
torch._dynamo.decorators.mark_unbacked(
|
||||||
arg, dim, hint_override=arg.size()[dim]
|
arg, dim, hint_override=arg.size()[dim]
|
||||||
@@ -373,7 +373,7 @@ def _support_torch_compile(
|
|||||||
if isinstance(arg, torch.Tensor):
|
if isinstance(arg, torch.Tensor):
|
||||||
# In case dims is specified with negative indexing
|
# In case dims is specified with negative indexing
|
||||||
dims = [arg.ndim + dim if dim < 0 else dim for dim in dims]
|
dims = [arg.ndim + dim if dim < 0 else dim for dim in dims]
|
||||||
if is_torch_equal_or_newer("2.10.0"):
|
if is_torch_equal_or_newer("2.10.0.dev"):
|
||||||
for dim in dims:
|
for dim in dims:
|
||||||
torch._dynamo.decorators.mark_unbacked(
|
torch._dynamo.decorators.mark_unbacked(
|
||||||
arg, dim, hint_override=arg.size()[dim]
|
arg, dim, hint_override=arg.size()[dim]
|
||||||
@@ -525,9 +525,9 @@ def _support_torch_compile(
|
|||||||
fx_config_patches["backed_size_oblivious"] = True
|
fx_config_patches["backed_size_oblivious"] = True
|
||||||
|
|
||||||
# Prepare inductor config patches
|
# Prepare inductor config patches
|
||||||
# assume_32bit_indexing is only available in torch 2.10.0+
|
# assume_32bit_indexing is only available in torch 2.10.0.dev+
|
||||||
inductor_config_patches = {}
|
inductor_config_patches = {}
|
||||||
if is_torch_equal_or_newer("2.10.0"):
|
if is_torch_equal_or_newer("2.10.0.dev"):
|
||||||
inductor_config_patches["assume_32bit_indexing"] = (
|
inductor_config_patches["assume_32bit_indexing"] = (
|
||||||
self.compilation_config.dynamic_shapes_config.assume_32_bit_indexing
|
self.compilation_config.dynamic_shapes_config.assume_32_bit_indexing
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -181,7 +181,7 @@ class SpeculativeConfig:
|
|||||||
@staticmethod
|
@staticmethod
|
||||||
def hf_config_override(hf_config: PretrainedConfig) -> PretrainedConfig:
|
def hf_config_override(hf_config: PretrainedConfig) -> PretrainedConfig:
|
||||||
initial_architecture = hf_config.architectures[0]
|
initial_architecture = hf_config.architectures[0]
|
||||||
if hf_config.model_type in ("deepseek_v3", "deepseek_v32"):
|
if hf_config.model_type in ("deepseek_v3", "deepseek_v32", "glm_moe_dsa"):
|
||||||
hf_config.model_type = "deepseek_mtp"
|
hf_config.model_type = "deepseek_mtp"
|
||||||
if hf_config.model_type == "deepseek_mtp":
|
if hf_config.model_type == "deepseek_mtp":
|
||||||
n_predict = getattr(hf_config, "num_nextn_predict_layers", None)
|
n_predict = getattr(hf_config, "num_nextn_predict_layers", None)
|
||||||
|
|||||||
@@ -48,7 +48,6 @@ class RealtimeConnection:
|
|||||||
self.generation_task: asyncio.Task | None = None
|
self.generation_task: asyncio.Task | None = None
|
||||||
|
|
||||||
self._is_connected = False
|
self._is_connected = False
|
||||||
self._is_input_finished = False
|
|
||||||
self._is_model_validated = False
|
self._is_model_validated = False
|
||||||
|
|
||||||
self._max_audio_filesize_mb = envs.VLLM_MAX_AUDIO_CLIP_FILESIZE_MB
|
self._max_audio_filesize_mb = envs.VLLM_MAX_AUDIO_CLIP_FILESIZE_MB
|
||||||
@@ -145,7 +144,7 @@ class RealtimeConnection:
|
|||||||
commit_event = InputAudioBufferCommit(**event)
|
commit_event = InputAudioBufferCommit(**event)
|
||||||
# final signals that the audio is finished
|
# final signals that the audio is finished
|
||||||
if commit_event.final:
|
if commit_event.final:
|
||||||
self._is_input_finished = True
|
self.audio_queue.put_nowait(None)
|
||||||
else:
|
else:
|
||||||
await self.start_generation()
|
await self.start_generation()
|
||||||
else:
|
else:
|
||||||
@@ -239,11 +238,6 @@ class RealtimeConnection:
|
|||||||
# finish because websocket connection was killed
|
# finish because websocket connection was killed
|
||||||
break
|
break
|
||||||
|
|
||||||
if self.audio_queue.empty() and self._is_input_finished:
|
|
||||||
# finish because client signals that audio input
|
|
||||||
# is finished
|
|
||||||
break
|
|
||||||
|
|
||||||
usage = UsageInfo(
|
usage = UsageInfo(
|
||||||
prompt_tokens=prompt_token_ids_len,
|
prompt_tokens=prompt_token_ids_len,
|
||||||
completion_tokens=completion_tokens_len,
|
completion_tokens=completion_tokens_len,
|
||||||
|
|||||||
@@ -271,7 +271,7 @@ def use_aot_compile() -> bool:
|
|||||||
|
|
||||||
default_value = (
|
default_value = (
|
||||||
"1"
|
"1"
|
||||||
if is_torch_equal_or_newer("2.11.0.dev") and not disable_compile_cache()
|
if is_torch_equal_or_newer("2.10.0.dev") and not disable_compile_cache()
|
||||||
else "0"
|
else "0"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
@@ -974,7 +974,7 @@ def enable_batch_invariant_mode():
|
|||||||
)
|
)
|
||||||
|
|
||||||
reduced_precision_val = (
|
reduced_precision_val = (
|
||||||
(False, False) if is_torch_equal_or_newer("2.10.0") else False
|
(False, False) if is_torch_equal_or_newer("2.10.0.dev") else False
|
||||||
)
|
)
|
||||||
torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = (
|
torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = (
|
||||||
reduced_precision_val
|
reduced_precision_val
|
||||||
|
|||||||
@@ -19,42 +19,17 @@ from vllm.model_executor.layers.fused_moe.utils import _resize_cache
|
|||||||
from vllm.model_executor.layers.quantization.utils.quant_utils import (
|
from vllm.model_executor.layers.quantization.utils.quant_utils import (
|
||||||
QuantKey,
|
QuantKey,
|
||||||
)
|
)
|
||||||
from vllm.platforms import current_platform
|
|
||||||
from vllm.triton_utils import tl, triton
|
from vllm.triton_utils import tl, triton
|
||||||
from vllm.utils.import_utils import has_triton_kernels
|
from vllm.utils.import_utils import has_triton_kernels
|
||||||
|
|
||||||
logger = init_logger(__name__)
|
logger = init_logger(__name__)
|
||||||
|
|
||||||
use_legacy_triton_kernels = False
|
|
||||||
|
|
||||||
if has_triton_kernels():
|
if has_triton_kernels():
|
||||||
try:
|
try:
|
||||||
import triton_kernels.swiglu
|
import triton_kernels.swiglu
|
||||||
from triton_kernels.matmul_ogs import (
|
from triton_kernels.matmul_ogs import FnSpecs, FusedActivation, matmul_ogs
|
||||||
FnSpecs,
|
from triton_kernels.routing import RoutingData, routing, routing_from_bitmatrix
|
||||||
FusedActivation,
|
from triton_kernels.tensor import Bitmatrix
|
||||||
GatherIndx,
|
|
||||||
RoutingData,
|
|
||||||
ScatterIndx,
|
|
||||||
matmul_ogs,
|
|
||||||
)
|
|
||||||
from triton_kernels.tensor import (
|
|
||||||
BIT,
|
|
||||||
Bitmatrix,
|
|
||||||
)
|
|
||||||
from triton_kernels.topk import topk
|
|
||||||
|
|
||||||
try:
|
|
||||||
from triton_kernels.tensor import (
|
|
||||||
SparseMatrix,
|
|
||||||
make_ragged_tensor_metadata,
|
|
||||||
)
|
|
||||||
except ImportError:
|
|
||||||
if current_platform.is_rocm():
|
|
||||||
logger.warning_once("Using legacy triton_kernels on ROCm")
|
|
||||||
use_legacy_triton_kernels = True
|
|
||||||
else:
|
|
||||||
raise
|
|
||||||
except (AttributeError, ImportError) as e:
|
except (AttributeError, ImportError) as e:
|
||||||
logger.error(
|
logger.error(
|
||||||
"Failed to import Triton kernels. Please make sure your triton "
|
"Failed to import Triton kernels. Please make sure your triton "
|
||||||
@@ -103,68 +78,6 @@ def pack_bitmatrix(
|
|||||||
tl.store(bitmatrix_ptrs, y, mask=offsets_m[:, None] < n_rows)
|
tl.store(bitmatrix_ptrs, y, mask=offsets_m[:, None] < n_rows)
|
||||||
|
|
||||||
|
|
||||||
def legacy_routing_from_bitmatrix(
|
|
||||||
bitmatrix: "Bitmatrix",
|
|
||||||
expt_scal: torch.Tensor,
|
|
||||||
expt_indx: torch.Tensor,
|
|
||||||
n_expts_tot: int,
|
|
||||||
n_expts_act: int,
|
|
||||||
) -> tuple["RoutingData", "GatherIndx", "ScatterIndx"]:
|
|
||||||
"""
|
|
||||||
Replacement for the removed triton_kernels.routing.routing_from_bitmatrix.
|
|
||||||
Creates routing data from a bitmatrix representation.
|
|
||||||
"""
|
|
||||||
if use_legacy_triton_kernels:
|
|
||||||
from triton_kernels.routing import routing_from_bitmatrix
|
|
||||||
|
|
||||||
return routing_from_bitmatrix(
|
|
||||||
bitmatrix, expt_scal, expt_indx, n_expts_tot, n_expts_act
|
|
||||||
)
|
|
||||||
sparse_logits = SparseMatrix(indx=expt_indx, vals=expt_scal, mask=bitmatrix)
|
|
||||||
dispatch_indx = sparse_logits.mask_metadata.row_sorted_indx
|
|
||||||
combine_indx = sparse_logits.mask_metadata.col_sorted_indx
|
|
||||||
ragged_batch_metadata = make_ragged_tensor_metadata(
|
|
||||||
sparse_logits.mask_metadata.col_sum,
|
|
||||||
dispatch_indx.shape[0],
|
|
||||||
)
|
|
||||||
gate_scal = sparse_logits.vals.flatten()[combine_indx]
|
|
||||||
routing_data = RoutingData(
|
|
||||||
gate_scal,
|
|
||||||
ragged_batch_metadata.block_sizes,
|
|
||||||
n_expts_tot,
|
|
||||||
n_expts_act,
|
|
||||||
ragged_batch_metadata,
|
|
||||||
)
|
|
||||||
gather_idx = GatherIndx(combine_indx, dispatch_indx)
|
|
||||||
scatter_idx = ScatterIndx(dispatch_indx, combine_indx)
|
|
||||||
return routing_data, gather_idx, scatter_idx
|
|
||||||
|
|
||||||
|
|
||||||
def legacy_routing(
|
|
||||||
logits: torch.Tensor,
|
|
||||||
n_expts_act: int,
|
|
||||||
sm_first: bool = False,
|
|
||||||
) -> tuple["RoutingData", "GatherIndx", "ScatterIndx"]:
|
|
||||||
"""
|
|
||||||
Replacement for the removed triton_kernels.routing.routing function.
|
|
||||||
Computes routing data from gating logits.
|
|
||||||
"""
|
|
||||||
if use_legacy_triton_kernels:
|
|
||||||
from triton_kernels.routing import routing
|
|
||||||
|
|
||||||
return routing(logits, n_expts_act, sm_first=sm_first)
|
|
||||||
if sm_first:
|
|
||||||
logits = torch.softmax(logits, dim=-1)
|
|
||||||
sparse_logits = topk(logits, n_expts_act, apply_softmax=not sm_first)
|
|
||||||
return legacy_routing_from_bitmatrix(
|
|
||||||
sparse_logits.mask,
|
|
||||||
sparse_logits.vals,
|
|
||||||
sparse_logits.indx,
|
|
||||||
logits.shape[-1],
|
|
||||||
n_expts_act,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def triton_kernel_moe_forward(
|
def triton_kernel_moe_forward(
|
||||||
hidden_states: torch.Tensor,
|
hidden_states: torch.Tensor,
|
||||||
w1, # Tensor or triton_kernels.Tensor
|
w1, # Tensor or triton_kernels.Tensor
|
||||||
@@ -178,7 +91,7 @@ def triton_kernel_moe_forward(
|
|||||||
global_num_experts: int = -1,
|
global_num_experts: int = -1,
|
||||||
expert_map: torch.Tensor | None = None,
|
expert_map: torch.Tensor | None = None,
|
||||||
) -> torch.Tensor:
|
) -> torch.Tensor:
|
||||||
routing_data, gather_idx, scatter_idx = legacy_routing(
|
routing_data, gather_idx, scatter_idx = routing(
|
||||||
gating_output, topk, sm_first=not renormalize
|
gating_output, topk, sm_first=not renormalize
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -254,23 +167,11 @@ def triton_kernel_fused_experts(
|
|||||||
)
|
)
|
||||||
output_tensor = _resize_cache(output_tensor, (batch_dim, M, K))
|
output_tensor = _resize_cache(output_tensor, (batch_dim, M, K))
|
||||||
|
|
||||||
act = (
|
act = FusedActivation(
|
||||||
FusedActivation(
|
|
||||||
FnSpecs(
|
|
||||||
"swiglu",
|
|
||||||
triton_kernels.swiglu.swiglu_fn,
|
|
||||||
("alpha", "limit"),
|
|
||||||
reduction_n=2,
|
|
||||||
),
|
|
||||||
(swiglu_alpha, swiglu_limit),
|
|
||||||
)
|
|
||||||
if not use_legacy_triton_kernels
|
|
||||||
else FusedActivation(
|
|
||||||
FnSpecs("swiglu", triton_kernels.swiglu.swiglu_fn, ("alpha", "limit")),
|
FnSpecs("swiglu", triton_kernels.swiglu.swiglu_fn, ("alpha", "limit")),
|
||||||
(swiglu_alpha, swiglu_limit),
|
(swiglu_alpha, swiglu_limit),
|
||||||
2,
|
2,
|
||||||
)
|
)
|
||||||
)
|
|
||||||
gammas = routing_data.gate_scal if routing_data else None
|
gammas = routing_data.gate_scal if routing_data else None
|
||||||
|
|
||||||
matmul_ogs(
|
matmul_ogs(
|
||||||
@@ -330,22 +231,13 @@ def make_routing_data(
|
|||||||
|
|
||||||
bitmatrix_shape = [n_rows, bm_cols * 32]
|
bitmatrix_shape = [n_rows, bm_cols * 32]
|
||||||
bitmatrix_shape_max = [n_rows, None]
|
bitmatrix_shape_max = [n_rows, None]
|
||||||
bitmatrix = (
|
bitmatrix = Bitmatrix(
|
||||||
Bitmatrix(
|
bitmatrix, shape=bitmatrix_shape, shape_max=bitmatrix_shape_max, scratchpad=None
|
||||||
bitmatrix, dtype=BIT, shape=bitmatrix_shape, shape_max=bitmatrix_shape_max
|
|
||||||
)
|
|
||||||
if not use_legacy_triton_kernels
|
|
||||||
else Bitmatrix(
|
|
||||||
bitmatrix,
|
|
||||||
shape=bitmatrix_shape,
|
|
||||||
shape_max=bitmatrix_shape_max,
|
|
||||||
scratchpad=None,
|
|
||||||
)
|
|
||||||
)
|
)
|
||||||
|
|
||||||
# matmul_ogs expects invalid topk_weights to be -1s
|
# matmul_ogs expects invalid topk_weights to be -1s
|
||||||
topk_weights = torch.where(topk_ids == -1, -1.0, topk_weights)
|
topk_weights = torch.where(topk_ids == -1, -1.0, topk_weights)
|
||||||
routing_data, gather_indx, scatter_indx = legacy_routing_from_bitmatrix(
|
routing_data, gather_indx, scatter_indx = routing_from_bitmatrix(
|
||||||
bitmatrix, topk_weights, topk_ids, num_local_experts, num_topk
|
bitmatrix, topk_weights, topk_ids, num_local_experts, num_topk
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
@@ -836,7 +836,7 @@ class DeepseekV2MLAAttention(nn.Module):
|
|||||||
qk_rope_head_dim,
|
qk_rope_head_dim,
|
||||||
max_position=max_position_embeddings,
|
max_position=max_position_embeddings,
|
||||||
rope_parameters=config.rope_parameters,
|
rope_parameters=config.rope_parameters,
|
||||||
is_neox_style=True,
|
is_neox_style=not getattr(config, "indexer_rope_interleave", False),
|
||||||
)
|
)
|
||||||
self.indexer = Indexer(
|
self.indexer = Indexer(
|
||||||
vllm_config,
|
vllm_config,
|
||||||
@@ -1499,6 +1499,10 @@ class DeepseekV3ForCausalLM(DeepseekV2ForCausalLM):
|
|||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class GlmMoeDsaForCausalLM(DeepseekV2ForCausalLM):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
# Compatibility with
|
# Compatibility with
|
||||||
# https://huggingface.co/deepseek-ai/DeepSeek-V3-Base/blob/main/configuration_deepseek.py
|
# https://huggingface.co/deepseek-ai/DeepSeek-V3-Base/blob/main/configuration_deepseek.py
|
||||||
def get_spec_layer_idx_from_weight_name(
|
def get_spec_layer_idx_from_weight_name(
|
||||||
|
|||||||
@@ -114,6 +114,7 @@ _TEXT_GENERATION_MODELS = {
|
|||||||
"Glm4ForCausalLM": ("glm4", "Glm4ForCausalLM"),
|
"Glm4ForCausalLM": ("glm4", "Glm4ForCausalLM"),
|
||||||
"Glm4MoeForCausalLM": ("glm4_moe", "Glm4MoeForCausalLM"),
|
"Glm4MoeForCausalLM": ("glm4_moe", "Glm4MoeForCausalLM"),
|
||||||
"Glm4MoeLiteForCausalLM": ("glm4_moe_lite", "Glm4MoeLiteForCausalLM"),
|
"Glm4MoeLiteForCausalLM": ("glm4_moe_lite", "Glm4MoeLiteForCausalLM"),
|
||||||
|
"GlmMoeDsaForCausalLM": ("deepseek_v2", "GlmMoeDsaForCausalLM"),
|
||||||
"GptOssForCausalLM": ("gpt_oss", "GptOssForCausalLM"),
|
"GptOssForCausalLM": ("gpt_oss", "GptOssForCausalLM"),
|
||||||
"GPT2LMHeadModel": ("gpt2", "GPT2LMHeadModel"),
|
"GPT2LMHeadModel": ("gpt2", "GPT2LMHeadModel"),
|
||||||
"GPTBigCodeForCausalLM": ("gpt_bigcode", "GPTBigCodeForCausalLM"),
|
"GPTBigCodeForCausalLM": ("gpt_bigcode", "GPTBigCodeForCausalLM"),
|
||||||
|
|||||||
@@ -237,6 +237,7 @@ class ModelArchConfigConvertorBase:
|
|||||||
"deepseek_v3",
|
"deepseek_v3",
|
||||||
"deepseek_v32",
|
"deepseek_v32",
|
||||||
"deepseek_mtp",
|
"deepseek_mtp",
|
||||||
|
"glm_moe_dsa",
|
||||||
"glm4_moe_lite",
|
"glm4_moe_lite",
|
||||||
"glm4_moe_lite_mtp",
|
"glm4_moe_lite_mtp",
|
||||||
"kimi_k2",
|
"kimi_k2",
|
||||||
|
|||||||
@@ -1503,6 +1503,24 @@ class SpecDecodeBaseProposer:
|
|||||||
del self.model.lm_head
|
del self.model.lm_head
|
||||||
self.model.lm_head = target_language_model.lm_head
|
self.model.lm_head = target_language_model.lm_head
|
||||||
|
|
||||||
|
# MTP models call compute_logits via shared_head.head (a
|
||||||
|
# ParallelLMHead inside each MTP layer), not self.model.lm_head.
|
||||||
|
# If the checkpoint omits a copy of the lm_head weights at the
|
||||||
|
# MTP layer path, shared_head.head stays uninitialised and
|
||||||
|
# produces NaN logits. Always share it explicitly.
|
||||||
|
inner = getattr(self.model, "model", None)
|
||||||
|
layers = getattr(inner, "layers", None) if inner else None
|
||||||
|
if layers is not None:
|
||||||
|
items = layers.values() if isinstance(layers, nn.ModuleDict) else layers
|
||||||
|
for layer in items:
|
||||||
|
sh = getattr(layer, "shared_head", None)
|
||||||
|
if sh is not None and hasattr(sh, "head"):
|
||||||
|
del sh.head
|
||||||
|
sh.head = target_language_model.lm_head
|
||||||
|
logger.info(
|
||||||
|
"Shared target model lm_head with MTP shared_head.head."
|
||||||
|
)
|
||||||
|
|
||||||
@torch.inference_mode()
|
@torch.inference_mode()
|
||||||
def dummy_run(
|
def dummy_run(
|
||||||
self,
|
self,
|
||||||
|
|||||||
Reference in New Issue
Block a user