Compare commits

..

11 Commits

Author SHA1 Message Date
Andreas Karatzas
89a77b1084 [ROCm][CI] Pin TorchCodec to v0.10.0 for ROCm compatibility (#34447)
Signed-off-by: Andreas Karatzas <akaratza@amd.com>
(cherry picked from commit 4c078fa546)
(cherry picked from commit a976961fb77d38129abf69edd4952101731f2421)
2026-02-24 20:30:22 -08:00
Kevin H. Luu
d3c1513f5f [ci] Use the right tag for CPU arm64 image (#34915)
Signed-off-by: Kevin H. Luu <khluu000@gmail.com>
(cherry picked from commit a1a2d79442)
(cherry picked from commit 772f70839192262ff01c533d821a11a225d1c00f)
2026-02-24 20:30:13 -08:00
Cyrus Leung
5dbfbc967b [CI/Build] Fix gRPC version mismatch (#35013)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
(cherry picked from commit 965fe45935)
(cherry picked from commit 90308959295b66049024649fe1273070477f343d)
2026-02-24 20:30:02 -08:00
khluu
c86cdcbcd2 Revert "[Release 2.10] Update to Torch 2.10 - final release (#30525)"
This reverts commit f97ca67176.
2026-02-24 20:28:53 -08:00
khluu
3c9496f146 Revert "[Bugfix][ROCm][GPT-OSS] Use old triton_kernels implementation on ROCm if the new API is not available (#34153)"
This reverts commit 55a1baebc5.
2026-02-24 20:28:45 -08:00
khluu
2d5be1dd5c release script
Signed-off-by: khluu <khluu000@gmail.com>
2026-02-12 02:37:52 -08:00
Michael Goin
7a06e5b05b [Bugfix] Fix MTP accuracy for GLM-5 (#34385)
Signed-off-by: mgoin <mgoin64@gmail.com>
(cherry picked from commit ec12d39d44)
2026-02-11 20:54:27 -08:00
Junseo Park
946b2f106c [Bugfix] send None sentinel on final commit so server properly sends transcription.done (#33963)
Signed-off-by: pjs102793 <pjs102793@naver.com>
Co-authored-by: Nick Hill <nickhill123@gmail.com>
(cherry picked from commit 5458eb835d)
2026-02-11 20:54:14 -08:00
Nick Hill
5e8adb0c49 [Misc] Bump fastsafetensors version for latest fixes (#34273)
Signed-off-by: Nick Hill <nickhill123@gmail.com>
(cherry picked from commit 79504027ef)
2026-02-11 20:54:00 -08:00
Xinyu Dong
9be1ff2d3a [Bugfix] fix default is_neox_style is True for deepseek (#34353)
Signed-off-by: dongxinyu03 <dongxinyu03@baidu.com>
(cherry picked from commit be7f3d5d20)
2026-02-11 20:53:40 -08:00
Jee Jee Li
b3ee90f961 [Model] GLM adaptation (#34124)
(cherry picked from commit 978a37c823)
2026-02-11 20:53:11 -08:00
34 changed files with 262 additions and 202 deletions

View File

@@ -3,7 +3,6 @@ steps:
- label: ":docker: Build image" - label: ":docker: Build image"
key: image-build key: image-build
depends_on: [] depends_on: []
timeout_in_minutes: 600
commands: commands:
- if [[ "$BUILDKITE_BRANCH" != "main" ]]; then .buildkite/image_build/image_build.sh $REGISTRY $REPO $BUILDKITE_COMMIT $BRANCH $VLLM_USE_PRECOMPILED $VLLM_MERGE_BASE_COMMIT $IMAGE_TAG; fi - if [[ "$BUILDKITE_BRANCH" != "main" ]]; then .buildkite/image_build/image_build.sh $REGISTRY $REPO $BUILDKITE_COMMIT $BRANCH $VLLM_USE_PRECOMPILED $VLLM_MERGE_BASE_COMMIT $IMAGE_TAG; fi
- if [[ "$BUILDKITE_BRANCH" == "main" ]]; then .buildkite/image_build/image_build.sh $REGISTRY $REPO $BUILDKITE_COMMIT $BRANCH $VLLM_USE_PRECOMPILED $VLLM_MERGE_BASE_COMMIT $IMAGE_TAG $IMAGE_TAG_LATEST; fi - if [[ "$BUILDKITE_BRANCH" == "main" ]]; then .buildkite/image_build/image_build.sh $REGISTRY $REPO $BUILDKITE_COMMIT $BRANCH $VLLM_USE_PRECOMPILED $VLLM_MERGE_BASE_COMMIT $IMAGE_TAG $IMAGE_TAG_LATEST; fi

View File

@@ -14,7 +14,7 @@ BUILDKITE_COMMIT=$3
aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin $REGISTRY aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin $REGISTRY
# skip build if image already exists # skip build if image already exists
if [[ -z $(docker manifest inspect $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu) ]]; then if [[ -z $(docker manifest inspect "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-arm64-cpu) ]]; then
echo "Image not found, proceeding with build..." echo "Image not found, proceeding with build..."
else else
echo "Image found" echo "Image found"
@@ -24,10 +24,10 @@ fi
# build # build
docker build --file docker/Dockerfile.cpu \ docker build --file docker/Dockerfile.cpu \
--build-arg max_jobs=16 \ --build-arg max_jobs=16 \
--build-arg buildkite_commit=$BUILDKITE_COMMIT \ --build-arg buildkite_commit="$BUILDKITE_COMMIT" \
--tag $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu \ --tag "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-arm64-cpu \
--target vllm-test \ --target vllm-test \
--progress plain . --progress plain .
# push # push
docker push $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu docker push "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-arm64-cpu

View File

@@ -248,8 +248,8 @@ steps:
DOCKER_BUILDKIT: "1" DOCKER_BUILDKIT: "1"
DOCKERHUB_USERNAME: "vllmbot" DOCKERHUB_USERNAME: "vllmbot"
- group: "Publish wheels" - group: "Publish release artifacts"
key: "publish-wheels" key: "publish-release-artifacts"
steps: steps:
- block: "Confirm update release wheels to PyPI (experimental, use with caution)?" - block: "Confirm update release wheels to PyPI (experimental, use with caution)?"
key: block-upload-release-wheels key: block-upload-release-wheels
@@ -266,6 +266,27 @@ steps:
commands: commands:
- "bash .buildkite/scripts/upload-release-wheels-pypi.sh" - "bash .buildkite/scripts/upload-release-wheels-pypi.sh"
- block: "Confirm update release images to DockerHub"
key: block-update-release-images-dockerhub
depends_on:
- input-release-version
- annotate-release-workflow
- label: "Publish release images to DockerHub"
depends_on:
- block-update-release-images-dockerhub
agents:
queue: small_cpu_queue_postmerge
commands:
- "bash .buildkite/scripts/push-release-images-dockerhub.sh"
plugins:
- docker-login#v3.0.0:
username: vllmbot
password-env: DOCKERHUB_TOKEN
env:
DOCKER_BUILDKIT: "1"
DOCKERHUB_USERNAME: "vllmbot"
# ============================================================================= # =============================================================================
# ROCm Release Pipeline (x86_64 only) # ROCm Release Pipeline (x86_64 only)
# ============================================================================= # =============================================================================

View File

@@ -0,0 +1,98 @@
#!/bin/bash
set -ex
RELEASE_VERSION=$(buildkite-agent meta-data get release-version 2>/dev/null | sed 's/^v//')
if [ -z "${RELEASE_VERSION}" ]; then
echo "RELEASE_VERSION is not set"
exit 1
fi
aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7
# Download images:
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64-cu130
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64-cu130
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm-base
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm
docker pull public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:v${RELEASE_VERSION}
docker pull public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:v${RELEASE_VERSION}
# Tag and push images:
## CUDA
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64 vllm/vllm-openai:x86_64
docker tag vllm/vllm-openai:x86_64 vllm/vllm-openai:latest-x86_64
docker tag vllm/vllm-openai:x86_64 vllm/vllm-openai:v${RELEASE_VERSION}-x86_64
docker push vllm/vllm-openai:latest-x86_64
docker push vllm/vllm-openai:v${RELEASE_VERSION}-x86_64
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64-cu130 vllm/vllm-openai:x86_64-cu130
docker tag vllm/vllm-openai:x86_64-cu130 vllm/vllm-openai:latest-x86_64-cu130
docker tag vllm/vllm-openai:x86_64-cu130 vllm/vllm-openai:v${RELEASE_VERSION}-x86_64-cu130
docker push vllm/vllm-openai:latest-x86_64-cu130
docker push vllm/vllm-openai:v${RELEASE_VERSION}-x86_64-cu130
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64 vllm/vllm-openai:aarch64
docker tag vllm/vllm-openai:aarch64 vllm/vllm-openai:latest-aarch64
docker tag vllm/vllm-openai:aarch64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
docker push vllm/vllm-openai:latest-aarch64
docker push vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64-cu130 vllm/vllm-openai:aarch64-cu130
docker tag vllm/vllm-openai:aarch64-cu130 vllm/vllm-openai:latest-aarch64-cu130
docker tag vllm/vllm-openai:aarch64-cu130 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64-cu130
docker push vllm/vllm-openai:latest-aarch64-cu130
docker push vllm/vllm-openai:v${RELEASE_VERSION}-aarch64-cu130
## ROCm
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}
docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT} vllm/vllm-openai-rocm:latest
docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT} vllm/vllm-openai-rocm:v${RELEASE_VERSION}
docker push vllm/vllm-openai-rocm:latest
docker push vllm/vllm-openai-rocm:v${RELEASE_VERSION}
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm-base vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-base
docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-base vllm/vllm-openai-rocm:latest-base
docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-base vllm/vllm-openai-rocm:v${RELEASE_VERSION}-base
docker push vllm/vllm-openai-rocm:latest-base
docker push vllm/vllm-openai-rocm:v${RELEASE_VERSION}-base
## CPU
docker tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:v${RELEASE_VERSION} vllm/vllm-openai-cpu:x86_64
docker tag vllm/vllm-openai-cpu:x86_64 vllm/vllm-openai-cpu:latest-x86_64
docker tag vllm/vllm-openai-cpu:x86_64 vllm/vllm-openai-cpu:v${RELEASE_VERSION}-x86_64
docker push vllm/vllm-openai-cpu:latest-x86_64
docker push vllm/vllm-openai-cpu:v${RELEASE_VERSION}-x86_64
docker tag public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:v${RELEASE_VERSION} vllm/vllm-openai-cpu:arm64
docker tag vllm/vllm-openai-cpu:arm64 vllm/vllm-openai-cpu:latest-arm64
docker tag vllm/vllm-openai-cpu:arm64 vllm/vllm-openai-cpu:v${RELEASE_VERSION}-arm64
docker push vllm/vllm-openai-cpu:latest-arm64
docker push vllm/vllm-openai-cpu:v${RELEASE_VERSION}-arm64
# Create multi-arch manifest:
docker manifest rm vllm/vllm-openai:latest
docker manifest create vllm/vllm-openai:latest vllm/vllm-openai:latest-x86_64 vllm/vllm-openai:latest-aarch64
docker manifest create vllm/vllm-openai:v${RELEASE_VERSION} vllm/vllm-openai:v${RELEASE_VERSION}-x86_64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
docker manifest push vllm/vllm-openai:latest
docker manifest push vllm/vllm-openai:v${RELEASE_VERSION}
docker manifest rm vllm/vllm-openai:latest-cu130
docker manifest create vllm/vllm-openai:latest-cu130 vllm/vllm-openai:latest-x86_64-cu130 vllm/vllm-openai:latest-aarch64-cu130
docker manifest create vllm/vllm-openai:v${RELEASE_VERSION}-cu130 vllm/vllm-openai:v${RELEASE_VERSION}-x86_64-cu130 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64-cu130
docker manifest push vllm/vllm-openai:latest-cu130
docker manifest push vllm/vllm-openai:v${RELEASE_VERSION}-cu130
docker manifest rm vllm/vllm-openai-cpu:latest || true
docker manifest create vllm/vllm-openai-cpu:latest vllm/vllm-openai-cpu:latest-x86_64 vllm/vllm-openai-cpu:latest-arm64
docker manifest create vllm/vllm-openai-cpu:v${RELEASE_VERSION} vllm/vllm-openai-cpu:v${RELEASE_VERSION}-x86_64 vllm/vllm-openai-cpu:v${RELEASE_VERSION}-arm64
docker manifest push vllm/vllm-openai-cpu:latest
docker manifest push vllm/vllm-openai-cpu:v${RELEASE_VERSION}

View File

@@ -56,8 +56,8 @@ endif()
# requirements.txt files and should be kept consistent. The ROCm torch # requirements.txt files and should be kept consistent. The ROCm torch
# versions are derived from docker/Dockerfile.rocm # versions are derived from docker/Dockerfile.rocm
# #
set(TORCH_SUPPORTED_VERSION_CUDA "2.10.0") set(TORCH_SUPPORTED_VERSION_CUDA "2.9.1")
set(TORCH_SUPPORTED_VERSION_ROCM "2.10.0") set(TORCH_SUPPORTED_VERSION_ROCM "2.9.1")
# #
# Try to find python package with an executable that exactly matches # Try to find python package with an executable that exactly matches

View File

@@ -686,6 +686,7 @@ def get_model_params(config):
"DeepseekV2ForCausalLM", "DeepseekV2ForCausalLM",
"DeepseekV3ForCausalLM", "DeepseekV3ForCausalLM",
"DeepseekV32ForCausalLM", "DeepseekV32ForCausalLM",
"GlmMoeDsaForCausalLM",
"Glm4MoeForCausalLM", "Glm4MoeForCausalLM",
"Glm4MoeLiteForCausalLM", "Glm4MoeLiteForCausalLM",
"NemotronHForCausalLM", "NemotronHForCausalLM",

View File

@@ -1,6 +1,6 @@
# Install OpenAI triton_kernels from https://github.com/triton-lang/triton/tree/main/python/triton_kernels # Install OpenAI triton_kernels from https://github.com/triton-lang/triton/tree/main/python/triton_kernels
set(DEFAULT_TRITON_KERNELS_TAG "v3.6.0") set(DEFAULT_TRITON_KERNELS_TAG "v3.5.0")
# Set TRITON_KERNELS_SRC_DIR for use with local development with vLLM. We expect TRITON_KERNELS_SRC_DIR to # Set TRITON_KERNELS_SRC_DIR for use with local development with vLLM. We expect TRITON_KERNELS_SRC_DIR to
# be directly set to the triton_kernels python directory. # be directly set to the triton_kernels python directory.

View File

@@ -6,7 +6,7 @@ requires = [
"packaging>=24.2", "packaging>=24.2",
"setuptools>=77.0.3,<81.0.0", "setuptools>=77.0.3,<81.0.0",
"setuptools-scm>=8.0", "setuptools-scm>=8.0",
"torch == 2.10.0", "torch == 2.9.1",
"wheel", "wheel",
"jinja2", "jinja2",
"grpcio-tools==1.78.0", "grpcio-tools==1.78.0",

View File

@@ -4,7 +4,7 @@ ninja
packaging>=24.2 packaging>=24.2
setuptools>=77.0.3,<81.0.0 setuptools>=77.0.3,<81.0.0
setuptools-scm>=8 setuptools-scm>=8
torch==2.10.0 torch==2.9.1
wheel wheel
jinja2>=3.1.6 jinja2>=3.1.6
regex regex

View File

@@ -5,9 +5,9 @@ numba == 0.61.2 # Required for N-gram speculative decoding
# Dependencies for NVIDIA GPUs # Dependencies for NVIDIA GPUs
ray[cgraph]>=2.48.0 ray[cgraph]>=2.48.0
torch==2.10.0 torch==2.9.1
torchaudio==2.10.0 torchaudio==2.9.1
# These must be updated alongside torch # These must be updated alongside torch
torchvision==0.25.0 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version torchvision==0.24.1 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
# FlashInfer should be updated together with the Dockerfile # FlashInfer should be updated together with the Dockerfile
flashinfer-python==0.6.3 flashinfer-python==0.6.3

View File

@@ -43,5 +43,5 @@ tritonclient>=2.51.0
numba == 0.61.2 # Required for N-gram speculative decoding numba == 0.61.2 # Required for N-gram speculative decoding
numpy numpy
runai-model-streamer[s3,gcs]==0.15.3 runai-model-streamer[s3,gcs]==0.15.3
fastsafetensors>=0.1.10 fastsafetensors>=0.2.2
pydantic>=2.12 # 2.11 leads to error on python 3.13 pydantic>=2.12 # 2.11 leads to error on python 3.13

View File

@@ -1,11 +1,12 @@
# Common dependencies # Common dependencies
-r common.txt -r common.txt
--extra-index-url https://download.pytorch.org/whl/test/rocm7.0 --extra-index-url https://download.pytorch.org/whl/rocm6.4
torch==2.10.0 torch==2.9.1
torchvision==0.25.0 torchvision==0.24.1
torchaudio==2.10.0 torchaudio==2.9.1
triton==3.6.0
triton==3.5.1
cmake>=3.26.1,<4 cmake>=3.26.1,<4
packaging>=24.2 packaging>=24.2
setuptools>=77.0.3,<80.0.0 setuptools>=77.0.3,<80.0.0

View File

@@ -1,6 +1,11 @@
# Common dependencies # Common dependencies
-r common.txt -r common.txt
# The version of gRPC libraries should be consistent with each other
grpcio==1.78.0
grpcio-reflection==1.78.0
grpcio-tools==1.78.0
numba == 0.61.2 # Required for N-gram speculative decoding numba == 0.61.2 # Required for N-gram speculative decoding
# Dependencies for AMD GPUs # Dependencies for AMD GPUs
@@ -15,4 +20,3 @@ setuptools-scm>=8
runai-model-streamer[s3,gcs]==0.15.3 runai-model-streamer[s3,gcs]==0.15.3
conch-triton-kernels==1.2.1 conch-triton-kernels==1.2.1
timm>=1.0.17 timm>=1.0.17
grpcio-tools==1.78.0 # Should match `build.txt`

View File

@@ -24,10 +24,10 @@ sentence-transformers>=5.2.0 # required for embedding tests
soundfile # required for audio tests soundfile # required for audio tests
jiwer # required for audio tests jiwer # required for audio tests
tblib # for pickling test exceptions tblib # for pickling test exceptions
timm >=1.0.17 # required for internvl and gemma3n-mm test timm==1.0.17 # required for internvl and gemma3n-mm test
torch==2.10.0 torch==2.9.1
torchaudio==2.10.0 torchaudio==2.9.1
torchvision==0.25.0 torchvision==0.24.1
transformers_stream_generator # required for qwen-vl test transformers_stream_generator # required for qwen-vl test
matplotlib # required for qwen-vl test matplotlib # required for qwen-vl test
mistral_common[image,audio] >= 1.9.0 # required for voxtral test mistral_common[image,audio] >= 1.9.0 # required for voxtral test
@@ -48,12 +48,16 @@ buildkite-test-collector==0.1.9
genai_perf>=0.0.8 genai_perf>=0.0.8
tritonclient>=2.51.0 tritonclient>=2.51.0
grpcio-tools==1.78.0 # Should match `build.txt` # The version of gRPC libraries should be consistent with each other
grpcio==1.78.0
grpcio-reflection==1.78.0
grpcio-tools==1.78.0
arctic-inference == 0.1.1 # Required for suffix decoding test arctic-inference == 0.1.1 # Required for suffix decoding test
numba == 0.61.2 # Required for N-gram speculative decoding numba == 0.61.2 # Required for N-gram speculative decoding
numpy numpy
runai-model-streamer[s3,gcs]==0.15.3 runai-model-streamer[s3,gcs]==0.15.3
fastsafetensors>=0.1.10 fastsafetensors>=0.2.2 # 0.2.2 contains important fixes for multi-GPU mem usage
pydantic>=2.12 # 2.11 leads to error on python 3.13 pydantic>=2.12 # 2.11 leads to error on python 3.13
decord==0.6.0 decord==0.6.0
terratorch @ git+https://github.com/IBM/terratorch.git@1.1.rc3 # required for PrithviMAE test terratorch @ git+https://github.com/IBM/terratorch.git@1.1.rc3 # required for PrithviMAE test

View File

@@ -155,10 +155,6 @@ coverage==7.10.6
# via pytest-cov # via pytest-cov
cramjam==2.9.0 cramjam==2.9.0
# via fastparquet # via fastparquet
cuda-bindings==12.9.4
# via torch
cuda-pathfinder==1.3.3
# via cuda-bindings
cupy-cuda12x==13.6.0 cupy-cuda12x==13.6.0
# via ray # via ray
cycler==0.12.1 cycler==0.12.1
@@ -224,7 +220,7 @@ fastparquet==2024.11.0
# via genai-perf # via genai-perf
fastrlock==0.8.2 fastrlock==0.8.2
# via cupy-cuda12x # via cupy-cuda12x
fastsafetensors==0.1.10 fastsafetensors==0.2.2
# via -r requirements/test.in # via -r requirements/test.in
filelock==3.16.1 filelock==3.16.1
# via # via
@@ -309,8 +305,13 @@ greenlet==3.2.3
# via sqlalchemy # via sqlalchemy
grpcio==1.78.0 grpcio==1.78.0
# via # via
# -r requirements/test.in
# grpcio-reflection
# grpcio-tools # grpcio-tools
# ray # ray
# tensorboard
grpcio-reflection==1.78.0
# via -r requirements/test.in
grpcio-tools==1.78.0 grpcio-tools==1.78.0
# via -r requirements/test.in # via -r requirements/test.in
gunicorn==23.0.0 gunicorn==23.0.0
@@ -635,7 +636,7 @@ nvidia-nvjitlink-cu12==12.9.86
# nvidia-cusolver-cu12 # nvidia-cusolver-cu12
# nvidia-cusparse-cu12 # nvidia-cusparse-cu12
# torch # torch
nvidia-nvshmem-cu12==3.4.5 nvidia-nvshmem-cu12==3.3.20
# via torch # via torch
nvidia-nvtx-cu12==12.9.79 nvidia-nvtx-cu12==12.9.79
# via torch # via torch
@@ -785,6 +786,7 @@ protobuf==6.33.2
# via # via
# google-api-core # google-api-core
# googleapis-common-protos # googleapis-common-protos
# grpcio-reflection
# grpcio-tools # grpcio-tools
# mlflow-skinny # mlflow-skinny
# opentelemetry-proto # opentelemetry-proto
@@ -1167,14 +1169,13 @@ tomli==2.2.1
# via schemathesis # via schemathesis
tomli-w==1.2.0 tomli-w==1.2.0
# via schemathesis # via schemathesis
torch==2.10.0+cu129 torch==2.9.1+cu129
# via # via
# -r requirements/test.in # -r requirements/test.in
# accelerate # accelerate
# bitsandbytes # bitsandbytes
# efficientnet-pytorch # efficientnet-pytorch
# encodec # encodec
# fastsafetensors
# kornia # kornia
# lightly # lightly
# lightning # lightning
@@ -1196,7 +1197,7 @@ torch==2.10.0+cu129
# torchvision # torchvision
# vector-quantize-pytorch # vector-quantize-pytorch
# vocos # vocos
torchaudio==2.10.0+cu129 torchaudio==2.9.1+cu129
# via # via
# -r requirements/test.in # -r requirements/test.in
# encodec # encodec
@@ -1209,7 +1210,7 @@ torchmetrics==1.7.4
# pytorch-lightning # pytorch-lightning
# terratorch # terratorch
# torchgeo # torchgeo
torchvision==0.25.0+cu129 torchvision==0.24.1+cu129
# via # via
# -r requirements/test.in # -r requirements/test.in
# lightly # lightly
@@ -1251,7 +1252,7 @@ transformers==4.57.5
# transformers-stream-generator # transformers-stream-generator
transformers-stream-generator==0.0.5 transformers-stream-generator==0.0.5
# via -r requirements/test.in # via -r requirements/test.in
triton==3.6.0 triton==3.5.1
# via torch # via torch
tritonclient==2.64.0 tritonclient==2.64.0
# via -r requirements/test.in # via -r requirements/test.in

View File

@@ -1035,7 +1035,7 @@ setup(
extras_require={ extras_require={
"bench": ["pandas", "matplotlib", "seaborn", "datasets", "scipy"], "bench": ["pandas", "matplotlib", "seaborn", "datasets", "scipy"],
"tensorizer": ["tensorizer==2.10.1"], "tensorizer": ["tensorizer==2.10.1"],
"fastsafetensors": ["fastsafetensors >= 0.1.10"], "fastsafetensors": ["fastsafetensors >= 0.2.2"],
"runai": ["runai-model-streamer[s3,gcs] >= 0.15.3"], "runai": ["runai-model-streamer[s3,gcs] >= 0.15.3"],
"audio": [ "audio": [
"librosa", "librosa",

View File

@@ -90,7 +90,9 @@ def use_vllm_config(vllm_config: VllmConfig):
yield yield
@pytest.mark.skipif(not is_torch_equal_or_newer("2.10.0"), reason="requires torch 2.10") @pytest.mark.skipif(
not is_torch_equal_or_newer("2.10.0.dev"), reason="requires torch 2.10"
)
def test_no_dynamo_cache_entry(monkeypatch: pytest.MonkeyPatch): def test_no_dynamo_cache_entry(monkeypatch: pytest.MonkeyPatch):
with monkeypatch.context() as m: with monkeypatch.context() as m:
vllm_config = make_vllm_config() vllm_config = make_vllm_config()
@@ -114,7 +116,9 @@ def test_no_dynamo_cache_entry(monkeypatch: pytest.MonkeyPatch):
assert torch.allclose(actual, expected) assert torch.allclose(actual, expected)
@pytest.mark.skipif(not is_torch_equal_or_newer("2.10.0"), reason="requires torch 2.10") @pytest.mark.skipif(
not is_torch_equal_or_newer("2.10.0.dev"), reason="requires torch 2.10"
)
def test_force_aot_load(monkeypatch: pytest.MonkeyPatch): def test_force_aot_load(monkeypatch: pytest.MonkeyPatch):
with tempfile.TemporaryDirectory() as tmpdirname, monkeypatch.context() as m: with tempfile.TemporaryDirectory() as tmpdirname, monkeypatch.context() as m:
args = (torch.randn(10, 10),) args = (torch.randn(10, 10),)
@@ -128,7 +132,9 @@ def test_force_aot_load(monkeypatch: pytest.MonkeyPatch):
CompiledMod(vllm_config=vllm_config)(*args) CompiledMod(vllm_config=vllm_config)(*args)
@pytest.mark.skipif(not is_torch_equal_or_newer("2.10.0"), reason="requires torch 2.10") @pytest.mark.skipif(
not is_torch_equal_or_newer("2.10.0.dev"), reason="requires torch 2.10"
)
def test_save_and_load(monkeypatch: pytest.MonkeyPatch): def test_save_and_load(monkeypatch: pytest.MonkeyPatch):
with monkeypatch.context() as m: with monkeypatch.context() as m:
args = (torch.randn(10, 10),) args = (torch.randn(10, 10),)
@@ -156,7 +162,9 @@ def test_save_and_load(monkeypatch: pytest.MonkeyPatch):
assert torch.allclose(ret, expected) assert torch.allclose(ret, expected)
@pytest.mark.skipif(not is_torch_equal_or_newer("2.10.0"), reason="requires torch 2.10") @pytest.mark.skipif(
not is_torch_equal_or_newer("2.10.0.dev"), reason="requires torch 2.10"
)
def test_cache_load_returns_tuple_consistency(monkeypatch: pytest.MonkeyPatch): def test_cache_load_returns_tuple_consistency(monkeypatch: pytest.MonkeyPatch):
""" """
Test that cache loading correctly handles the returns_tuple logic. Test that cache loading correctly handles the returns_tuple logic.
@@ -215,7 +223,9 @@ def test_cache_load_returns_tuple_consistency(monkeypatch: pytest.MonkeyPatch):
) )
@pytest.mark.skipif(not is_torch_equal_or_newer("2.10.0"), reason="requires torch 2.10") @pytest.mark.skipif(
not is_torch_equal_or_newer("2.10.0.dev"), reason="requires torch 2.10"
)
def test_cache_load_returns_tuple_consistency_tuple_output( def test_cache_load_returns_tuple_consistency_tuple_output(
monkeypatch: pytest.MonkeyPatch, monkeypatch: pytest.MonkeyPatch,
): ):
@@ -284,7 +294,9 @@ def test_cache_load_returns_tuple_consistency_tuple_output(
) )
@pytest.mark.skipif(not is_torch_equal_or_newer("2.10.0"), reason="requires torch 2.10") @pytest.mark.skipif(
not is_torch_equal_or_newer("2.10.0.dev"), reason="requires torch 2.10"
)
def test_shape_env(monkeypatch: pytest.MonkeyPatch): def test_shape_env(monkeypatch: pytest.MonkeyPatch):
""" """
Test that the shape environment is correctly serialized and preserved Test that the shape environment is correctly serialized and preserved
@@ -321,7 +333,9 @@ def test_shape_env(monkeypatch: pytest.MonkeyPatch):
assert guards_string == " - s77 <= 42\n - Eq(Mod(s77, 2), 0)" assert guards_string == " - s77 <= 42\n - Eq(Mod(s77, 2), 0)"
@pytest.mark.skipif(not is_torch_equal_or_newer("2.10.0"), reason="requires torch 2.10") @pytest.mark.skipif(
not is_torch_equal_or_newer("2.10.0.dev"), reason="requires torch 2.10"
)
def test_partition_wrapper_applied_on_aot_load( def test_partition_wrapper_applied_on_aot_load(
monkeypatch: pytest.MonkeyPatch, vllm_tmp_cache: Path, mocker monkeypatch: pytest.MonkeyPatch, vllm_tmp_cache: Path, mocker
): ):
@@ -412,7 +426,9 @@ def test_partition_wrapper_applied_on_aot_load(
) )
@pytest.mark.skipif(not is_torch_equal_or_newer("2.10.0"), reason="requires torch 2.10") @pytest.mark.skipif(
not is_torch_equal_or_newer("2.10.0.dev"), reason="requires torch 2.10"
)
@create_new_process_for_each_test("spawn") @create_new_process_for_each_test("spawn")
def test_gpt2_cache_hit(monkeypatch: pytest.MonkeyPatch): def test_gpt2_cache_hit(monkeypatch: pytest.MonkeyPatch):
""" """
@@ -476,7 +492,9 @@ def test_gpt2_cache_hit(monkeypatch: pytest.MonkeyPatch):
symbolic_shapes_module.make_symbol = original_make_symbol symbolic_shapes_module.make_symbol = original_make_symbol
@pytest.mark.skipif(not is_torch_equal_or_newer("2.10.0"), reason="requires torch 2.10") @pytest.mark.skipif(
not is_torch_equal_or_newer("2.10.0.dev"), reason="requires torch 2.10"
)
class TestStandaloneCompiledArtifacts: class TestStandaloneCompiledArtifacts:
def test_init(self): def test_init(self):
cache = StandaloneCompiledArtifacts() cache = StandaloneCompiledArtifacts()
@@ -650,7 +668,9 @@ class TestStandaloneCompiledArtifacts:
assert len(restored_cache.loaded_submodule_store) == 0 assert len(restored_cache.loaded_submodule_store) == 0
@pytest.mark.skipif(not is_torch_equal_or_newer("2.10.0"), reason="requires torch 2.10") @pytest.mark.skipif(
not is_torch_equal_or_newer("2.10.0.dev"), reason="requires torch 2.10"
)
class TestStandaloneCompiledArtifactsIntegration: class TestStandaloneCompiledArtifactsIntegration:
def test_add_pickle_unpickle(self): def test_add_pickle_unpickle(self):
cache = StandaloneCompiledArtifacts() cache = StandaloneCompiledArtifacts()

View File

@@ -39,7 +39,9 @@ def get_test_models():
@pytest.mark.parametrize("use_aot_compile", ["0", "1"]) @pytest.mark.parametrize("use_aot_compile", ["0", "1"])
@pytest.mark.parametrize("use_bytecode_hook", [True, False]) @pytest.mark.parametrize("use_bytecode_hook", [True, False])
@pytest.mark.parametrize("evaluate_guards", [False, True]) @pytest.mark.parametrize("evaluate_guards", [False, True])
@pytest.mark.skipif(not is_torch_equal_or_newer("2.10.0"), reason="requires torch 2.10") @pytest.mark.skipif(
not is_torch_equal_or_newer("2.10.0.dev"), reason="requires torch 2.10"
)
def test_dynamic_shapes_compilation( def test_dynamic_shapes_compilation(
monkeypatch, monkeypatch,
model_name, model_name,

View File

@@ -129,5 +129,5 @@ async def test_multi_chunk_streaming(
" First words I spoke in the original phonograph." " First words I spoke in the original phonograph."
" A little piece of practical poetry. Mary had a little lamb," " A little piece of practical poetry. Mary had a little lamb,"
" it sleeps with quite a flow, and everywhere that Mary went," " it sleeps with quite a flow, and everywhere that Mary went,"
" the lamb was sure to go" " the lamb was sure to go."
) )

View File

@@ -14,7 +14,6 @@ import torch.nn as nn
from vllm.config import VllmConfig, set_current_vllm_config from vllm.config import VllmConfig, set_current_vllm_config
from vllm.forward_context import set_forward_context from vllm.forward_context import set_forward_context
from vllm.model_executor.layers.fused_moe.shared_fused_moe import SharedFusedMoE from vllm.model_executor.layers.fused_moe.shared_fused_moe import SharedFusedMoE
from vllm.utils.torch_utils import is_torch_equal_or_newer
class SimpleLinear(nn.Module): class SimpleLinear(nn.Module):
@@ -61,10 +60,6 @@ def setup_cuda():
@pytest.mark.parametrize("num_tokens", [1, 32]) @pytest.mark.parametrize("num_tokens", [1, 32])
@pytest.mark.parametrize("hidden_size,latent_size", [(256, 128), (128, 64)]) @pytest.mark.parametrize("hidden_size,latent_size", [(256, 128), (128, 64)])
@pytest.mark.parametrize("dtype", [torch.bfloat16]) @pytest.mark.parametrize("dtype", [torch.bfloat16])
@pytest.mark.skipif(
is_torch_equal_or_newer("2.10.0"),
reason="Test fails with PyTorch 2.10.0 see: https://github.com/vllm-project/vllm/issues/33995",
)
def test_routed_input_transform_inside_vs_outside( def test_routed_input_transform_inside_vs_outside(
num_tokens: int, num_tokens: int,
hidden_size: int, hidden_size: int,

View File

@@ -275,6 +275,9 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
"zai-org/GLM-4.7-Flash", "zai-org/GLM-4.7-Flash",
min_transformers_version="5.0.0", min_transformers_version="5.0.0",
), ),
"GlmMoeDsaForCausalLM": _HfExamplesInfo(
"zai-org/GLM-5", min_transformers_version="5.0.1", is_available_online=False
),
"GPT2LMHeadModel": _HfExamplesInfo("openai-community/gpt2", {"alias": "gpt2"}), "GPT2LMHeadModel": _HfExamplesInfo("openai-community/gpt2", {"alias": "gpt2"}),
"GPTBigCodeForCausalLM": _HfExamplesInfo( "GPTBigCodeForCausalLM": _HfExamplesInfo(
"bigcode/starcoder", "bigcode/starcoder",

View File

@@ -97,7 +97,7 @@ def can_initialize(
"pickle error when loading `transformers.models.auto.CONFIG_MAPPING`" "pickle error when loading `transformers.models.auto.CONFIG_MAPPING`"
) )
if model_arch == "DeepseekV32ForCausalLM": if model_arch in ["DeepseekV32ForCausalLM", "GlmMoeDsaForCausalLM"]:
from vllm.platforms import current_platform from vllm.platforms import current_platform
capability = current_platform.get_device_capability() capability = current_platform.get_device_capability()

View File

@@ -7,7 +7,8 @@
set -e set -e
TORCHCODEC_REPO="${TORCHCODEC_REPO:-https://github.com/pytorch/torchcodec.git}" TORCHCODEC_REPO="${TORCHCODEC_REPO:-https://github.com/pytorch/torchcodec.git}"
TORCHCODEC_BRANCH="${TORCHCODEC_BRANCH:-main}" # Pin to a specific release for reproducibility; update as needed.
TORCHCODEC_BRANCH="${TORCHCODEC_BRANCH:-v0.10.0}"
echo "=== TorchCodec Installation Script ===" echo "=== TorchCodec Installation Script ==="

View File

@@ -233,7 +233,7 @@ class InductorStandaloneAdaptor(CompilerInterface):
from torch._inductor import standalone_compile from torch._inductor import standalone_compile
supports_aot = is_torch_equal_or_newer("2.10.0") supports_aot = is_torch_equal_or_newer("2.10.0.dev")
if not supports_aot and envs.VLLM_USE_MEGA_AOT_ARTIFACT: if not supports_aot and envs.VLLM_USE_MEGA_AOT_ARTIFACT:
logger.error( logger.error(

View File

@@ -333,7 +333,7 @@ def _support_torch_compile(
) -> None: ) -> None:
def mark_dynamic(arg: torch.Tensor, dims: list[int]) -> None: def mark_dynamic(arg: torch.Tensor, dims: list[int]) -> None:
if ds_type == DynamicShapesType.UNBACKED: if ds_type == DynamicShapesType.UNBACKED:
if is_torch_equal_or_newer("2.10.0"): if is_torch_equal_or_newer("2.10.0.dev"):
for dim in dims: for dim in dims:
torch._dynamo.decorators.mark_unbacked( torch._dynamo.decorators.mark_unbacked(
arg, dim, hint_override=arg.size()[dim] arg, dim, hint_override=arg.size()[dim]
@@ -373,7 +373,7 @@ def _support_torch_compile(
if isinstance(arg, torch.Tensor): if isinstance(arg, torch.Tensor):
# In case dims is specified with negative indexing # In case dims is specified with negative indexing
dims = [arg.ndim + dim if dim < 0 else dim for dim in dims] dims = [arg.ndim + dim if dim < 0 else dim for dim in dims]
if is_torch_equal_or_newer("2.10.0"): if is_torch_equal_or_newer("2.10.0.dev"):
for dim in dims: for dim in dims:
torch._dynamo.decorators.mark_unbacked( torch._dynamo.decorators.mark_unbacked(
arg, dim, hint_override=arg.size()[dim] arg, dim, hint_override=arg.size()[dim]
@@ -525,9 +525,9 @@ def _support_torch_compile(
fx_config_patches["backed_size_oblivious"] = True fx_config_patches["backed_size_oblivious"] = True
# Prepare inductor config patches # Prepare inductor config patches
# assume_32bit_indexing is only available in torch 2.10.0+ # assume_32bit_indexing is only available in torch 2.10.0.dev+
inductor_config_patches = {} inductor_config_patches = {}
if is_torch_equal_or_newer("2.10.0"): if is_torch_equal_or_newer("2.10.0.dev"):
inductor_config_patches["assume_32bit_indexing"] = ( inductor_config_patches["assume_32bit_indexing"] = (
self.compilation_config.dynamic_shapes_config.assume_32_bit_indexing self.compilation_config.dynamic_shapes_config.assume_32_bit_indexing
) )

View File

@@ -181,7 +181,7 @@ class SpeculativeConfig:
@staticmethod @staticmethod
def hf_config_override(hf_config: PretrainedConfig) -> PretrainedConfig: def hf_config_override(hf_config: PretrainedConfig) -> PretrainedConfig:
initial_architecture = hf_config.architectures[0] initial_architecture = hf_config.architectures[0]
if hf_config.model_type in ("deepseek_v3", "deepseek_v32"): if hf_config.model_type in ("deepseek_v3", "deepseek_v32", "glm_moe_dsa"):
hf_config.model_type = "deepseek_mtp" hf_config.model_type = "deepseek_mtp"
if hf_config.model_type == "deepseek_mtp": if hf_config.model_type == "deepseek_mtp":
n_predict = getattr(hf_config, "num_nextn_predict_layers", None) n_predict = getattr(hf_config, "num_nextn_predict_layers", None)

View File

@@ -48,7 +48,6 @@ class RealtimeConnection:
self.generation_task: asyncio.Task | None = None self.generation_task: asyncio.Task | None = None
self._is_connected = False self._is_connected = False
self._is_input_finished = False
self._is_model_validated = False self._is_model_validated = False
self._max_audio_filesize_mb = envs.VLLM_MAX_AUDIO_CLIP_FILESIZE_MB self._max_audio_filesize_mb = envs.VLLM_MAX_AUDIO_CLIP_FILESIZE_MB
@@ -145,7 +144,7 @@ class RealtimeConnection:
commit_event = InputAudioBufferCommit(**event) commit_event = InputAudioBufferCommit(**event)
# final signals that the audio is finished # final signals that the audio is finished
if commit_event.final: if commit_event.final:
self._is_input_finished = True self.audio_queue.put_nowait(None)
else: else:
await self.start_generation() await self.start_generation()
else: else:
@@ -239,11 +238,6 @@ class RealtimeConnection:
# finish because websocket connection was killed # finish because websocket connection was killed
break break
if self.audio_queue.empty() and self._is_input_finished:
# finish because client signals that audio input
# is finished
break
usage = UsageInfo( usage = UsageInfo(
prompt_tokens=prompt_token_ids_len, prompt_tokens=prompt_token_ids_len,
completion_tokens=completion_tokens_len, completion_tokens=completion_tokens_len,

View File

@@ -271,7 +271,7 @@ def use_aot_compile() -> bool:
default_value = ( default_value = (
"1" "1"
if is_torch_equal_or_newer("2.11.0.dev") and not disable_compile_cache() if is_torch_equal_or_newer("2.10.0.dev") and not disable_compile_cache()
else "0" else "0"
) )

View File

@@ -974,7 +974,7 @@ def enable_batch_invariant_mode():
) )
reduced_precision_val = ( reduced_precision_val = (
(False, False) if is_torch_equal_or_newer("2.10.0") else False (False, False) if is_torch_equal_or_newer("2.10.0.dev") else False
) )
torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = ( torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = (
reduced_precision_val reduced_precision_val

View File

@@ -19,42 +19,17 @@ from vllm.model_executor.layers.fused_moe.utils import _resize_cache
from vllm.model_executor.layers.quantization.utils.quant_utils import ( from vllm.model_executor.layers.quantization.utils.quant_utils import (
QuantKey, QuantKey,
) )
from vllm.platforms import current_platform
from vllm.triton_utils import tl, triton from vllm.triton_utils import tl, triton
from vllm.utils.import_utils import has_triton_kernels from vllm.utils.import_utils import has_triton_kernels
logger = init_logger(__name__) logger = init_logger(__name__)
use_legacy_triton_kernels = False
if has_triton_kernels(): if has_triton_kernels():
try: try:
import triton_kernels.swiglu import triton_kernels.swiglu
from triton_kernels.matmul_ogs import ( from triton_kernels.matmul_ogs import FnSpecs, FusedActivation, matmul_ogs
FnSpecs, from triton_kernels.routing import RoutingData, routing, routing_from_bitmatrix
FusedActivation, from triton_kernels.tensor import Bitmatrix
GatherIndx,
RoutingData,
ScatterIndx,
matmul_ogs,
)
from triton_kernels.tensor import (
BIT,
Bitmatrix,
)
from triton_kernels.topk import topk
try:
from triton_kernels.tensor import (
SparseMatrix,
make_ragged_tensor_metadata,
)
except ImportError:
if current_platform.is_rocm():
logger.warning_once("Using legacy triton_kernels on ROCm")
use_legacy_triton_kernels = True
else:
raise
except (AttributeError, ImportError) as e: except (AttributeError, ImportError) as e:
logger.error( logger.error(
"Failed to import Triton kernels. Please make sure your triton " "Failed to import Triton kernels. Please make sure your triton "
@@ -103,68 +78,6 @@ def pack_bitmatrix(
tl.store(bitmatrix_ptrs, y, mask=offsets_m[:, None] < n_rows) tl.store(bitmatrix_ptrs, y, mask=offsets_m[:, None] < n_rows)
def legacy_routing_from_bitmatrix(
bitmatrix: "Bitmatrix",
expt_scal: torch.Tensor,
expt_indx: torch.Tensor,
n_expts_tot: int,
n_expts_act: int,
) -> tuple["RoutingData", "GatherIndx", "ScatterIndx"]:
"""
Replacement for the removed triton_kernels.routing.routing_from_bitmatrix.
Creates routing data from a bitmatrix representation.
"""
if use_legacy_triton_kernels:
from triton_kernels.routing import routing_from_bitmatrix
return routing_from_bitmatrix(
bitmatrix, expt_scal, expt_indx, n_expts_tot, n_expts_act
)
sparse_logits = SparseMatrix(indx=expt_indx, vals=expt_scal, mask=bitmatrix)
dispatch_indx = sparse_logits.mask_metadata.row_sorted_indx
combine_indx = sparse_logits.mask_metadata.col_sorted_indx
ragged_batch_metadata = make_ragged_tensor_metadata(
sparse_logits.mask_metadata.col_sum,
dispatch_indx.shape[0],
)
gate_scal = sparse_logits.vals.flatten()[combine_indx]
routing_data = RoutingData(
gate_scal,
ragged_batch_metadata.block_sizes,
n_expts_tot,
n_expts_act,
ragged_batch_metadata,
)
gather_idx = GatherIndx(combine_indx, dispatch_indx)
scatter_idx = ScatterIndx(dispatch_indx, combine_indx)
return routing_data, gather_idx, scatter_idx
def legacy_routing(
logits: torch.Tensor,
n_expts_act: int,
sm_first: bool = False,
) -> tuple["RoutingData", "GatherIndx", "ScatterIndx"]:
"""
Replacement for the removed triton_kernels.routing.routing function.
Computes routing data from gating logits.
"""
if use_legacy_triton_kernels:
from triton_kernels.routing import routing
return routing(logits, n_expts_act, sm_first=sm_first)
if sm_first:
logits = torch.softmax(logits, dim=-1)
sparse_logits = topk(logits, n_expts_act, apply_softmax=not sm_first)
return legacy_routing_from_bitmatrix(
sparse_logits.mask,
sparse_logits.vals,
sparse_logits.indx,
logits.shape[-1],
n_expts_act,
)
def triton_kernel_moe_forward( def triton_kernel_moe_forward(
hidden_states: torch.Tensor, hidden_states: torch.Tensor,
w1, # Tensor or triton_kernels.Tensor w1, # Tensor or triton_kernels.Tensor
@@ -178,7 +91,7 @@ def triton_kernel_moe_forward(
global_num_experts: int = -1, global_num_experts: int = -1,
expert_map: torch.Tensor | None = None, expert_map: torch.Tensor | None = None,
) -> torch.Tensor: ) -> torch.Tensor:
routing_data, gather_idx, scatter_idx = legacy_routing( routing_data, gather_idx, scatter_idx = routing(
gating_output, topk, sm_first=not renormalize gating_output, topk, sm_first=not renormalize
) )
@@ -254,23 +167,11 @@ def triton_kernel_fused_experts(
) )
output_tensor = _resize_cache(output_tensor, (batch_dim, M, K)) output_tensor = _resize_cache(output_tensor, (batch_dim, M, K))
act = ( act = FusedActivation(
FusedActivation(
FnSpecs(
"swiglu",
triton_kernels.swiglu.swiglu_fn,
("alpha", "limit"),
reduction_n=2,
),
(swiglu_alpha, swiglu_limit),
)
if not use_legacy_triton_kernels
else FusedActivation(
FnSpecs("swiglu", triton_kernels.swiglu.swiglu_fn, ("alpha", "limit")), FnSpecs("swiglu", triton_kernels.swiglu.swiglu_fn, ("alpha", "limit")),
(swiglu_alpha, swiglu_limit), (swiglu_alpha, swiglu_limit),
2, 2,
) )
)
gammas = routing_data.gate_scal if routing_data else None gammas = routing_data.gate_scal if routing_data else None
matmul_ogs( matmul_ogs(
@@ -330,22 +231,13 @@ def make_routing_data(
bitmatrix_shape = [n_rows, bm_cols * 32] bitmatrix_shape = [n_rows, bm_cols * 32]
bitmatrix_shape_max = [n_rows, None] bitmatrix_shape_max = [n_rows, None]
bitmatrix = ( bitmatrix = Bitmatrix(
Bitmatrix( bitmatrix, shape=bitmatrix_shape, shape_max=bitmatrix_shape_max, scratchpad=None
bitmatrix, dtype=BIT, shape=bitmatrix_shape, shape_max=bitmatrix_shape_max
)
if not use_legacy_triton_kernels
else Bitmatrix(
bitmatrix,
shape=bitmatrix_shape,
shape_max=bitmatrix_shape_max,
scratchpad=None,
)
) )
# matmul_ogs expects invalid topk_weights to be -1s # matmul_ogs expects invalid topk_weights to be -1s
topk_weights = torch.where(topk_ids == -1, -1.0, topk_weights) topk_weights = torch.where(topk_ids == -1, -1.0, topk_weights)
routing_data, gather_indx, scatter_indx = legacy_routing_from_bitmatrix( routing_data, gather_indx, scatter_indx = routing_from_bitmatrix(
bitmatrix, topk_weights, topk_ids, num_local_experts, num_topk bitmatrix, topk_weights, topk_ids, num_local_experts, num_topk
) )

View File

@@ -836,7 +836,7 @@ class DeepseekV2MLAAttention(nn.Module):
qk_rope_head_dim, qk_rope_head_dim,
max_position=max_position_embeddings, max_position=max_position_embeddings,
rope_parameters=config.rope_parameters, rope_parameters=config.rope_parameters,
is_neox_style=True, is_neox_style=not getattr(config, "indexer_rope_interleave", False),
) )
self.indexer = Indexer( self.indexer = Indexer(
vllm_config, vllm_config,
@@ -1499,6 +1499,10 @@ class DeepseekV3ForCausalLM(DeepseekV2ForCausalLM):
pass pass
class GlmMoeDsaForCausalLM(DeepseekV2ForCausalLM):
pass
# Compatibility with # Compatibility with
# https://huggingface.co/deepseek-ai/DeepSeek-V3-Base/blob/main/configuration_deepseek.py # https://huggingface.co/deepseek-ai/DeepSeek-V3-Base/blob/main/configuration_deepseek.py
def get_spec_layer_idx_from_weight_name( def get_spec_layer_idx_from_weight_name(

View File

@@ -114,6 +114,7 @@ _TEXT_GENERATION_MODELS = {
"Glm4ForCausalLM": ("glm4", "Glm4ForCausalLM"), "Glm4ForCausalLM": ("glm4", "Glm4ForCausalLM"),
"Glm4MoeForCausalLM": ("glm4_moe", "Glm4MoeForCausalLM"), "Glm4MoeForCausalLM": ("glm4_moe", "Glm4MoeForCausalLM"),
"Glm4MoeLiteForCausalLM": ("glm4_moe_lite", "Glm4MoeLiteForCausalLM"), "Glm4MoeLiteForCausalLM": ("glm4_moe_lite", "Glm4MoeLiteForCausalLM"),
"GlmMoeDsaForCausalLM": ("deepseek_v2", "GlmMoeDsaForCausalLM"),
"GptOssForCausalLM": ("gpt_oss", "GptOssForCausalLM"), "GptOssForCausalLM": ("gpt_oss", "GptOssForCausalLM"),
"GPT2LMHeadModel": ("gpt2", "GPT2LMHeadModel"), "GPT2LMHeadModel": ("gpt2", "GPT2LMHeadModel"),
"GPTBigCodeForCausalLM": ("gpt_bigcode", "GPTBigCodeForCausalLM"), "GPTBigCodeForCausalLM": ("gpt_bigcode", "GPTBigCodeForCausalLM"),

View File

@@ -237,6 +237,7 @@ class ModelArchConfigConvertorBase:
"deepseek_v3", "deepseek_v3",
"deepseek_v32", "deepseek_v32",
"deepseek_mtp", "deepseek_mtp",
"glm_moe_dsa",
"glm4_moe_lite", "glm4_moe_lite",
"glm4_moe_lite_mtp", "glm4_moe_lite_mtp",
"kimi_k2", "kimi_k2",

View File

@@ -1503,6 +1503,24 @@ class SpecDecodeBaseProposer:
del self.model.lm_head del self.model.lm_head
self.model.lm_head = target_language_model.lm_head self.model.lm_head = target_language_model.lm_head
# MTP models call compute_logits via shared_head.head (a
# ParallelLMHead inside each MTP layer), not self.model.lm_head.
# If the checkpoint omits a copy of the lm_head weights at the
# MTP layer path, shared_head.head stays uninitialised and
# produces NaN logits. Always share it explicitly.
inner = getattr(self.model, "model", None)
layers = getattr(inner, "layers", None) if inner else None
if layers is not None:
items = layers.values() if isinstance(layers, nn.ModuleDict) else layers
for layer in items:
sh = getattr(layer, "shared_head", None)
if sh is not None and hasattr(sh, "head"):
del sh.head
sh.head = target_language_model.lm_head
logger.info(
"Shared target model lm_head with MTP shared_head.head."
)
@torch.inference_mode() @torch.inference_mode()
def dummy_run( def dummy_run(
self, self,