Compare commits
18 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
89a77b1084 | ||
|
|
d3c1513f5f | ||
|
|
5dbfbc967b | ||
|
|
c86cdcbcd2 | ||
|
|
3c9496f146 | ||
|
|
2d5be1dd5c | ||
|
|
7a06e5b05b | ||
|
|
946b2f106c | ||
|
|
5e8adb0c49 | ||
|
|
9be1ff2d3a | ||
|
|
b3ee90f961 | ||
|
|
c44d0c6d66 | ||
|
|
83db96d8cd | ||
|
|
dbfb79fe45 | ||
|
|
b2e1fc3589 | ||
|
|
55a1baebc5 | ||
|
|
e1e9841631 | ||
|
|
5bd63387c3 |
@@ -3,7 +3,6 @@ steps:
|
||||
- label: ":docker: Build image"
|
||||
key: image-build
|
||||
depends_on: []
|
||||
timeout_in_minutes: 600
|
||||
commands:
|
||||
- if [[ "$BUILDKITE_BRANCH" != "main" ]]; then .buildkite/image_build/image_build.sh $REGISTRY $REPO $BUILDKITE_COMMIT $BRANCH $VLLM_USE_PRECOMPILED $VLLM_MERGE_BASE_COMMIT $IMAGE_TAG; fi
|
||||
- if [[ "$BUILDKITE_BRANCH" == "main" ]]; then .buildkite/image_build/image_build.sh $REGISTRY $REPO $BUILDKITE_COMMIT $BRANCH $VLLM_USE_PRECOMPILED $VLLM_MERGE_BASE_COMMIT $IMAGE_TAG $IMAGE_TAG_LATEST; fi
|
||||
@@ -42,7 +41,7 @@ steps:
|
||||
limit: 2
|
||||
- exit_status: -10 # Agent was lost
|
||||
limit: 2
|
||||
|
||||
|
||||
- label: ":docker: Build CPU arm64 image"
|
||||
key: cpu-arm64-image-build
|
||||
depends_on: []
|
||||
|
||||
@@ -14,7 +14,7 @@ BUILDKITE_COMMIT=$3
|
||||
aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin $REGISTRY
|
||||
|
||||
# skip build if image already exists
|
||||
if [[ -z $(docker manifest inspect $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu) ]]; then
|
||||
if [[ -z $(docker manifest inspect "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-arm64-cpu) ]]; then
|
||||
echo "Image not found, proceeding with build..."
|
||||
else
|
||||
echo "Image found"
|
||||
@@ -24,10 +24,10 @@ fi
|
||||
# build
|
||||
docker build --file docker/Dockerfile.cpu \
|
||||
--build-arg max_jobs=16 \
|
||||
--build-arg buildkite_commit=$BUILDKITE_COMMIT \
|
||||
--tag $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu \
|
||||
--build-arg buildkite_commit="$BUILDKITE_COMMIT" \
|
||||
--tag "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-arm64-cpu \
|
||||
--target vllm-test \
|
||||
--progress plain .
|
||||
|
||||
# push
|
||||
docker push $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu
|
||||
docker push "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-arm64-cpu
|
||||
|
||||
@@ -248,8 +248,8 @@ steps:
|
||||
DOCKER_BUILDKIT: "1"
|
||||
DOCKERHUB_USERNAME: "vllmbot"
|
||||
|
||||
- group: "Publish wheels"
|
||||
key: "publish-wheels"
|
||||
- group: "Publish release artifacts"
|
||||
key: "publish-release-artifacts"
|
||||
steps:
|
||||
- block: "Confirm update release wheels to PyPI (experimental, use with caution)?"
|
||||
key: block-upload-release-wheels
|
||||
@@ -265,6 +265,27 @@ steps:
|
||||
queue: small_cpu_queue_postmerge
|
||||
commands:
|
||||
- "bash .buildkite/scripts/upload-release-wheels-pypi.sh"
|
||||
|
||||
- block: "Confirm update release images to DockerHub"
|
||||
key: block-update-release-images-dockerhub
|
||||
depends_on:
|
||||
- input-release-version
|
||||
- annotate-release-workflow
|
||||
|
||||
- label: "Publish release images to DockerHub"
|
||||
depends_on:
|
||||
- block-update-release-images-dockerhub
|
||||
agents:
|
||||
queue: small_cpu_queue_postmerge
|
||||
commands:
|
||||
- "bash .buildkite/scripts/push-release-images-dockerhub.sh"
|
||||
plugins:
|
||||
- docker-login#v3.0.0:
|
||||
username: vllmbot
|
||||
password-env: DOCKERHUB_TOKEN
|
||||
env:
|
||||
DOCKER_BUILDKIT: "1"
|
||||
DOCKERHUB_USERNAME: "vllmbot"
|
||||
|
||||
# =============================================================================
|
||||
# ROCm Release Pipeline (x86_64 only)
|
||||
|
||||
@@ -39,6 +39,7 @@ docker run \
|
||||
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray
|
||||
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp
|
||||
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager --attention-backend=TRITON_ATTN
|
||||
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager --quantization fp8
|
||||
python3 examples/offline_inference/basic/generate.py --model superjob/Qwen3-4B-Instruct-2507-GPTQ-Int4 --block-size 64 --enforce-eager
|
||||
python3 examples/offline_inference/basic/generate.py --model ibm-research/PowerMoE-3b --block-size 64 --enforce-eager -tp 2
|
||||
python3 examples/offline_inference/basic/generate.py --model ibm-research/PowerMoE-3b --block-size 64 --enforce-eager -tp 2 --enable-expert-parallel
|
||||
|
||||
98
.buildkite/scripts/push-release-images-dockerhub.sh
Normal file
98
.buildkite/scripts/push-release-images-dockerhub.sh
Normal file
@@ -0,0 +1,98 @@
|
||||
#!/bin/bash
|
||||
|
||||
set -ex
|
||||
|
||||
RELEASE_VERSION=$(buildkite-agent meta-data get release-version 2>/dev/null | sed 's/^v//')
|
||||
if [ -z "${RELEASE_VERSION}" ]; then
|
||||
echo "RELEASE_VERSION is not set"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7
|
||||
|
||||
# Download images:
|
||||
|
||||
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64
|
||||
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64
|
||||
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64-cu130
|
||||
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64-cu130
|
||||
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm-base
|
||||
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm
|
||||
docker pull public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:v${RELEASE_VERSION}
|
||||
docker pull public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:v${RELEASE_VERSION}
|
||||
|
||||
# Tag and push images:
|
||||
|
||||
## CUDA
|
||||
|
||||
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64 vllm/vllm-openai:x86_64
|
||||
docker tag vllm/vllm-openai:x86_64 vllm/vllm-openai:latest-x86_64
|
||||
docker tag vllm/vllm-openai:x86_64 vllm/vllm-openai:v${RELEASE_VERSION}-x86_64
|
||||
docker push vllm/vllm-openai:latest-x86_64
|
||||
docker push vllm/vllm-openai:v${RELEASE_VERSION}-x86_64
|
||||
|
||||
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64-cu130 vllm/vllm-openai:x86_64-cu130
|
||||
docker tag vllm/vllm-openai:x86_64-cu130 vllm/vllm-openai:latest-x86_64-cu130
|
||||
docker tag vllm/vllm-openai:x86_64-cu130 vllm/vllm-openai:v${RELEASE_VERSION}-x86_64-cu130
|
||||
docker push vllm/vllm-openai:latest-x86_64-cu130
|
||||
docker push vllm/vllm-openai:v${RELEASE_VERSION}-x86_64-cu130
|
||||
|
||||
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64 vllm/vllm-openai:aarch64
|
||||
docker tag vllm/vllm-openai:aarch64 vllm/vllm-openai:latest-aarch64
|
||||
docker tag vllm/vllm-openai:aarch64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
|
||||
docker push vllm/vllm-openai:latest-aarch64
|
||||
docker push vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
|
||||
|
||||
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64-cu130 vllm/vllm-openai:aarch64-cu130
|
||||
docker tag vllm/vllm-openai:aarch64-cu130 vllm/vllm-openai:latest-aarch64-cu130
|
||||
docker tag vllm/vllm-openai:aarch64-cu130 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64-cu130
|
||||
docker push vllm/vllm-openai:latest-aarch64-cu130
|
||||
docker push vllm/vllm-openai:v${RELEASE_VERSION}-aarch64-cu130
|
||||
|
||||
## ROCm
|
||||
|
||||
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}
|
||||
docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT} vllm/vllm-openai-rocm:latest
|
||||
docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT} vllm/vllm-openai-rocm:v${RELEASE_VERSION}
|
||||
docker push vllm/vllm-openai-rocm:latest
|
||||
docker push vllm/vllm-openai-rocm:v${RELEASE_VERSION}
|
||||
|
||||
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm-base vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-base
|
||||
docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-base vllm/vllm-openai-rocm:latest-base
|
||||
docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-base vllm/vllm-openai-rocm:v${RELEASE_VERSION}-base
|
||||
docker push vllm/vllm-openai-rocm:latest-base
|
||||
docker push vllm/vllm-openai-rocm:v${RELEASE_VERSION}-base
|
||||
|
||||
## CPU
|
||||
|
||||
docker tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:v${RELEASE_VERSION} vllm/vllm-openai-cpu:x86_64
|
||||
docker tag vllm/vllm-openai-cpu:x86_64 vllm/vllm-openai-cpu:latest-x86_64
|
||||
docker tag vllm/vllm-openai-cpu:x86_64 vllm/vllm-openai-cpu:v${RELEASE_VERSION}-x86_64
|
||||
docker push vllm/vllm-openai-cpu:latest-x86_64
|
||||
docker push vllm/vllm-openai-cpu:v${RELEASE_VERSION}-x86_64
|
||||
|
||||
docker tag public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:v${RELEASE_VERSION} vllm/vllm-openai-cpu:arm64
|
||||
docker tag vllm/vllm-openai-cpu:arm64 vllm/vllm-openai-cpu:latest-arm64
|
||||
docker tag vllm/vllm-openai-cpu:arm64 vllm/vllm-openai-cpu:v${RELEASE_VERSION}-arm64
|
||||
docker push vllm/vllm-openai-cpu:latest-arm64
|
||||
docker push vllm/vllm-openai-cpu:v${RELEASE_VERSION}-arm64
|
||||
|
||||
# Create multi-arch manifest:
|
||||
|
||||
docker manifest rm vllm/vllm-openai:latest
|
||||
docker manifest create vllm/vllm-openai:latest vllm/vllm-openai:latest-x86_64 vllm/vllm-openai:latest-aarch64
|
||||
docker manifest create vllm/vllm-openai:v${RELEASE_VERSION} vllm/vllm-openai:v${RELEASE_VERSION}-x86_64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
|
||||
docker manifest push vllm/vllm-openai:latest
|
||||
docker manifest push vllm/vllm-openai:v${RELEASE_VERSION}
|
||||
|
||||
docker manifest rm vllm/vllm-openai:latest-cu130
|
||||
docker manifest create vllm/vllm-openai:latest-cu130 vllm/vllm-openai:latest-x86_64-cu130 vllm/vllm-openai:latest-aarch64-cu130
|
||||
docker manifest create vllm/vllm-openai:v${RELEASE_VERSION}-cu130 vllm/vllm-openai:v${RELEASE_VERSION}-x86_64-cu130 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64-cu130
|
||||
docker manifest push vllm/vllm-openai:latest-cu130
|
||||
docker manifest push vllm/vllm-openai:v${RELEASE_VERSION}-cu130
|
||||
|
||||
docker manifest rm vllm/vllm-openai-cpu:latest || true
|
||||
docker manifest create vllm/vllm-openai-cpu:latest vllm/vllm-openai-cpu:latest-x86_64 vllm/vllm-openai-cpu:latest-arm64
|
||||
docker manifest create vllm/vllm-openai-cpu:v${RELEASE_VERSION} vllm/vllm-openai-cpu:v${RELEASE_VERSION}-x86_64 vllm/vllm-openai-cpu:v${RELEASE_VERSION}-arm64
|
||||
docker manifest push vllm/vllm-openai-cpu:latest
|
||||
docker manifest push vllm/vllm-openai-cpu:v${RELEASE_VERSION}
|
||||
@@ -56,8 +56,8 @@ endif()
|
||||
# requirements.txt files and should be kept consistent. The ROCm torch
|
||||
# versions are derived from docker/Dockerfile.rocm
|
||||
#
|
||||
set(TORCH_SUPPORTED_VERSION_CUDA "2.10.0")
|
||||
set(TORCH_SUPPORTED_VERSION_ROCM "2.10.0")
|
||||
set(TORCH_SUPPORTED_VERSION_CUDA "2.9.1")
|
||||
set(TORCH_SUPPORTED_VERSION_ROCM "2.9.1")
|
||||
|
||||
#
|
||||
# Try to find python package with an executable that exactly matches
|
||||
@@ -433,7 +433,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
list(APPEND VLLM_EXT_SRC ${MARLIN_TEMPLATE_BF16_KERNEL_SRC})
|
||||
endif()
|
||||
|
||||
if (MARLIN_SM75_ARCHS)
|
||||
if (MARLIN_SM75_ARCHS)
|
||||
file(GLOB MARLIN_TEMPLATE_SM75_KERNEL_SRC "csrc/quantization/marlin/sm75_kernel_*.cu")
|
||||
set_gencode_flags_for_srcs(
|
||||
SRCS "${MARLIN_TEMPLATE_SM75_KERNEL_SRC}"
|
||||
@@ -445,7 +445,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
list(APPEND VLLM_EXT_SRC ${MARLIN_TEMPLATE_SM75_KERNEL_SRC})
|
||||
endif()
|
||||
|
||||
if (MARLIN_FP8_ARCHS)
|
||||
if (MARLIN_FP8_ARCHS)
|
||||
file(GLOB MARLIN_TEMPLATE_FP8_KERNEL_SRC "csrc/quantization/marlin/sm89_kernel_*.cu")
|
||||
set_gencode_flags_for_srcs(
|
||||
SRCS "${MARLIN_TEMPLATE_FP8_KERNEL_SRC}"
|
||||
@@ -1042,7 +1042,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
list(APPEND VLLM_MOE_EXT_SRC ${MARLIN_MOE_SRC})
|
||||
endif()
|
||||
|
||||
if (MARLIN_MOE_SM75_ARCHS)
|
||||
if (MARLIN_MOE_SM75_ARCHS)
|
||||
file(GLOB MARLIN_MOE_SM75_SRC "csrc/moe/marlin_moe_wna16/sm75_kernel_*.cu")
|
||||
set_gencode_flags_for_srcs(
|
||||
SRCS "${MARLIN_MOE_SM75_SRC}"
|
||||
|
||||
@@ -686,6 +686,7 @@ def get_model_params(config):
|
||||
"DeepseekV2ForCausalLM",
|
||||
"DeepseekV3ForCausalLM",
|
||||
"DeepseekV32ForCausalLM",
|
||||
"GlmMoeDsaForCausalLM",
|
||||
"Glm4MoeForCausalLM",
|
||||
"Glm4MoeLiteForCausalLM",
|
||||
"NemotronHForCausalLM",
|
||||
|
||||
@@ -1,9 +1,9 @@
|
||||
# Install OpenAI triton_kernels from https://github.com/triton-lang/triton/tree/main/python/triton_kernels
|
||||
|
||||
set(DEFAULT_TRITON_KERNELS_TAG "v3.6.0")
|
||||
set(DEFAULT_TRITON_KERNELS_TAG "v3.5.0")
|
||||
|
||||
# Set TRITON_KERNELS_SRC_DIR for use with local development with vLLM. We expect TRITON_KERNELS_SRC_DIR to
|
||||
# be directly set to the triton_kernels python directory.
|
||||
# be directly set to the triton_kernels python directory.
|
||||
if (DEFINED ENV{TRITON_KERNELS_SRC_DIR})
|
||||
message(STATUS "[triton_kernels] Fetch from $ENV{TRITON_KERNELS_SRC_DIR}")
|
||||
FetchContent_Declare(
|
||||
@@ -24,7 +24,7 @@ else()
|
||||
)
|
||||
endif()
|
||||
|
||||
# Fetch content
|
||||
# Fetch content
|
||||
FetchContent_MakeAvailable(triton_kernels)
|
||||
|
||||
if (NOT triton_kernels_SOURCE_DIR)
|
||||
@@ -47,7 +47,7 @@ install(CODE "file(MAKE_DIRECTORY \"\${CMAKE_INSTALL_PREFIX}/vllm/third_party/tr
|
||||
## Copy .py files to install directory.
|
||||
install(DIRECTORY
|
||||
${TRITON_KERNELS_PYTHON_DIR}
|
||||
DESTINATION
|
||||
DESTINATION
|
||||
vllm/third_party/triton_kernels/
|
||||
COMPONENT triton_kernels
|
||||
FILES_MATCHING PATTERN "*.py")
|
||||
|
||||
@@ -134,7 +134,6 @@ WORKDIR /vllm-workspace
|
||||
# Copy test requirements
|
||||
COPY requirements/test.in requirements/cpu-test.in
|
||||
|
||||
# TODO: Update to 2.9.0 when there is a new build for intel_extension_for_pytorch for that version
|
||||
RUN \
|
||||
sed -i '/mamba_ssm/d' requirements/cpu-test.in && \
|
||||
remove_packages_not_supported_on_aarch64() { \
|
||||
|
||||
@@ -6,10 +6,11 @@ vLLM initially supports basic model inference and serving on Intel GPU platform.
|
||||
# --8<-- [start:requirements]
|
||||
|
||||
- Supported Hardware: Intel Data Center GPU, Intel ARC GPU
|
||||
- OneAPI requirements: oneAPI 2025.1
|
||||
- OneAPI requirements: oneAPI 2025.3
|
||||
- Dependency: [vllm-xpu-kernels](https://github.com/vllm-project/vllm-xpu-kernels): a package provide all necessary vllm custom kernel when running vLLM on Intel GPU platform,
|
||||
- Python: 3.12
|
||||
!!! warning
|
||||
The provided IPEX whl is Python3.12 specific so this version is a MUST.
|
||||
The provided vllm-xpu-kernels whl is Python3.12 specific so this version is a MUST.
|
||||
|
||||
# --8<-- [end:requirements]
|
||||
# --8<-- [start:set-up-using-python]
|
||||
@@ -24,7 +25,7 @@ Currently, there are no pre-built XPU wheels.
|
||||
# --8<-- [end:pre-built-wheels]
|
||||
# --8<-- [start:build-wheel-from-source]
|
||||
|
||||
- First, install required [driver](https://dgpu-docs.intel.com/driver/installation.html#installing-gpu-drivers) and [Intel OneAPI](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit.html) 2025.1 or later.
|
||||
- First, install required [driver](https://dgpu-docs.intel.com/driver/installation.html#installing-gpu-drivers) and [Intel OneAPI](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit.html) 2025.3 or later.
|
||||
- Second, install Python packages for vLLM XPU backend building:
|
||||
|
||||
```bash
|
||||
@@ -37,7 +38,7 @@ pip install -v -r requirements/xpu.txt
|
||||
- Then, build and install vLLM XPU backend:
|
||||
|
||||
```bash
|
||||
VLLM_TARGET_DEVICE=xpu python setup.py install
|
||||
VLLM_TARGET_DEVICE=xpu pip install --no-build-isolation -e . -v
|
||||
```
|
||||
|
||||
# --8<-- [end:build-wheel-from-source]
|
||||
|
||||
@@ -6,7 +6,7 @@ requires = [
|
||||
"packaging>=24.2",
|
||||
"setuptools>=77.0.3,<81.0.0",
|
||||
"setuptools-scm>=8.0",
|
||||
"torch == 2.10.0",
|
||||
"torch == 2.9.1",
|
||||
"wheel",
|
||||
"jinja2",
|
||||
"grpcio-tools==1.78.0",
|
||||
|
||||
@@ -4,10 +4,10 @@ ninja
|
||||
packaging>=24.2
|
||||
setuptools>=77.0.3,<81.0.0
|
||||
setuptools-scm>=8
|
||||
torch==2.10.0
|
||||
torch==2.9.1
|
||||
wheel
|
||||
jinja2>=3.1.6
|
||||
regex
|
||||
build
|
||||
protobuf
|
||||
protobuf >= 5.29.6, !=6.30.*, !=6.31.*, !=6.32.*, !=6.33.0.*, !=6.33.1.*, !=6.33.2.*, !=6.33.3.*, !=6.33.4.*
|
||||
grpcio-tools==1.78.0 # Required for grpc entrypoints
|
||||
|
||||
@@ -9,7 +9,7 @@ blake3
|
||||
py-cpuinfo
|
||||
transformers >= 4.56.0, < 5
|
||||
tokenizers >= 0.21.1 # Required for fast incremental detokenization.
|
||||
protobuf # Required by LlamaTokenizer, gRPC.
|
||||
protobuf >= 5.29.6, !=6.30.*, !=6.31.*, !=6.32.*, !=6.33.0.*, !=6.33.1.*, !=6.33.2.*, !=6.33.3.*, !=6.33.4.* # Required by LlamaTokenizer, gRPC. CVE-2026-0994
|
||||
fastapi[standard] >= 0.115.0 # Required by FastAPI's form models in the OpenAI API server's audio transcriptions endpoint.
|
||||
aiohttp >= 3.13.3
|
||||
openai >= 1.99.1 # For Responses API with reasoning content
|
||||
|
||||
@@ -5,9 +5,9 @@ numba == 0.61.2 # Required for N-gram speculative decoding
|
||||
|
||||
# Dependencies for NVIDIA GPUs
|
||||
ray[cgraph]>=2.48.0
|
||||
torch==2.10.0
|
||||
torchaudio==2.10.0
|
||||
torch==2.9.1
|
||||
torchaudio==2.9.1
|
||||
# These must be updated alongside torch
|
||||
torchvision==0.25.0 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
|
||||
torchvision==0.24.1 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
|
||||
# FlashInfer should be updated together with the Dockerfile
|
||||
flashinfer-python==0.6.3
|
||||
|
||||
@@ -43,5 +43,5 @@ tritonclient>=2.51.0
|
||||
numba == 0.61.2 # Required for N-gram speculative decoding
|
||||
numpy
|
||||
runai-model-streamer[s3,gcs]==0.15.3
|
||||
fastsafetensors>=0.1.10
|
||||
fastsafetensors>=0.2.2
|
||||
pydantic>=2.12 # 2.11 leads to error on python 3.13
|
||||
|
||||
@@ -1,11 +1,12 @@
|
||||
# Common dependencies
|
||||
-r common.txt
|
||||
|
||||
--extra-index-url https://download.pytorch.org/whl/test/rocm7.0
|
||||
torch==2.10.0
|
||||
torchvision==0.25.0
|
||||
torchaudio==2.10.0
|
||||
triton==3.6.0
|
||||
--extra-index-url https://download.pytorch.org/whl/rocm6.4
|
||||
torch==2.9.1
|
||||
torchvision==0.24.1
|
||||
torchaudio==2.9.1
|
||||
|
||||
triton==3.5.1
|
||||
cmake>=3.26.1,<4
|
||||
packaging>=24.2
|
||||
setuptools>=77.0.3,<80.0.0
|
||||
|
||||
@@ -1,6 +1,11 @@
|
||||
# Common dependencies
|
||||
-r common.txt
|
||||
|
||||
# The version of gRPC libraries should be consistent with each other
|
||||
grpcio==1.78.0
|
||||
grpcio-reflection==1.78.0
|
||||
grpcio-tools==1.78.0
|
||||
|
||||
numba == 0.61.2 # Required for N-gram speculative decoding
|
||||
|
||||
# Dependencies for AMD GPUs
|
||||
@@ -14,5 +19,4 @@ setuptools>=77.0.3,<80.0.0
|
||||
setuptools-scm>=8
|
||||
runai-model-streamer[s3,gcs]==0.15.3
|
||||
conch-triton-kernels==1.2.1
|
||||
timm>=1.0.17
|
||||
grpcio-tools==1.78.0 # Should match `build.txt`
|
||||
timm>=1.0.17
|
||||
@@ -24,10 +24,10 @@ sentence-transformers>=5.2.0 # required for embedding tests
|
||||
soundfile # required for audio tests
|
||||
jiwer # required for audio tests
|
||||
tblib # for pickling test exceptions
|
||||
timm >=1.0.17 # required for internvl and gemma3n-mm test
|
||||
torch==2.10.0
|
||||
torchaudio==2.10.0
|
||||
torchvision==0.25.0
|
||||
timm==1.0.17 # required for internvl and gemma3n-mm test
|
||||
torch==2.9.1
|
||||
torchaudio==2.9.1
|
||||
torchvision==0.24.1
|
||||
transformers_stream_generator # required for qwen-vl test
|
||||
matplotlib # required for qwen-vl test
|
||||
mistral_common[image,audio] >= 1.9.0 # required for voxtral test
|
||||
@@ -48,12 +48,16 @@ buildkite-test-collector==0.1.9
|
||||
genai_perf>=0.0.8
|
||||
tritonclient>=2.51.0
|
||||
|
||||
grpcio-tools==1.78.0 # Should match `build.txt`
|
||||
# The version of gRPC libraries should be consistent with each other
|
||||
grpcio==1.78.0
|
||||
grpcio-reflection==1.78.0
|
||||
grpcio-tools==1.78.0
|
||||
|
||||
arctic-inference == 0.1.1 # Required for suffix decoding test
|
||||
numba == 0.61.2 # Required for N-gram speculative decoding
|
||||
numpy
|
||||
runai-model-streamer[s3,gcs]==0.15.3
|
||||
fastsafetensors>=0.1.10
|
||||
fastsafetensors>=0.2.2 # 0.2.2 contains important fixes for multi-GPU mem usage
|
||||
pydantic>=2.12 # 2.11 leads to error on python 3.13
|
||||
decord==0.6.0
|
||||
terratorch @ git+https://github.com/IBM/terratorch.git@1.1.rc3 # required for PrithviMAE test
|
||||
|
||||
@@ -155,10 +155,6 @@ coverage==7.10.6
|
||||
# via pytest-cov
|
||||
cramjam==2.9.0
|
||||
# via fastparquet
|
||||
cuda-bindings==12.9.4
|
||||
# via torch
|
||||
cuda-pathfinder==1.3.3
|
||||
# via cuda-bindings
|
||||
cupy-cuda12x==13.6.0
|
||||
# via ray
|
||||
cycler==0.12.1
|
||||
@@ -224,7 +220,7 @@ fastparquet==2024.11.0
|
||||
# via genai-perf
|
||||
fastrlock==0.8.2
|
||||
# via cupy-cuda12x
|
||||
fastsafetensors==0.1.10
|
||||
fastsafetensors==0.2.2
|
||||
# via -r requirements/test.in
|
||||
filelock==3.16.1
|
||||
# via
|
||||
@@ -309,8 +305,13 @@ greenlet==3.2.3
|
||||
# via sqlalchemy
|
||||
grpcio==1.78.0
|
||||
# via
|
||||
# -r requirements/test.in
|
||||
# grpcio-reflection
|
||||
# grpcio-tools
|
||||
# ray
|
||||
# tensorboard
|
||||
grpcio-reflection==1.78.0
|
||||
# via -r requirements/test.in
|
||||
grpcio-tools==1.78.0
|
||||
# via -r requirements/test.in
|
||||
gunicorn==23.0.0
|
||||
@@ -635,7 +636,7 @@ nvidia-nvjitlink-cu12==12.9.86
|
||||
# nvidia-cusolver-cu12
|
||||
# nvidia-cusparse-cu12
|
||||
# torch
|
||||
nvidia-nvshmem-cu12==3.4.5
|
||||
nvidia-nvshmem-cu12==3.3.20
|
||||
# via torch
|
||||
nvidia-nvtx-cu12==12.9.79
|
||||
# via torch
|
||||
@@ -785,6 +786,7 @@ protobuf==6.33.2
|
||||
# via
|
||||
# google-api-core
|
||||
# googleapis-common-protos
|
||||
# grpcio-reflection
|
||||
# grpcio-tools
|
||||
# mlflow-skinny
|
||||
# opentelemetry-proto
|
||||
@@ -1167,14 +1169,13 @@ tomli==2.2.1
|
||||
# via schemathesis
|
||||
tomli-w==1.2.0
|
||||
# via schemathesis
|
||||
torch==2.10.0+cu129
|
||||
torch==2.9.1+cu129
|
||||
# via
|
||||
# -r requirements/test.in
|
||||
# accelerate
|
||||
# bitsandbytes
|
||||
# efficientnet-pytorch
|
||||
# encodec
|
||||
# fastsafetensors
|
||||
# kornia
|
||||
# lightly
|
||||
# lightning
|
||||
@@ -1196,7 +1197,7 @@ torch==2.10.0+cu129
|
||||
# torchvision
|
||||
# vector-quantize-pytorch
|
||||
# vocos
|
||||
torchaudio==2.10.0+cu129
|
||||
torchaudio==2.9.1+cu129
|
||||
# via
|
||||
# -r requirements/test.in
|
||||
# encodec
|
||||
@@ -1209,7 +1210,7 @@ torchmetrics==1.7.4
|
||||
# pytorch-lightning
|
||||
# terratorch
|
||||
# torchgeo
|
||||
torchvision==0.25.0+cu129
|
||||
torchvision==0.24.1+cu129
|
||||
# via
|
||||
# -r requirements/test.in
|
||||
# lightly
|
||||
@@ -1251,7 +1252,7 @@ transformers==4.57.5
|
||||
# transformers-stream-generator
|
||||
transformers-stream-generator==0.0.5
|
||||
# via -r requirements/test.in
|
||||
triton==3.6.0
|
||||
triton==3.5.1
|
||||
# via torch
|
||||
tritonclient==2.64.0
|
||||
# via -r requirements/test.in
|
||||
|
||||
@@ -15,4 +15,4 @@ torch==2.10.0+xpu
|
||||
torchaudio
|
||||
torchvision
|
||||
|
||||
vllm_xpu_kernels @ https://github.com/vllm-project/vllm-xpu-kernels/releases/download/v0.1.1/vllm_xpu_kernels-0.1.1-cp312-cp312-linux_x86_64.whl
|
||||
vllm_xpu_kernels @ https://github.com/vllm-project/vllm-xpu-kernels/releases/download/v0.1.2/vllm_xpu_kernels-0.1.2-cp312-cp312-linux_x86_64.whl
|
||||
|
||||
2
setup.py
2
setup.py
@@ -1035,7 +1035,7 @@ setup(
|
||||
extras_require={
|
||||
"bench": ["pandas", "matplotlib", "seaborn", "datasets", "scipy"],
|
||||
"tensorizer": ["tensorizer==2.10.1"],
|
||||
"fastsafetensors": ["fastsafetensors >= 0.1.10"],
|
||||
"fastsafetensors": ["fastsafetensors >= 0.2.2"],
|
||||
"runai": ["runai-model-streamer[s3,gcs] >= 0.15.3"],
|
||||
"audio": [
|
||||
"librosa",
|
||||
|
||||
@@ -267,7 +267,7 @@ elif current_platform.is_rocm():
|
||||
PATTERN_TEST_MODELS_FP8 = [
|
||||
("amd/Llama-3.1-8B-Instruct-FP8-KV", TestAttentionFp8StaticQuantPatternModel)
|
||||
]
|
||||
BACKENDS = [
|
||||
BACKENDS_FP8 = [
|
||||
AttentionBackendEnum.ROCM_AITER_UNIFIED_ATTN,
|
||||
AttentionBackendEnum.ROCM_ATTN,
|
||||
AttentionBackendEnum.TRITON_ATTN,
|
||||
@@ -474,6 +474,17 @@ def test_attention_quant_pattern(
|
||||
assert attn_nodes_pre[0].kwargs.get("output_block_scale") is None, (
|
||||
"Attention should not have output_block_scale before fusion"
|
||||
)
|
||||
|
||||
kv_cache_dummy_dep_pre_is_none = (
|
||||
attn_nodes_pre[0].kwargs.get("kv_cache_dummy_dep") is None
|
||||
)
|
||||
kv_cache_dummy_dep_post_is_none = (
|
||||
attn_nodes_post[0].kwargs.get("kv_cache_dummy_dep") is None
|
||||
)
|
||||
assert not (kv_cache_dummy_dep_pre_is_none ^ kv_cache_dummy_dep_post_is_none), (
|
||||
"The kv_cache_dummy_dep should be consistent before and after fusion"
|
||||
)
|
||||
|
||||
if quant_key.dtype == FP8_DTYPE:
|
||||
assert attn_nodes_post[0].kwargs.get("output_block_scale") is None, (
|
||||
"Attention should not have output_block_scale after FP8 fusion"
|
||||
|
||||
@@ -90,7 +90,9 @@ def use_vllm_config(vllm_config: VllmConfig):
|
||||
yield
|
||||
|
||||
|
||||
@pytest.mark.skipif(not is_torch_equal_or_newer("2.10.0"), reason="requires torch 2.10")
|
||||
@pytest.mark.skipif(
|
||||
not is_torch_equal_or_newer("2.10.0.dev"), reason="requires torch 2.10"
|
||||
)
|
||||
def test_no_dynamo_cache_entry(monkeypatch: pytest.MonkeyPatch):
|
||||
with monkeypatch.context() as m:
|
||||
vllm_config = make_vllm_config()
|
||||
@@ -114,7 +116,9 @@ def test_no_dynamo_cache_entry(monkeypatch: pytest.MonkeyPatch):
|
||||
assert torch.allclose(actual, expected)
|
||||
|
||||
|
||||
@pytest.mark.skipif(not is_torch_equal_or_newer("2.10.0"), reason="requires torch 2.10")
|
||||
@pytest.mark.skipif(
|
||||
not is_torch_equal_or_newer("2.10.0.dev"), reason="requires torch 2.10"
|
||||
)
|
||||
def test_force_aot_load(monkeypatch: pytest.MonkeyPatch):
|
||||
with tempfile.TemporaryDirectory() as tmpdirname, monkeypatch.context() as m:
|
||||
args = (torch.randn(10, 10),)
|
||||
@@ -128,7 +132,9 @@ def test_force_aot_load(monkeypatch: pytest.MonkeyPatch):
|
||||
CompiledMod(vllm_config=vllm_config)(*args)
|
||||
|
||||
|
||||
@pytest.mark.skipif(not is_torch_equal_or_newer("2.10.0"), reason="requires torch 2.10")
|
||||
@pytest.mark.skipif(
|
||||
not is_torch_equal_or_newer("2.10.0.dev"), reason="requires torch 2.10"
|
||||
)
|
||||
def test_save_and_load(monkeypatch: pytest.MonkeyPatch):
|
||||
with monkeypatch.context() as m:
|
||||
args = (torch.randn(10, 10),)
|
||||
@@ -156,7 +162,9 @@ def test_save_and_load(monkeypatch: pytest.MonkeyPatch):
|
||||
assert torch.allclose(ret, expected)
|
||||
|
||||
|
||||
@pytest.mark.skipif(not is_torch_equal_or_newer("2.10.0"), reason="requires torch 2.10")
|
||||
@pytest.mark.skipif(
|
||||
not is_torch_equal_or_newer("2.10.0.dev"), reason="requires torch 2.10"
|
||||
)
|
||||
def test_cache_load_returns_tuple_consistency(monkeypatch: pytest.MonkeyPatch):
|
||||
"""
|
||||
Test that cache loading correctly handles the returns_tuple logic.
|
||||
@@ -215,7 +223,9 @@ def test_cache_load_returns_tuple_consistency(monkeypatch: pytest.MonkeyPatch):
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.skipif(not is_torch_equal_or_newer("2.10.0"), reason="requires torch 2.10")
|
||||
@pytest.mark.skipif(
|
||||
not is_torch_equal_or_newer("2.10.0.dev"), reason="requires torch 2.10"
|
||||
)
|
||||
def test_cache_load_returns_tuple_consistency_tuple_output(
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
):
|
||||
@@ -284,7 +294,9 @@ def test_cache_load_returns_tuple_consistency_tuple_output(
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.skipif(not is_torch_equal_or_newer("2.10.0"), reason="requires torch 2.10")
|
||||
@pytest.mark.skipif(
|
||||
not is_torch_equal_or_newer("2.10.0.dev"), reason="requires torch 2.10"
|
||||
)
|
||||
def test_shape_env(monkeypatch: pytest.MonkeyPatch):
|
||||
"""
|
||||
Test that the shape environment is correctly serialized and preserved
|
||||
@@ -321,7 +333,9 @@ def test_shape_env(monkeypatch: pytest.MonkeyPatch):
|
||||
assert guards_string == " - s77 <= 42\n - Eq(Mod(s77, 2), 0)"
|
||||
|
||||
|
||||
@pytest.mark.skipif(not is_torch_equal_or_newer("2.10.0"), reason="requires torch 2.10")
|
||||
@pytest.mark.skipif(
|
||||
not is_torch_equal_or_newer("2.10.0.dev"), reason="requires torch 2.10"
|
||||
)
|
||||
def test_partition_wrapper_applied_on_aot_load(
|
||||
monkeypatch: pytest.MonkeyPatch, vllm_tmp_cache: Path, mocker
|
||||
):
|
||||
@@ -412,7 +426,9 @@ def test_partition_wrapper_applied_on_aot_load(
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.skipif(not is_torch_equal_or_newer("2.10.0"), reason="requires torch 2.10")
|
||||
@pytest.mark.skipif(
|
||||
not is_torch_equal_or_newer("2.10.0.dev"), reason="requires torch 2.10"
|
||||
)
|
||||
@create_new_process_for_each_test("spawn")
|
||||
def test_gpt2_cache_hit(monkeypatch: pytest.MonkeyPatch):
|
||||
"""
|
||||
@@ -476,7 +492,9 @@ def test_gpt2_cache_hit(monkeypatch: pytest.MonkeyPatch):
|
||||
symbolic_shapes_module.make_symbol = original_make_symbol
|
||||
|
||||
|
||||
@pytest.mark.skipif(not is_torch_equal_or_newer("2.10.0"), reason="requires torch 2.10")
|
||||
@pytest.mark.skipif(
|
||||
not is_torch_equal_or_newer("2.10.0.dev"), reason="requires torch 2.10"
|
||||
)
|
||||
class TestStandaloneCompiledArtifacts:
|
||||
def test_init(self):
|
||||
cache = StandaloneCompiledArtifacts()
|
||||
@@ -650,7 +668,9 @@ class TestStandaloneCompiledArtifacts:
|
||||
assert len(restored_cache.loaded_submodule_store) == 0
|
||||
|
||||
|
||||
@pytest.mark.skipif(not is_torch_equal_or_newer("2.10.0"), reason="requires torch 2.10")
|
||||
@pytest.mark.skipif(
|
||||
not is_torch_equal_or_newer("2.10.0.dev"), reason="requires torch 2.10"
|
||||
)
|
||||
class TestStandaloneCompiledArtifactsIntegration:
|
||||
def test_add_pickle_unpickle(self):
|
||||
cache = StandaloneCompiledArtifacts()
|
||||
|
||||
@@ -39,7 +39,9 @@ def get_test_models():
|
||||
@pytest.mark.parametrize("use_aot_compile", ["0", "1"])
|
||||
@pytest.mark.parametrize("use_bytecode_hook", [True, False])
|
||||
@pytest.mark.parametrize("evaluate_guards", [False, True])
|
||||
@pytest.mark.skipif(not is_torch_equal_or_newer("2.10.0"), reason="requires torch 2.10")
|
||||
@pytest.mark.skipif(
|
||||
not is_torch_equal_or_newer("2.10.0.dev"), reason="requires torch 2.10"
|
||||
)
|
||||
def test_dynamic_shapes_compilation(
|
||||
monkeypatch,
|
||||
model_name,
|
||||
|
||||
@@ -129,5 +129,5 @@ async def test_multi_chunk_streaming(
|
||||
" First words I spoke in the original phonograph."
|
||||
" A little piece of practical poetry. Mary had a little lamb,"
|
||||
" it sleeps with quite a flow, and everywhere that Mary went,"
|
||||
" the lamb was sure to go"
|
||||
" the lamb was sure to go."
|
||||
)
|
||||
|
||||
@@ -14,7 +14,6 @@ import torch.nn as nn
|
||||
from vllm.config import VllmConfig, set_current_vllm_config
|
||||
from vllm.forward_context import set_forward_context
|
||||
from vllm.model_executor.layers.fused_moe.shared_fused_moe import SharedFusedMoE
|
||||
from vllm.utils.torch_utils import is_torch_equal_or_newer
|
||||
|
||||
|
||||
class SimpleLinear(nn.Module):
|
||||
@@ -61,10 +60,6 @@ def setup_cuda():
|
||||
@pytest.mark.parametrize("num_tokens", [1, 32])
|
||||
@pytest.mark.parametrize("hidden_size,latent_size", [(256, 128), (128, 64)])
|
||||
@pytest.mark.parametrize("dtype", [torch.bfloat16])
|
||||
@pytest.mark.skipif(
|
||||
is_torch_equal_or_newer("2.10.0"),
|
||||
reason="Test fails with PyTorch 2.10.0 see: https://github.com/vllm-project/vllm/issues/33995",
|
||||
)
|
||||
def test_routed_input_transform_inside_vs_outside(
|
||||
num_tokens: int,
|
||||
hidden_size: int,
|
||||
|
||||
@@ -275,6 +275,9 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
|
||||
"zai-org/GLM-4.7-Flash",
|
||||
min_transformers_version="5.0.0",
|
||||
),
|
||||
"GlmMoeDsaForCausalLM": _HfExamplesInfo(
|
||||
"zai-org/GLM-5", min_transformers_version="5.0.1", is_available_online=False
|
||||
),
|
||||
"GPT2LMHeadModel": _HfExamplesInfo("openai-community/gpt2", {"alias": "gpt2"}),
|
||||
"GPTBigCodeForCausalLM": _HfExamplesInfo(
|
||||
"bigcode/starcoder",
|
||||
|
||||
@@ -97,7 +97,7 @@ def can_initialize(
|
||||
"pickle error when loading `transformers.models.auto.CONFIG_MAPPING`"
|
||||
)
|
||||
|
||||
if model_arch == "DeepseekV32ForCausalLM":
|
||||
if model_arch in ["DeepseekV32ForCausalLM", "GlmMoeDsaForCausalLM"]:
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
capability = current_platform.get_device_capability()
|
||||
|
||||
@@ -17,7 +17,7 @@ DTYPE = ["bfloat16"]
|
||||
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("dtype", DTYPE)
|
||||
def test_ipex_quant(vllm_runner, model, dtype):
|
||||
def test_cpu_quant(vllm_runner, model, dtype):
|
||||
with vllm_runner(model, dtype=dtype) as llm:
|
||||
output = llm.generate_greedy(["The capital of France is"], max_tokens=32)
|
||||
assert output
|
||||
|
||||
@@ -1,32 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""Test model set-up and inference for quantized HF models supported
|
||||
on the CPU/GPU backend using IPEX (including AWQ/GPTQ).
|
||||
|
||||
Validating the configuration and printing results for manual checking.
|
||||
|
||||
Run `pytest tests/quantization/test_ipex_quant.py`.
|
||||
"""
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
MODELS = [
|
||||
"AMead10/Llama-3.2-1B-Instruct-AWQ",
|
||||
"shuyuej/Llama-3.2-1B-Instruct-GPTQ", # with g_idx
|
||||
]
|
||||
DTYPE = ["bfloat16"]
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
not current_platform.is_cpu() and not current_platform.is_xpu(),
|
||||
reason="only supports Intel CPU/XPU backend.",
|
||||
)
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("dtype", DTYPE)
|
||||
def test_ipex_quant(vllm_runner, model, dtype):
|
||||
with vllm_runner(model, dtype=dtype, enforce_eager=True) as llm:
|
||||
output = llm.generate_greedy(["The capital of France is"], max_tokens=4)
|
||||
assert output
|
||||
print(output)
|
||||
@@ -236,7 +236,7 @@ def test_prefix_caching_for_multi_turn():
|
||||
req._all_token_ids = req.prompt_token_ids.copy()
|
||||
req.all_token_ids = ConstantList(req._all_token_ids)
|
||||
req.block_hashes = []
|
||||
req.block_hashes = req.get_hash_new_full_blocks()
|
||||
req.update_block_hashes()
|
||||
|
||||
# Schedule the next-turn requests.
|
||||
for req in next_turn_requests:
|
||||
|
||||
@@ -7,7 +7,8 @@
|
||||
set -e
|
||||
|
||||
TORCHCODEC_REPO="${TORCHCODEC_REPO:-https://github.com/pytorch/torchcodec.git}"
|
||||
TORCHCODEC_BRANCH="${TORCHCODEC_BRANCH:-main}"
|
||||
# Pin to a specific release for reproducibility; update as needed.
|
||||
TORCHCODEC_BRANCH="${TORCHCODEC_BRANCH:-v0.10.0}"
|
||||
|
||||
echo "=== TorchCodec Installation Script ==="
|
||||
|
||||
|
||||
@@ -53,7 +53,7 @@ if hasattr(torch.ops._xpu_C, "int4_gemm_w4a16"):
|
||||
return torch.empty((M, N), dtype=input.dtype, device=input.device)
|
||||
|
||||
|
||||
class ipex_ops:
|
||||
class xpu_ops:
|
||||
@staticmethod
|
||||
def flash_attn_varlen_func(
|
||||
q: torch.Tensor,
|
||||
@@ -73,7 +73,7 @@ class ipex_ops:
|
||||
cu_seqlens_k: torch.Tensor | None = None,
|
||||
# passed in qwen vl
|
||||
dropout_p: float = 0.0,
|
||||
# The following parameters are not used in ipex kernel currently,
|
||||
# The following parameters are not used in xpu kernel currently,
|
||||
# we keep API compatible to CUDA's.
|
||||
scheduler_metadata=None,
|
||||
fa_version: int = 2,
|
||||
@@ -153,6 +153,6 @@ class ipex_ops:
|
||||
sm_margin=0, # Can be tuned if some SMs are used for communication
|
||||
) -> None:
|
||||
logger.warning_once(
|
||||
"get_scheduler_metadata is not implemented for ipex_ops, returning None."
|
||||
"get_scheduler_metadata is not implemented for xpu_ops, returning None."
|
||||
)
|
||||
return None
|
||||
@@ -233,7 +233,7 @@ class InductorStandaloneAdaptor(CompilerInterface):
|
||||
|
||||
from torch._inductor import standalone_compile
|
||||
|
||||
supports_aot = is_torch_equal_or_newer("2.10.0")
|
||||
supports_aot = is_torch_equal_or_newer("2.10.0.dev")
|
||||
|
||||
if not supports_aot and envs.VLLM_USE_MEGA_AOT_ARTIFACT:
|
||||
logger.error(
|
||||
|
||||
@@ -333,7 +333,7 @@ def _support_torch_compile(
|
||||
) -> None:
|
||||
def mark_dynamic(arg: torch.Tensor, dims: list[int]) -> None:
|
||||
if ds_type == DynamicShapesType.UNBACKED:
|
||||
if is_torch_equal_or_newer("2.10.0"):
|
||||
if is_torch_equal_or_newer("2.10.0.dev"):
|
||||
for dim in dims:
|
||||
torch._dynamo.decorators.mark_unbacked(
|
||||
arg, dim, hint_override=arg.size()[dim]
|
||||
@@ -373,7 +373,7 @@ def _support_torch_compile(
|
||||
if isinstance(arg, torch.Tensor):
|
||||
# In case dims is specified with negative indexing
|
||||
dims = [arg.ndim + dim if dim < 0 else dim for dim in dims]
|
||||
if is_torch_equal_or_newer("2.10.0"):
|
||||
if is_torch_equal_or_newer("2.10.0.dev"):
|
||||
for dim in dims:
|
||||
torch._dynamo.decorators.mark_unbacked(
|
||||
arg, dim, hint_override=arg.size()[dim]
|
||||
@@ -525,9 +525,9 @@ def _support_torch_compile(
|
||||
fx_config_patches["backed_size_oblivious"] = True
|
||||
|
||||
# Prepare inductor config patches
|
||||
# assume_32bit_indexing is only available in torch 2.10.0+
|
||||
# assume_32bit_indexing is only available in torch 2.10.0.dev+
|
||||
inductor_config_patches = {}
|
||||
if is_torch_equal_or_newer("2.10.0"):
|
||||
if is_torch_equal_or_newer("2.10.0.dev"):
|
||||
inductor_config_patches["assume_32bit_indexing"] = (
|
||||
self.compilation_config.dynamic_shapes_config.assume_32_bit_indexing
|
||||
)
|
||||
|
||||
@@ -142,6 +142,7 @@ class AttentionFp8StaticQuantPattern(AttentionQuantPattern):
|
||||
v: torch.Tensor,
|
||||
output_attn: torch.Tensor,
|
||||
scale: torch.Tensor,
|
||||
kv_cache_dummy_dep: torch.Tensor,
|
||||
) -> torch.Tensor:
|
||||
at1 = auto_functionalized(
|
||||
ATTN_OP,
|
||||
@@ -152,6 +153,7 @@ class AttentionFp8StaticQuantPattern(AttentionQuantPattern):
|
||||
layer_name=self.layer_name,
|
||||
output_scale=None,
|
||||
output_block_scale=None,
|
||||
kv_cache_dummy_dep=kv_cache_dummy_dep,
|
||||
)
|
||||
attn_out_view = RESHAPE_OP(
|
||||
at1[1], [q.shape[0], self.num_heads * self.head_size]
|
||||
@@ -165,6 +167,7 @@ class AttentionFp8StaticQuantPattern(AttentionQuantPattern):
|
||||
v: torch.Tensor,
|
||||
output_attn: torch.Tensor,
|
||||
scale: torch.Tensor,
|
||||
kv_cache_dummy_dep: torch.Tensor,
|
||||
) -> torch.Tensor:
|
||||
# attn output in quant_dtype
|
||||
output_attn = torch.ops.aten.full.default(
|
||||
@@ -182,6 +185,7 @@ class AttentionFp8StaticQuantPattern(AttentionQuantPattern):
|
||||
layer_name=self.layer_name,
|
||||
output_scale=scale,
|
||||
output_block_scale=None,
|
||||
kv_cache_dummy_dep=kv_cache_dummy_dep,
|
||||
)
|
||||
return RESHAPE_OP(at1[1], [-1, self.num_heads * self.head_size])
|
||||
|
||||
@@ -191,6 +195,7 @@ class AttentionFp8StaticQuantPattern(AttentionQuantPattern):
|
||||
self.empty(5, self.num_heads, self.head_size), # v
|
||||
self.empty(5, self.num_heads, self.head_size), # attn_output
|
||||
empty_fp32(1, 1), # scale
|
||||
self.empty(0), # kv_cache_dummy_dep
|
||||
]
|
||||
|
||||
pm.register_replacement(
|
||||
@@ -228,6 +233,7 @@ class AttentionNvfp4QuantPattern(AttentionQuantPattern):
|
||||
output_quant: torch.Tensor,
|
||||
output_scale: torch.Tensor,
|
||||
input_scale: torch.Tensor,
|
||||
kv_cache_dummy_dep: torch.Tensor,
|
||||
) -> tuple[torch.Tensor, torch.Tensor]:
|
||||
at1 = auto_functionalized(
|
||||
ATTN_OP,
|
||||
@@ -238,6 +244,7 @@ class AttentionNvfp4QuantPattern(AttentionQuantPattern):
|
||||
layer_name=self.layer_name,
|
||||
output_scale=None,
|
||||
output_block_scale=None,
|
||||
kv_cache_dummy_dep=kv_cache_dummy_dep,
|
||||
)
|
||||
attn_out_view = RESHAPE_OP(
|
||||
at1[1], [q.shape[0], self.num_heads * self.head_size]
|
||||
@@ -261,6 +268,7 @@ class AttentionNvfp4QuantPattern(AttentionQuantPattern):
|
||||
output_quant: torch.Tensor,
|
||||
output_scale: torch.Tensor,
|
||||
input_scale: torch.Tensor,
|
||||
kv_cache_dummy_dep: torch.Tensor,
|
||||
) -> tuple[torch.Tensor, torch.Tensor]:
|
||||
# attention output in quant_dtype
|
||||
output_attn = torch.ops.aten.full.default(
|
||||
@@ -280,6 +288,7 @@ class AttentionNvfp4QuantPattern(AttentionQuantPattern):
|
||||
layer_name=self.layer_name,
|
||||
output_scale=input_scale,
|
||||
output_block_scale=output_scale_view,
|
||||
kv_cache_dummy_dep=kv_cache_dummy_dep,
|
||||
)
|
||||
output = RESHAPE_OP(at2[1], [-1, self.num_heads * self.head_size // 2])
|
||||
return output, at2[2]
|
||||
@@ -294,6 +303,7 @@ class AttentionNvfp4QuantPattern(AttentionQuantPattern):
|
||||
128, round_up(self.num_heads * self.head_size // 16, 4)
|
||||
), # output_scale
|
||||
empty_fp32(1, 1), # input_scale
|
||||
self.empty(0), # kv_cache_dummy_dep
|
||||
]
|
||||
|
||||
pm.register_replacement(
|
||||
|
||||
@@ -181,7 +181,7 @@ class SpeculativeConfig:
|
||||
@staticmethod
|
||||
def hf_config_override(hf_config: PretrainedConfig) -> PretrainedConfig:
|
||||
initial_architecture = hf_config.architectures[0]
|
||||
if hf_config.model_type in ("deepseek_v3", "deepseek_v32"):
|
||||
if hf_config.model_type in ("deepseek_v3", "deepseek_v32", "glm_moe_dsa"):
|
||||
hf_config.model_type = "deepseek_mtp"
|
||||
if hf_config.model_type == "deepseek_mtp":
|
||||
n_predict = getattr(hf_config, "num_nextn_predict_layers", None)
|
||||
|
||||
@@ -48,7 +48,6 @@ class RealtimeConnection:
|
||||
self.generation_task: asyncio.Task | None = None
|
||||
|
||||
self._is_connected = False
|
||||
self._is_input_finished = False
|
||||
self._is_model_validated = False
|
||||
|
||||
self._max_audio_filesize_mb = envs.VLLM_MAX_AUDIO_CLIP_FILESIZE_MB
|
||||
@@ -145,7 +144,7 @@ class RealtimeConnection:
|
||||
commit_event = InputAudioBufferCommit(**event)
|
||||
# final signals that the audio is finished
|
||||
if commit_event.final:
|
||||
self._is_input_finished = True
|
||||
self.audio_queue.put_nowait(None)
|
||||
else:
|
||||
await self.start_generation()
|
||||
else:
|
||||
@@ -239,11 +238,6 @@ class RealtimeConnection:
|
||||
# finish because websocket connection was killed
|
||||
break
|
||||
|
||||
if self.audio_queue.empty() and self._is_input_finished:
|
||||
# finish because client signals that audio input
|
||||
# is finished
|
||||
break
|
||||
|
||||
usage = UsageInfo(
|
||||
prompt_tokens=prompt_token_ids_len,
|
||||
completion_tokens=completion_tokens_len,
|
||||
|
||||
@@ -271,7 +271,7 @@ def use_aot_compile() -> bool:
|
||||
|
||||
default_value = (
|
||||
"1"
|
||||
if is_torch_equal_or_newer("2.11.0.dev") and not disable_compile_cache()
|
||||
if is_torch_equal_or_newer("2.10.0.dev") and not disable_compile_cache()
|
||||
else "0"
|
||||
)
|
||||
|
||||
|
||||
@@ -974,7 +974,7 @@ def enable_batch_invariant_mode():
|
||||
)
|
||||
|
||||
reduced_precision_val = (
|
||||
(False, False) if is_torch_equal_or_newer("2.10.0") else False
|
||||
(False, False) if is_torch_equal_or_newer("2.10.0.dev") else False
|
||||
)
|
||||
torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = (
|
||||
reduced_precision_val
|
||||
|
||||
@@ -102,6 +102,7 @@ if HAS_TRITON:
|
||||
)
|
||||
from vllm.model_executor.layers.fused_moe.xpu_fused_moe import (
|
||||
XPUExperts,
|
||||
XPUExpertsFp8,
|
||||
)
|
||||
|
||||
__all__ += [
|
||||
@@ -121,6 +122,7 @@ if HAS_TRITON:
|
||||
"BatchedDeepGemmExperts",
|
||||
"TritonOrDeepGemmExperts",
|
||||
"XPUExperts",
|
||||
"XPUExpertsFp8",
|
||||
]
|
||||
else:
|
||||
# Some model classes directly use the custom ops. Add placeholders
|
||||
|
||||
@@ -27,21 +27,9 @@ logger = init_logger(__name__)
|
||||
if has_triton_kernels():
|
||||
try:
|
||||
import triton_kernels.swiglu
|
||||
from triton_kernels.matmul_ogs import (
|
||||
FnSpecs,
|
||||
FusedActivation,
|
||||
GatherIndx,
|
||||
RoutingData,
|
||||
ScatterIndx,
|
||||
matmul_ogs,
|
||||
)
|
||||
from triton_kernels.tensor import (
|
||||
BIT,
|
||||
Bitmatrix,
|
||||
SparseMatrix,
|
||||
make_ragged_tensor_metadata,
|
||||
)
|
||||
from triton_kernels.topk import topk
|
||||
from triton_kernels.matmul_ogs import FnSpecs, FusedActivation, matmul_ogs
|
||||
from triton_kernels.routing import RoutingData, routing, routing_from_bitmatrix
|
||||
from triton_kernels.tensor import Bitmatrix
|
||||
except (AttributeError, ImportError) as e:
|
||||
logger.error(
|
||||
"Failed to import Triton kernels. Please make sure your triton "
|
||||
@@ -90,58 +78,6 @@ def pack_bitmatrix(
|
||||
tl.store(bitmatrix_ptrs, y, mask=offsets_m[:, None] < n_rows)
|
||||
|
||||
|
||||
def legacy_routing_from_bitmatrix(
|
||||
bitmatrix: "Bitmatrix",
|
||||
expt_scal: torch.Tensor,
|
||||
expt_indx: torch.Tensor,
|
||||
n_expts_tot: int,
|
||||
n_expts_act: int,
|
||||
) -> tuple["RoutingData", "GatherIndx", "ScatterIndx"]:
|
||||
"""
|
||||
Replacement for the removed triton_kernels.routing.routing_from_bitmatrix.
|
||||
Creates routing data from a bitmatrix representation.
|
||||
"""
|
||||
sparse_logits = SparseMatrix(indx=expt_indx, vals=expt_scal, mask=bitmatrix)
|
||||
dispatch_indx = sparse_logits.mask_metadata.row_sorted_indx
|
||||
combine_indx = sparse_logits.mask_metadata.col_sorted_indx
|
||||
ragged_batch_metadata = make_ragged_tensor_metadata(
|
||||
sparse_logits.mask_metadata.col_sum,
|
||||
dispatch_indx.shape[0],
|
||||
)
|
||||
gate_scal = sparse_logits.vals.flatten()[combine_indx]
|
||||
routing_data = RoutingData(
|
||||
gate_scal,
|
||||
ragged_batch_metadata.block_sizes,
|
||||
n_expts_tot,
|
||||
n_expts_act,
|
||||
ragged_batch_metadata,
|
||||
)
|
||||
gather_idx = GatherIndx(combine_indx, dispatch_indx)
|
||||
scatter_idx = ScatterIndx(dispatch_indx, combine_indx)
|
||||
return routing_data, gather_idx, scatter_idx
|
||||
|
||||
|
||||
def legacy_routing(
|
||||
logits: torch.Tensor,
|
||||
n_expts_act: int,
|
||||
sm_first: bool = False,
|
||||
) -> tuple["RoutingData", "GatherIndx", "ScatterIndx"]:
|
||||
"""
|
||||
Replacement for the removed triton_kernels.routing.routing function.
|
||||
Computes routing data from gating logits.
|
||||
"""
|
||||
if sm_first:
|
||||
logits = torch.softmax(logits, dim=-1)
|
||||
sparse_logits = topk(logits, n_expts_act, apply_softmax=not sm_first)
|
||||
return legacy_routing_from_bitmatrix(
|
||||
sparse_logits.mask,
|
||||
sparse_logits.vals,
|
||||
sparse_logits.indx,
|
||||
logits.shape[-1],
|
||||
n_expts_act,
|
||||
)
|
||||
|
||||
|
||||
def triton_kernel_moe_forward(
|
||||
hidden_states: torch.Tensor,
|
||||
w1, # Tensor or triton_kernels.Tensor
|
||||
@@ -155,7 +91,7 @@ def triton_kernel_moe_forward(
|
||||
global_num_experts: int = -1,
|
||||
expert_map: torch.Tensor | None = None,
|
||||
) -> torch.Tensor:
|
||||
routing_data, gather_idx, scatter_idx = legacy_routing(
|
||||
routing_data, gather_idx, scatter_idx = routing(
|
||||
gating_output, topk, sm_first=not renormalize
|
||||
)
|
||||
|
||||
@@ -232,10 +168,9 @@ def triton_kernel_fused_experts(
|
||||
output_tensor = _resize_cache(output_tensor, (batch_dim, M, K))
|
||||
|
||||
act = FusedActivation(
|
||||
FnSpecs(
|
||||
"swiglu", triton_kernels.swiglu.swiglu_fn, ("alpha", "limit"), reduction_n=2
|
||||
),
|
||||
FnSpecs("swiglu", triton_kernels.swiglu.swiglu_fn, ("alpha", "limit")),
|
||||
(swiglu_alpha, swiglu_limit),
|
||||
2,
|
||||
)
|
||||
gammas = routing_data.gate_scal if routing_data else None
|
||||
|
||||
@@ -297,12 +232,12 @@ def make_routing_data(
|
||||
bitmatrix_shape = [n_rows, bm_cols * 32]
|
||||
bitmatrix_shape_max = [n_rows, None]
|
||||
bitmatrix = Bitmatrix(
|
||||
bitmatrix, dtype=BIT, shape=bitmatrix_shape, shape_max=bitmatrix_shape_max
|
||||
bitmatrix, shape=bitmatrix_shape, shape_max=bitmatrix_shape_max, scratchpad=None
|
||||
)
|
||||
|
||||
# matmul_ogs expects invalid topk_weights to be -1s
|
||||
topk_weights = torch.where(topk_ids == -1, -1.0, topk_weights)
|
||||
routing_data, gather_indx, scatter_indx = legacy_routing_from_bitmatrix(
|
||||
routing_data, gather_indx, scatter_indx = routing_from_bitmatrix(
|
||||
bitmatrix, topk_weights, topk_ids, num_local_experts, num_topk
|
||||
)
|
||||
|
||||
|
||||
@@ -52,6 +52,7 @@ class Fp8MoeBackend(Enum):
|
||||
AITER = "AITER"
|
||||
VLLM_CUTLASS = "VLLM_CUTLASS"
|
||||
BATCHED_VLLM_CUTLASS = "BATCHED_VLLM_CUTLASS"
|
||||
XPU = "XPU"
|
||||
|
||||
|
||||
def backend_to_kernel_cls(
|
||||
@@ -123,6 +124,13 @@ def backend_to_kernel_cls(
|
||||
|
||||
return CutlassBatchedExpertsFp8
|
||||
|
||||
elif backend == Fp8MoeBackend.XPU:
|
||||
from vllm.model_executor.layers.fused_moe.xpu_fused_moe import (
|
||||
XPUExpertsFp8,
|
||||
)
|
||||
|
||||
return XPUExpertsFp8
|
||||
|
||||
else:
|
||||
raise ValueError(f"Unknown FP8 MoE backend: {backend.value}")
|
||||
|
||||
@@ -154,6 +162,7 @@ def select_fp8_moe_backend(
|
||||
Fp8MoeBackend.TRITON,
|
||||
Fp8MoeBackend.BATCHED_TRITON,
|
||||
Fp8MoeBackend.MARLIN,
|
||||
Fp8MoeBackend.XPU,
|
||||
]
|
||||
|
||||
# NOTE(rob): We need to peak into the P/F selection to determine
|
||||
@@ -393,6 +402,7 @@ def convert_to_fp8_moe_kernel_format(
|
||||
Fp8MoeBackend.BATCHED_TRITON,
|
||||
Fp8MoeBackend.VLLM_CUTLASS,
|
||||
Fp8MoeBackend.BATCHED_VLLM_CUTLASS,
|
||||
Fp8MoeBackend.XPU,
|
||||
]:
|
||||
raise ValueError(f"Unsupported FP8 MoE backend: {fp8_backend.value}")
|
||||
|
||||
|
||||
@@ -4,13 +4,16 @@ import torch
|
||||
|
||||
import vllm.model_executor.layers.fused_moe.modular_kernel as mk
|
||||
from vllm.model_executor.layers.fused_moe.config import (
|
||||
FusedMoEConfig,
|
||||
FusedMoEParallelConfig,
|
||||
FusedMoEQuantConfig,
|
||||
)
|
||||
from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import (
|
||||
TopKWeightAndReduceNoOP,
|
||||
)
|
||||
from vllm.model_executor.layers.quantization.utils.quant_utils import (
|
||||
QuantKey,
|
||||
kFp8DynamicTensorSym,
|
||||
kFp8StaticTensorSym,
|
||||
)
|
||||
from vllm.platforms import current_platform
|
||||
@@ -20,6 +23,21 @@ if current_platform.is_xpu():
|
||||
|
||||
|
||||
class XPUExperts(mk.FusedMoEPermuteExpertsUnpermute):
|
||||
def __init__(
|
||||
self,
|
||||
moe_config: FusedMoEConfig,
|
||||
quant_config: FusedMoEQuantConfig,
|
||||
max_num_tokens: int | None = None,
|
||||
num_dispatchers: int | None = None,
|
||||
):
|
||||
super().__init__(
|
||||
moe_config,
|
||||
quant_config,
|
||||
max_num_tokens,
|
||||
num_dispatchers,
|
||||
)
|
||||
self.is_fp8 = False
|
||||
|
||||
@property
|
||||
def expects_unquantized_inputs(self) -> bool:
|
||||
return True
|
||||
@@ -49,10 +67,10 @@ class XPUExperts(mk.FusedMoEPermuteExpertsUnpermute):
|
||||
weight_key: QuantKey | None,
|
||||
activation_key: QuantKey | None,
|
||||
) -> bool:
|
||||
# TODO: dispatch based on device.
|
||||
SUPPORTED_W_A = [
|
||||
(None, None),
|
||||
(kFp8StaticTensorSym, None),
|
||||
(kFp8StaticTensorSym, kFp8DynamicTensorSym),
|
||||
]
|
||||
return (weight_key, activation_key) in SUPPORTED_W_A
|
||||
|
||||
@@ -103,10 +121,10 @@ class XPUExperts(mk.FusedMoEPermuteExpertsUnpermute):
|
||||
xpu_fused_moe(
|
||||
hidden_states=hidden_states,
|
||||
w13=w1,
|
||||
w13_scales=a1q_scale,
|
||||
w13_scales=self.w1_scale,
|
||||
w13_bias=self.w1_bias,
|
||||
w2=w2,
|
||||
w2_scales=a2_scale,
|
||||
w2_scales=self.w2_scale,
|
||||
w2_bias=self.w2_bias,
|
||||
topk_weights=topk_weights,
|
||||
topk_ids=topk_ids,
|
||||
@@ -116,5 +134,22 @@ class XPUExperts(mk.FusedMoEPermuteExpertsUnpermute):
|
||||
ep_rank=self.moe_config.ep_rank,
|
||||
ep_size=self.moe_config.ep_size,
|
||||
output=output,
|
||||
is_fp8=self.is_fp8,
|
||||
)
|
||||
return
|
||||
|
||||
|
||||
class XPUExpertsFp8(XPUExperts):
|
||||
def __init__(
|
||||
self,
|
||||
moe_config: FusedMoEConfig,
|
||||
quant_config: FusedMoEQuantConfig,
|
||||
max_num_tokens: int | None = None,
|
||||
num_dispatchers: int | None = None,
|
||||
):
|
||||
super().__init__(
|
||||
moe_config,
|
||||
quant_config,
|
||||
max_num_tokens,
|
||||
num_dispatchers,
|
||||
)
|
||||
self.is_fp8 = True
|
||||
|
||||
@@ -180,18 +180,9 @@ class Fp8Config(QuantizationConfig):
|
||||
weight_block_size=weight_block_size,
|
||||
)
|
||||
|
||||
def get_xpu_quant_method(
|
||||
self, layer: torch.nn.Module, prefix: str
|
||||
) -> "QuantizeMethodBase | None":
|
||||
raise NotImplementedError(
|
||||
"FP8 quantization is not supported during xpu kernel migration."
|
||||
)
|
||||
|
||||
def get_quant_method(
|
||||
self, layer: torch.nn.Module, prefix: str
|
||||
) -> "QuantizeMethodBase | None":
|
||||
if current_platform.is_xpu():
|
||||
return self.get_xpu_quant_method(layer, prefix)
|
||||
if isinstance(layer, LinearBase):
|
||||
if is_layer_skipped(
|
||||
prefix=prefix,
|
||||
@@ -300,7 +291,7 @@ class Fp8LinearMethod(LinearMethodBase):
|
||||
or envs.VLLM_TEST_FORCE_FP8_MARLIN
|
||||
)
|
||||
# Disable marlin for rocm
|
||||
if current_platform.is_rocm():
|
||||
if current_platform.is_rocm() or current_platform.is_xpu():
|
||||
self.use_marlin = False
|
||||
if vllm_is_batch_invariant():
|
||||
self.use_marlin = False
|
||||
|
||||
@@ -39,6 +39,9 @@ from vllm.model_executor.layers.quantization.kernels.scaled_mm.ScaledMMLinearKer
|
||||
from vllm.model_executor.layers.quantization.kernels.scaled_mm.triton import (
|
||||
TritonInt8ScaledMMLinearKernel,
|
||||
)
|
||||
from vllm.model_executor.layers.quantization.kernels.scaled_mm.xpu import (
|
||||
XPUFP8ScaledMMLinearKernel,
|
||||
)
|
||||
from vllm.model_executor.layers.quantization.utils.quant_utils import QuantKey
|
||||
from vllm.platforms import PlatformEnum, current_platform
|
||||
|
||||
@@ -72,6 +75,9 @@ _POSSIBLE_FP8_KERNELS: dict[PlatformEnum, list[type[FP8ScaledMMLinearKernel]]] =
|
||||
PerTensorTorchFP8ScaledMMLinearKernel,
|
||||
ChannelWiseTorchFP8ScaledMMLinearKernel,
|
||||
],
|
||||
PlatformEnum.XPU: [
|
||||
XPUFP8ScaledMMLinearKernel,
|
||||
],
|
||||
}
|
||||
|
||||
_KernelT = TypeVar("_KernelT", bound=ScaledMMLinearKernel)
|
||||
|
||||
@@ -0,0 +1,59 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from collections.abc import Sequence
|
||||
|
||||
import torch
|
||||
|
||||
from vllm.model_executor.layers.quantization.kernels.scaled_mm.ScaledMMLinearKernel import ( # noqa: E501
|
||||
FP8ScaledMMLinearKernel,
|
||||
FP8ScaledMMLinearLayerConfig,
|
||||
)
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
|
||||
class XPUFP8ScaledMMLinearKernel(FP8ScaledMMLinearKernel):
|
||||
@classmethod
|
||||
def is_supported(
|
||||
cls, compute_capability: int | None = None
|
||||
) -> tuple[bool, str | None]:
|
||||
if not current_platform.is_xpu():
|
||||
return False, "XPUFP8ScaledMM only support on XPU"
|
||||
return True, None
|
||||
|
||||
@classmethod
|
||||
def can_implement(cls, c: FP8ScaledMMLinearLayerConfig) -> tuple[bool, str | None]:
|
||||
if c.weight_quant_key.dtype not in {torch.float8_e5m2, torch.float8_e4m3fn}:
|
||||
return False, "XPUFP8ScaledMM only support FP8 weight dtype"
|
||||
return True, None
|
||||
|
||||
def __init__(
|
||||
self, c: FP8ScaledMMLinearLayerConfig, layer_param_names: Sequence[str]
|
||||
) -> None:
|
||||
assert self.can_implement(c)[0]
|
||||
assert self.is_supported()[0]
|
||||
self.config = c
|
||||
self.layer_param_names = layer_param_names
|
||||
|
||||
def apply_weights(
|
||||
self,
|
||||
layer: torch.nn.Module,
|
||||
x: torch.Tensor,
|
||||
bias: torch.Tensor | None = None,
|
||||
) -> torch.Tensor:
|
||||
weight = layer.weight
|
||||
weight_scale = layer.weight_scale
|
||||
return torch.ops._xpu_C.fp8_gemm_w8a16(x, weight, weight_scale, bias)
|
||||
|
||||
def apply_scaled_mm(
|
||||
self,
|
||||
*,
|
||||
A: torch.Tensor,
|
||||
B: torch.Tensor,
|
||||
out_dtype: torch.dtype,
|
||||
As: torch.Tensor,
|
||||
Bs: torch.Tensor,
|
||||
bias: torch.Tensor | None,
|
||||
output_shape: list,
|
||||
) -> torch.Tensor:
|
||||
pass
|
||||
@@ -160,7 +160,7 @@ def get_mxfp4_backend(with_lora_support: bool) -> Mxfp4Backend:
|
||||
logger.info_once("Using Triton backend")
|
||||
return Mxfp4Backend.TRITON
|
||||
elif current_platform.is_xpu():
|
||||
logger.info_once("Using ipex marlin backend on XPU")
|
||||
logger.info_once("Using xpu backend on XPU")
|
||||
return Mxfp4Backend.MARLIN
|
||||
elif current_platform.is_rocm() and has_triton_kernels():
|
||||
logger.info_once("Using Triton backend")
|
||||
|
||||
@@ -20,7 +20,7 @@ from vllm.v1.worker.workspace import current_workspace_manager
|
||||
if current_platform.is_cuda_alike():
|
||||
from vllm import _custom_ops as ops
|
||||
elif current_platform.is_xpu():
|
||||
from vllm._ipex_ops import ipex_ops as ops
|
||||
from vllm._xpu_ops import xpu_ops as ops
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
@@ -836,7 +836,7 @@ class DeepseekV2MLAAttention(nn.Module):
|
||||
qk_rope_head_dim,
|
||||
max_position=max_position_embeddings,
|
||||
rope_parameters=config.rope_parameters,
|
||||
is_neox_style=True,
|
||||
is_neox_style=not getattr(config, "indexer_rope_interleave", False),
|
||||
)
|
||||
self.indexer = Indexer(
|
||||
vllm_config,
|
||||
@@ -1499,6 +1499,10 @@ class DeepseekV3ForCausalLM(DeepseekV2ForCausalLM):
|
||||
pass
|
||||
|
||||
|
||||
class GlmMoeDsaForCausalLM(DeepseekV2ForCausalLM):
|
||||
pass
|
||||
|
||||
|
||||
# Compatibility with
|
||||
# https://huggingface.co/deepseek-ai/DeepSeek-V3-Base/blob/main/configuration_deepseek.py
|
||||
def get_spec_layer_idx_from_weight_name(
|
||||
|
||||
@@ -114,6 +114,7 @@ _TEXT_GENERATION_MODELS = {
|
||||
"Glm4ForCausalLM": ("glm4", "Glm4ForCausalLM"),
|
||||
"Glm4MoeForCausalLM": ("glm4_moe", "Glm4MoeForCausalLM"),
|
||||
"Glm4MoeLiteForCausalLM": ("glm4_moe_lite", "Glm4MoeLiteForCausalLM"),
|
||||
"GlmMoeDsaForCausalLM": ("deepseek_v2", "GlmMoeDsaForCausalLM"),
|
||||
"GptOssForCausalLM": ("gpt_oss", "GptOssForCausalLM"),
|
||||
"GPT2LMHeadModel": ("gpt2", "GPT2LMHeadModel"),
|
||||
"GPTBigCodeForCausalLM": ("gpt_bigcode", "GPTBigCodeForCausalLM"),
|
||||
|
||||
@@ -338,7 +338,6 @@ class CpuPlatform(Platform):
|
||||
ld_preload_str += pytorch_libgomp_so
|
||||
os.environ["LD_PRELOAD"] = ld_preload_str
|
||||
|
||||
# To hint IPEX uses shared memory based AllReduce
|
||||
os.environ["LOCAL_WORLD_SIZE"] = str(
|
||||
vllm_config.parallel_config.tensor_parallel_size
|
||||
)
|
||||
|
||||
@@ -237,6 +237,7 @@ class ModelArchConfigConvertorBase:
|
||||
"deepseek_v3",
|
||||
"deepseek_v32",
|
||||
"deepseek_mtp",
|
||||
"glm_moe_dsa",
|
||||
"glm4_moe_lite",
|
||||
"glm4_moe_lite_mtp",
|
||||
"kimi_k2",
|
||||
|
||||
@@ -23,12 +23,11 @@ if current_platform.is_cuda():
|
||||
|
||||
elif current_platform.is_xpu():
|
||||
from vllm import _custom_ops as ops
|
||||
from vllm._xpu_ops import xpu_ops
|
||||
|
||||
reshape_and_cache_flash = ops.reshape_and_cache_flash
|
||||
from vllm._ipex_ops import ipex_ops
|
||||
|
||||
flash_attn_varlen_func = ipex_ops.flash_attn_varlen_func # type: ignore[assignment]
|
||||
get_scheduler_metadata = ipex_ops.get_scheduler_metadata # type: ignore[assignment]
|
||||
flash_attn_varlen_func = xpu_ops.flash_attn_varlen_func # type: ignore[assignment]
|
||||
get_scheduler_metadata = xpu_ops.get_scheduler_metadata # type: ignore[assignment]
|
||||
elif current_platform.is_rocm():
|
||||
try:
|
||||
from flash_attn import flash_attn_varlen_func # type: ignore[no-redef]
|
||||
@@ -153,7 +152,7 @@ def is_flash_attn_varlen_func_available() -> bool:
|
||||
|
||||
Platform-specific sources:
|
||||
- CUDA: vllm.vllm_flash_attn.flash_attn_varlen_func
|
||||
- XPU: ipex_ops.flash_attn_varlen_func
|
||||
- XPU: xpu_ops.flash_attn_varlen_func
|
||||
- ROCm: upstream flash_attn.flash_attn_varlen_func (if available)
|
||||
|
||||
Note: This is separate from the AITER flash attention backend (rocm_aiter_fa.py)
|
||||
|
||||
@@ -9,7 +9,7 @@ from vllm.platforms import current_platform
|
||||
if current_platform.is_cuda_alike():
|
||||
from vllm import _custom_ops as ops
|
||||
elif current_platform.is_xpu():
|
||||
from vllm._ipex_ops import ipex_ops as ops # type: ignore[no-redef]
|
||||
from vllm._xpu_ops import xpu_ops as ops # type: ignore[no-redef]
|
||||
|
||||
|
||||
class PagedAttention:
|
||||
|
||||
@@ -982,10 +982,8 @@ class Scheduler(SchedulerInterface):
|
||||
|
||||
session._all_token_ids.extend(update.prompt_token_ids or ())
|
||||
session.prompt_token_ids.extend(update.prompt_token_ids or ())
|
||||
# Update block hashes for the new tokens
|
||||
# (mirrors Request.append_output_token_ids)
|
||||
if session.get_hash_new_full_blocks is not None:
|
||||
session.block_hashes.extend(session.get_hash_new_full_blocks())
|
||||
# Update block hashes for the new tokens.
|
||||
session.update_block_hashes()
|
||||
session.num_prompt_tokens = len(session.prompt_token_ids)
|
||||
session.arrival_time = update.arrival_time
|
||||
session.sampling_params = update.sampling_params
|
||||
|
||||
@@ -6,7 +6,6 @@ import time
|
||||
from collections import deque
|
||||
from collections.abc import Callable, Mapping
|
||||
from dataclasses import dataclass
|
||||
from functools import partial
|
||||
from typing import TYPE_CHECKING, Any
|
||||
|
||||
import torch
|
||||
@@ -164,10 +163,11 @@ class Request:
|
||||
self.num_external_computed_tokens = 0
|
||||
|
||||
self.block_hashes: list[BlockHash] = []
|
||||
self.get_hash_new_full_blocks: Callable[[], list[BlockHash]] | None = None
|
||||
if block_hasher is not None:
|
||||
self.get_hash_new_full_blocks = partial(block_hasher, self)
|
||||
self.block_hashes = self.get_hash_new_full_blocks()
|
||||
# Store the block hasher without binding self to avoid creating a
|
||||
# reference cycle (Request -> partial -> Request) that prevents
|
||||
# immediate garbage collection via reference counting.
|
||||
self._block_hasher: Callable[[Request], list[BlockHash]] | None = block_hasher
|
||||
self.update_block_hashes()
|
||||
|
||||
self.skip_reading_prefix_cache = self.get_skip_reading_prefix_cache()
|
||||
|
||||
@@ -212,8 +212,12 @@ class Request:
|
||||
self._output_token_ids.extend(token_ids)
|
||||
self._all_token_ids.extend(token_ids)
|
||||
|
||||
if self.get_hash_new_full_blocks is not None:
|
||||
self.block_hashes.extend(self.get_hash_new_full_blocks())
|
||||
self.update_block_hashes()
|
||||
|
||||
def update_block_hashes(self) -> None:
|
||||
"""Compute block hashes for any new full blocks and append them."""
|
||||
if self._block_hasher is not None:
|
||||
self.block_hashes.extend(self._block_hasher(self))
|
||||
|
||||
@property
|
||||
def use_structured_output(self) -> bool:
|
||||
|
||||
@@ -1503,6 +1503,24 @@ class SpecDecodeBaseProposer:
|
||||
del self.model.lm_head
|
||||
self.model.lm_head = target_language_model.lm_head
|
||||
|
||||
# MTP models call compute_logits via shared_head.head (a
|
||||
# ParallelLMHead inside each MTP layer), not self.model.lm_head.
|
||||
# If the checkpoint omits a copy of the lm_head weights at the
|
||||
# MTP layer path, shared_head.head stays uninitialised and
|
||||
# produces NaN logits. Always share it explicitly.
|
||||
inner = getattr(self.model, "model", None)
|
||||
layers = getattr(inner, "layers", None) if inner else None
|
||||
if layers is not None:
|
||||
items = layers.values() if isinstance(layers, nn.ModuleDict) else layers
|
||||
for layer in items:
|
||||
sh = getattr(layer, "shared_head", None)
|
||||
if sh is not None and hasattr(sh, "head"):
|
||||
del sh.head
|
||||
sh.head = target_language_model.lm_head
|
||||
logger.info(
|
||||
"Shared target model lm_head with MTP shared_head.head."
|
||||
)
|
||||
|
||||
@torch.inference_mode()
|
||||
def dummy_run(
|
||||
self,
|
||||
|
||||
Reference in New Issue
Block a user