Compare commits

..

1 Commits

Author SHA1 Message Date
Lucas Wilkinson
5fbbfe9a4c [BugFix] FA2 MLA Accuracy Issue (#18807)
Some checks failed
Create Release / Create Release (push) Has been cancelled
Signed-off-by: LucasWilkinson <lwilkinson@neuralmagic.com>
2025-05-30 08:50:58 -07:00
1559 changed files with 7189 additions and 24706 deletions

View File

@@ -1,5 +1,4 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import os import os
import sys import sys

View File

@@ -1,5 +1,4 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import argparse import argparse
import os import os

View File

@@ -1,5 +1,4 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from pathlib import Path from pathlib import Path
import pytest import pytest

View File

@@ -1,5 +1,4 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
""" """
LM eval harness on model to compare vs HF baseline computed offline. LM eval harness on model to compare vs HF baseline computed offline.
Configs are found in configs/$MODEL.yaml Configs are found in configs/$MODEL.yaml

View File

@@ -113,7 +113,7 @@ WARNING: The benchmarking script will save json results by itself, so please do
### Visualizing the results ### Visualizing the results
The `convert-results-json-to-markdown.py` helps you put the benchmarking results inside a markdown table, by formatting [descriptions.md](performance-benchmarks-descriptions.md) with real benchmarking results. The `convert-results-json-to-markdown.py` helps you put the benchmarking results inside a markdown table, by formatting [descriptions.md](tests/descriptions.md) with real benchmarking results.
You can find the result presented as a table inside the `buildkite/performance-benchmark` job page. You can find the result presented as a table inside the `buildkite/performance-benchmark` job page.
If you do not see the table, please wait till the benchmark finish running. If you do not see the table, please wait till the benchmark finish running.
The json version of the table (together with the json version of the benchmark) will be also attached to the markdown file. The json version of the table (together with the json version of the benchmark) will be also attached to the markdown file.

View File

@@ -1,5 +1,4 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import json import json
import os import os

View File

@@ -1,5 +1,4 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import argparse import argparse

View File

@@ -1,5 +1,4 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import argparse import argparse
import json import json

View File

@@ -1,5 +1,4 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from lmdeploy.serve.openai.api_client import APIClient from lmdeploy.serve.openai.api_client import APIClient

View File

@@ -1,5 +1,4 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import datetime import datetime
import json import json

View File

@@ -1,6 +1,5 @@
steps: steps:
- label: "Build wheel - CUDA 12.8" - label: "Build wheel - CUDA 12.8"
id: build-wheel-cuda-12-8
agents: agents:
queue: cpu_queue_postmerge queue: cpu_queue_postmerge
commands: commands:
@@ -12,7 +11,6 @@ steps:
DOCKER_BUILDKIT: "1" DOCKER_BUILDKIT: "1"
- label: "Build wheel - CUDA 12.6" - label: "Build wheel - CUDA 12.6"
id: build-wheel-cuda-12-6
agents: agents:
queue: cpu_queue_postmerge queue: cpu_queue_postmerge
commands: commands:
@@ -30,7 +28,6 @@ steps:
- label: "Build wheel - CUDA 11.8" - label: "Build wheel - CUDA 11.8"
# depends_on: block-build-cu118-wheel # depends_on: block-build-cu118-wheel
id: build-wheel-cuda-11-8
agents: agents:
queue: cpu_queue_postmerge queue: cpu_queue_postmerge
commands: commands:
@@ -47,7 +44,6 @@ steps:
- label: "Build release image" - label: "Build release image"
depends_on: block-release-image-build depends_on: block-release-image-build
id: build-release-image
agents: agents:
queue: cpu_queue_postmerge queue: cpu_queue_postmerge
commands: commands:
@@ -55,18 +51,6 @@ steps:
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT --target vllm-openai --progress plain -f docker/Dockerfile ." - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT --target vllm-openai --progress plain -f docker/Dockerfile ."
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT" - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
- label: "Annotate release workflow"
depends_on:
- build-release-image
- build-wheel-cuda-12-8
- build-wheel-cuda-12-6
- build-wheel-cuda-11-8
id: annotate-release-workflow
agents:
queue: cpu_queue_postmerge
commands:
- "bash .buildkite/scripts/annotate-release.sh"
- label: "Build and publish TPU release image" - label: "Build and publish TPU release image"
depends_on: ~ depends_on: ~
if: build.env("NIGHTLY") == "1" if: build.env("NIGHTLY") == "1"
@@ -86,10 +70,9 @@ steps:
DOCKER_BUILDKIT: "1" DOCKER_BUILDKIT: "1"
- input: "Provide Release version here" - input: "Provide Release version here"
id: input-release-version
fields: fields:
- text: "What is the release version?" - text: "What is the release version?"
key: release-version key: "release-version"
- block: "Build CPU release image" - block: "Build CPU release image"
key: block-cpu-release-image-build key: block-cpu-release-image-build

View File

@@ -1,31 +0,0 @@
#!/bin/bash
set -ex
# Get release version and strip leading 'v' if present
RELEASE_VERSION=$(buildkite-agent meta-data get release-version | sed 's/^v//')
if [ -z "$RELEASE_VERSION" ]; then
echo "Error: RELEASE_VERSION is empty. 'release-version' metadata might not be set or is invalid."
exit 1
fi
buildkite-agent annotate --style 'info' --context 'release-workflow' << EOF
To download the wheel:
\`\`\`
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux1_x86_64.whl .
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu126/vllm-${RELEASE_VERSION}+cu126-cp38-abi3-manylinux1_x86_64.whl .
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu118/vllm-${RELEASE_VERSION}+cu118-cp38-abi3-manylinux1_x86_64.whl .
\`\`\`
To download and upload the image:
\`\`\`
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT} vllm/vllm-openai
docker tag vllm/vllm-openai vllm/vllm-openai:latest
docker tag vllm/vllm-openai vllm/vllm-openai:v${RELEASE_VERSION}
docker push vllm/vllm-openai:latest
docker push vllm/vllm-openai:v${RELEASE_VERSION}
\`\`\`
EOF

View File

@@ -1,17 +0,0 @@
#!/bin/bash
# Usage: ./ci_clean_log.sh ci.log
# This script strips timestamps and color codes from CI log files.
# Check if argument is given
if [ $# -lt 1 ]; then
echo "Usage: $0 ci.log"
exit 1
fi
INPUT_FILE="$1"
# Strip timestamps
sed -i 's/^\[[0-9]\{4\}-[0-9]\{2\}-[0-9]\{2\}T[0-9]\{2\}:[0-9]\{2\}:[0-9]\{2\}Z\] //' "$INPUT_FILE"
# Strip colorization
sed -i -r 's/\x1B\[[0-9;]*[mK]//g' "$INPUT_FILE"

View File

@@ -94,10 +94,6 @@ if [[ $commands == *"pytest -v -s compile/test_basic_correctness.py"* ]]; then
commands=${commands//"pytest -v -s compile/test_basic_correctness.py"/"VLLM_USE_TRITON_FLASH_ATTN=0 pytest -v -s compile/test_basic_correctness.py"} commands=${commands//"pytest -v -s compile/test_basic_correctness.py"/"VLLM_USE_TRITON_FLASH_ATTN=0 pytest -v -s compile/test_basic_correctness.py"}
fi fi
if [[ $commands == *"pytest -v -s lora"* ]]; then
commands=${commands//"pytest -v -s lora"/"VLLM_ROCM_CUSTOM_PAGED_ATTN=0 pytest -v -s lora"}
fi
#ignore certain kernels tests #ignore certain kernels tests
if [[ $commands == *" kernels/core"* ]]; then if [[ $commands == *" kernels/core"* ]]; then
commands="${commands} \ commands="${commands} \

View File

@@ -7,7 +7,6 @@ set -ex
# Setup cleanup # Setup cleanup
remove_docker_container() { remove_docker_container() {
if [[ -n "$container_id" ]]; then if [[ -n "$container_id" ]]; then
podman stop --all -t0
podman rm -f "$container_id" || true podman rm -f "$container_id" || true
fi fi
podman system prune -f podman system prune -f
@@ -38,7 +37,7 @@ function cpu_tests() {
pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-facebook/opt-125m] pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-facebook/opt-125m]
pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-google/gemma-1.1-2b-it] pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-google/gemma-1.1-2b-it]
pytest -v -s tests/models/language/pooling/test_classification.py::test_models[float-jason9693/Qwen2.5-1.5B-apeach] pytest -v -s tests/models/language/pooling/test_classification.py::test_models[float-jason9693/Qwen2.5-1.5B-apeach]
pytest -v -s tests/models/language/pooling/test_embedding.py -m cpu_model" pytest -v -s tests/models/language/pooling/test_embedding.py::test_models[half-BAAI/bge-base-en-v1.5]"
} }
# All of CPU tests are expected to be finished less than 40 mins. # All of CPU tests are expected to be finished less than 40 mins.

View File

@@ -6,67 +6,72 @@ set -ex
# allow to bind to different cores # allow to bind to different cores
CORE_RANGE=${CORE_RANGE:-48-95} CORE_RANGE=${CORE_RANGE:-48-95}
OMP_CORE_RANGE=${OMP_CORE_RANGE:-48-95}
NUMA_NODE=${NUMA_NODE:-1} NUMA_NODE=${NUMA_NODE:-1}
export CMAKE_BUILD_PARALLEL_LEVEL=32
# Setup cleanup # Setup cleanup
remove_docker_container() { remove_docker_container() {
set -e; set -e;
docker rm -f cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"-avx2 || true; docker rm -f cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" || true;
docker image rm cpu-test-"$BUILDKITE_BUILD_NUMBER" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2 || true;
} }
trap remove_docker_container EXIT trap remove_docker_container EXIT
remove_docker_container remove_docker_container
# Try building the docker image # Try building the docker image
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --tag cpu-test-"$NUMA_NODE" --target vllm-test -f docker/Dockerfile.cpu . numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --tag cpu-test-"$BUILDKITE_BUILD_NUMBER" --target vllm-test -f docker/Dockerfile.cpu .
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" --tag cpu-test-"$NUMA_NODE"-avx2 --target vllm-test -f docker/Dockerfile.cpu . numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" --tag cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2 --target vllm-test -f docker/Dockerfile.cpu .
# Run the image, setting --shm-size=4g for tensor parallel. # Run the image, setting --shm-size=4g for tensor parallel.
docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --env VLLM_CPU_OMP_THREADS_BIND="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE" docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE" \
docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --env VLLM_CPU_OMP_THREADS_BIND="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE"-avx2 cpu-test-"$NUMA_NODE"-avx2 --cpuset-mems="$NUMA_NODE" --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"
docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE" \
--cpuset-mems="$NUMA_NODE" --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2
function cpu_tests() { function cpu_tests() {
set -e set -e
export NUMA_NODE=$2 export NUMA_NODE=$2
export BUILDKITE_BUILD_NUMBER=$3
# offline inference # offline inference
docker exec cpu-test-"$NUMA_NODE"-avx2 bash -c " docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" bash -c "
set -e set -e
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m" python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
# Run basic model test # Run basic model test
docker exec cpu-test-"$NUMA_NODE" bash -c " docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
set -e set -e
pytest -v -s tests/kernels/attention/test_cache.py -m cpu_model pytest -v -s tests/kernels/test_cache.py -m cpu_model
pytest -v -s tests/kernels/attention/test_mla_decode_cpu.py -m cpu_model pytest -v -s tests/kernels/test_mla_decode_cpu.py -m cpu_model
pytest -v -s tests/models/language/generation -m cpu_model pytest -v -s tests/models/decoder_only/language -m cpu_model
pytest -v -s tests/models/language/pooling -m cpu_model pytest -v -s tests/models/embedding/language -m cpu_model
pytest -v -s tests/models/multimodal/generation --ignore=tests/models/multimodal/generation/test_mllama.py -m cpu_model" pytest -v -s tests/models/encoder_decoder/language -m cpu_model
pytest -v -s tests/models/decoder_only/audio_language -m cpu_model
pytest -v -s tests/models/decoder_only/vision_language -m cpu_model"
# Run compressed-tensor test # Run compressed-tensor test
docker exec cpu-test-"$NUMA_NODE" bash -c " docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
set -e set -e
pytest -s -v \ pytest -s -v \
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_static_setup \ tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_static_setup \
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_dynamic_per_token" tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_dynamic_per_token"
# Run AWQ test # Run AWQ test
docker exec cpu-test-"$NUMA_NODE" bash -c " docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
set -e set -e
VLLM_USE_V1=0 pytest -s -v \ pytest -s -v \
tests/quantization/test_ipex_quant.py" tests/quantization/test_ipex_quant.py"
# Run chunked-prefill and prefix-cache test # Run chunked-prefill and prefix-cache test
docker exec cpu-test-"$NUMA_NODE" bash -c " docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
set -e set -e
pytest -s -v -k cpu_model \ pytest -s -v -k cpu_model \
tests/basic_correctness/test_chunked_prefill.py" tests/basic_correctness/test_chunked_prefill.py"
# online serving # online serving
docker exec cpu-test-"$NUMA_NODE" bash -c " docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
set -e set -e
export VLLM_CPU_KVCACHE_SPACE=10
export VLLM_CPU_OMP_THREADS_BIND=$1
python3 -m vllm.entrypoints.openai.api_server --model facebook/opt-125m --dtype half & python3 -m vllm.entrypoints.openai.api_server --model facebook/opt-125m --dtype half &
timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1 timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1
python3 benchmarks/benchmark_serving.py \ python3 benchmarks/benchmark_serving.py \
@@ -78,7 +83,7 @@ function cpu_tests() {
--tokenizer facebook/opt-125m" --tokenizer facebook/opt-125m"
# Run multi-lora tests # Run multi-lora tests
docker exec cpu-test-"$NUMA_NODE" bash -c " docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
set -e set -e
pytest -s -v \ pytest -s -v \
tests/lora/test_qwen2vl.py" tests/lora/test_qwen2vl.py"
@@ -86,4 +91,4 @@ function cpu_tests() {
# All of CPU tests are expected to be finished less than 40 mins. # All of CPU tests are expected to be finished less than 40 mins.
export -f cpu_tests export -f cpu_tests
timeout 1h bash -c "cpu_tests $CORE_RANGE $NUMA_NODE" timeout 40m bash -c "cpu_tests $CORE_RANGE $NUMA_NODE $BUILDKITE_BUILD_NUMBER"

View File

@@ -2,184 +2,102 @@
set -xu set -xu
remove_docker_container() {
docker rm -f tpu-test || true;
docker rm -f vllm-tpu || true;
}
trap remove_docker_container EXIT
# Remove the container that might not be cleaned up in the previous run.
remove_docker_container
# Build the docker image. # Build the docker image.
docker build -f docker/Dockerfile.tpu -t vllm-tpu . docker build -f docker/Dockerfile.tpu -t vllm-tpu .
# Set up cleanup. # Set up cleanup.
cleanup_docker() { remove_docker_container() { docker rm -f tpu-test || true; }
# Get Docker's root directory trap remove_docker_container EXIT
docker_root=$(docker info -f '{{.DockerRootDir}}') # Remove the container that might not be cleaned up in the previous run.
if [ -z "$docker_root" ]; then remove_docker_container
echo "Failed to determine Docker root directory."
exit 1
fi
echo "Docker root directory: $docker_root"
# Check disk usage of the filesystem where Docker's root directory is located
disk_usage=$(df "$docker_root" | tail -1 | awk '{print $5}' | sed 's/%//')
# Define the threshold
threshold=70
if [ "$disk_usage" -gt "$threshold" ]; then
echo "Disk usage is above $threshold%. Cleaning up Docker images and volumes..."
# Remove dangling images (those that are not tagged and not used by any container)
docker image prune -f
# Remove unused volumes / force the system prune for old images as well.
docker volume prune -f && docker system prune --force --filter "until=72h" --all
echo "Docker images and volumes cleanup completed."
else
echo "Disk usage is below $threshold%. No cleanup needed."
fi
}
cleanup_docker
# For HF_TOKEN. # For HF_TOKEN.
source /etc/environment source /etc/environment
# Run a simple end-to-end example.
docker run --privileged --net host --shm-size=16G -it \ docker run --privileged --net host --shm-size=16G -it \
-e "HF_TOKEN=$HF_TOKEN" --name tpu-test \ -e "HF_TOKEN=$HF_TOKEN" --name tpu-test \
vllm-tpu /bin/bash -c ' vllm-tpu /bin/bash -c "python3 -m pip install git+https://github.com/thuml/depyf.git \
set -e # Exit immediately if a command exits with a non-zero status. && python3 -m pip install pytest pytest-asyncio tpu-info \
set -u # Treat unset variables as an error. && python3 -m pip install lm_eval[api]==0.4.4 \
&& export VLLM_XLA_CACHE_PATH= \
&& export VLLM_USE_V1=1 \
&& export VLLM_XLA_CHECK_RECOMPILATION=1 \
&& echo HARDWARE \
&& tpu-info \
&& { \
echo TEST_0: Running test_perf.py; \
python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_perf.py; \
echo TEST_0_EXIT_CODE: \$?; \
} & \
{ \
echo TEST_1: Running test_compilation.py; \
python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_compilation.py; \
echo TEST_1_EXIT_CODE: \$?; \
} & \
{ \
echo TEST_2: Running test_basic.py; \
python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_basic.py; \
echo TEST_2_EXIT_CODE: \$?; \
} & \
{ \
echo TEST_3: Running test_accuracy.py::test_lm_eval_accuracy_v1_engine; \
python3 -m pytest -s -v /workspace/vllm/tests/entrypoints/llm/test_accuracy.py::test_lm_eval_accuracy_v1_engine; \
echo TEST_3_EXIT_CODE: \$?; \
} & \
{ \
echo TEST_4: Running test_quantization_accuracy.py; \
python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_quantization_accuracy.py; \
echo TEST_4_EXIT_CODE: \$?; \
} & \
{ \
echo TEST_5: Running examples/offline_inference/tpu.py; \
python3 /workspace/vllm/examples/offline_inference/tpu.py; \
echo TEST_5_EXIT_CODE: \$?; \
} & \
{ \
echo TEST_6: Running test_tpu_model_runner.py; \
python3 -m pytest -s -v /workspace/vllm/tests/tpu/worker/test_tpu_model_runner.py; \
echo TEST_6_EXIT_CODE: \$?; \
} & \
{ \
echo TEST_7: Running test_sampler.py; \
python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_sampler.py; \
echo TEST_7_EXIT_CODE: \$?; \
} & \
{ \
echo TEST_8: Running test_topk_topp_sampler.py; \
python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_topk_topp_sampler.py; \
echo TEST_8_EXIT_CODE: \$?; \
} & \
{ \
echo TEST_9: Running test_multimodal.py; \
python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_multimodal.py; \
echo TEST_9_EXIT_CODE: \$?; \
} & \
{ \
echo TEST_10: Running test_pallas.py; \
python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_pallas.py; \
echo TEST_10_EXIT_CODE: \$?; \
} & \
{ \
echo TEST_11: Running test_struct_output_generate.py; \
python3 -m pytest -s -v /workspace/vllm/tests/v1/entrypoints/llm/test_struct_output_generate.py; \
echo TEST_11_EXIT_CODE: \$?; \
} & \
{ \
echo TEST_12: Running test_moe_pallas.py; \
python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_moe_pallas.py; \
echo TEST_12_EXIT_CODE: \$?; \
} & \
# Disable the TPU LoRA tests until the feature is activated
# & { \
# echo TEST_13: Running test_moe_pallas.py; \
# python3 -m pytest -s -v /workspace/vllm/tests/tpu/lora/; \
# echo TEST_13_EXIT_CODE: \$?; \
# } & \
wait \
&& echo 'All tests have attempted to run. Check logs for individual test statuses and exit codes.' \
"
echo "--- Starting script inside Docker container ---"
# Create results directory
RESULTS_DIR=$(mktemp -d)
# If mktemp fails, set -e will cause the script to exit.
echo "Results will be stored in: $RESULTS_DIR"
# Install dependencies
echo "--- Installing Python dependencies ---"
python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \
&& python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \
&& python3 -m pip install --progress-bar off lm_eval[api]==0.4.4
echo "--- Python dependencies installed ---"
export VLLM_USE_V1=1
export VLLM_XLA_CHECK_RECOMPILATION=1
export VLLM_XLA_CACHE_PATH=
echo "Using VLLM V1"
echo "--- Hardware Information ---"
tpu-info
echo "--- Starting Tests ---"
set +e
overall_script_exit_code=0
# --- Test Definitions ---
# If a test fails, this function will print logs and will not cause the main script to exit.
run_test() {
local test_num=$1
local test_name=$2
local test_command=$3
local log_file="$RESULTS_DIR/test_${test_num}.log"
local actual_exit_code
echo "--- TEST_$test_num: Running $test_name ---"
# Execute the test command.
eval "$test_command" > >(tee -a "$log_file") 2> >(tee -a "$log_file" >&2)
actual_exit_code=$?
echo "TEST_${test_num}_COMMAND_EXIT_CODE: $actual_exit_code" # This goes to main log
echo "TEST_${test_num}_COMMAND_EXIT_CODE: $actual_exit_code" >> "$log_file" # Also to per-test log
if [ "$actual_exit_code" -ne 0 ]; then
echo "TEST_$test_num ($test_name) FAILED with exit code $actual_exit_code." >&2
echo "--- Log for failed TEST_$test_num ($test_name) ---" >&2
if [ -f "$log_file" ]; then
cat "$log_file" >&2
else
echo "Log file $log_file not found for TEST_$test_num ($test_name)." >&2
fi
echo "--- End of log for TEST_$test_num ($test_name) ---" >&2
return "$actual_exit_code" # Return the failure code
else
echo "TEST_$test_num ($test_name) PASSED."
return 0 # Return success
fi
}
# Helper function to call run_test and update the overall script exit code
run_and_track_test() {
local test_num_arg="$1"
local test_name_arg="$2"
local test_command_arg="$3"
# Run the test
run_test "$test_num_arg" "$test_name_arg" "$test_command_arg"
local test_specific_exit_code=$?
# If the test failed, set the overall script exit code to 1
if [ "$test_specific_exit_code" -ne 0 ]; then
# No need for extra echo here, run_test already logged the failure.
overall_script_exit_code=1
fi
}
# --- Actual Test Execution ---
run_and_track_test 0 "test_perf.py" \
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_perf.py"
run_and_track_test 1 "test_compilation.py" \
"python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_compilation.py"
run_and_track_test 2 "test_basic.py" \
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_basic.py"
run_and_track_test 3 "test_accuracy.py::test_lm_eval_accuracy_v1_engine" \
"python3 -m pytest -s -v /workspace/vllm/tests/entrypoints/llm/test_accuracy.py::test_lm_eval_accuracy_v1_engine"
run_and_track_test 4 "test_quantization_accuracy.py" \
"python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_quantization_accuracy.py"
run_and_track_test 5 "examples/offline_inference/tpu.py" \
"python3 /workspace/vllm/examples/offline_inference/tpu.py"
run_and_track_test 6 "test_tpu_model_runner.py" \
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/worker/test_tpu_model_runner.py"
run_and_track_test 7 "test_sampler.py" \
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_sampler.py"
run_and_track_test 8 "test_topk_topp_sampler.py" \
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_topk_topp_sampler.py"
run_and_track_test 9 "test_multimodal.py" \
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_multimodal.py"
run_and_track_test 10 "test_pallas.py" \
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_pallas.py"
run_and_track_test 11 "test_struct_output_generate.py" \
"python3 -m pytest -s -v /workspace/vllm/tests/v1/entrypoints/llm/test_struct_output_generate.py -k \"not test_structured_output_with_reasoning_matrices\""
run_and_track_test 12 "test_moe_pallas.py" \
"python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_moe_pallas.py"
run_and_track_test 13 "test_lora.py" \
"VLLM_XLA_CHECK_RECOMPILATION=0 python3 -m pytest -s -v /workspace/vllm/tests/tpu/lora/test_lora.py"
run_and_track_test 14 "test_tpu_qkv_linear.py" \
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_tpu_qkv_linear.py"
run_and_track_test 15 "test_spmd_model_weight_loading.py" \
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_spmd_model_weight_loading.py"
# After all tests have been attempted, exit with the overall status.
if [ "$overall_script_exit_code" -ne 0 ]; then
echo "--- One or more tests FAILED. Overall script exiting with failure code 1. ---"
else
echo "--- All tests have completed and PASSED. Overall script exiting with success code 0. ---"
fi
exit "$overall_script_exit_code"
' # IMPORTANT: This is the closing single quote for the bash -c "..." command. Ensure it is present and correct.
# Capture the exit code of the docker run command
DOCKER_RUN_EXIT_CODE=$?
# The trap will run for cleanup.
# Exit the main script with the Docker run command's exit code.
if [ "$DOCKER_RUN_EXIT_CODE" -ne 0 ]; then
echo "Docker run command failed with exit code $DOCKER_RUN_EXIT_CODE."
exit "$DOCKER_RUN_EXIT_CODE"
else
echo "Docker run command completed successfully."
exit 0
fi
# TODO: This test fails because it uses RANDOM_SEED sampling # TODO: This test fails because it uses RANDOM_SEED sampling
# pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \ # && VLLM_USE_V1=1 pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \

View File

@@ -1,18 +0,0 @@
#!/bin/bash
# Usage: ./rerun_test.sh path/to/test.py::test_name
# Check if argument is given
if [ $# -lt 1 ]; then
echo "Usage: $0 path/to/test.py::test_name"
echo "Example: $0 tests/v1/engine/test_engine_core_client.py::test_kv_cache_events[True-tcp]"
exit 1
fi
TEST=$1
COUNT=1
while pytest -sv "$TEST"; do
COUNT=$((COUNT + 1))
echo "RUN NUMBER ${COUNT}"
done

View File

@@ -1,24 +0,0 @@
#!/bin/bash
set -euo pipefail
docker_root=$(docker info -f '{{.DockerRootDir}}')
if [ -z "$docker_root" ]; then
echo "Failed to determine Docker root directory."
exit 1
fi
echo "Docker root directory: $docker_root"
# Check disk usage of the filesystem where Docker's root directory is located
disk_usage=$(df "$docker_root" | tail -1 | awk '{print $5}' | sed 's/%//')
# Define the threshold
threshold=70
if [ "$disk_usage" -gt "$threshold" ]; then
echo "Disk usage is above $threshold%. Cleaning up Docker images and volumes..."
# Remove dangling images (those that are not tagged and not used by any container)
docker image prune -f
# Remove unused volumes / force the system prune for old images as well.
docker volume prune -f && docker system prune --force --filter "until=72h" --all
echo "Docker images and volumes cleanup completed."
else
echo "Disk usage is below $threshold%. No cleanup needed."
fi

View File

@@ -1,14 +0,0 @@
# Environment config
TEST_NAME=llama8b
CONTAINER_NAME=vllm-tpu
# vllm config
MODEL=meta-llama/Llama-3.1-8B-Instruct
MAX_NUM_SEQS=512
MAX_NUM_BATCHED_TOKENS=512
TENSOR_PARALLEL_SIZE=1
MAX_MODEL_LEN=2048
DOWNLOAD_DIR=/mnt/disks/persist
EXPECTED_THROUGHPUT=8.0
INPUT_LEN=1800
OUTPUT_LEN=128

View File

@@ -1,102 +0,0 @@
#!/bin/bash
if [ ! -f "$1" ]; then
echo "Error: The env file '$1' does not exist."
exit 1 # Exit the script with a non-zero status to indicate an error
fi
ENV_FILE=$1
# For testing on local vm, use `set -a` to export all variables
source /etc/environment
source $ENV_FILE
remove_docker_container() {
docker rm -f tpu-test || true;
docker rm -f vllm-tpu || true;
docker rm -f $CONTAINER_NAME || true;
}
trap remove_docker_container EXIT
# Remove the container that might not be cleaned up in the previous run.
remove_docker_container
# Build docker image.
# TODO: build the image outside the script and share the image with other
# tpu test if building time is too long.
DOCKER_BUILDKIT=1 docker build \
--build-arg max_jobs=16 \
--build-arg USE_SCCACHE=1 \
--build-arg GIT_REPO_CHECK=0 \
--tag vllm/vllm-tpu-bm \
--progress plain -f docker/Dockerfile.tpu .
LOG_ROOT=$(mktemp -d)
# If mktemp fails, set -e will cause the script to exit.
echo "Results will be stored in: $LOG_ROOT"
if [ -z "$HF_TOKEN" ]; then
echo "Error: HF_TOKEN is not set or is empty."
exit 1
fi
# Make sure mounted disk or dir exists
if [ ! -d "$DOWNLOAD_DIR" ]; then
echo "Error: Folder $DOWNLOAD_DIR does not exist. This is useually a mounted drive. If no mounted drive, just create a folder."
exit 1
fi
echo "Run model $MODEL"
echo
echo "starting docker...$CONTAINER_NAME"
echo
docker run \
-v $DOWNLOAD_DIR:$DOWNLOAD_DIR \
--env-file $ENV_FILE \
-e HF_TOKEN="$HF_TOKEN" \
-e TARGET_COMMIT=$BUILDKITE_COMMIT \
-e MODEL=$MODEL \
-e WORKSPACE=/workspace \
--name $CONTAINER_NAME \
-d \
--privileged \
--network host \
-v /dev/shm:/dev/shm \
vllm/vllm-tpu-bm tail -f /dev/null
echo "run script..."
echo
docker exec "$CONTAINER_NAME" /bin/bash -c ".buildkite/scripts/hardware_ci/run_bm.sh"
echo "copy result back..."
VLLM_LOG="$LOG_ROOT/$TEST_NAME"_vllm_log.txt
BM_LOG="$LOG_ROOT/$TEST_NAME"_bm_log.txt
docker cp "$CONTAINER_NAME:/workspace/vllm_log.txt" "$VLLM_LOG"
docker cp "$CONTAINER_NAME:/workspace/bm_log.txt" "$BM_LOG"
throughput=$(grep "Request throughput (req/s):" "$BM_LOG" | sed 's/[^0-9.]//g')
echo "throughput for $TEST_NAME at $BUILDKITE_COMMIT: $throughput"
if [ "$BUILDKITE" = "true" ]; then
echo "Running inside Buildkite"
buildkite-agent artifact upload "$VLLM_LOG"
buildkite-agent artifact upload "$BM_LOG"
else
echo "Not running inside Buildkite"
fi
#
# compare the throughput with EXPECTED_THROUGHPUT
# and assert meeting the expectation
#
if [[ -z "$throughput" || ! "$throughput" =~ ^[0-9]+([.][0-9]+)?$ ]]; then
echo "Failed to get the throughput"
exit 1
fi
if (( $(echo "$throughput < $EXPECTED_THROUGHPUT" | bc -l) )); then
echo "Error: throughput($throughput) is less than expected($EXPECTED_THROUGHPUT)"
exit 1
fi

View File

@@ -1,94 +0,0 @@
#!/bin/bash
set -euo pipefail
VLLM_LOG="$WORKSPACE/vllm_log.txt"
BM_LOG="$WORKSPACE/bm_log.txt"
if [ -n "$TARGET_COMMIT" ]; then
head_hash=$(git rev-parse HEAD)
if [ "$TARGET_COMMIT" != "$head_hash" ]; then
echo "Error: target commit $TARGET_COMMIT does not match HEAD: $head_hash"
exit 1
fi
fi
echo "model: $MODEL"
echo
#
# create a log folder
#
mkdir "$WORKSPACE/log"
# TODO: Move to image building.
pip install pandas
pip install datasets
#
# create sonnet_4x
#
echo "Create sonnet_4x.txt"
echo "" > benchmarks/sonnet_4x.txt
for _ in {1..4}
do
cat benchmarks/sonnet.txt >> benchmarks/sonnet_4x.txt
done
#
# start vllm service in backend
#
echo "lanching vllm..."
echo "logging to $VLLM_LOG"
echo
VLLM_USE_V1=1 vllm serve $MODEL \
--seed 42 \
--disable-log-requests \
--max-num-seqs $MAX_NUM_SEQS \
--max-num-batched-tokens $MAX_NUM_BATCHED_TOKENS \
--tensor-parallel-size $TENSOR_PARALLEL_SIZE \
--no-enable-prefix-caching \
--download_dir $DOWNLOAD_DIR \
--max-model-len $MAX_MODEL_LEN > "$VLLM_LOG" 2>&1 &
echo "wait for 20 minutes.."
echo
# sleep 1200
# wait for 10 minutes...
for i in {1..120}; do
# TODO: detect other type of errors.
if grep -Fq "raise RuntimeError" "$VLLM_LOG"; then
echo "Detected RuntimeError, exiting."
exit 1
elif grep -Fq "Application startup complete" "$VLLM_LOG"; then
echo "Application started"
break
else
echo "wait for 10 seconds..."
sleep 10
fi
done
#
# run test
#
echo "run benchmark test..."
echo "logging to $BM_LOG"
echo
python benchmarks/benchmark_serving.py \
--backend vllm \
--model $MODEL \
--dataset-name sonnet \
--dataset-path benchmarks/sonnet_4x.txt \
--sonnet-input-len $INPUT_LEN \
--sonnet-output-len $OUTPUT_LEN \
--ignore-eos > "$BM_LOG"
echo "completed..."
echo
throughput=$(grep "Request throughput (req/s):" "$BM_LOG" | sed 's/[^0-9.]//g')
echo "throughput: $throughput"
echo

View File

@@ -145,7 +145,6 @@ steps:
- examples/offline_inference/rlhf_colocate.py - examples/offline_inference/rlhf_colocate.py
- tests/examples/offline_inference/data_parallel.py - tests/examples/offline_inference/data_parallel.py
- tests/v1/test_async_llm_dp.py - tests/v1/test_async_llm_dp.py
- tests/v1/engine/test_engine_core_client.py
commands: commands:
# test with tp=2 and external_dp=2 # test with tp=2 and external_dp=2
- VLLM_USE_V1=0 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py - VLLM_USE_V1=0 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
@@ -155,7 +154,6 @@ steps:
# test with internal dp # test with internal dp
- python3 ../examples/offline_inference/data_parallel.py - python3 ../examples/offline_inference/data_parallel.py
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py
- pytest -v -s v1/engine/test_engine_core_client.py::test_kv_cache_events_dp
- pytest -v -s distributed/test_utils.py - pytest -v -s distributed/test_utils.py
- pytest -v -s compile/test_basic_correctness.py - pytest -v -s compile/test_basic_correctness.py
- pytest -v -s distributed/test_pynccl.py - pytest -v -s distributed/test_pynccl.py
@@ -201,9 +199,8 @@ steps:
- tests/test_sequence - tests/test_sequence
- tests/test_config - tests/test_config
- tests/test_logger - tests/test_logger
- tests/test_vllm_port
commands: commands:
- pytest -v -s engine test_sequence.py test_config.py test_logger.py test_vllm_port.py - pytest -v -s engine test_sequence.py test_config.py test_logger.py
# OOM in the CI unless we run this separately # OOM in the CI unless we run this separately
- pytest -v -s tokenization - pytest -v -s tokenization
@@ -277,6 +274,17 @@ steps:
- pytest -v -s samplers - pytest -v -s samplers
- VLLM_USE_FLASHINFER_SAMPLER=1 pytest -v -s samplers - VLLM_USE_FLASHINFER_SAMPLER=1 pytest -v -s samplers
- label: LogitsProcessor Test # 5min
mirror_hardwares: [amdexperimental, amdproduction]
source_file_dependencies:
- vllm/model_executor/layers
- vllm/model_executor/guided_decoding
- tests/test_logits_processor
- tests/model_executor/test_guided_processors
commands:
- pytest -v -s test_logits_processor.py
- pytest -v -s model_executor/test_guided_processors.py
- label: Speculative decoding tests # 40min - label: Speculative decoding tests # 40min
mirror_hardwares: [amdexperimental] mirror_hardwares: [amdexperimental]
source_file_dependencies: source_file_dependencies:
@@ -289,7 +297,7 @@ steps:
- pytest -v -s spec_decode/e2e/test_eagle_correctness.py - pytest -v -s spec_decode/e2e/test_eagle_correctness.py
- label: LoRA Test %N # 15min each - label: LoRA Test %N # 15min each
mirror_hardwares: [amdexperimental, amdproduction] mirror_hardwares: [amdexperimental]
source_file_dependencies: source_file_dependencies:
- vllm/lora - vllm/lora
- tests/lora - tests/lora
@@ -320,7 +328,6 @@ steps:
# these tests need to be separated, cannot combine # these tests need to be separated, cannot combine
- pytest -v -s compile/piecewise/test_simple.py - pytest -v -s compile/piecewise/test_simple.py
- pytest -v -s compile/piecewise/test_toy_llama.py - pytest -v -s compile/piecewise/test_toy_llama.py
- pytest -v -s compile/piecewise/test_full_cudagraph.py
- label: PyTorch Fullgraph Test # 18min - label: PyTorch Fullgraph Test # 18min
mirror_hardwares: [amdexperimental, amdproduction] mirror_hardwares: [amdexperimental, amdproduction]
@@ -390,17 +397,6 @@ steps:
- pytest -v -s tensorizer_loader - pytest -v -s tensorizer_loader
- pytest -v -s entrypoints/openai/test_tensorizer_entrypoint.py - pytest -v -s entrypoints/openai/test_tensorizer_entrypoint.py
- label: Model Executor Test
mirror_hardwares: [amdexperimental, amdproduction]
soft_fail: true
source_file_dependencies:
- vllm/model_executor
- tests/model_executor
commands:
- apt-get update && apt-get install -y curl libsodium23
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- pytest -v -s model_executor
- label: Benchmarks # 9min - label: Benchmarks # 9min
mirror_hardwares: [amdexperimental, amdproduction] mirror_hardwares: [amdexperimental, amdproduction]
working_dir: "/vllm-workspace/.buildkite" working_dir: "/vllm-workspace/.buildkite"
@@ -424,9 +420,6 @@ steps:
- vllm/model_executor/layers/quantization - vllm/model_executor/layers/quantization
- tests/quantization - tests/quantization
commands: commands:
# temporary install here since we need nightly, will move to requirements/test.in
# after torchao 0.12 release
- pip install --pre torchao --index-url https://download.pytorch.org/whl/nightly/cu126
- VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization
- label: LM Eval Small Models # 53min - label: LM Eval Small Models # 53min
@@ -624,11 +617,9 @@ steps:
- vllm/worker/model_runner.py - vllm/worker/model_runner.py
- entrypoints/llm/test_collective_rpc.py - entrypoints/llm/test_collective_rpc.py
- tests/v1/test_async_llm_dp.py - tests/v1/test_async_llm_dp.py
- tests/v1/entrypoints/openai/test_multi_api_servers.py
- vllm/v1/engine/ - vllm/v1/engine/
commands: commands:
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py
- DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py
- pytest -v -s entrypoints/llm/test_collective_rpc.py - pytest -v -s entrypoints/llm/test_collective_rpc.py
- pytest -v -s ./compile/test_basic_correctness.py - pytest -v -s ./compile/test_basic_correctness.py
- pytest -v -s ./compile/test_wrapper.py - pytest -v -s ./compile/test_wrapper.py

14
.github/CODEOWNERS vendored
View File

@@ -10,17 +10,15 @@
/vllm/worker/worker.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill /vllm/worker/worker.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
/vllm/model_executor/layers/sampler.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill /vllm/model_executor/layers/sampler.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
/vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth /vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth
/vllm/model_executor/guided_decoding @mgoin @russellb @aarnphm /vllm/model_executor/guided_decoding @mgoin @russellb
/vllm/multimodal @DarkLight1337 @ywang96 /vllm/multimodal @DarkLight1337 @ywang96
/vllm/vllm_flash_attn @LucasWilkinson /vllm/vllm_flash_attn @LucasWilkinson
/vllm/lora @jeejeelee /vllm/lora @jeejeelee
/vllm/reasoning @aarnphm
/vllm/entrypoints @aarnphm
CMakeLists.txt @tlrmchlsmth CMakeLists.txt @tlrmchlsmth
# vLLM V1 # vLLM V1
/vllm/v1 @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat /vllm/v1 @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat
/vllm/v1/structured_output @mgoin @russellb @aarnphm /vllm/v1/structured_output @mgoin @russellb
# Test ownership # Test ownership
/.buildkite/lm-eval-harness @mgoin @simon-mo /.buildkite/lm-eval-harness @mgoin @simon-mo
@@ -29,8 +27,8 @@ CMakeLists.txt @tlrmchlsmth
/tests/distributed/test_multi_node_assignment.py @youkaichao /tests/distributed/test_multi_node_assignment.py @youkaichao
/tests/distributed/test_pipeline_parallel.py @youkaichao /tests/distributed/test_pipeline_parallel.py @youkaichao
/tests/distributed/test_same_node.py @youkaichao /tests/distributed/test_same_node.py @youkaichao
/tests/entrypoints @DarkLight1337 @robertgshaw2-redhat @simon-mo @aarnphm /tests/entrypoints @DarkLight1337 @robertgshaw2-redhat @simon-mo
/tests/entrypoints/llm/test_guided_generate.py @mgoin @russellb @aarnphm /tests/entrypoints/llm/test_guided_generate.py @mgoin @russellb
/tests/kernels @tlrmchlsmth @WoosukKwon /tests/kernels @tlrmchlsmth @WoosukKwon
/tests/model_executor/test_guided_processors.py @mgoin @russellb /tests/model_executor/test_guided_processors.py @mgoin @russellb
/tests/models @DarkLight1337 @ywang96 /tests/models @DarkLight1337 @ywang96
@@ -40,8 +38,8 @@ CMakeLists.txt @tlrmchlsmth
/tests/quantization @mgoin @robertgshaw2-redhat /tests/quantization @mgoin @robertgshaw2-redhat
/tests/spec_decode @njhill @LiuXiaoxuanPKU /tests/spec_decode @njhill @LiuXiaoxuanPKU
/tests/test_inputs.py @DarkLight1337 @ywang96 /tests/test_inputs.py @DarkLight1337 @ywang96
/tests/v1/entrypoints/llm/test_struct_output_generate.py @mgoin @russellb @aarnphm /tests/v1/entrypoints/llm/test_struct_output_generate.py @mgoin @russellb
/tests/v1/structured_output @mgoin @russellb @aarnphm /tests/v1/structured_output @mgoin @russellb
/tests/weight_loading @mgoin @youkaichao /tests/weight_loading @mgoin @youkaichao
/tests/lora @jeejeelee /tests/lora @jeejeelee

View File

@@ -1,18 +1,6 @@
## Essential Elements of an Effective PR Description Checklist FILL IN THE PR DESCRIPTION HERE
- [ ] The purpose of the PR, such as "Fix some issue (link existing issues this PR will resolve)".
- [ ] The test plan, such as providing test command.
- [ ] The test results, such as pasting the results comparison before and after, or e2e results
- [ ] (Optional) The necessary documentation update, such as updating `supported_models.md` and `examples` for a new model.
PLEASE FILL IN THE PR DESCRIPTION HERE ENSURING ALL CHECKLIST ITEMS ABOVE HAVE BEEN CONSIDERED. FIX #xxxx (*link existing issues this PR will resolve*)
## Purpose
## Test Plan
## Test Result
## (Optional) Documentation Update
<!--- pyml disable-next-line no-emphasis-as-heading --> <!--- pyml disable-next-line no-emphasis-as-heading -->
**BEFORE SUBMITTING, PLEASE READ <https://docs.vllm.ai/en/latest/contributing>** (anything written below this line will be removed by GitHub Actions) **BEFORE SUBMITTING, PLEASE READ <https://docs.vllm.ai/en/latest/contributing>** (anything written below this line will be removed by GitHub Actions)

14
.github/mergify.yml vendored
View File

@@ -36,20 +36,6 @@ pull_request_rules:
add: add:
- frontend - frontend
- name: label-llama
description: Automatically apply llama label
conditions:
- or:
- files~=^examples/.*llama.*\.py
- files~=^tests/.*llama.*\.py
- files~=^vllm/entrypoints/openai/tool_parsers/llama.*\.py
- files~=^vllm/model_executor/models/.*llama.*\.py
- files~=^vllm/transformers_utils/configs/.*llama.*\.py
actions:
label:
add:
- llama
- name: label-multi-modality - name: label-multi-modality
description: Automatically apply multi-modality label description: Automatically apply multi-modality label
conditions: conditions:

View File

@@ -11,8 +11,6 @@ repos:
hooks: hooks:
- id: yapf - id: yapf
args: [--in-place, --verbose] args: [--in-place, --verbose]
# Keep the same list from yapfignore here to avoid yapf failing without any inputs
exclude: '(.buildkite|benchmarks|build|examples)/.*'
- repo: https://github.com/astral-sh/ruff-pre-commit - repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.11.7 rev: v0.11.7
hooks: hooks:
@@ -60,7 +58,7 @@ repos:
entry: tools/mypy.sh 0 "local" entry: tools/mypy.sh 0 "local"
language: python language: python
types: [python] types: [python]
additional_dependencies: &mypy_deps [mypy==1.11.1, types-cachetools, types-setuptools, types-PyYAML, types-requests, pydantic] additional_dependencies: &mypy_deps [mypy==1.11.1, types-cachetools, types-setuptools, types-PyYAML, types-requests]
stages: [pre-commit] # Don't run in CI stages: [pre-commit] # Don't run in CI
- id: mypy-3.9 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward - id: mypy-3.9 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
name: Run mypy for Python 3.9 name: Run mypy for Python 3.9

View File

@@ -23,9 +23,6 @@ include(${CMAKE_CURRENT_LIST_DIR}/cmake/utils.cmake)
# Suppress potential warnings about unused manually-specified variables # Suppress potential warnings about unused manually-specified variables
set(ignoreMe "${VLLM_PYTHON_PATH}") set(ignoreMe "${VLLM_PYTHON_PATH}")
# Prevent installation of dependencies (cutlass) by default.
install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY TRUE)" ALL_COMPONENTS)
# #
# Supported python versions. These versions will be searched in order, the # Supported python versions. These versions will be searched in order, the
# first match will be selected. These should be kept in sync with setup.py. # first match will be selected. These should be kept in sync with setup.py.
@@ -182,6 +179,9 @@ include(FetchContent)
file(MAKE_DIRECTORY ${FETCHCONTENT_BASE_DIR}) # Ensure the directory exists file(MAKE_DIRECTORY ${FETCHCONTENT_BASE_DIR}) # Ensure the directory exists
message(STATUS "FetchContent base directory: ${FETCHCONTENT_BASE_DIR}") message(STATUS "FetchContent base directory: ${FETCHCONTENT_BASE_DIR}")
#
# Set rocm version dev int.
#
if(VLLM_GPU_LANG STREQUAL "HIP") if(VLLM_GPU_LANG STREQUAL "HIP")
# #
# Overriding the default -O set up by cmake, adding ggdb3 for the most verbose devug info # Overriding the default -O set up by cmake, adding ggdb3 for the most verbose devug info
@@ -189,6 +189,7 @@ if(VLLM_GPU_LANG STREQUAL "HIP")
set(CMAKE_${VLLM_GPU_LANG}_FLAGS_DEBUG "${CMAKE_${VLLM_GPU_LANG}_FLAGS_DEBUG} -O0 -ggdb3") set(CMAKE_${VLLM_GPU_LANG}_FLAGS_DEBUG "${CMAKE_${VLLM_GPU_LANG}_FLAGS_DEBUG} -O0 -ggdb3")
set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -O0 -ggdb3") set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -O0 -ggdb3")
# #
# Certain HIP functions are marked as [[nodiscard]], yet vllm ignores the result which generates # Certain HIP functions are marked as [[nodiscard]], yet vllm ignores the result which generates
# a lot of warnings that always mask real issues. Suppressing until this is properly addressed. # a lot of warnings that always mask real issues. Suppressing until this is properly addressed.
@@ -242,7 +243,6 @@ set(VLLM_EXT_SRC
"csrc/activation_kernels.cu" "csrc/activation_kernels.cu"
"csrc/layernorm_kernels.cu" "csrc/layernorm_kernels.cu"
"csrc/layernorm_quant_kernels.cu" "csrc/layernorm_quant_kernels.cu"
"csrc/sampler.cu"
"csrc/cuda_view.cu" "csrc/cuda_view.cu"
"csrc/quantization/gptq/q_gemm.cu" "csrc/quantization/gptq/q_gemm.cu"
"csrc/quantization/compressed_tensors/int8_quant_kernels.cu" "csrc/quantization/compressed_tensors/int8_quant_kernels.cu"
@@ -308,7 +308,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
# Keep building Marlin for 9.0 as there are some group sizes and shapes that # Keep building Marlin for 9.0 as there are some group sizes and shapes that
# are not supported by Machete yet. # are not supported by Machete yet.
# 9.0 for latest bf16 atomicAdd PTX # 9.0 for latest bf16 atomicAdd PTX
cuda_archs_loose_intersection(MARLIN_ARCHS "8.0;8.7;9.0+PTX" "${CUDA_ARCHS}") cuda_archs_loose_intersection(MARLIN_ARCHS "8.0;9.0+PTX" "${CUDA_ARCHS}")
if (MARLIN_ARCHS) if (MARLIN_ARCHS)
# #
@@ -454,7 +454,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
# kernels for the remaining archs that are not already built for 3x. # kernels for the remaining archs that are not already built for 3x.
# (Build 8.9 for FP8) # (Build 8.9 for FP8)
cuda_archs_loose_intersection(SCALED_MM_2X_ARCHS cuda_archs_loose_intersection(SCALED_MM_2X_ARCHS
"7.5;8.0;8.7;8.9+PTX" "${CUDA_ARCHS}") "7.5;8.0;8.9+PTX" "${CUDA_ARCHS}")
# subtract out the archs that are already built for 3x # subtract out the archs that are already built for 3x
list(REMOVE_ITEM SCALED_MM_2X_ARCHS ${SCALED_MM_3X_ARCHS}) list(REMOVE_ITEM SCALED_MM_2X_ARCHS ${SCALED_MM_3X_ARCHS})
if (SCALED_MM_2X_ARCHS) if (SCALED_MM_2X_ARCHS)
@@ -543,8 +543,8 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
# CUTLASS MoE kernels # CUTLASS MoE kernels
# The MoE kernel cutlass_moe_mm requires CUDA 12.3 or later (and only works # The MoE kernel cutlass_moe_mm requires CUDA 12.3 or later (and only works
# on Hopper). get_cutlass_(pplx_)moe_mm_data should only be compiled # on Hopper). get_cutlass_moe_mm_data should only be compiled if it's possible
# if it's possible to compile MoE kernels that use its output. # to compile MoE kernels that use its output.
cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a;10.0a" "${CUDA_ARCHS}") cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a;10.0a" "${CUDA_ARCHS}")
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND SCALED_MM_ARCHS) if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND SCALED_MM_ARCHS)
set(SRCS "csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x.cu" set(SRCS "csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x.cu"
@@ -684,7 +684,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
list(APPEND VLLM_MOE_EXT_SRC "${VLLM_MOE_WNA16_SRC}") list(APPEND VLLM_MOE_EXT_SRC "${VLLM_MOE_WNA16_SRC}")
# 9.0 for latest bf16 atomicAdd PTX # 9.0 for latest bf16 atomicAdd PTX
cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0;8.7;9.0+PTX" "${CUDA_ARCHS}") cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0;9.0+PTX" "${CUDA_ARCHS}")
if (MARLIN_MOE_ARCHS) if (MARLIN_MOE_ARCHS)
# #
@@ -785,7 +785,5 @@ endif()
# For CUDA we also build and ship some external projects. # For CUDA we also build and ship some external projects.
if (VLLM_GPU_LANG STREQUAL "CUDA") if (VLLM_GPU_LANG STREQUAL "CUDA")
include(cmake/external_projects/flashmla.cmake) include(cmake/external_projects/flashmla.cmake)
# vllm-flash-attn should be last as it overwrites some CMake functions
include(cmake/external_projects/vllm_flash_attn.cmake) include(cmake/external_projects/vllm_flash_attn.cmake)
endif () endif ()

View File

@@ -58,8 +58,8 @@ vLLM is fast with:
- Efficient management of attention key and value memory with [**PagedAttention**](https://blog.vllm.ai/2023/06/20/vllm.html) - Efficient management of attention key and value memory with [**PagedAttention**](https://blog.vllm.ai/2023/06/20/vllm.html)
- Continuous batching of incoming requests - Continuous batching of incoming requests
- Fast model execution with CUDA/HIP graph - Fast model execution with CUDA/HIP graph
- Quantizations: [GPTQ](https://arxiv.org/abs/2210.17323), [AWQ](https://arxiv.org/abs/2306.00978), [AutoRound](https://arxiv.org/abs/2309.05516), INT4, INT8, and FP8 - Quantizations: [GPTQ](https://arxiv.org/abs/2210.17323), [AWQ](https://arxiv.org/abs/2306.00978), [AutoRound](https://arxiv.org/abs/2309.05516),INT4, INT8, and FP8.
- Optimized CUDA kernels, including integration with FlashAttention and FlashInfer - Optimized CUDA kernels, including integration with FlashAttention and FlashInfer.
- Speculative decoding - Speculative decoding
- Chunked prefill - Chunked prefill
@@ -72,14 +72,14 @@ vLLM is flexible and easy to use with:
- Tensor parallelism and pipeline parallelism support for distributed inference - Tensor parallelism and pipeline parallelism support for distributed inference
- Streaming outputs - Streaming outputs
- OpenAI-compatible API server - OpenAI-compatible API server
- Support NVIDIA GPUs, AMD CPUs and GPUs, Intel CPUs and GPUs, PowerPC CPUs, TPU, and AWS Neuron - Support NVIDIA GPUs, AMD CPUs and GPUs, Intel CPUs and GPUs, PowerPC CPUs, TPU, and AWS Neuron.
- Prefix caching support - Prefix caching support
- Multi-LoRA support - Multi-LoRA support
vLLM seamlessly supports most popular open-source models on HuggingFace, including: vLLM seamlessly supports most popular open-source models on HuggingFace, including:
- Transformer-like LLMs (e.g., Llama) - Transformer-like LLMs (e.g., Llama)
- Mixture-of-Expert LLMs (e.g., Mixtral, Deepseek-V2 and V3) - Mixture-of-Expert LLMs (e.g., Mixtral, Deepseek-V2 and V3)
- Embedding Models (e.g., E5-Mistral) - Embedding Models (e.g. E5-Mistral)
- Multi-modal LLMs (e.g., LLaVA) - Multi-modal LLMs (e.g., LLaVA)
Find the full list of supported models [here](https://docs.vllm.ai/en/latest/models/supported_models.html). Find the full list of supported models [here](https://docs.vllm.ai/en/latest/models/supported_models.html).
@@ -162,4 +162,4 @@ If you use vLLM for your research, please cite our [paper](https://arxiv.org/abs
## Media Kit ## Media Kit
- If you wish to use vLLM's logo, please refer to [our media kit repo](https://github.com/vllm-project/media-kit) - If you wish to use vLLM's logo, please refer to [our media kit repo](https://github.com/vllm-project/media-kit).

View File

@@ -8,6 +8,4 @@ Please report security issues privately using [the vulnerability submission form
--- ---
Please see the [Security Guide in the vLLM documentation](https://docs.vllm.ai/en/latest/usage/security.html) for more information on vLLM's security assumptions and recommendations.
Please see [PyTorch's Security Policy](https://github.com/pytorch/pytorch/blob/main/SECURITY.md) for more information and recommendations on how to securely interact with models. Please see [PyTorch's Security Policy](https://github.com/pytorch/pytorch/blob/main/SECURITY.md) for more information and recommendations on how to securely interact with models.

View File

@@ -64,12 +64,6 @@ become available.
<td style="text-align: center;">✅</td> <td style="text-align: center;">✅</td>
<td><code>lmms-lab/LLaVA-OneVision-Data</code>, <code>Aeala/ShareGPT_Vicuna_unfiltered</code></td> <td><code>lmms-lab/LLaVA-OneVision-Data</code>, <code>Aeala/ShareGPT_Vicuna_unfiltered</code></td>
</tr> </tr>
<tr>
<td><strong>Custom</strong></td>
<td style="text-align: center;">✅</td>
<td style="text-align: center;">✅</td>
<td>Local file: <code>data.jsonl</code></td>
</tr>
</tbody> </tbody>
</table> </table>
@@ -130,38 +124,6 @@ P99 ITL (ms): 8.39
================================================== ==================================================
``` ```
### Custom Dataset
If the dataset you want to benchmark is not supported yet in vLLM, even then you can benchmark on it using `CustomDataset`. Your data needs to be in `.jsonl` format and needs to have "prompt" field per entry, e.g., data.jsonl
```
{"prompt": "What is the capital of India?"}
{"prompt": "What is the capital of Iran?"}
{"prompt": "What is the capital of China?"}
```
```bash
# start server
VLLM_USE_V1=1 vllm serve meta-llama/Llama-3.1-8B-Instruct --disable-log-requests
```
```bash
# run benchmarking script
python3 benchmarks/benchmark_serving.py --port 9001 --save-result --save-detailed \
--backend vllm \
--model meta-llama/Llama-3.1-8B-Instruct \
--endpoint /v1/completions \
--dataset-name custom \
--dataset-path <path-to-your-data-jsonl> \
--custom-skip-chat-template \
--num-prompts 80 \
--max-concurrency 1 \
--temperature=0.3 \
--top-p=0.75 \
--result-dir "./log/"
```
You can skip applying chat template if your data already has it by using `--custom-skip-chat-template`.
### VisionArena Benchmark for Vision Language Models ### VisionArena Benchmark for Vision Language Models
```bash ```bash
@@ -184,9 +146,9 @@ python3 vllm/benchmarks/benchmark_serving.py \
``` bash ``` bash
VLLM_USE_V1=1 vllm serve meta-llama/Meta-Llama-3-8B-Instruct \ VLLM_USE_V1=1 vllm serve meta-llama/Meta-Llama-3-8B-Instruct \
--speculative-config $'{"method": "ngram", --ngram_prompt_lookup_min 2 \
"num_speculative_tokens": 5, "prompt_lookup_max": 5, --ngram-prompt-lookup-max 5 \
"prompt_lookup_min": 2}' --speculative_config '{"model": "[ngram]", "num_speculative_tokens": 5}
``` ```
``` bash ``` bash
@@ -241,16 +203,6 @@ python3 vllm/benchmarks/benchmark_serving.py \
--seed 42 --seed 42
``` ```
**`philschmid/mt-bench`**
``` bash
python3 vllm/benchmarks/benchmark_serving.py \
--model Qwen/QwQ-32B \
--dataset-name hf \
--dataset-path philschmid/mt-bench \
--num-prompts 80
```
### Running With Sampling Parameters ### Running With Sampling Parameters
When using OpenAI-compatible backends such as `vllm`, optional sampling When using OpenAI-compatible backends such as `vllm`, optional sampling
@@ -321,9 +273,9 @@ python3 vllm/benchmarks/benchmark_throughput.py \
--output-len=100 \ --output-len=100 \
--num-prompts=2048 \ --num-prompts=2048 \
--async-engine \ --async-engine \
--speculative-config $'{"method": "ngram", --ngram_prompt_lookup_min=2 \
"num_speculative_tokens": 5, "prompt_lookup_max": 5, --ngram-prompt-lookup-max=5 \
"prompt_lookup_min": 2}' --speculative_config '{"model": "[ngram]", "num_speculative_tokens": 5}
``` ```
``` ```

View File

@@ -10,15 +10,11 @@
# 3. Set variables (ALL REQUIRED) # 3. Set variables (ALL REQUIRED)
# BASE: your directory for vllm repo # BASE: your directory for vllm repo
# MODEL: the model served by vllm # MODEL: the model served by vllm
# TP: ways of tensor parallelism
# DOWNLOAD_DIR: directory to download and load model weights. # DOWNLOAD_DIR: directory to download and load model weights.
# INPUT_LEN: request input len # INPUT_LEN: request input len
# OUTPUT_LEN: request output len # OUTPUT_LEN: request output len
# MIN_CACHE_HIT_PCT: prefix cache rate # MIN_CACHE_HIT_PCT: prefix cache rate
# MAX_LATENCY_ALLOWED_MS: (e2e) latency requirement. If there's no latency requirement, set it to a large number like 1000000000 # MAX_LATENCY_ALLOWED_MS: (e2e) latency requirement. If there's no latency requirement, set it to a large number like 1000000000
# NUM_SEQS_LIST: a list of `max-num-seqs` you want to loop with.
# NUM_BATCHED_TOKENS_LIST: a list of `max-num-batched-tokens` you want to loop with.
# Note that the default NUM_SEQS_LIST and NUM_BATCHED_TOKENS_LIST are set for medium size input/output len, for extra short context (such as 20:20), you might need to include larger numbers in NUM_SEQS_LIST.
# 4. Run the script, it might take a long time, you can use tmux to avoid the script stop if disconnection happens. # 4. Run the script, it might take a long time, you can use tmux to avoid the script stop if disconnection happens.
# 5. The final result will be saved in RESULT file. # 5. The final result will be saved in RESULT file.
@@ -34,27 +30,31 @@
TAG=$(date +"%Y_%m_%d_%H_%M") TAG=$(date +"%Y_%m_%d_%H_%M")
BASE="" BASE=""
MODEL="meta-llama/Llama-3.1-8B-Instruct" MODEL="meta-llama/Llama-3.1-8B-Instruct"
TP=1
DOWNLOAD_DIR="" DOWNLOAD_DIR=""
INPUT_LEN=4000 INPUT_LEN=4000
OUTPUT_LEN=16 OUTPUT_LEN=16
MIN_CACHE_HIT_PCT=0 MIN_CACHE_HIT_PCT_PCT=0
MAX_LATENCY_ALLOWED_MS=100000000000 MAX_LATENCY_ALLOWED_MS=100000000000
NUM_SEQS_LIST="128 256"
NUM_BATCHED_TOKENS_LIST="512 1024 2048 4096"
LOG_FOLDER="$BASE/auto-benchmark/$TAG" LOG_FOLDER="$BASE/auto-benchmark/$TAG"
RESULT="$LOG_FOLDER/result.txt" RESULT="$LOG_FOLDER/result.txt"
echo "result file: $RESULT" echo "result file$ $RESULT"
echo "model: $MODEL" echo "model: $MODEL"
echo
rm -rf $LOG_FOLDER rm -rf $LOG_FOLDER
mkdir -p $LOG_FOLDER mkdir -p $LOG_FOLDER
cd "$BASE/vllm" cd "$BASE/vllm"
# create sonnet-4x.txt so that we can sample 2048 tokens for input
echo "" > benchmarks/sonnet_4x.txt
for _ in {1..4}
do
cat benchmarks/sonnet.txt >> benchmarks/sonnet_4x.txt
done
pip install -q datasets pip install datasets
current_hash=$(git rev-parse HEAD) current_hash=$(git rev-parse HEAD)
echo "hash:$current_hash" >> "$RESULT" echo "hash:$current_hash" >> "$RESULT"
@@ -64,69 +64,53 @@ best_throughput=0
best_max_num_seqs=0 best_max_num_seqs=0
best_num_batched_tokens=0 best_num_batched_tokens=0
best_goodput=0 best_goodput=0
start_server() {
local gpu_memory_utilization=$1
local max_num_seqs=$2
local max_num_batched_tokens=$3
local vllm_log=$4
pkill -f vllm
VLLM_USE_V1=1 VLLM_SERVER_DEV_MODE=1 vllm serve $MODEL \
--disable-log-requests \
--port 8004 \
--gpu-memory-utilization $gpu_memory_utilization \
--max-num-seqs $max_num_seqs \
--max-num-batched-tokens $max_num_batched_tokens \
--tensor-parallel-size $TP \
--enable-prefix-caching \
--load-format dummy \
--download-dir "$DOWNLOAD_DIR" \
--max-model-len $(( INPUT_LEN+OUTPUT_LEN )) > "$vllm_log" 2>&1 &
# wait for 10 minutes...
server_started=0
for i in {1..60}; do
RESPONSE=$(curl -s -X GET "http://0.0.0.0:8004/health" -w "%{http_code}" -o /dev/stdout)
STATUS_CODE=$(echo "$RESPONSE" | tail -n 1)
if [[ "$STATUS_CODE" -eq 200 ]]; then
server_started=1
break
else
sleep 10
fi
done
if (( ! server_started )); then
echo "server did not start within 10 minutes. Please check server log at $vllm_log".
return 1
else
return 0
fi
}
run_benchmark() { run_benchmark() {
local max_num_seqs=$1 local max_num_seqs=$1
local max_num_batched_tokens=$2 local max_num_batched_tokens=$2
local gpu_memory_utilization=$3
echo "max_num_seq: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens" echo "max_num_seq: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens"
local vllm_log="$LOG_FOLDER/vllm_log_${max_num_seqs}_${max_num_batched_tokens}.txt" local vllm_log="$LOG_FOLDER/vllm_log_${max_num_seqs}_${max_num_batched_tokens}.txt"
echo "vllm_log: $vllm_log" echo "vllm_log: $vllm_log"
echo echo
rm -f $vllm_log rm -f $vllm_log
pkill -f vllm
echo "starting server..." # start the server
start_server $gpu_memory_utilization $max_num_seqs $max_num_batched_tokens $vllm_log VLLM_USE_V1=1 VLLM_SERVER_DEV_MODE=1 vllm serve $MODEL \
result=$? --disable-log-requests \
if [[ "$result" -eq 1 ]]; then --port 8004 \
echo "server failed to start. gpu_memory_utilization:$gpu_memory_utilization, max_num_seqs:$max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens" --gpu-memory-utilization 0.98 \
else --max-num-seqs $max_num_seqs \
echo "server started." --max-num-batched-tokens $max_num_batched_tokens \
fi --tensor-parallel-size 1 \
--enable-prefix-caching \
--load-format dummy \
--download-dir $DOWNLOAD_DIR \
--max-model-len $(( INPUT_LEN+OUTPUT_LEN )) > "$vllm_log" 2>&1 &
echo "wait for 10 minutes.."
echo echo
# wait for 10 minutes...
server_started=0
for i in {1..60}; do
if grep -Fq "Application startup complete" "$vllm_log"; then
echo "Application started"
server_started=1
break
else
# echo "wait for 10 seconds..."
sleep 10
fi
done
if (( ! server_started )); then
echo "server did not start within 10 minutes, terminate the benchmarking. Please check server log at $vllm_log"
echo "pkill -f vllm"
echo
pkill vllm
sleep 10
return 1
fi
echo "run benchmark test..." echo "run benchmark test..."
echo
meet_latency_requirement=0 meet_latency_requirement=0
# get a basic qps by using request-rate inf # get a basic qps by using request-rate inf
bm_log="$LOG_FOLDER/bm_log_${max_num_seqs}_${max_num_batched_tokens}_requestrate_inf.txt" bm_log="$LOG_FOLDER/bm_log_${max_num_seqs}_${max_num_batched_tokens}_requestrate_inf.txt"
@@ -134,29 +118,29 @@ run_benchmark() {
python benchmarks/benchmark_serving.py \ python benchmarks/benchmark_serving.py \
--backend vllm \ --backend vllm \
--model $MODEL \ --model $MODEL \
--dataset-name random \ --dataset-name sonnet \
--random-input-len $INPUT_LEN \ --dataset-path benchmarks/sonnet_4x.txt \
--random-output-len $OUTPUT_LEN \ --sonnet-input-len $INPUT_LEN \
--sonnet-output-len $OUTPUT_LEN \
--ignore-eos \ --ignore-eos \
--disable-tqdm \ --disable-tqdm \
--request-rate inf \ --request-rate inf \
--percentile-metrics ttft,tpot,itl,e2el \ --percentile-metrics ttft,tpot,itl,e2el \
--goodput e2el:$MAX_LATENCY_ALLOWED_MS \ --goodput e2el:$MAX_LATENCY_ALLOWED_MS \
--num-prompts 1000 \ --num-prompts 100 \
--random-prefix-len $prefix_len \ --sonnet-prefix-len $prefix_len \
--port 8004 &> "$bm_log" --port 8004 > "$bm_log"
throughput=$(grep "Request throughput (req/s):" "$bm_log" | sed 's/[^0-9.]//g') through_put=$(grep "Request throughput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
e2el=$(grep "P99 E2EL (ms):" "$bm_log" | awk '{print $NF}') e2el=$(grep "P99 E2EL (ms):" "$bm_log" | awk '{print $NF}')
goodput=$(grep "Request goodput (req/s):" "$bm_log" | sed 's/[^0-9.]//g') goodput=$(grep "Request goodput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
if (( $(echo "$e2el <= $MAX_LATENCY_ALLOWED_MS" | bc -l) )); then if (( $(echo "$e2el <= $MAX_LATENCY_ALLOWED_MS" | bc -l) )); then
meet_latency_requirement=1 meet_latency_requirement=1
request_rate=inf
fi fi
if (( ! meet_latency_requirement )); then if (( ! meet_latency_requirement )); then
# start from request-rate as int(throughput) + 1 # start from request-rate as int(through_put) + 1
request_rate=$((${throughput%.*} + 1)) request_rate=$((${through_put%.*} + 1))
while ((request_rate > 0)); do while ((request_rate > 0)); do
# clear prefix cache # clear prefix cache
curl -X POST http://0.0.0.0:8004/reset_prefix_cache curl -X POST http://0.0.0.0:8004/reset_prefix_cache
@@ -165,18 +149,19 @@ run_benchmark() {
python benchmarks/benchmark_serving.py \ python benchmarks/benchmark_serving.py \
--backend vllm \ --backend vllm \
--model $MODEL \ --model $MODEL \
--dataset-name random \ --dataset-name sonnet \
--random-input-len $INPUT_LEN \ --dataset-path benchmarks/sonnet_4x.txt \
--random-output-len $OUTPUT_LEN \ --sonnet-input-len $INPUT_LEN \
--ignore-eos \ --sonnet-output-len $OUTPUT_LEN \
--ignore_eos \
--disable-tqdm \ --disable-tqdm \
--request-rate $request_rate \ --request-rate $request_rate \
--percentile-metrics ttft,tpot,itl,e2el \ --percentile-metrics ttft,tpot,itl,e2el \
--goodput e2el:$MAX_LATENCY_ALLOWED_MS \ --goodput e2el:$MAX_LATENCY_ALLOWED_MS \
--num-prompts 100 \ --num-prompts 100 \
--random-prefix-len $prefix_len \ --sonnet-prefix-len $prefix_len \
--port 8004 &> "$bm_log" --port 8004 > "$bm_log"
throughput=$(grep "Request throughput (req/s):" "$bm_log" | sed 's/[^0-9.]//g') through_put=$(grep "Request throughput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
e2el=$(grep "P99 E2EL (ms):" "$bm_log" | awk '{print $NF}') e2el=$(grep "P99 E2EL (ms):" "$bm_log" | awk '{print $NF}')
goodput=$(grep "Request goodput (req/s):" "$bm_log" | sed 's/[^0-9.]//g') goodput=$(grep "Request goodput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
if (( $(echo "$e2el <= $MAX_LATENCY_ALLOWED_MS" | bc -l) )); then if (( $(echo "$e2el <= $MAX_LATENCY_ALLOWED_MS" | bc -l) )); then
@@ -188,10 +173,10 @@ run_benchmark() {
fi fi
# write the results and update the best result. # write the results and update the best result.
if ((meet_latency_requirement)); then if ((meet_latency_requirement)); then
echo "max_num_seqs: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens, request_rate: $request_rate, e2el: $e2el, throughput: $throughput, goodput: $goodput" echo "max_num_seqs: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens, request_rate: $request_rate, e2el: $e2el, through put: $through_put, goodput: $goodput"
echo "max_num_seqs: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens, request_rate: $request_rate, e2el: $e2el, throughput: $throughput, goodput: $goodput" >> "$RESULT" echo "max_num_seqs: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens, request_rate: $request_rate, e2el: $e2el, through put: $through_put, goodput: $goodput" >> "$RESULT"
if (( $(echo "$throughput > $best_throughput" | bc -l) )); then if (( $(echo "$through_put > $best_throughput" | bc -l) )); then
best_throughput=$throughput best_throughput=$through_put
best_max_num_seqs=$max_num_seqs best_max_num_seqs=$max_num_seqs
best_num_batched_tokens=$max_num_batched_tokens best_num_batched_tokens=$max_num_batched_tokens
best_goodput=$goodput best_goodput=$goodput
@@ -203,39 +188,22 @@ run_benchmark() {
echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput" echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput"
echo "pkill -f vllm"
echo
pkill vllm pkill vllm
sleep 10 sleep 10
rm -f $vllm_log
printf '=%.0s' $(seq 1 20) printf '=%.0s' $(seq 1 20)
return 0 return 0
} }
read -r -a num_seqs_list <<< "$NUM_SEQS_LIST"
read -r -a num_batched_tokens_list <<< "$NUM_BATCHED_TOKENS_LIST"
# first find out the max gpu-memory-utilization without HBM OOM. num_seqs_list="128 256"
gpu_memory_utilization=0.98 num_batched_tokens_list="512 1024 2048 4096"
find_gpu_memory_utilization=0 for num_seqs in $num_seqs_list; do
while (( $(echo "$gpu_memory_utilization >= 0.9" | bc -l) )); do for num_batched_tokens in $num_batched_tokens_list; do
start_server $gpu_memory_utilization "${num_seqs_list[-1]}" "${num_batched_tokens_list[-1]}" "$LOG_FOLDER/vllm_log_gpu_memory_utilization_$gpu_memory_utilization.log" run_benchmark $num_seqs $num_batched_tokens
result=$? exit 0
if [[ "$result" -eq 0 ]]; then
find_gpu_memory_utilization=1
break
else
gpu_memory_utilization=$(echo "$gpu_memory_utilization - 0.01" | bc)
fi
done
if [[ "$find_gpu_memory_utilization" -eq 1 ]]; then
echo "Using gpu_memory_utilization=$gpu_memory_utilization to serve model."
else
echo "Cannot find a proper gpu_memory_utilization over 0.9 to serve the model, please check logs in $LOG_FOLDER."
exit 1
fi
for num_seqs in "${num_seqs_list[@]}"; do
for num_batched_tokens in "${num_batched_tokens_list[@]}"; do
run_benchmark $num_seqs $num_batched_tokens $gpu_memory_utilization
done done
done done
echo "finish permutations" echo "finish permutations"

View File

@@ -1,5 +1,4 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import io import io
import json import json
@@ -325,7 +324,7 @@ async def async_request_openai_completions(
most_recent_timestamp = timestamp most_recent_timestamp = timestamp
generated_text += text or "" generated_text += text or ""
if usage := data.get("usage"): elif usage := data.get("usage"):
output.output_tokens = usage.get("completion_tokens") output.output_tokens = usage.get("completion_tokens")
if first_chunk_received: if first_chunk_received:
output.success = True output.success = True
@@ -612,7 +611,6 @@ ASYNC_REQUEST_FUNCS = {
"tensorrt-llm": async_request_trt_llm, "tensorrt-llm": async_request_trt_llm,
"scalellm": async_request_openai_completions, "scalellm": async_request_openai_completions,
"sglang": async_request_openai_completions, "sglang": async_request_openai_completions,
"llama.cpp": async_request_openai_completions,
} }
OPENAI_COMPATIBLE_BACKENDS = [ OPENAI_COMPATIBLE_BACKENDS = [

View File

@@ -1,5 +1,4 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
""" """
This module defines a framework for sampling benchmark requests from various This module defines a framework for sampling benchmark requests from various
datasets. Each dataset subclass of BenchmarkDataset must implement sample datasets. Each dataset subclass of BenchmarkDataset must implement sample
@@ -10,6 +9,9 @@ generation. Supported dataset types include:
- BurstGPT - BurstGPT
- HuggingFace - HuggingFace
- VisionArena - VisionArena
TODO: Implement CustomDataset to parse a JSON file and convert its contents into
SampleRequest instances, similar to the approach used in ShareGPT.
""" """
import base64 import base64
@@ -440,97 +442,6 @@ class ShareGPTDataset(BenchmarkDataset):
return samples return samples
# -----------------------------------------------------------------------------
# Custom Dataset Implementation
# -----------------------------------------------------------------------------
class CustomDataset(BenchmarkDataset):
"""
Implements the Custom dataset. Loads data from a JSONL file and generates
sample requests based on conversation turns. E.g.,
```
{"prompt": "What is the capital of India?"}
{"prompt": "What is the capital of Iran?"}
{"prompt": "What is the capital of China?"}
```
"""
def __init__(self, **kwargs) -> None:
super().__init__(**kwargs)
self.load_data()
def load_data(self) -> None:
if self.dataset_path is None:
raise ValueError("dataset_path must be provided for loading data.")
# self.data will be a list of dictionaries
# e.g., [{"prompt": "What is the capital of India?"}, ...]
# This will be the standardized format which load_data()
# has to convert into depending on the filetype of dataset_path.
# sample() will assume this standardized format of self.data
self.data = []
# Load the JSONL file
if self.dataset_path.endswith(".jsonl"):
jsonl_data = pd.read_json(path_or_buf=self.dataset_path, lines=True)
# check if the JSONL file has a 'prompt' column
if "prompt" not in jsonl_data.columns:
raise ValueError("JSONL file must contain a 'prompt' column.")
# Convert each row to a dictionary and append to self.data
# This will convert the DataFrame to a list of dictionaries
# where each dictionary corresponds to a row in the DataFrame.
# This is the standardized format we want for self.data
for _, row in jsonl_data.iterrows():
self.data.append(row.to_dict())
else:
raise NotImplementedError(
"Only JSONL format is supported for CustomDataset."
)
random.seed(self.random_seed)
random.shuffle(self.data)
def sample(
self,
tokenizer: PreTrainedTokenizerBase,
num_requests: int,
lora_path: Optional[str] = None,
max_loras: Optional[int] = None,
output_len: Optional[int] = None,
enable_multimodal_chat: bool = False,
skip_chat_template: bool = False,
**kwargs,
) -> list:
sampled_requests = []
for item in self.data:
if len(sampled_requests) >= num_requests:
break
prompt = item["prompt"]
# apply template
if not skip_chat_template:
prompt = tokenizer.apply_chat_template(
[{"role": "user", "content": prompt}],
add_generation_prompt=True,
tokenize=False,
)
prompt_len = len(tokenizer(prompt).input_ids)
sampled_requests.append(
SampleRequest(
prompt=prompt,
prompt_len=prompt_len,
expected_output_len=output_len,
)
)
self.maybe_oversample_requests(sampled_requests, num_requests)
return sampled_requests
# ----------------------------------------------------------------------------- # -----------------------------------------------------------------------------
# Sonnet Dataset Implementation # Sonnet Dataset Implementation
# ----------------------------------------------------------------------------- # -----------------------------------------------------------------------------
@@ -865,15 +776,7 @@ class InstructCoderDataset(HuggingFaceDataset):
for item in self.data: for item in self.data:
if len(sampled_requests) >= num_requests: if len(sampled_requests) >= num_requests:
break break
prompt = f"{item['input']}\n\n{item['instruction']} Just output \ prompt = f"{item['instruction']}:\n{item['input']}"
the code, do not include any explanation."
# apply template
prompt = tokenizer.apply_chat_template(
[{"role": "user", "content": prompt}],
add_generation_prompt=True,
tokenize=False,
)
prompt_len = len(tokenizer(prompt).input_ids) prompt_len = len(tokenizer(prompt).input_ids)
sampled_requests.append( sampled_requests.append(
SampleRequest( SampleRequest(

View File

@@ -1,5 +1,4 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Benchmark the latency of processing a single batch of requests.""" """Benchmark the latency of processing a single batch of requests."""
import argparse import argparse
@@ -7,12 +6,13 @@ import dataclasses
import json import json
import os import os
import time import time
from pathlib import Path
from typing import Any, Optional from typing import Any, Optional
import numpy as np import numpy as np
import torch
from tqdm import tqdm from tqdm import tqdm
import vllm.envs as envs
from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json
from vllm import LLM, SamplingParams from vllm import LLM, SamplingParams
from vllm.engine.arg_utils import EngineArgs from vllm.engine.arg_utils import EngineArgs
@@ -80,9 +80,17 @@ def main(args: argparse.Namespace):
def run_to_completion(profile_dir: Optional[str] = None): def run_to_completion(profile_dir: Optional[str] = None):
if profile_dir: if profile_dir:
llm.start_profile() with torch.profiler.profile(
llm_generate() activities=[
llm.stop_profile() torch.profiler.ProfilerActivity.CPU,
torch.profiler.ProfilerActivity.CUDA,
],
on_trace_ready=torch.profiler.tensorboard_trace_handler(
str(profile_dir)
),
) as p:
llm_generate()
print(p.key_averages().table(sort_by="self_cuda_time_total"))
else: else:
start_time = time.perf_counter() start_time = time.perf_counter()
llm_generate() llm_generate()
@@ -95,7 +103,11 @@ def main(args: argparse.Namespace):
run_to_completion(profile_dir=None) run_to_completion(profile_dir=None)
if args.profile: if args.profile:
profile_dir = envs.VLLM_TORCH_PROFILER_DIR profile_dir = args.profile_result_dir
if not profile_dir:
profile_dir = (
Path(".") / "vllm_benchmark_result" / f"latency_result_{time.time()}"
)
print(f"Profiling (results will be saved to '{profile_dir}')...") print(f"Profiling (results will be saved to '{profile_dir}')...")
run_to_completion(profile_dir=profile_dir) run_to_completion(profile_dir=profile_dir)
return return
@@ -152,6 +164,15 @@ if __name__ == "__main__":
action="store_true", action="store_true",
help="profile the generation process of a single batch", help="profile the generation process of a single batch",
) )
parser.add_argument(
"--profile-result-dir",
type=str,
default=None,
help=(
"path to save the pytorch profiler output. Can be visualized "
"with ui.perfetto.dev or Tensorboard."
),
)
parser.add_argument( parser.add_argument(
"--output-json", "--output-json",
type=str, type=str,
@@ -172,9 +193,4 @@ if __name__ == "__main__":
# numbers. We need to disable prefix caching by default. # numbers. We need to disable prefix caching by default.
parser.set_defaults(enable_prefix_caching=False) parser.set_defaults(enable_prefix_caching=False)
args = parser.parse_args() args = parser.parse_args()
if args.profile and not envs.VLLM_TORCH_PROFILER_DIR:
raise OSError(
"The environment variable 'VLLM_TORCH_PROFILER_DIR' is not set. "
"Please set it to a valid path to use torch profiler."
)
main(args) main(args)

View File

@@ -1,5 +1,4 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
""" """
Offline benchmark to test the long document QA throughput. Offline benchmark to test the long document QA throughput.

View File

@@ -1,5 +1,4 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
""" """
Benchmark the efficiency of prefix caching. Benchmark the efficiency of prefix caching.

View File

@@ -1,5 +1,4 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Benchmark offline prioritization.""" """Benchmark offline prioritization."""
import argparse import argparse

View File

@@ -1,5 +1,4 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
r"""Benchmark online serving throughput. r"""Benchmark online serving throughput.
On the server side, run one of the following commands: On the server side, run one of the following commands:
@@ -61,7 +60,6 @@ from benchmark_dataset import (
ASRDataset, ASRDataset,
BurstGPTDataset, BurstGPTDataset,
ConversationDataset, ConversationDataset,
CustomDataset,
HuggingFaceDataset, HuggingFaceDataset,
InstructCoderDataset, InstructCoderDataset,
MTBenchDataset, MTBenchDataset,
@@ -629,16 +627,7 @@ def main(args: argparse.Namespace):
"'--dataset-path' if required." "'--dataset-path' if required."
) )
if args.dataset_name == "custom": if args.dataset_name == "sonnet":
dataset = CustomDataset(dataset_path=args.dataset_path)
input_requests = dataset.sample(
num_requests=args.num_prompts,
tokenizer=tokenizer,
output_len=args.custom_output_len,
skip_chat_template=args.custom_skip_chat_template,
)
elif args.dataset_name == "sonnet":
dataset = SonnetDataset(dataset_path=args.dataset_path) dataset = SonnetDataset(dataset_path=args.dataset_path)
# For the "sonnet" dataset, formatting depends on the backend. # For the "sonnet" dataset, formatting depends on the backend.
if args.backend == "openai-chat": if args.backend == "openai-chat":
@@ -773,10 +762,6 @@ def main(args: argparse.Namespace):
if "temperature" not in sampling_params: if "temperature" not in sampling_params:
sampling_params["temperature"] = 0.0 # Default to greedy decoding. sampling_params["temperature"] = 0.0 # Default to greedy decoding.
if args.backend == "llama.cpp":
# Disable prompt caching in llama.cpp backend
sampling_params["cache_prompt"] = False
# Avoid GC processing "static" data - reduce pause times. # Avoid GC processing "static" data - reduce pause times.
gc.collect() gc.collect()
gc.freeze() gc.freeze()
@@ -849,8 +834,6 @@ def main(args: argparse.Namespace):
]: ]:
if field in result_json: if field in result_json:
del result_json[field] del result_json[field]
if field in benchmark_result:
del benchmark_result[field]
# Save to file # Save to file
base_model_id = model_id.split("/")[-1] base_model_id = model_id.split("/")[-1]
@@ -863,7 +846,6 @@ def main(args: argparse.Namespace):
if args.result_filename: if args.result_filename:
file_name = args.result_filename file_name = args.result_filename
if args.result_dir: if args.result_dir:
os.makedirs(args.result_dir, exist_ok=True)
file_name = os.path.join(args.result_dir, file_name) file_name = os.path.join(args.result_dir, file_name)
with open( with open(
file_name, mode="a+" if args.append_result else "w", encoding="utf-8" file_name, mode="a+" if args.append_result else "w", encoding="utf-8"
@@ -904,7 +886,7 @@ if __name__ == "__main__":
"--dataset-name", "--dataset-name",
type=str, type=str,
default="sharegpt", default="sharegpt",
choices=["sharegpt", "burstgpt", "sonnet", "random", "hf", "custom"], choices=["sharegpt", "burstgpt", "sonnet", "random", "hf"],
help="Name of the dataset to benchmark on.", help="Name of the dataset to benchmark on.",
) )
parser.add_argument( parser.add_argument(
@@ -1074,19 +1056,6 @@ if __name__ == "__main__":
) )
# group for dataset specific arguments # group for dataset specific arguments
custom_group = parser.add_argument_group("custom dataset options")
custom_group.add_argument(
"--custom-output-len",
type=int,
default=256,
help="Number of output tokens per request, used only for custom dataset.",
)
custom_group.add_argument(
"--custom-skip-chat-template",
action="store_true",
help="Skip applying chat template to prompt, used only for custom dataset.",
)
sonnet_group = parser.add_argument_group("sonnet dataset options") sonnet_group = parser.add_argument_group("sonnet dataset options")
sonnet_group.add_argument( sonnet_group.add_argument(
"--sonnet-input-len", "--sonnet-input-len",

View File

@@ -1,5 +1,4 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
r"""Benchmark online serving throughput with structured outputs. r"""Benchmark online serving throughput with structured outputs.
On the server side, run one of the following commands: On the server side, run one of the following commands:
@@ -12,6 +11,7 @@ On the client side, run:
--model <your_model> \ --model <your_model> \
--dataset json \ --dataset json \
--structured-output-ratio 1.0 \ --structured-output-ratio 1.0 \
--structured-output-backend auto \
--request-rate 10 \ --request-rate 10 \
--num-prompts 1000 --num-prompts 1000

View File

@@ -1,5 +1,4 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Benchmark offline inference throughput.""" """Benchmark offline inference throughput."""
import argparse import argparse

View File

@@ -1,5 +1,4 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import argparse import argparse
import json import json
@@ -66,9 +65,4 @@ class InfEncoder(json.JSONEncoder):
def write_to_json(filename: str, records: list) -> None: def write_to_json(filename: str, records: list) -> None:
with open(filename, "w") as f: with open(filename, "w") as f:
json.dump( json.dump(records, f, cls=InfEncoder)
records,
f,
cls=InfEncoder,
default=lambda o: f"<{type(o).__name__} object is not JSON serializable>",
)

View File

@@ -1,5 +1,4 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import argparse import argparse
import copy import copy

View File

@@ -1,5 +1,4 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# Cutlass bench utils # Cutlass bench utils
from collections.abc import Iterable from collections.abc import Iterable

View File

@@ -1,5 +1,4 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import argparse import argparse
import copy import copy

View File

@@ -1,5 +1,4 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# Weight Shapes are in the format # Weight Shapes are in the format
# ([K, N], TP_SPLIT_DIM) # ([K, N], TP_SPLIT_DIM)

View File

@@ -1,5 +1,4 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import os import os

View File

@@ -1,5 +1,4 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import asyncio import asyncio
import itertools import itertools

View File

@@ -1,5 +1,4 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import json import json

View File

@@ -1,5 +1,4 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pickle as pkl import pickle as pkl
import time import time

View File

@@ -1,223 +0,0 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import argparse
import copy
import itertools
import torch
from weight_shapes import WEIGHT_SHAPES
from vllm._custom_ops import cutlass_scaled_mm as vllm_scaled_mm
from vllm._custom_ops import scaled_fp8_quant as vllm_scaled_fp8_quant
from vllm.triton_utils import triton
@triton.testing.perf_report(
triton.testing.Benchmark(
x_names=["batch_size"],
x_vals=[1, 16, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384],
x_log=False,
line_arg="provider",
line_vals=[
"torch-bf16",
# "fp8-tensor-w-token-a",
"fp8-tensor-w-tensor-a",
"fp8-channel-w-token-a",
# "fp8-channel-w-tensor-a",
# "fp8-tensor-w-token-a-noquant",
"fp8-tensor-w-tensor-a-noquant",
"fp8-channel-w-token-a-noquant",
# "fp8-channel-w-tensor-a-noquant",
],
line_names=[
"torch-bf16",
# "fp8-tensor-w-token-a",
"fp8-tensor-w-tensor-a",
"fp8-channel-w-token-a",
# "fp8-channel-w-tensor-a",
# "fp8-tensor-w-token-a-noquant",
"fp8-tensor-w-tensor-a-noquant",
"fp8-channel-w-token-a-noquant",
# "fp8-channel-w-tensor-a-noquant",
],
ylabel="TFLOP/s (larger is better)",
plot_name="BF16 vs FP8 GEMMs",
args={},
)
)
def benchmark(batch_size, provider, N, K):
M = batch_size
device = "cuda"
dtype = torch.bfloat16
# Create input tensors
a = torch.randn((M, K), device=device, dtype=dtype)
b = torch.randn((N, K), device=device, dtype=dtype)
quantiles = [0.5, 0.2, 0.8]
if "torch-bf16" in provider:
ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
lambda: torch.nn.functional.linear(a, b), quantiles=quantiles
)
elif "fp8" in provider:
# Weights are always quantized ahead of time
if "noquant" in provider:
# For no quantization, we just measure the GEMM
if "tensor-w-token-a" in provider:
# Dynamic per-token quant for A, per-tensor quant for B
b_fp8, scale_b_fp8 = vllm_scaled_fp8_quant(b)
assert scale_b_fp8.numel() == 1
a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant(
a, use_per_token_if_dynamic=True
)
def run_quant():
return vllm_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype)
elif "tensor-w-tensor-a" in provider:
# Static per-tensor quantization with fixed scales
# for both A and B
scale_a = torch.tensor([1.0], device=device, dtype=torch.float32)
scale_b = torch.tensor([1.0], device=device, dtype=torch.float32)
b_fp8, scale_b_fp8 = vllm_scaled_fp8_quant(b, scale_b)
assert scale_b_fp8.numel() == 1
a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant(a, scale_a)
def run_quant():
return vllm_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype)
elif "channel-w-token-a" in provider:
# Static per-channel quantization for weights, per-token
# quant for A
scale_b = torch.tensor((N,), device=device, dtype=torch.float32)
b_fp8, scale_b_fp8 = vllm_scaled_fp8_quant(b, scale_b)
scale_b_fp8 = scale_b_fp8.expand(N).contiguous()
assert scale_b_fp8.numel() == N
a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant(
a, use_per_token_if_dynamic=True
)
def run_quant():
return vllm_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype)
elif "channel-w-tensor-a" in provider:
# Static per-channel quantization for weights, per-tensor
# quant for A
scale_a = torch.tensor([1.0], device=device, dtype=torch.float32)
scale_b = torch.tensor((N,), device=device, dtype=torch.float32)
b_fp8, scale_b_fp8 = vllm_scaled_fp8_quant(b, scale_b)
scale_b_fp8 = scale_b_fp8.expand(N).contiguous()
assert scale_b_fp8.numel() == N
a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant(a, scale_a)
def run_quant():
return vllm_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype)
else:
# In these cases, we quantize the activations during the GEMM call
if "tensor-w-token-a" in provider:
# Dynamic per-token quant for A, per-tensor quant for B
b_fp8, scale_b_fp8 = vllm_scaled_fp8_quant(b)
assert scale_b_fp8.numel() == 1
def run_quant():
a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant(
a, use_per_token_if_dynamic=True
)
return vllm_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype)
elif "tensor-w-tensor-a" in provider:
# Static per-tensor quantization with fixed scales
# for both A and B
scale_a = torch.tensor([1.0], device=device, dtype=torch.float32)
scale_b = torch.tensor([1.0], device=device, dtype=torch.float32)
b_fp8, scale_b_fp8 = vllm_scaled_fp8_quant(b, scale_b)
assert scale_b_fp8.numel() == 1
def run_quant():
a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant(a, scale_a)
return vllm_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype)
elif "channel-w-token-a" in provider:
# Static per-channel quantization for weights, per-token
# quant for A
scale_b = torch.tensor((N,), device=device, dtype=torch.float32)
b_fp8, scale_b_fp8 = vllm_scaled_fp8_quant(b, scale_b)
scale_b_fp8 = scale_b_fp8.expand(N).contiguous()
assert scale_b_fp8.numel() == N
def run_quant():
a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant(
a, use_per_token_if_dynamic=True
)
return vllm_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype)
elif "channel-w-tensor-a" in provider:
# Static per-channel quantization for weights, per-tensor
# quant for A
scale_a = torch.tensor([1.0], device=device, dtype=torch.float32)
scale_b = torch.tensor((N,), device=device, dtype=torch.float32)
b_fp8, scale_b_fp8 = vllm_scaled_fp8_quant(b, scale_b)
scale_b_fp8 = scale_b_fp8.expand(N).contiguous()
assert scale_b_fp8.numel() == N
def run_quant():
a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant(a, scale_a)
return vllm_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype)
b_fp8 = b_fp8.t()
ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
lambda: run_quant(), quantiles=quantiles
)
# Calculate TFLOP/s, two flops per multiply-add
tflops = lambda ms: (2 * M * N * K) * 1e-12 / (ms * 1e-3)
return tflops(ms), tflops(max_ms), tflops(min_ms)
def prepare_shapes(args):
KN_model_names = []
models_tps = list(itertools.product(args.models, args.tp_sizes))
for model, tp_size in models_tps:
assert model in WEIGHT_SHAPES
for KN, tp_split_dim in copy.deepcopy(WEIGHT_SHAPES[model]):
KN[tp_split_dim] = KN[tp_split_dim] // tp_size
KN.append(model)
KN_model_names.append(KN)
return KN_model_names
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"--models",
nargs="+",
type=str,
default=["meta-llama/Llama-3.1-8B-Instruct"],
choices=[*WEIGHT_SHAPES.keys()],
help="List of models to benchmark",
)
parser.add_argument(
"--tp-sizes",
nargs="+",
type=int,
default=[1],
help="List of tensor parallel sizes",
)
args = parser.parse_args()
KN_model_names = prepare_shapes(args)
for K, N, model_name in KN_model_names:
print(f"{model_name}, N={N} K={K}, BF16 vs FP8 GEMMs TFLOP/s:")
benchmark.run(
print_data=True,
show_plots=True,
save_path=f"bench_fp8_res_n{N}_k{K}",
N=N,
K=K,
)
print("Benchmark finished!")

View File

@@ -1,5 +1,4 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import os import os
import sys import sys

View File

@@ -1,5 +1,4 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# Copyright (c) Microsoft Corporation. # Copyright (c) Microsoft Corporation.
# Licensed under the MIT License. # Licensed under the MIT License.

View File

@@ -1,5 +1,4 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
""" """
Benchmark the performance of the cutlass_moe_fp4 kernel vs the triton_moe Benchmark the performance of the cutlass_moe_fp4 kernel vs the triton_moe
kernel. The cutlass_moe_fp4 kernel takes in fp4 quantized weights and 16-bit kernel. The cutlass_moe_fp4 kernel takes in fp4 quantized weights and 16-bit
@@ -91,7 +90,7 @@ def bench_run(
score = torch.randn((m, num_experts), device=device, dtype=dtype) score = torch.randn((m, num_experts), device=device, dtype=dtype)
topk_weights, topk_ids, _ = fused_topk(a, score, topk, renormalize=False) topk_weights, topk_ids = fused_topk(a, score, topk, renormalize=False)
quant_blocksize = 16 quant_blocksize = 16
w1_blockscale = torch.empty( w1_blockscale = torch.empty(

View File

@@ -1,5 +1,4 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import torch import torch
import torch.utils.benchmark as benchmark import torch.utils.benchmark as benchmark
@@ -7,8 +6,8 @@ from benchmark_shapes import WEIGHT_SHAPES_MOE
from vllm import _custom_ops as ops from vllm import _custom_ops as ops
from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config
from vllm.model_executor.layers.fused_moe.cutlass_moe import cutlass_moe_fp8
from vllm.model_executor.layers.fused_moe.fused_moe import ( from vllm.model_executor.layers.fused_moe.fused_moe import (
cutlass_moe_fp8,
fused_experts, fused_experts,
fused_topk, fused_topk,
) )
@@ -70,9 +69,18 @@ def bench_run(
w1_scale = torch.empty((num_experts, 1, 1), device="cuda", dtype=torch.float32) w1_scale = torch.empty((num_experts, 1, 1), device="cuda", dtype=torch.float32)
w2_scale = torch.empty((num_experts, 1, 1), device="cuda", dtype=torch.float32) w2_scale = torch.empty((num_experts, 1, 1), device="cuda", dtype=torch.float32)
ab_strides1 = torch.full((num_experts,), k, device="cuda", dtype=torch.int64)
c_strides1 = torch.full((num_experts,), 2 * n, device="cuda", dtype=torch.int64)
ab_strides2 = torch.full((num_experts,), n, device="cuda", dtype=torch.int64)
c_strides2 = torch.full((num_experts,), k, device="cuda", dtype=torch.int64)
for expert in range(num_experts): for expert in range(num_experts):
w1_q[expert], w1_scale[expert] = ops.scaled_fp8_quant(w1[expert]) w1_q[expert], w1_scale[expert] = ops.scaled_fp8_quant(w1[expert])
w2_q[expert], w2_scale[expert] = ops.scaled_fp8_quant(w2[expert]) w2_q[expert], w2_scale[expert] = ops.scaled_fp8_quant(w2[expert])
w1_q_notransp = w1_q.clone()
w2_q_notransp = w2_q.clone()
w1_q = w1_q.transpose(1, 2)
w2_q = w2_q.transpose(1, 2)
score = torch.randn((m, num_experts), device="cuda", dtype=dtype) score = torch.randn((m, num_experts), device="cuda", dtype=dtype)
@@ -113,6 +121,10 @@ def bench_run(
w2_scale: torch.Tensor, w2_scale: torch.Tensor,
topk_weights: torch.Tensor, topk_weights: torch.Tensor,
topk_ids: torch.Tensor, topk_ids: torch.Tensor,
ab_strides1: torch.Tensor,
c_strides1: torch.Tensor,
ab_strides2: torch.Tensor,
c_strides2: torch.Tensor,
num_repeats: int, num_repeats: int,
): ):
for _ in range(num_repeats): for _ in range(num_repeats):
@@ -120,10 +132,14 @@ def bench_run(
a, a,
w1, w1,
w2, w2,
topk_weights,
topk_ids,
w1_scale, w1_scale,
w2_scale, w2_scale,
topk_weights,
topk_ids,
ab_strides1,
c_strides1,
ab_strides2,
c_strides2,
a1_scale=a_scale, a1_scale=a_scale,
) )
@@ -136,6 +152,10 @@ def bench_run(
w2_scale: torch.Tensor, w2_scale: torch.Tensor,
topk_weights: torch.Tensor, topk_weights: torch.Tensor,
topk_ids: torch.Tensor, topk_ids: torch.Tensor,
ab_strides1: torch.Tensor,
c_strides1: torch.Tensor,
ab_strides2: torch.Tensor,
c_strides2: torch.Tensor,
): ):
with set_current_vllm_config( with set_current_vllm_config(
VllmConfig(parallel_config=ParallelConfig(pipeline_parallel_size=1)) VllmConfig(parallel_config=ParallelConfig(pipeline_parallel_size=1))
@@ -144,10 +164,14 @@ def bench_run(
a, a,
w1_q, w1_q,
w2_q, w2_q,
topk_weights,
topk_ids,
w1_scale, w1_scale,
w2_scale, w2_scale,
topk_weights,
topk_ids,
ab_strides1,
c_strides1,
ab_strides2,
c_strides2,
a1_scale=a_scale, a1_scale=a_scale,
) )
@@ -193,6 +217,10 @@ def bench_run(
w2_scale, w2_scale,
topk_weights, topk_weights,
topk_ids, topk_ids,
ab_strides1,
c_strides1,
ab_strides2,
c_strides2,
) )
torch.cuda.synchronize() torch.cuda.synchronize()
@@ -201,8 +229,8 @@ def bench_run(
with torch.cuda.graph(triton_graph, stream=triton_stream): with torch.cuda.graph(triton_graph, stream=triton_stream):
run_triton_from_graph( run_triton_from_graph(
a, a,
w1_q, w1_q_notransp,
w2_q, w2_q_notransp,
topk_weights, topk_weights,
topk_ids, topk_ids,
w1_scale, w1_scale,
@@ -221,12 +249,18 @@ def bench_run(
"w2": w2, "w2": w2,
"score": score, "score": score,
"topk": topk, "topk": topk,
"w1_q_notransp": w1_q_notransp,
"w2_q_notransp": w2_q_notransp,
# Cutlass params # Cutlass params
"a_scale": a_scale, "a_scale": a_scale,
"w1_q": w1_q, "w1_q": w1_q,
"w2_q": w2_q, "w2_q": w2_q,
"w1_scale": w1_scale, "w1_scale": w1_scale,
"w2_scale": w2_scale, "w2_scale": w2_scale,
"ab_strides1": ab_strides1,
"c_strides1": c_strides1,
"ab_strides2": ab_strides2,
"c_strides2": c_strides2,
# cuda graph params # cuda graph params
"cutlass_graph": cutlass_graph, "cutlass_graph": cutlass_graph,
"triton_graph": triton_graph, "triton_graph": triton_graph,
@@ -244,8 +278,8 @@ def bench_run(
# Warmup # Warmup
run_triton_moe( run_triton_moe(
a, a,
w1_q, w1_q_notransp,
w2_q, w2_q_notransp,
topk_weights, topk_weights,
topk_ids, topk_ids,
w1_scale, w1_scale,
@@ -256,7 +290,7 @@ def bench_run(
results.append( results.append(
benchmark.Timer( benchmark.Timer(
stmt="run_triton_moe(a, w1_q, w2_q, topk_weights, topk_ids, w1_scale, w2_scale, a_scale, num_runs)", # noqa: E501 stmt="run_triton_moe(a, w1_q_notransp, w2_q_notransp, topk_weights, topk_ids, w1_scale, w2_scale, a_scale, num_runs)", # noqa: E501
globals=globals, globals=globals,
label=label, label=label,
sub_label=sub_label, sub_label=sub_label,
@@ -287,12 +321,16 @@ def bench_run(
w2_scale, w2_scale,
topk_weights, topk_weights,
topk_ids, topk_ids,
ab_strides1,
c_strides1,
ab_strides2,
c_strides2,
num_warmup, num_warmup,
) )
results.append( results.append(
benchmark.Timer( benchmark.Timer(
stmt="run_cutlass_moe(a, a_scale, w1_q, w2_q, w1_scale, w2_scale, topk_weights, topk_ids, num_runs)", # noqa: E501 stmt="run_cutlass_moe(a, a_scale, w1_q, w2_q, w1_scale, w2_scale, topk_weights, topk_ids, ab_strides1, c_strides1, ab_strides2, c_strides2, num_runs)", # noqa: E501
globals=globals, globals=globals,
label=label, label=label,
sub_label=sub_label, sub_label=sub_label,

View File

@@ -1,5 +1,4 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import time import time

View File

@@ -1,5 +1,4 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import argparse import argparse
import copy import copy

View File

@@ -1,5 +1,4 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import argparse import argparse
import copy import copy

View File

@@ -1,5 +1,4 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import torch import torch
import torch.utils.benchmark as benchmark import torch.utils.benchmark as benchmark

View File

@@ -1,5 +1,4 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import argparse import argparse
import json import json

View File

@@ -1,5 +1,4 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import argparse import argparse
from typing import Any, TypedDict from typing import Any, TypedDict

View File

@@ -1,5 +1,4 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import random import random
import time import time

View File

@@ -1,5 +1,4 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import time import time

View File

@@ -1,5 +1,4 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import itertools import itertools
from typing import Optional, Union from typing import Optional, Union

View File

@@ -1,5 +1,4 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from itertools import accumulate from itertools import accumulate
from typing import Optional from typing import Optional
@@ -23,7 +22,7 @@ def benchmark_rope_kernels_multi_lora(
seed: int, seed: int,
device: str, device: str,
max_position: int = 8192, max_position: int = 8192,
base: float = 10000, base: int = 10000,
) -> None: ) -> None:
current_platform.seed_everything(seed) current_platform.seed_everything(seed)
torch.set_default_device(device) torch.set_default_device(device)

View File

@@ -1,5 +1,4 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
WEIGHT_SHAPES = { WEIGHT_SHAPES = {
"ideal": [[4 * 256 * 32, 256 * 32]], "ideal": [[4 * 256 * 32, 256 * 32]],

View File

@@ -1,5 +1,4 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# Adapted from sglang quantization/tuning_block_wise_kernel.py # Adapted from sglang quantization/tuning_block_wise_kernel.py
import argparse import argparse

View File

@@ -1,5 +1,4 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# fmt: off # fmt: off
# ruff: noqa: E501 # ruff: noqa: E501
import time import time

View File

@@ -1,5 +1,4 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import math import math
import pickle import pickle

View File

@@ -1,5 +1,4 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import dataclasses import dataclasses
from collections.abc import Iterable from collections.abc import Iterable

View File

@@ -1,5 +1,4 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# Weight Shapes are in the format # Weight Shapes are in the format
# ([K, N], TP_SPLIT_DIM) # ([K, N], TP_SPLIT_DIM)
@@ -49,50 +48,4 @@ WEIGHT_SHAPES = {
([16384, 106496], 1), ([16384, 106496], 1),
([53248, 16384], 0), ([53248, 16384], 0),
], ],
"meta-llama/Llama-3.1-8B-Instruct": [
([4096, 6144], 1),
([4096, 4096], 0),
([4096, 28672], 1),
([14336, 4096], 0),
],
"meta-llama/Llama-3.3-70B-Instruct": [
([8192, 10240], 1),
([8192, 8192], 0),
([8192, 57344], 1),
([28672, 8192], 0),
],
"mistralai/Mistral-Large-Instruct-2407": [
([12288, 14336], 1),
([12288, 12288], 0),
([12288, 57344], 1),
([28672, 12288], 0),
],
"Qwen/Qwen2.5-7B-Instruct": [
([3584, 4608], 1),
([3584, 3584], 0),
([3584, 37888], 1),
([18944, 3584], 0),
],
"Qwen/Qwen2.5-32B-Instruct": [
([5120, 7168], 1),
([5120, 5120], 0),
([5120, 55296], 1),
([27648, 5120], 0),
],
"Qwen/Qwen2.5-72B-Instruct": [
([8192, 10240], 1),
([8192, 8192], 0),
([8192, 59136], 1),
([29568, 8192], 0),
],
"deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct": [
([2048, 3072], 1),
([2048, 4096], 1),
([2048, 2048], 0),
([2048, 576], 0),
([2048, 21888], 1),
([10944, 2048], 0),
([2048, 2816], 1),
([1408, 2048], 0),
],
} }

View File

@@ -1,5 +1,4 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import cProfile import cProfile
import pstats import pstats

View File

@@ -75,7 +75,6 @@ if (MACOSX_FOUND AND CMAKE_SYSTEM_PROCESSOR STREQUAL "arm64")
else() else()
find_isa(${CPUINFO} "avx2" AVX2_FOUND) find_isa(${CPUINFO} "avx2" AVX2_FOUND)
find_isa(${CPUINFO} "avx512f" AVX512_FOUND) find_isa(${CPUINFO} "avx512f" AVX512_FOUND)
find_isa(${CPUINFO} "Power11" POWER11_FOUND)
find_isa(${CPUINFO} "POWER10" POWER10_FOUND) find_isa(${CPUINFO} "POWER10" POWER10_FOUND)
find_isa(${CPUINFO} "POWER9" POWER9_FOUND) find_isa(${CPUINFO} "POWER9" POWER9_FOUND)
find_isa(${CPUINFO} "asimd" ASIMD_FOUND) # Check for ARM NEON support find_isa(${CPUINFO} "asimd" ASIMD_FOUND) # Check for ARM NEON support
@@ -107,19 +106,13 @@ elseif (AVX2_FOUND)
list(APPEND CXX_COMPILE_FLAGS "-mavx2") list(APPEND CXX_COMPILE_FLAGS "-mavx2")
message(WARNING "vLLM CPU backend using AVX2 ISA") message(WARNING "vLLM CPU backend using AVX2 ISA")
elseif (POWER9_FOUND OR POWER10_FOUND OR POWER11_FOUND) elseif (POWER9_FOUND OR POWER10_FOUND)
message(STATUS "PowerPC detected") message(STATUS "PowerPC detected")
if (POWER9_FOUND) # Check for PowerPC VSX support
list(APPEND CXX_COMPILE_FLAGS list(APPEND CXX_COMPILE_FLAGS
"-mvsx" "-mvsx"
"-mcpu=power9" "-mcpu=native"
"-mtune=power9") "-mtune=native")
elseif (POWER10_FOUND OR POWER11_FOUND)
list(APPEND CXX_COMPILE_FLAGS
"-mvsx"
"-mcpu=power10"
"-mtune=power10")
endif()
elseif (ASIMD_FOUND) elseif (ASIMD_FOUND)
message(STATUS "ARMv8 or later architecture detected") message(STATUS "ARMv8 or later architecture detected")

View File

@@ -46,38 +46,22 @@ else()
endif() endif()
# Ensure the vllm/vllm_flash_attn directory exists before installation
install(CODE "file(MAKE_DIRECTORY \"\${CMAKE_INSTALL_PREFIX}/vllm/vllm_flash_attn\")" ALL_COMPONENTS)
# Make sure vllm-flash-attn install rules are nested under vllm/
# This is here to support installing all components under the same prefix with cmake --install.
# setup.py installs every component separately but uses the same prefix for all.
# ALL_COMPONENTS is used to avoid duplication for FA2 and FA3,
# and these statements don't hurt when installing neither component.
install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY FALSE)" ALL_COMPONENTS)
install(CODE "set(OLD_CMAKE_INSTALL_PREFIX \"\${CMAKE_INSTALL_PREFIX}\")" ALL_COMPONENTS)
install(CODE "set(CMAKE_INSTALL_PREFIX \"\${CMAKE_INSTALL_PREFIX}/vllm/\")" ALL_COMPONENTS)
# Fetch the vllm-flash-attn library # Fetch the vllm-flash-attn library
FetchContent_MakeAvailable(vllm-flash-attn) FetchContent_MakeAvailable(vllm-flash-attn)
message(STATUS "vllm-flash-attn is available at ${vllm-flash-attn_SOURCE_DIR}") message(STATUS "vllm-flash-attn is available at ${vllm-flash-attn_SOURCE_DIR}")
# Restore the install prefix
install(CODE "set(CMAKE_INSTALL_PREFIX \"\${OLD_CMAKE_INSTALL_PREFIX}\")" ALL_COMPONENTS)
install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY TRUE)" ALL_COMPONENTS)
# Copy over the vllm-flash-attn python files (duplicated for fa2 and fa3, in # Copy over the vllm-flash-attn python files (duplicated for fa2 and fa3, in
# case only one is built, in the case both are built redundant work is done) # case only one is built, in the case both are built redundant work is done)
install( install(
DIRECTORY ${vllm-flash-attn_SOURCE_DIR}/vllm_flash_attn/ DIRECTORY ${vllm-flash-attn_SOURCE_DIR}/vllm_flash_attn/
DESTINATION vllm/vllm_flash_attn DESTINATION vllm_flash_attn
COMPONENT _vllm_fa2_C COMPONENT _vllm_fa2_C
FILES_MATCHING PATTERN "*.py" FILES_MATCHING PATTERN "*.py"
) )
install( install(
DIRECTORY ${vllm-flash-attn_SOURCE_DIR}/vllm_flash_attn/ DIRECTORY ${vllm-flash-attn_SOURCE_DIR}/vllm_flash_attn/
DESTINATION vllm/vllm_flash_attn DESTINATION vllm_flash_attn
COMPONENT _vllm_fa3_C COMPONENT _vllm_fa3_C
FILES_MATCHING PATTERN "*.py" FILES_MATCHING PATTERN "*.py"
) )

View File

@@ -1,6 +1,5 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# #
# A command line tool for running pytorch's hipify preprocessor on CUDA # A command line tool for running pytorch's hipify preprocessor on CUDA

View File

@@ -76,7 +76,7 @@ function (hipify_sources_target OUT_SRCS NAME ORIG_SRCS)
set(CSRC_BUILD_DIR ${CMAKE_CURRENT_BINARY_DIR}/csrc) set(CSRC_BUILD_DIR ${CMAKE_CURRENT_BINARY_DIR}/csrc)
add_custom_target( add_custom_target(
hipify${NAME} hipify${NAME}
COMMAND ${Python_EXECUTABLE} ${CMAKE_SOURCE_DIR}/cmake/hipify.py -p ${CMAKE_SOURCE_DIR}/csrc -o ${CSRC_BUILD_DIR} ${SRCS} COMMAND ${CMAKE_SOURCE_DIR}/cmake/hipify.py -p ${CMAKE_SOURCE_DIR}/csrc -o ${CSRC_BUILD_DIR} ${SRCS}
DEPENDS ${CMAKE_SOURCE_DIR}/cmake/hipify.py ${SRCS} DEPENDS ${CMAKE_SOURCE_DIR}/cmake/hipify.py ${SRCS}
BYPRODUCTS ${HIP_SRCS} BYPRODUCTS ${HIP_SRCS}
COMMENT "Running hipify on ${NAME} extension source files.") COMMENT "Running hipify on ${NAME} extension source files.")

View File

@@ -119,7 +119,7 @@ typename T::Fmha::Arguments args_from_options(
{static_cast<ElementOut*>(out.data_ptr()), stride_O, {static_cast<ElementOut*>(out.data_ptr()), stride_O,
static_cast<ElementAcc*>(nullptr), stride_LSE}, static_cast<ElementAcc*>(nullptr), stride_LSE},
hw_info, hw_info,
1, // split_kv -1, // split_kv
nullptr, // is_var_split_kv nullptr, // is_var_split_kv
}; };
// TODO(kaixih@nvidia): When split_kv=-1 and is_var_split_kv=false, we compute // TODO(kaixih@nvidia): When split_kv=-1 and is_var_split_kv=false, we compute

View File

@@ -1,5 +1,4 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import enum import enum
from typing import Union from typing import Union

View File

@@ -1,5 +1,4 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import glob import glob
import itertools import itertools
import os import os

View File

@@ -31,7 +31,3 @@ torch::Tensor moe_wna16_gemm(torch::Tensor input, torch::Tensor output,
#endif #endif
bool moe_permute_unpermute_supported(); bool moe_permute_unpermute_supported();
void shuffle_rows(const torch::Tensor& input_tensor,
const torch::Tensor& dst2src_map,
torch::Tensor& output_tensor);

View File

@@ -130,62 +130,6 @@ void moe_unpermute(
}); });
} }
template <typename T>
__global__ void shuffleInputRowsKernel(const T* input,
const int32_t* dst2src_map, T* output,
int64_t num_src_rows,
int64_t num_dst_rows, int64_t num_cols) {
int64_t dest_row_idx = blockIdx.x;
int64_t const source_row_idx = dst2src_map[dest_row_idx];
if (blockIdx.x < num_dst_rows) {
// Load 128-bits per thread
constexpr int64_t ELEM_PER_THREAD = 128 / sizeof(T) / 8;
using DataElem = cutlass::Array<T, ELEM_PER_THREAD>;
// Duplicate and permute rows
auto const* source_row_ptr =
reinterpret_cast<DataElem const*>(input + source_row_idx * num_cols);
auto* dest_row_ptr =
reinterpret_cast<DataElem*>(output + dest_row_idx * num_cols);
int64_t const start_offset = threadIdx.x;
int64_t const stride = blockDim.x;
int64_t const num_elems_in_col = num_cols / ELEM_PER_THREAD;
for (int elem_index = start_offset; elem_index < num_elems_in_col;
elem_index += stride) {
dest_row_ptr[elem_index] = source_row_ptr[elem_index];
}
}
}
void shuffle_rows(const torch::Tensor& input_tensor,
const torch::Tensor& dst2src_map,
torch::Tensor& output_tensor) {
TORCH_CHECK(input_tensor.scalar_type() == output_tensor.scalar_type(),
"Input and output tensors must have the same data type");
auto stream = at::cuda::getCurrentCUDAStream().stream();
int64_t const blocks = output_tensor.size(0);
int64_t const threads = 256;
int64_t const num_dest_rows = output_tensor.size(0);
int64_t const num_src_rows = input_tensor.size(0);
int64_t const num_cols = input_tensor.size(1);
TORCH_CHECK(!(num_cols % (128 / sizeof(input_tensor.scalar_type()) / 8)),
"num_cols must be divisible by 128 / "
"sizeof(input_tensor.scalar_type()) / 8");
MOE_DISPATCH(input_tensor.scalar_type(), [&] {
shuffleInputRowsKernel<scalar_t><<<blocks, threads, 0, stream>>>(
reinterpret_cast<scalar_t*>(input_tensor.data_ptr()),
dst2src_map.data_ptr<int32_t>(),
reinterpret_cast<scalar_t*>(output_tensor.data_ptr()), num_src_rows,
num_dest_rows, num_cols);
});
}
#else #else
void moe_permute(const torch::Tensor& input, const torch::Tensor& topk_weights, void moe_permute(const torch::Tensor& input, const torch::Tensor& topk_weights,

View File

@@ -14,13 +14,12 @@
__VA_ARGS__(); \ __VA_ARGS__(); \
break; \ break; \
} }
#define MOE_DISPATCH_FLOAT_CASE(...) \ #define MOE_DISPATCH_FLOAT_CASE(...) \
MOE_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) \ MOE_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) \
MOE_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__) \ MOE_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__) \
MOE_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__) \ MOE_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__) \
MOE_DISPATCH_CASE(at::ScalarType::Float8_e5m2, __VA_ARGS__) \ MOE_DISPATCH_CASE(at::ScalarType::Float8_e5m2, __VA_ARGS__) \
MOE_DISPATCH_CASE(at::ScalarType::Float8_e4m3fn, __VA_ARGS__) \ MOE_DISPATCH_CASE(at::ScalarType::Float8_e4m3fn, __VA_ARGS__)
MOE_DISPATCH_CASE(at::ScalarType::Byte, __VA_ARGS__)
#define MOE_DISPATCH(TYPE, ...) \ #define MOE_DISPATCH(TYPE, ...) \
MOE_SWITCH(TYPE, MOE_DISPATCH_FLOAT_CASE(__VA_ARGS__)) MOE_SWITCH(TYPE, MOE_DISPATCH_FLOAT_CASE(__VA_ARGS__))
@@ -40,11 +39,6 @@ template <>
struct ScalarType2CudaType<at::ScalarType::BFloat16> { struct ScalarType2CudaType<at::ScalarType::BFloat16> {
using type = __nv_bfloat16; using type = __nv_bfloat16;
}; };
// uint8 for packed fp4
template <>
struct ScalarType2CudaType<at::ScalarType::Byte> {
using type = uint8_t;
};
// #if __CUDA_ARCH__ >= 890 // #if __CUDA_ARCH__ >= 890
// fp8 // fp8

View File

@@ -516,8 +516,9 @@ void topk_softmax(
topk, topk,
stream); stream);
} }
else if (topk_indices.scalar_type() == at::ScalarType::UInt32) else
{ {
assert(topk_indices.scalar_type() == at::ScalarType::UInt32);
vllm::moe::topkGatingSoftmaxKernelLauncher( vllm::moe::topkGatingSoftmaxKernelLauncher(
gating_output.data_ptr<float>(), gating_output.data_ptr<float>(),
topk_weights.data_ptr<float>(), topk_weights.data_ptr<float>(),
@@ -529,17 +530,4 @@ void topk_softmax(
topk, topk,
stream); stream);
} }
else {
assert(topk_indices.scalar_type() == at::ScalarType::Int64);
vllm::moe::topkGatingSoftmaxKernelLauncher(
gating_output.data_ptr<float>(),
topk_weights.data_ptr<float>(),
topk_indices.data_ptr<int64_t>(),
token_expert_indices.data_ptr<int>(),
softmax_workspace.data_ptr<float>(),
num_tokens,
num_experts,
topk,
stream);
}
} }

View File

@@ -81,12 +81,6 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
m.def("moe_permute_unpermute_supported() -> bool"); m.def("moe_permute_unpermute_supported() -> bool");
m.impl("moe_permute_unpermute_supported", &moe_permute_unpermute_supported); m.impl("moe_permute_unpermute_supported", &moe_permute_unpermute_supported);
// Row shuffle for MoE
m.def(
"shuffle_rows(Tensor input_tensor, Tensor dst2src_map, Tensor! "
"output_tensor) -> ()");
m.impl("shuffle_rows", torch::kCUDA, &shuffle_rows);
#endif #endif
} }

View File

@@ -92,11 +92,6 @@ void rms_norm(torch::Tensor& out, torch::Tensor& input, torch::Tensor& weight,
void fused_add_rms_norm(torch::Tensor& input, torch::Tensor& residual, void fused_add_rms_norm(torch::Tensor& input, torch::Tensor& residual,
torch::Tensor& weight, double epsilon); torch::Tensor& weight, double epsilon);
void apply_repetition_penalties_(torch::Tensor& logits,
const torch::Tensor& prompt_mask,
const torch::Tensor& output_mask,
const torch::Tensor& repetition_penalties);
void rms_norm_static_fp8_quant(torch::Tensor& out, torch::Tensor& input, void rms_norm_static_fp8_quant(torch::Tensor& out, torch::Tensor& input,
torch::Tensor& weight, torch::Tensor& scale, torch::Tensor& weight, torch::Tensor& scale,
double epsilon); double epsilon);
@@ -236,8 +231,7 @@ void cutlass_moe_mm(
torch::Tensor const& b_tensors, torch::Tensor const& a_scales, torch::Tensor const& b_tensors, torch::Tensor const& a_scales,
torch::Tensor const& b_scales, torch::Tensor const& expert_offsets, torch::Tensor const& b_scales, torch::Tensor const& expert_offsets,
torch::Tensor const& problem_sizes, torch::Tensor const& a_strides, torch::Tensor const& problem_sizes, torch::Tensor const& a_strides,
torch::Tensor const& b_strides, torch::Tensor const& c_strides, torch::Tensor const& b_strides, torch::Tensor const& c_strides);
bool per_act_token, bool per_out_ch);
void cutlass_fp4_group_mm( void cutlass_fp4_group_mm(
torch::Tensor& output, const torch::Tensor& a, const torch::Tensor& b, torch::Tensor& output, const torch::Tensor& a, const torch::Tensor& b,
@@ -249,16 +243,7 @@ void get_cutlass_moe_mm_data(
const torch::Tensor& topk_ids, torch::Tensor& expert_offsets, const torch::Tensor& topk_ids, torch::Tensor& expert_offsets,
torch::Tensor& problem_sizes1, torch::Tensor& problem_sizes2, torch::Tensor& problem_sizes1, torch::Tensor& problem_sizes2,
torch::Tensor& input_permutation, torch::Tensor& output_permutation, torch::Tensor& input_permutation, torch::Tensor& output_permutation,
const int64_t num_experts, const int64_t n, const int64_t k, const int64_t num_experts, const int64_t n, const int64_t k);
const std::optional<torch::Tensor>& blockscale_offsets);
void get_cutlass_pplx_moe_mm_data(torch::Tensor& expert_offsets,
torch::Tensor& problem_sizes1,
torch::Tensor& problem_sizes2,
const torch::Tensor& expert_num_tokens,
const int64_t num_local_experts,
const int64_t padded_m, const int64_t n,
const int64_t k);
void cutlass_scaled_mm_azp(torch::Tensor& out, torch::Tensor const& a, void cutlass_scaled_mm_azp(torch::Tensor& out, torch::Tensor const& a,
torch::Tensor const& b, torch::Tensor const& b,

View File

@@ -9,6 +9,10 @@ void cutlass_scaled_mm_blockwise_sm100_fp8(torch::Tensor& out,
torch::Tensor const& b, torch::Tensor const& b,
torch::Tensor const& a_scales, torch::Tensor const& a_scales,
torch::Tensor const& b_scales) { torch::Tensor const& b_scales) {
TORCH_CHECK(
a.size(0) % 4 == 0,
"Input tensor must have a number of rows that is a multiple of 4. ",
"but got: ", a.size(0), " rows.");
if (out.dtype() == torch::kBFloat16) { if (out.dtype() == torch::kBFloat16) {
cutlass_gemm_blockwise_sm100_fp8_dispatch<cutlass::bfloat16_t>( cutlass_gemm_blockwise_sm100_fp8_dispatch<cutlass::bfloat16_t>(
out, a, b, a_scales, b_scales); out, a, b, a_scales, b_scales);

View File

@@ -1,6 +1,5 @@
#pragma once #pragma once
#include "cuda_utils.h"
#include "cutlass/cutlass.h" #include "cutlass/cutlass.h"
#include "cutlass/numeric_types.h" #include "cutlass/numeric_types.h"
@@ -23,49 +22,49 @@ namespace vllm {
using namespace cute; using namespace cute;
// clang-format off template <typename OutType, typename MmaTileShape, typename ScalesPerTile,
template <class OutType, int ScaleGranularityM, class ClusterShape, typename EpilogueScheduler,
int ScaleGranularityN, int ScaleGranularityK, typename MainloopScheduler>
class MmaTileShape, class ClusterShape,
class EpilogueScheduler, class MainloopScheduler,
bool swap_ab_ = false>
struct cutlass_3x_gemm_fp8_blockwise { struct cutlass_3x_gemm_fp8_blockwise {
static constexpr bool swap_ab = swap_ab_;
using ElementAB = cutlass::float_e4m3_t; using ElementAB = cutlass::float_e4m3_t;
using ElementA = ElementAB; using ElementA = ElementAB;
using LayoutA = cutlass::layout::RowMajor; using LayoutA = cutlass::layout::RowMajor;
using LayoutA_Transpose = typename cutlass::layout::LayoutTranspose<LayoutA>::type;
static constexpr int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value; static constexpr int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value;
using ElementB = ElementAB; using ElementB = ElementAB;
using LayoutB = cutlass::layout::ColumnMajor; using LayoutB = cutlass::layout::ColumnMajor;
using LayoutB_Transpose = typename cutlass::layout::LayoutTranspose<LayoutB>::type;
static constexpr int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value; static constexpr int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value;
using ElementC = void;
using ElementD = OutType; using ElementD = OutType;
using LayoutD = cutlass::layout::RowMajor; using LayoutD = cutlass::layout::RowMajor;
using LayoutD_Transpose = typename cutlass::layout::LayoutTranspose<LayoutD>::type;
static constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value; static constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
using ElementC = void; // TODO: support bias
using LayoutC = LayoutD; using LayoutC = LayoutD;
using LayoutC_Transpose = LayoutD_Transpose;
static constexpr int AlignmentC = AlignmentD; static constexpr int AlignmentC = AlignmentD;
using ElementAccumulator = float; using ElementAccumulator = float;
using ElementCompute = float; using ElementCompute = float;
using ElementBlockScale = float; using ElementBlockScale = float;
using ScaleConfig = conditional_t<swap_ab, // MMA and Cluster Tile Shapes
cutlass::detail::Sm100BlockwiseScaleConfig< // Shape of the tile computed by tcgen05 MMA, could be across 2 SMs if Cluster
ScaleGranularityM, ScaleGranularityN, ScaleGranularityK, // Shape %2 == 0 using MmaTileShape_MNK = Shape<_128,_128,_128>;
cute::UMMA::Major::K, cute::UMMA::Major::MN>, static constexpr int ScaleMsPerTile = size<0>(ScalesPerTile{});
cutlass::detail::Sm100BlockwiseScaleConfig< static constexpr int ScaleGranularityM =
ScaleGranularityM, ScaleGranularityN, ScaleGranularityK, size<0>(MmaTileShape{}) / ScaleMsPerTile;
cute::UMMA::Major::MN, cute::UMMA::Major::K>>; static constexpr int ScaleGranularityN =
size<1>(MmaTileShape{}) / size<1>(ScalesPerTile{});
static constexpr int ScaleGranularityK =
size<2>(MmaTileShape{}) / size<2>(ScalesPerTile{});
// layout_SFA and layout_SFB cannot be swapped since they are deduced. // Shape of the threadblocks in a cluster
using ClusterShape_MNK = ClusterShape;
using ScaleConfig = cutlass::detail::Sm100BlockwiseScaleConfig<
ScaleGranularityM, ScaleGranularityN, ScaleGranularityK,
cute::UMMA::Major::MN, cute::UMMA::Major::K>;
using LayoutSFA = decltype(ScaleConfig::deduce_layoutSFA()); using LayoutSFA = decltype(ScaleConfig::deduce_layoutSFA());
using LayoutSFB = decltype(ScaleConfig::deduce_layoutSFB()); using LayoutSFB = decltype(ScaleConfig::deduce_layoutSFB());
@@ -74,6 +73,7 @@ struct cutlass_3x_gemm_fp8_blockwise {
static constexpr auto RoundStyle = cutlass::FloatRoundStyle::round_to_nearest; static constexpr auto RoundStyle = cutlass::FloatRoundStyle::round_to_nearest;
using ElementScalar = float; using ElementScalar = float;
// clang-format off
using DefaultOperation = cutlass::epilogue::fusion::LinearCombination<ElementD, ElementCompute, ElementC, ElementScalar, RoundStyle>; using DefaultOperation = cutlass::epilogue::fusion::LinearCombination<ElementD, ElementCompute, ElementC, ElementScalar, RoundStyle>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder< using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
ArchTag, ArchTag,
@@ -84,47 +84,33 @@ struct cutlass_3x_gemm_fp8_blockwise {
ElementAccumulator, ElementAccumulator,
ElementCompute, ElementCompute,
ElementC, ElementC,
conditional_t<swap_ab, LayoutC_Transpose, LayoutC>, LayoutC,
AlignmentC, AlignmentC,
ElementD, ElementD,
conditional_t<swap_ab, LayoutD_Transpose, LayoutD>, LayoutD,
AlignmentD, AlignmentD,
EpilogueScheduler, EpilogueScheduler,
DefaultOperation DefaultOperation
>::CollectiveOp; >::CollectiveOp;
using StageCountType = cutlass::gemm::collective::StageCountAuto; using StageCountType = cutlass::gemm::collective::StageCountAuto;
using CollectiveMainloop = conditional_t<swap_ab, using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
typename cutlass::gemm::collective::CollectiveBuilder< ArchTag,
ArchTag, OperatorClass,
OperatorClass, ElementA,
ElementB, cute::tuple<LayoutA, LayoutSFA>,
cute::tuple<LayoutB_Transpose, LayoutSFA>, AlignmentA,
AlignmentB, ElementB,
ElementA, cute::tuple<LayoutB, LayoutSFB>,
cute::tuple<LayoutA_Transpose, LayoutSFB>, AlignmentB,
AlignmentA, ElementAccumulator,
ElementAccumulator, MmaTileShape,
MmaTileShape, ClusterShape,
ClusterShape,
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>, cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
MainloopScheduler MainloopScheduler
>::CollectiveOp, >::CollectiveOp;
typename cutlass::gemm::collective::CollectiveBuilder< // clang-format on
ArchTag,
OperatorClass,
ElementA,
cute::tuple<LayoutA, LayoutSFA>,
AlignmentA,
ElementB,
cute::tuple<LayoutB, LayoutSFB>,
AlignmentB,
ElementAccumulator,
MmaTileShape,
ClusterShape,
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
MainloopScheduler
>::CollectiveOp>;
using KernelType = enable_sm100_only<cutlass::gemm::kernel::GemmUniversal< using KernelType = enable_sm100_only<cutlass::gemm::kernel::GemmUniversal<
Shape<int, int, int, int>, CollectiveMainloop, CollectiveEpilogue>>; Shape<int, int, int, int>, CollectiveMainloop, CollectiveEpilogue>>;
@@ -137,7 +123,6 @@ void cutlass_gemm_caller_blockwise(torch::Tensor& out, torch::Tensor const& a,
torch::Tensor const& b, torch::Tensor const& b,
torch::Tensor const& a_scales, torch::Tensor const& a_scales,
torch::Tensor const& b_scales) { torch::Tensor const& b_scales) {
static constexpr bool swap_ab = Gemm::swap_ab;
using GemmKernel = typename Gemm::GemmKernel; using GemmKernel = typename Gemm::GemmKernel;
using StrideA = typename Gemm::GemmKernel::StrideA; using StrideA = typename Gemm::GemmKernel::StrideA;
using StrideB = typename Gemm::GemmKernel::StrideB; using StrideB = typename Gemm::GemmKernel::StrideB;
@@ -151,6 +136,7 @@ void cutlass_gemm_caller_blockwise(torch::Tensor& out, torch::Tensor const& a,
using ElementD = typename Gemm::ElementD; using ElementD = typename Gemm::ElementD;
int32_t m = a.size(0), n = b.size(1), k = a.size(1); int32_t m = a.size(0), n = b.size(1), k = a.size(1);
auto prob_shape = cute::make_shape(m, n, k, 1);
StrideA a_stride; StrideA a_stride;
StrideB b_stride; StrideB b_stride;
@@ -160,13 +146,11 @@ void cutlass_gemm_caller_blockwise(torch::Tensor& out, torch::Tensor const& a,
b_stride = b_stride =
cutlass::make_cute_packed_stride(StrideB{}, cute::make_shape(n, k, 1)); cutlass::make_cute_packed_stride(StrideB{}, cute::make_shape(n, k, 1));
c_stride = c_stride =
cutlass::make_cute_packed_stride(StrideC{}, swap_ab ? cute::make_shape(n, m, 1) : cute::make_shape(m, n, 1)); cutlass::make_cute_packed_stride(StrideC{}, cute::make_shape(m, n, 1));
LayoutSFA layout_SFA = swap_ab ? LayoutSFA layout_SFA =
ScaleConfig::tile_atom_to_shape_SFA(make_shape(n, m, k, 1)) :
ScaleConfig::tile_atom_to_shape_SFA(make_shape(m, n, k, 1)); ScaleConfig::tile_atom_to_shape_SFA(make_shape(m, n, k, 1));
LayoutSFB layout_SFB = swap_ab ? LayoutSFB layout_SFB =
ScaleConfig::tile_atom_to_shape_SFB(make_shape(n, m, k, 1)) :
ScaleConfig::tile_atom_to_shape_SFB(make_shape(m, n, k, 1)); ScaleConfig::tile_atom_to_shape_SFB(make_shape(m, n, k, 1));
auto a_ptr = static_cast<ElementAB*>(a.data_ptr()); auto a_ptr = static_cast<ElementAB*>(a.data_ptr());
@@ -174,22 +158,9 @@ void cutlass_gemm_caller_blockwise(torch::Tensor& out, torch::Tensor const& a,
auto a_scales_ptr = static_cast<float*>(a_scales.data_ptr()); auto a_scales_ptr = static_cast<float*>(a_scales.data_ptr());
auto b_scales_ptr = static_cast<float*>(b_scales.data_ptr()); auto b_scales_ptr = static_cast<float*>(b_scales.data_ptr());
auto mainloop_args = [&](){ typename GemmKernel::MainloopArguments mainloop_args{
// layout_SFA and layout_SFB cannot be swapped since they are deduced. a_ptr, a_stride, b_ptr, b_stride,
if (swap_ab) { a_scales_ptr, layout_SFA, b_scales_ptr, layout_SFB};
return typename GemmKernel::MainloopArguments{
b_ptr, b_stride, a_ptr, a_stride,
b_scales_ptr, layout_SFA, a_scales_ptr, layout_SFB
};
}
else {
return typename GemmKernel::MainloopArguments{
a_ptr, a_stride, b_ptr, b_stride,
a_scales_ptr, layout_SFA, b_scales_ptr, layout_SFB
};
}
}();
auto prob_shape = swap_ab ? cute::make_shape(n, m, k, 1) : cute::make_shape(m, n, k, 1);
auto c_ptr = static_cast<ElementD*>(out.data_ptr()); auto c_ptr = static_cast<ElementD*>(out.data_ptr());
typename GemmKernel::EpilogueArguments epilogue_args{ typename GemmKernel::EpilogueArguments epilogue_args{
@@ -204,74 +175,29 @@ void cutlass_gemm_blockwise_sm100_fp8_dispatch(torch::Tensor& out,
torch::Tensor const& b, torch::Tensor const& b,
torch::Tensor const& a_scales, torch::Tensor const& a_scales,
torch::Tensor const& b_scales) { torch::Tensor const& b_scales) {
int32_t m = a.size(0), n = b.size(1), k = a.size(1), sms; auto m = a.size(0);
auto k = a.size(1);
auto n = b.size(1);
int sms;
cudaDeviceGetAttribute(&sms, cudaDevAttrMultiProcessorCount, a.get_device()); cudaDeviceGetAttribute(&sms, cudaDevAttrMultiProcessorCount, a.get_device());
constexpr int TILE_K = 128; auto should_use_2sm = [&sms](int m, int n, int tile1SM = 128) {
// TODO: better heuristics return std::ceil(static_cast<float>(m) / tile1SM) *
bool swap_ab = (m < 16) || (m % 4 != 0); std::ceil(static_cast<float>(n) / tile1SM) >=
bool use_tma_epilogue = (m * n) % 4 == 0; sms;
if (!swap_ab) { };
constexpr int TILE_N = 128; bool use_2sm = should_use_2sm(m, n);
int tile_m = 256; if (use_2sm) {
if (cuda_utils::ceil_div(n, TILE_N) * cuda_utils::ceil_div(m, 64) <= sms) {
tile_m = 64;
}
else if (cuda_utils::ceil_div(n, TILE_N) * cuda_utils::ceil_div(m, 128) <= sms) {
tile_m = 128;
}
if (tile_m == 64) {
if (use_tma_epilogue) {
cutlass_gemm_caller_blockwise<cutlass_3x_gemm_fp8_blockwise<
OutType, 1, TILE_N, TILE_K, Shape<_64, Int<TILE_N>, Int<TILE_K>>,
Shape<_1, _1, _1>, cutlass::epilogue::TmaWarpSpecialized1Sm,
cutlass::gemm::KernelTmaWarpSpecializedBlockwise1SmSm100>>(
out, a, b, a_scales, b_scales);
} else {
cutlass_gemm_caller_blockwise<cutlass_3x_gemm_fp8_blockwise<
OutType, 1, TILE_N, TILE_K, Shape<_64, Int<TILE_N>, Int<TILE_K>>,
Shape<_1, _1, _1>, cutlass::epilogue::NoSmemWarpSpecialized1Sm,
cutlass::gemm::KernelTmaWarpSpecializedBlockwise1SmSm100>>(
out, a, b, a_scales, b_scales);
}
} else if (tile_m == 128) {
if (use_tma_epilogue) {
cutlass_gemm_caller_blockwise<cutlass_3x_gemm_fp8_blockwise<
OutType, 1, TILE_N, TILE_K, Shape<_128, Int<TILE_N>, Int<TILE_K>>,
Shape<_1, _1, _1>, cutlass::epilogue::TmaWarpSpecialized1Sm,
cutlass::gemm::KernelTmaWarpSpecializedBlockwise1SmSm100>>(
out, a, b, a_scales, b_scales);
} else {
cutlass_gemm_caller_blockwise<cutlass_3x_gemm_fp8_blockwise<
OutType, 1, TILE_N, TILE_K, Shape<_128, Int<TILE_N>, Int<TILE_K>>,
Shape<_1, _1, _1>, cutlass::epilogue::NoSmemWarpSpecialized1Sm,
cutlass::gemm::KernelTmaWarpSpecializedBlockwise1SmSm100>>(
out, a, b, a_scales, b_scales);
}
} else { // tile_m == 256
if (use_tma_epilogue) {
cutlass_gemm_caller_blockwise<cutlass_3x_gemm_fp8_blockwise<
OutType, 1, TILE_N, TILE_K, Shape<_256, Int<TILE_N>, Int<TILE_K>>,
Shape<_2, _1, _1>, cutlass::epilogue::TmaWarpSpecialized2Sm,
cutlass::gemm::KernelTmaWarpSpecializedBlockwise2SmSm100>>(
out, a, b, a_scales, b_scales);
} else {
cutlass_gemm_caller_blockwise<cutlass_3x_gemm_fp8_blockwise<
OutType, 1, TILE_N, TILE_K, Shape<_256, Int<TILE_N>, Int<TILE_K>>,
Shape<_2, _1, _1>, cutlass::epilogue::NoSmemWarpSpecialized2Sm,
cutlass::gemm::KernelTmaWarpSpecializedBlockwise2SmSm100>>(
out, a, b, a_scales, b_scales);
}
}
} else {
// TODO: Test more tile N configs
constexpr int TILE_M = 128;
constexpr int TILE_N = 16;
// TMA epilogue isn't compatible with Swap A/B
cutlass_gemm_caller_blockwise<cutlass_3x_gemm_fp8_blockwise< cutlass_gemm_caller_blockwise<cutlass_3x_gemm_fp8_blockwise<
OutType, TILE_M, 1, TILE_K, Shape<Int<TILE_M>, Int<TILE_N>, Int<TILE_K>>, OutType, Shape<_256, _128, _128>, Shape<_256, _1, _1>,
Shape<_1, _1, _1>, cutlass::epilogue::NoSmemWarpSpecialized1Sm, Shape<_2, _2, _1>, cutlass::epilogue::TmaWarpSpecialized2Sm,
cutlass::gemm::KernelTmaWarpSpecializedBlockwise1SmSm100, true>>( cutlass::gemm::KernelTmaWarpSpecializedBlockwise2SmSm100>>(
out, a, b, a_scales, b_scales);
} else {
cutlass_gemm_caller_blockwise<cutlass_3x_gemm_fp8_blockwise<
OutType, Shape<_128, _128, _128>, Shape<_128, _1, _1>,
Shape<_1, _1, _1>, cutlass::epilogue::TmaWarpSpecialized1Sm,
cutlass::gemm::KernelTmaWarpSpecializedBlockwise1SmSm100>>(
out, a, b, a_scales, b_scales); out, a, b, a_scales, b_scales);
} }
} }

View File

@@ -15,7 +15,6 @@ using c3x::cutlass_gemm_caller;
template <typename InType, typename OutType, template <typename InType, typename OutType,
template <typename, typename, typename> typename Epilogue> template <typename, typename, typename> typename Epilogue>
struct sm100_fp8_config_default { struct sm100_fp8_config_default {
// M in (128, inf)
static_assert(std::is_same<InType, cutlass::float_e4m3_t>()); static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
using KernelSchedule = cutlass::gemm::collective::KernelScheduleAuto; using KernelSchedule = cutlass::gemm::collective::KernelScheduleAuto;
using EpilogueSchedule = cutlass::epilogue::collective::EpilogueScheduleAuto; using EpilogueSchedule = cutlass::epilogue::collective::EpilogueScheduleAuto;
@@ -26,34 +25,6 @@ struct sm100_fp8_config_default {
KernelSchedule, EpilogueSchedule>; KernelSchedule, EpilogueSchedule>;
}; };
template <typename InType, typename OutType,
template <typename, typename, typename> typename Epilogue>
struct sm100_fp8_config_M128 {
// M in (64, 128]
static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
using KernelSchedule = cutlass::gemm::collective::KernelScheduleAuto;
using EpilogueSchedule = cutlass::epilogue::collective::EpilogueScheduleAuto;
using TileShape = Shape<_128, _128, _64>;
using ClusterShape = Shape<_2, _2, _1>;
using Cutlass3xGemm =
cutlass_3x_gemm_sm100<InType, OutType, Epilogue, TileShape, ClusterShape,
KernelSchedule, EpilogueSchedule>;
};
template <typename InType, typename OutType,
template <typename, typename, typename> typename Epilogue>
struct sm100_fp8_config_M64 {
// M in [1, 64]
static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
using KernelSchedule = cutlass::gemm::collective::KernelScheduleAuto;
using EpilogueSchedule = cutlass::epilogue::collective::EpilogueScheduleAuto;
using TileShape = Shape<_64, _64, _256>;
using ClusterShape = Shape<_1, _8, _1>;
using Cutlass3xGemm =
cutlass_3x_gemm_sm100<InType, OutType, Epilogue, TileShape, ClusterShape,
KernelSchedule, EpilogueSchedule>;
};
template <typename InType, typename OutType, template <typename InType, typename OutType,
template <typename, typename, typename> typename Epilogue, template <typename, typename, typename> typename Epilogue,
typename... EpilogueArgs> typename... EpilogueArgs>
@@ -68,28 +39,8 @@ inline void cutlass_gemm_sm100_fp8_dispatch(torch::Tensor& out,
using Cutlass3xGemmDefault = using Cutlass3xGemmDefault =
typename sm100_fp8_config_default<InType, OutType, typename sm100_fp8_config_default<InType, OutType,
Epilogue>::Cutlass3xGemm; Epilogue>::Cutlass3xGemm;
using Cutlass3xGemmM64 = return cutlass_gemm_caller<Cutlass3xGemmDefault>(
typename sm100_fp8_config_M64<InType, OutType, Epilogue>::Cutlass3xGemm; out, a, b, std::forward<EpilogueArgs>(args)...);
using Cutlass3xGemmM128 =
typename sm100_fp8_config_M128<InType, OutType, Epilogue>::Cutlass3xGemm;
uint32_t const m = a.size(0);
uint32_t const mp2 =
std::max(static_cast<uint32_t>(64), next_pow_2(m)); // next power of 2
if (mp2 <= 64) {
// m in [1, 64]
return cutlass_gemm_caller<Cutlass3xGemmM64>(
out, a, b, std::forward<EpilogueArgs>(args)...);
} else if (mp2 <= 128) {
// m in (64, 128]
return cutlass_gemm_caller<Cutlass3xGemmM128>(
out, a, b, std::forward<EpilogueArgs>(args)...);
} else {
// m in (128, inf)
return cutlass_gemm_caller<Cutlass3xGemmDefault>(
out, a, b, std::forward<EpilogueArgs>(args)...);
}
} }
template <template <typename, typename, typename> typename Epilogue, template <template <typename, typename, typename> typename Epilogue,

View File

@@ -84,8 +84,7 @@ void run_cutlass_moe_mm_sm90(
torch::Tensor const& b_tensors, torch::Tensor const& a_scales, torch::Tensor const& b_tensors, torch::Tensor const& a_scales,
torch::Tensor const& b_scales, torch::Tensor const& expert_offsets, torch::Tensor const& b_scales, torch::Tensor const& expert_offsets,
torch::Tensor const& problem_sizes, torch::Tensor const& a_strides, torch::Tensor const& problem_sizes, torch::Tensor const& a_strides,
torch::Tensor const& b_strides, torch::Tensor const& c_strides, torch::Tensor const& b_strides, torch::Tensor const& c_strides) {
bool per_act_token, bool per_out_ch) {
TORCH_CHECK(a_tensors.size(0) > 0, "No input A tensors provided."); TORCH_CHECK(a_tensors.size(0) > 0, "No input A tensors provided.");
TORCH_CHECK(b_tensors.size(0) > 0, "No input B tensors provided."); TORCH_CHECK(b_tensors.size(0) > 0, "No input B tensors provided.");
TORCH_CHECK(out_tensors.size(0) > 0, "No output tensors provided."); TORCH_CHECK(out_tensors.size(0) > 0, "No output tensors provided.");
@@ -114,23 +113,19 @@ void run_cutlass_moe_mm_sm90(
if (n >= 8192) { if (n >= 8192) {
cutlass_group_gemm_caller<Cutlass3xGemmN8192>( cutlass_group_gemm_caller<Cutlass3xGemmN8192>(
out_tensors, a_tensors, b_tensors, a_scales, b_scales, expert_offsets, out_tensors, a_tensors, b_tensors, a_scales, b_scales, expert_offsets,
problem_sizes, a_strides, b_strides, c_strides, per_act_token, problem_sizes, a_strides, b_strides, c_strides);
per_out_ch);
} else if (k >= 8192) { } else if (k >= 8192) {
cutlass_group_gemm_caller<Cutlass3xGemmK8192>( cutlass_group_gemm_caller<Cutlass3xGemmK8192>(
out_tensors, a_tensors, b_tensors, a_scales, b_scales, expert_offsets, out_tensors, a_tensors, b_tensors, a_scales, b_scales, expert_offsets,
problem_sizes, a_strides, b_strides, c_strides, per_act_token, problem_sizes, a_strides, b_strides, c_strides);
per_out_ch);
} else if (m <= 16) { } else if (m <= 16) {
cutlass_group_gemm_caller<Cutlass3xGemmM16>( cutlass_group_gemm_caller<Cutlass3xGemmM16>(
out_tensors, a_tensors, b_tensors, a_scales, b_scales, expert_offsets, out_tensors, a_tensors, b_tensors, a_scales, b_scales, expert_offsets,
problem_sizes, a_strides, b_strides, c_strides, per_act_token, problem_sizes, a_strides, b_strides, c_strides);
per_out_ch);
} else { } else {
cutlass_group_gemm_caller<Cutlass3xGemmDefault>( cutlass_group_gemm_caller<Cutlass3xGemmDefault>(
out_tensors, a_tensors, b_tensors, a_scales, b_scales, expert_offsets, out_tensors, a_tensors, b_tensors, a_scales, b_scales, expert_offsets,
problem_sizes, a_strides, b_strides, c_strides, per_act_token, problem_sizes, a_strides, b_strides, c_strides);
per_out_ch);
} }
} }
@@ -139,18 +134,15 @@ void dispatch_moe_mm_sm90(
torch::Tensor const& b_tensors, torch::Tensor const& a_scales, torch::Tensor const& b_tensors, torch::Tensor const& a_scales,
torch::Tensor const& b_scales, torch::Tensor const& expert_offsets, torch::Tensor const& b_scales, torch::Tensor const& expert_offsets,
torch::Tensor const& problem_sizes, torch::Tensor const& a_strides, torch::Tensor const& problem_sizes, torch::Tensor const& a_strides,
torch::Tensor const& b_strides, torch::Tensor const& c_strides, torch::Tensor const& b_strides, torch::Tensor const& c_strides) {
bool per_act_token, bool per_out_ch) {
if (out_tensors.dtype() == torch::kBFloat16) { if (out_tensors.dtype() == torch::kBFloat16) {
run_cutlass_moe_mm_sm90<cutlass::float_e4m3_t, cutlass::bfloat16_t>( run_cutlass_moe_mm_sm90<cutlass::float_e4m3_t, cutlass::bfloat16_t>(
out_tensors, a_tensors, b_tensors, a_scales, b_scales, expert_offsets, out_tensors, a_tensors, b_tensors, a_scales, b_scales, expert_offsets,
problem_sizes, a_strides, b_strides, c_strides, per_act_token, problem_sizes, a_strides, b_strides, c_strides);
per_out_ch);
} else { } else {
run_cutlass_moe_mm_sm90<cutlass::float_e4m3_t, cutlass::half_t>( run_cutlass_moe_mm_sm90<cutlass::float_e4m3_t, cutlass::half_t>(
out_tensors, a_tensors, b_tensors, a_scales, b_scales, expert_offsets, out_tensors, a_tensors, b_tensors, a_scales, b_scales, expert_offsets,
problem_sizes, a_strides, b_strides, c_strides, per_act_token, problem_sizes, a_strides, b_strides, c_strides);
per_out_ch);
} }
} }
@@ -161,9 +153,8 @@ void cutlass_moe_mm_sm90(
torch::Tensor const& b_tensors, torch::Tensor const& a_scales, torch::Tensor const& b_tensors, torch::Tensor const& a_scales,
torch::Tensor const& b_scales, torch::Tensor const& expert_offsets, torch::Tensor const& b_scales, torch::Tensor const& expert_offsets,
torch::Tensor const& problem_sizes, torch::Tensor const& a_strides, torch::Tensor const& problem_sizes, torch::Tensor const& a_strides,
torch::Tensor const& b_strides, torch::Tensor const& c_strides, torch::Tensor const& b_strides, torch::Tensor const& c_strides) {
bool per_act_token, bool per_out_ch) {
dispatch_moe_mm_sm90(out_tensors, a_tensors, b_tensors, a_scales, b_scales, dispatch_moe_mm_sm90(out_tensors, a_tensors, b_tensors, a_scales, b_scales,
expert_offsets, problem_sizes, a_strides, b_strides, expert_offsets, problem_sizes, a_strides, b_strides,
c_strides, per_act_token, per_out_ch); c_strides);
} }

View File

@@ -76,8 +76,7 @@ void cutlass_group_gemm_caller(
torch::Tensor const& b_tensors, torch::Tensor const& a_scales, torch::Tensor const& b_tensors, torch::Tensor const& a_scales,
torch::Tensor const& b_scales, torch::Tensor const& expert_offsets, torch::Tensor const& b_scales, torch::Tensor const& expert_offsets,
torch::Tensor const& problem_sizes, torch::Tensor const& a_strides, torch::Tensor const& problem_sizes, torch::Tensor const& a_strides,
torch::Tensor const& b_strides, torch::Tensor const& c_strides, torch::Tensor const& b_strides, torch::Tensor const& c_strides) {
bool per_act_token, bool per_out_ch) {
using ElementAB = typename Gemm::ElementAB; using ElementAB = typename Gemm::ElementAB;
using ElementD = typename Gemm::ElementD; using ElementD = typename Gemm::ElementD;
@@ -85,6 +84,9 @@ void cutlass_group_gemm_caller(
int k_size = a_tensors.size(1); int k_size = a_tensors.size(1);
int n_size = out_tensors.size(1); int n_size = out_tensors.size(1);
bool per_act_token = a_scales.numel() != 1;
bool per_out_ch = b_scales.numel() != num_experts;
auto stream = at::cuda::getCurrentCUDAStream(a_tensors.device().index()); auto stream = at::cuda::getCurrentCUDAStream(a_tensors.device().index());
auto options_int = auto options_int =

View File

@@ -7,7 +7,7 @@
constexpr uint64_t THREADS_PER_EXPERT = 512; constexpr uint64_t THREADS_PER_EXPERT = 512;
__global__ void compute_problem_sizes(const uint32_t* __restrict__ topk_ids, __global__ void compute_problem_sizes(const int* __restrict__ topk_ids,
int32_t* problem_sizes1, int32_t* problem_sizes1,
int32_t* problem_sizes2, int32_t* problem_sizes2,
int32_t* atomic_buffer, int32_t* atomic_buffer,
@@ -45,24 +45,7 @@ __global__ void compute_expert_offsets(
} }
} }
__global__ void compute_expert_blockscale_offsets( __global__ void compute_arg_sorts(const int* __restrict__ topk_ids,
const int32_t* __restrict__ problem_sizes1, int32_t* expert_offsets,
int32_t* blockscale_offsets, int32_t* atomic_buffer,
const int num_experts) {
int32_t tot_offset = 0;
int32_t tot_offset_round = 0;
expert_offsets[0] = 0;
blockscale_offsets[0] = 0;
for (int i = 0; i < num_experts; ++i) {
atomic_buffer[i] = tot_offset;
tot_offset += problem_sizes1[i * 3];
expert_offsets[i + 1] = tot_offset;
tot_offset_round += (problem_sizes1[i * 3] + (128 - 1)) / 128 * 128;
blockscale_offsets[i + 1] = tot_offset_round;
}
}
__global__ void compute_arg_sorts(const uint32_t* __restrict__ topk_ids,
const int32_t* __restrict__ expert_offsets, const int32_t* __restrict__ expert_offsets,
int32_t* input_permutation, int32_t* input_permutation,
int32_t* output_permutation, int32_t* output_permutation,
@@ -94,8 +77,7 @@ void get_cutlass_moe_mm_data_caller(
const torch::Tensor& topk_ids, torch::Tensor& expert_offsets, const torch::Tensor& topk_ids, torch::Tensor& expert_offsets,
torch::Tensor& problem_sizes1, torch::Tensor& problem_sizes2, torch::Tensor& problem_sizes1, torch::Tensor& problem_sizes2,
torch::Tensor& input_permutation, torch::Tensor& output_permutation, torch::Tensor& input_permutation, torch::Tensor& output_permutation,
const int64_t num_experts, const int64_t n, const int64_t k, const int64_t num_experts, const int64_t n, const int64_t k) {
const std::optional<torch::Tensor>& blockscale_offsets) {
auto stream = at::cuda::getCurrentCUDAStream(topk_ids.device().index()); auto stream = at::cuda::getCurrentCUDAStream(topk_ids.device().index());
auto options_int32 = auto options_int32 =
torch::TensorOptions().dtype(torch::kInt32).device(topk_ids.device()); torch::TensorOptions().dtype(torch::kInt32).device(topk_ids.device());
@@ -103,61 +85,19 @@ void get_cutlass_moe_mm_data_caller(
int num_threads = min(THREADS_PER_EXPERT, topk_ids.numel()); int num_threads = min(THREADS_PER_EXPERT, topk_ids.numel());
compute_problem_sizes<<<num_experts, num_threads, 0, stream>>>( compute_problem_sizes<<<num_experts, num_threads, 0, stream>>>(
static_cast<const uint32_t*>(topk_ids.data_ptr()), static_cast<const int32_t*>(topk_ids.data_ptr()),
static_cast<int32_t*>(problem_sizes1.data_ptr()), static_cast<int32_t*>(problem_sizes1.data_ptr()),
static_cast<int32_t*>(problem_sizes2.data_ptr()), static_cast<int32_t*>(problem_sizes2.data_ptr()),
static_cast<int32_t*>(atomic_buffer.data_ptr()), topk_ids.numel(), n, k); static_cast<int32_t*>(atomic_buffer.data_ptr()), topk_ids.numel(), n, k);
if (blockscale_offsets.has_value()) { compute_expert_offsets<<<1, 1, 0, stream>>>(
compute_expert_blockscale_offsets<<<1, 1, 0, stream>>>( static_cast<const int32_t*>(problem_sizes1.data_ptr()),
static_cast<const int32_t*>(problem_sizes1.data_ptr()), static_cast<int32_t*>(expert_offsets.data_ptr()),
static_cast<int32_t*>(expert_offsets.data_ptr()), static_cast<int32_t*>(atomic_buffer.data_ptr()), num_experts);
static_cast<int32_t*>(blockscale_offsets.value().data_ptr()),
static_cast<int32_t*>(atomic_buffer.data_ptr()), num_experts);
} else {
compute_expert_offsets<<<1, 1, 0, stream>>>(
static_cast<const int32_t*>(problem_sizes1.data_ptr()),
static_cast<int32_t*>(expert_offsets.data_ptr()),
static_cast<int32_t*>(atomic_buffer.data_ptr()), num_experts);
}
compute_arg_sorts<<<num_experts, num_threads, 0, stream>>>( compute_arg_sorts<<<num_experts, num_threads, 0, stream>>>(
static_cast<const uint32_t*>(topk_ids.data_ptr()), static_cast<const int32_t*>(topk_ids.data_ptr()),
static_cast<const int32_t*>(expert_offsets.data_ptr()), static_cast<const int32_t*>(expert_offsets.data_ptr()),
static_cast<int32_t*>(input_permutation.data_ptr()), static_cast<int32_t*>(input_permutation.data_ptr()),
static_cast<int32_t*>(output_permutation.data_ptr()), static_cast<int32_t*>(output_permutation.data_ptr()),
static_cast<int32_t*>(atomic_buffer.data_ptr()), topk_ids.numel(), static_cast<int32_t*>(atomic_buffer.data_ptr()), topk_ids.numel(),
topk_ids.size(1)); topk_ids.size(1));
} }
__global__ void compute_pplx_data(int32_t* expert_offsets,
int32_t* problem_sizes1,
int32_t* problem_sizes2,
const int32_t* __restrict__ expert_num_tokens,
const int padded_m, const int n,
const int k) {
int expert_idx = threadIdx.x;
expert_offsets[expert_idx] = expert_idx * padded_m;
problem_sizes1[expert_idx * 3] = expert_num_tokens[expert_idx];
problem_sizes1[expert_idx * 3 + 1] = 2 * n;
problem_sizes1[expert_idx * 3 + 2] = k;
problem_sizes2[expert_idx * 3] = expert_num_tokens[expert_idx];
problem_sizes2[expert_idx * 3 + 1] = k;
problem_sizes2[expert_idx * 3 + 2] = n;
}
void get_cutlass_pplx_moe_mm_data_caller(torch::Tensor& expert_offsets,
torch::Tensor& problem_sizes1,
torch::Tensor& problem_sizes2,
const torch::Tensor& expert_num_tokens,
const int64_t num_local_experts,
const int64_t padded_m,
const int64_t n, const int64_t k) {
auto stream = at::cuda::getCurrentCUDAStream(expert_offsets.device().index());
compute_pplx_data<<<1, num_local_experts, 0, stream>>>(
static_cast<int32_t*>(expert_offsets.data_ptr()),
static_cast<int32_t*>(problem_sizes1.data_ptr()),
static_cast<int32_t*>(problem_sizes2.data_ptr()),
static_cast<const int32_t*>(expert_num_tokens.data_ptr()), padded_m, n,
k);
}

View File

@@ -36,8 +36,7 @@ void cutlass_moe_mm_sm90(
torch::Tensor const& b_tensors, torch::Tensor const& a_scales, torch::Tensor const& b_tensors, torch::Tensor const& a_scales,
torch::Tensor const& b_scales, torch::Tensor const& expert_offsets, torch::Tensor const& b_scales, torch::Tensor const& expert_offsets,
torch::Tensor const& problem_sizes, torch::Tensor const& a_strides, torch::Tensor const& problem_sizes, torch::Tensor const& a_strides,
torch::Tensor const& b_strides, torch::Tensor const& c_strides, torch::Tensor const& b_strides, torch::Tensor const& c_strides);
bool per_act_token, bool per_out_ch);
#endif #endif
@@ -55,16 +54,7 @@ void get_cutlass_moe_mm_data_caller(
const torch::Tensor& topk_ids, torch::Tensor& expert_offsets, const torch::Tensor& topk_ids, torch::Tensor& expert_offsets,
torch::Tensor& problem_sizes1, torch::Tensor& problem_sizes2, torch::Tensor& problem_sizes1, torch::Tensor& problem_sizes2,
torch::Tensor& input_permutation, torch::Tensor& output_permutation, torch::Tensor& input_permutation, torch::Tensor& output_permutation,
const int64_t num_experts, const int64_t n, const int64_t k, const int64_t num_experts, const int64_t n, const int64_t k);
const std::optional<torch::Tensor>& blockscale_offsets);
void get_cutlass_pplx_moe_mm_data_caller(torch::Tensor& expert_offsets,
torch::Tensor& problem_sizes1,
torch::Tensor& problem_sizes2,
const torch::Tensor& expert_num_tokens,
const int64_t num_local_experts,
const int64_t padded_m,
const int64_t n, const int64_t k);
#endif #endif
void cutlass_scaled_mm_azp_sm75(torch::Tensor& c, torch::Tensor const& a, void cutlass_scaled_mm_azp_sm75(torch::Tensor& c, torch::Tensor const& a,
@@ -216,13 +206,12 @@ void cutlass_moe_mm(
torch::Tensor const& b_tensors, torch::Tensor const& a_scales, torch::Tensor const& b_tensors, torch::Tensor const& a_scales,
torch::Tensor const& b_scales, torch::Tensor const& expert_offsets, torch::Tensor const& b_scales, torch::Tensor const& expert_offsets,
torch::Tensor const& problem_sizes, torch::Tensor const& a_strides, torch::Tensor const& problem_sizes, torch::Tensor const& a_strides,
torch::Tensor const& b_strides, torch::Tensor const& c_strides, torch::Tensor const& b_strides, torch::Tensor const& c_strides) {
bool per_act_token, bool per_out_ch) {
int32_t version_num = get_sm_version_num(); int32_t version_num = get_sm_version_num();
#if defined ENABLE_CUTLASS_MOE_SM90 && ENABLE_CUTLASS_MOE_SM90 #if defined ENABLE_CUTLASS_MOE_SM90 && ENABLE_CUTLASS_MOE_SM90
cutlass_moe_mm_sm90(out_tensors, a_tensors, b_tensors, a_scales, b_scales, cutlass_moe_mm_sm90(out_tensors, a_tensors, b_tensors, a_scales, b_scales,
expert_offsets, problem_sizes, a_strides, b_strides, expert_offsets, problem_sizes, a_strides, b_strides,
c_strides, per_act_token, per_out_ch); c_strides);
return; return;
#endif #endif
TORCH_CHECK_NOT_IMPLEMENTED( TORCH_CHECK_NOT_IMPLEMENTED(
@@ -235,8 +224,7 @@ void get_cutlass_moe_mm_data(
const torch::Tensor& topk_ids, torch::Tensor& expert_offsets, const torch::Tensor& topk_ids, torch::Tensor& expert_offsets,
torch::Tensor& problem_sizes1, torch::Tensor& problem_sizes2, torch::Tensor& problem_sizes1, torch::Tensor& problem_sizes2,
torch::Tensor& input_permutation, torch::Tensor& output_permutation, torch::Tensor& input_permutation, torch::Tensor& output_permutation,
const int64_t num_experts, const int64_t n, const int64_t k, const int64_t num_experts, const int64_t n, const int64_t k) {
const std::optional<torch::Tensor>& blockscale_offsets) {
// This function currently gets compiled only if we have a valid cutlass moe // This function currently gets compiled only if we have a valid cutlass moe
// mm to run it for. // mm to run it for.
int32_t version_num = get_sm_version_num(); int32_t version_num = get_sm_version_num();
@@ -244,8 +232,7 @@ void get_cutlass_moe_mm_data(
(defined ENABLE_SCALED_MM_SM100 && ENABLE_SCALED_MM_SM90) (defined ENABLE_SCALED_MM_SM100 && ENABLE_SCALED_MM_SM90)
get_cutlass_moe_mm_data_caller(topk_ids, expert_offsets, problem_sizes1, get_cutlass_moe_mm_data_caller(topk_ids, expert_offsets, problem_sizes1,
problem_sizes2, input_permutation, problem_sizes2, input_permutation,
output_permutation, num_experts, n, k, output_permutation, num_experts, n, k);
blockscale_offsets);
return; return;
#endif #endif
TORCH_CHECK_NOT_IMPLEMENTED( TORCH_CHECK_NOT_IMPLEMENTED(
@@ -255,29 +242,6 @@ void get_cutlass_moe_mm_data(
version_num, ". Required capability: 90"); version_num, ". Required capability: 90");
} }
void get_cutlass_pplx_moe_mm_data(torch::Tensor& expert_offsets,
torch::Tensor& problem_sizes1,
torch::Tensor& problem_sizes2,
const torch::Tensor& expert_num_tokens,
const int64_t num_local_experts,
const int64_t padded_m, const int64_t n,
const int64_t k) {
// This function currently gets compiled only if we have a valid cutlass moe
// mm to run it for.
int32_t version_num = get_sm_version_num();
#if defined ENABLE_CUTLASS_MOE_SM90 && ENABLE_CUTLASS_MOE_SM90
get_cutlass_pplx_moe_mm_data_caller(expert_offsets, problem_sizes1,
problem_sizes2, expert_num_tokens,
num_local_experts, padded_m, n, k);
return;
#endif
TORCH_CHECK_NOT_IMPLEMENTED(
false,
"No compiled get_cutlass_pplx_moe_mm_data: no cutlass_scaled_mm kernel "
"for CUDA device capability: ",
version_num, ". Required capability: 90");
}
void cutlass_scaled_mm_azp(torch::Tensor& c, torch::Tensor const& a, void cutlass_scaled_mm_azp(torch::Tensor& c, torch::Tensor const& a,
torch::Tensor const& b, torch::Tensor const& b,
torch::Tensor const& a_scales, torch::Tensor const& a_scales,

View File

@@ -39,8 +39,8 @@ __global__ void dynamic_per_token_scaled_fp8_quant_kernel(
fp8_type* __restrict__ token_output = &out[offset]; fp8_type* __restrict__ token_output = &out[offset];
// For vectorization, token_input and token_output pointers need to be // For vectorization, token_input and token_output pointers need to be
// aligned at 32-byte and 16-byte addresses respectively. // aligned at 8-byte and 4-byte addresses respectively.
bool const can_vectorize = hidden_size % 16 == 0; bool const can_vectorize = hidden_size % 4 == 0;
float absmax_val = 0.0f; float absmax_val = 0.0f;
if (can_vectorize) { if (can_vectorize) {
@@ -48,24 +48,24 @@ __global__ void dynamic_per_token_scaled_fp8_quant_kernel(
} else { } else {
for (int i = tid; i < hidden_size; i += blockDim.x) { for (int i = tid; i < hidden_size; i += blockDim.x) {
float const x = static_cast<float>(token_input[i]); float const x = static_cast<float>(token_input[i]);
absmax_val = fmaxf(absmax_val, fabsf(x)); absmax_val = max(absmax_val, fabs(x));
} }
} }
using BlockReduce = cub::BlockReduce<float, 256>; using BlockReduce = cub::BlockReduce<float, 1024>;
__shared__ typename BlockReduce::TempStorage reduceStorage; __shared__ typename BlockReduce::TempStorage reduceStorage;
float const block_absmax_val_maybe = float const block_absmax_val_maybe =
BlockReduce(reduceStorage).Reduce(absmax_val, cub::Max{}, blockDim.x); BlockReduce(reduceStorage).Reduce(absmax_val, cub::Max{}, blockDim.x);
__shared__ float token_scale; __shared__ float token_scale;
if (tid == 0) { if (tid == 0) {
if (scale_ub) { if (scale_ub) {
token_scale = fminf(block_absmax_val_maybe, *scale_ub); token_scale = min(block_absmax_val_maybe, *scale_ub);
} else { } else {
token_scale = block_absmax_val_maybe; token_scale = block_absmax_val_maybe;
} }
// token scale computation // token scale computation
token_scale = fmaxf(token_scale / quant_type_max_v<fp8_type>, token_scale = max(token_scale / quant_type_max_v<fp8_type>,
min_scaling_factor<fp8_type>::val()); min_scaling_factor<fp8_type>::val());
scale[token_idx] = token_scale; scale[token_idx] = token_scale;
} }
__syncthreads(); __syncthreads();
@@ -88,11 +88,10 @@ void static_scaled_fp8_quant(torch::Tensor& out, // [..., d]
torch::Tensor const& input, // [..., d] torch::Tensor const& input, // [..., d]
torch::Tensor const& scale) // [1] torch::Tensor const& scale) // [1]
{ {
int const block_size = 256; int64_t num_tokens = input.numel() / input.size(-1);
int const num_tokens = input.numel() / input.size(-1); int64_t num_elems = input.numel();
int const num_elems = input.numel(); dim3 grid(num_tokens);
dim3 const grid(num_tokens); dim3 block(1024);
dim3 const block(block_size);
const at::cuda::OptionalCUDAGuard device_guard(device_of(input)); const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
VLLM_DISPATCH_FLOATING_TYPES( VLLM_DISPATCH_FLOATING_TYPES(
@@ -111,11 +110,10 @@ void dynamic_scaled_fp8_quant(torch::Tensor& out, // [..., d]
torch::Tensor const& input, // [..., d] torch::Tensor const& input, // [..., d]
torch::Tensor& scale) // [1] torch::Tensor& scale) // [1]
{ {
int const block_size = 256; int64_t num_tokens = input.numel() / input.size(-1);
int const num_tokens = input.numel() / input.size(-1); int64_t num_elems = input.numel();
int const num_elems = input.numel(); dim3 grid(num_tokens);
dim3 const grid(num_tokens); dim3 block(1024);
dim3 const block(block_size);
const at::cuda::OptionalCUDAGuard device_guard(device_of(input)); const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
VLLM_DISPATCH_FLOATING_TYPES( VLLM_DISPATCH_FLOATING_TYPES(
@@ -143,9 +141,8 @@ void dynamic_per_token_scaled_fp8_quant(
int const hidden_size = input.size(-1); int const hidden_size = input.size(-1);
int const num_tokens = input.numel() / hidden_size; int const num_tokens = input.numel() / hidden_size;
int const block_size = 256;
dim3 const grid(num_tokens); dim3 const grid(num_tokens);
dim3 const block(std::min(hidden_size, block_size)); dim3 const block(std::min(hidden_size, 1024));
const at::cuda::OptionalCUDAGuard device_guard(device_of(input)); const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); const cudaStream_t stream = at::cuda::getCurrentCUDAStream();

View File

@@ -46,7 +46,7 @@ __device__ __forceinline__ fp8_type scaled_fp8_conversion(float const val,
} }
float r = float r =
fmaxf(-quant_type_max_v<fp8_type>, fminf(x, quant_type_max_v<fp8_type>)); fmax(-quant_type_max_v<fp8_type>, fmin(x, quant_type_max_v<fp8_type>));
#ifndef USE_ROCM #ifndef USE_ROCM
return static_cast<fp8_type>(r); return static_cast<fp8_type>(r);
#else #else
@@ -65,7 +65,7 @@ template <typename scalar_t, typename fp8_type>
__global__ void segmented_max_reduction(float* __restrict__ scale, __global__ void segmented_max_reduction(float* __restrict__ scale,
const scalar_t* __restrict__ input, const scalar_t* __restrict__ input,
int64_t num_elems) { int64_t num_elems) {
__shared__ float cache[256]; __shared__ float cache[1024];
int64_t i = blockDim.x * blockIdx.x + threadIdx.x; int64_t i = blockDim.x * blockIdx.x + threadIdx.x;
// First store maximum for all values processes by // First store maximum for all values processes by
@@ -73,7 +73,7 @@ __global__ void segmented_max_reduction(float* __restrict__ scale,
scalar_t tmp = 0.0; scalar_t tmp = 0.0;
while (i < num_elems) { while (i < num_elems) {
float x = static_cast<float>(input[i]); float x = static_cast<float>(input[i]);
tmp = fmaxf(tmp, fabsf(x)); tmp = max(tmp, fabs(x));
i += blockDim.x * gridDim.x; i += blockDim.x * gridDim.x;
} }
cache[threadIdx.x] = tmp; cache[threadIdx.x] = tmp;
@@ -100,27 +100,25 @@ template <typename scalar_t>
__device__ float thread_max_vec(scalar_t const* __restrict__ input, __device__ float thread_max_vec(scalar_t const* __restrict__ input,
int64_t const num_elems, int const tid, int64_t const num_elems, int const tid,
int const step) { int const step) {
constexpr size_t VEC_SIZE = 16;
using scalarxN_t = vec_n_t<scalar_t, VEC_SIZE>;
// Vectorized input/output to better utilize memory bandwidth. // Vectorized input/output to better utilize memory bandwidth.
auto const* vectorized_in = reinterpret_cast<scalarxN_t const*>(input); vec4_t<scalar_t> const* vectorized_in =
reinterpret_cast<vec4_t<scalar_t> const*>(input);
// num_elems / VEC_SIZE (which is 16) int64_t const num_vec_elems = num_elems >> 2;
int64_t const num_vec_elems = num_elems >> 4;
float absmax_val = 0.0f; float absmax_val = 0.0f;
#pragma unroll #pragma unroll 4
for (int64_t i = tid; i < num_vec_elems; i += step) { for (int64_t i = tid; i < num_vec_elems; i += step) {
scalarxN_t in_vec = vectorized_in[i]; vec4_t<scalar_t> in_vec = vectorized_in[i];
#pragma unroll absmax_val = max(absmax_val, fabs(in_vec.x));
for (int j = 0; j < VEC_SIZE; ++j) { absmax_val = max(absmax_val, fabs(in_vec.y));
absmax_val = fmaxf(absmax_val, fabsf(in_vec.val[j])); absmax_val = max(absmax_val, fabs(in_vec.z));
} absmax_val = max(absmax_val, fabs(in_vec.w));
} }
// Handle the remaining elements if num_elems is not divisible by VEC_SIZE // Handle the remaining elements if num_elems is not divisible by 4
for (int64_t i = num_vec_elems * VEC_SIZE + tid; i < num_elems; i += step) { for (int64_t i = num_vec_elems * 4 + tid; i < num_elems; i += step) {
absmax_val = fmaxf(absmax_val, fabsf(input[i])); absmax_val = max(absmax_val, fabs(input[i]));
} }
return absmax_val; return absmax_val;
@@ -132,31 +130,31 @@ __device__ void scaled_fp8_conversion_vec(fp8_type* __restrict__ out,
float const scale, float const scale,
int64_t const num_elems, int64_t const num_elems,
int const tid, int const step) { int const tid, int const step) {
constexpr size_t VEC_SIZE = 16; using float8x4_t = q8x4_t<fp8_type>;
using scalarxN_t = vec_n_t<scalar_t, VEC_SIZE>;
using float8xN_t = q8_n_t<fp8_type, VEC_SIZE>;
// Vectorized input/output to better utilize memory bandwidth. // Vectorized input/output to better utilize memory bandwidth.
auto const* vectorized_in = reinterpret_cast<scalarxN_t const*>(input); auto const* vectorized_in = reinterpret_cast<vec4_t<scalar_t> const*>(input);
auto* vectorized_out = reinterpret_cast<float8xN_t*>(out); auto* vectorized_out = reinterpret_cast<float8x4_t*>(out);
// num_elems / VEC_SIZE (which is 16) int64_t const num_vec_elems = num_elems >> 2;
int64_t const num_vec_elems = num_elems >> 4;
#pragma unroll #pragma unroll 4
for (int64_t i = tid; i < num_vec_elems; i += step) { for (int64_t i = tid; i < num_vec_elems; i += step) {
scalarxN_t in_vec = vectorized_in[i]; vec4_t<scalar_t> in_vec = vectorized_in[i];
float8xN_t out_vec; float8x4_t out_vec;
#pragma unroll out_vec.x = scaled_fp8_conversion<is_scale_inverted, fp8_type>(
for (int j = 0; j < VEC_SIZE; ++j) { static_cast<float>(in_vec.x), scale);
out_vec.val[j] = scaled_fp8_conversion<is_scale_inverted, fp8_type>( out_vec.y = scaled_fp8_conversion<is_scale_inverted, fp8_type>(
static_cast<float>(in_vec.val[j]), scale); static_cast<float>(in_vec.y), scale);
} out_vec.z = scaled_fp8_conversion<is_scale_inverted, fp8_type>(
static_cast<float>(in_vec.z), scale);
out_vec.w = scaled_fp8_conversion<is_scale_inverted, fp8_type>(
static_cast<float>(in_vec.w), scale);
vectorized_out[i] = out_vec; vectorized_out[i] = out_vec;
} }
// Handle the remaining elements if num_elems is not divisible by VEC_SIZE // Handle the remaining elements if num_elems is not divisible by 4
for (int64_t i = num_vec_elems * VEC_SIZE + tid; i < num_elems; i += step) { for (int64_t i = num_vec_elems * 4 + tid; i < num_elems; i += step) {
out[i] = scaled_fp8_conversion<is_scale_inverted, fp8_type>( out[i] = scaled_fp8_conversion<is_scale_inverted, fp8_type>(
static_cast<float>(input[i]), scale); static_cast<float>(input[i]), scale);
} }

View File

@@ -140,7 +140,6 @@ __device__ void compute_rms(float* rms, scalar_t const* __restrict__ input,
// sum of squares // sum of squares
float ss = 0.0f; float ss = 0.0f;
const int VEC_SIZE = 4;
int32_t const num_vec_elems = hidden_size >> 2; int32_t const num_vec_elems = hidden_size >> 2;
#pragma unroll 4 #pragma unroll 4
@@ -148,23 +147,22 @@ __device__ void compute_rms(float* rms, scalar_t const* __restrict__ input,
vec4_t<scalar_t> in = vec_input[i]; vec4_t<scalar_t> in = vec_input[i];
vec4_t<float> x; vec4_t<float> x;
#pragma unroll x.x = static_cast<float>(in.x);
for (int j = 0; j < VEC_SIZE; ++j) { x.y = static_cast<float>(in.y);
x.val[j] = static_cast<float>(in.val[j]); x.z = static_cast<float>(in.z);
} x.w = static_cast<float>(in.w);
if constexpr (has_residual) { if constexpr (has_residual) {
vec4_t<scalar_t> r = vec_residual[i]; vec4_t<scalar_t> r = vec_residual[i];
#pragma unroll x.x += static_cast<float>(r.x);
for (int j = 0; j < VEC_SIZE; ++j) { x.y += static_cast<float>(r.y);
x.val[j] += static_cast<float>(r.val[j]); x.z += static_cast<float>(r.z);
} x.w += static_cast<float>(r.w);
} }
#pragma unroll ss += x.x * x.x;
for (int j = 0; j < VEC_SIZE; ++j) { ss += x.y * x.y;
ss += x.val[j] * x.val[j]; ss += x.z * x.z;
} ss += x.w * x.w;
} }
using BlockReduce = cub::BlockReduce<float, 1024>; using BlockReduce = cub::BlockReduce<float, 1024>;
@@ -205,7 +203,6 @@ __device__ void compute_dynamic_per_token_scales(
constexpr scalar_out_t qmax{quant_type_max_v<scalar_out_t>}; constexpr scalar_out_t qmax{quant_type_max_v<scalar_out_t>};
const int VEC_SIZE = 4;
int32_t const num_vec_elems = hidden_size >> 2; int32_t const num_vec_elems = hidden_size >> 2;
float block_absmax_val_maybe = 0.0f; float block_absmax_val_maybe = 0.0f;
@@ -215,25 +212,26 @@ __device__ void compute_dynamic_per_token_scales(
vec4_t<scalar_t> const w = vec_weight[i]; vec4_t<scalar_t> const w = vec_weight[i];
vec4_t<float> x; vec4_t<float> x;
#pragma unroll x.x = static_cast<float>(in.x);
for (int j = 0; j < VEC_SIZE; ++j) { x.y = static_cast<float>(in.y);
x.val[j] = static_cast<float>(in.val[j]); x.z = static_cast<float>(in.z);
} x.w = static_cast<float>(in.w);
if constexpr (has_residual) { if constexpr (has_residual) {
vec4_t<scalar_t> r = vec_residual[i]; vec4_t<scalar_t> r = vec_residual[i];
#pragma unroll x.x += static_cast<float>(r.x);
for (int j = 0; j < VEC_SIZE; ++j) { x.y += static_cast<float>(r.y);
x.val[j] += static_cast<float>(r.val[j]); x.z += static_cast<float>(r.z);
} x.w += static_cast<float>(r.w);
} }
#pragma unroll block_absmax_val_maybe = fmaxf(
for (int j = 0; j < VEC_SIZE; ++j) { block_absmax_val_maybe, fabs(static_cast<scalar_t>(x.x * rms) * w.x));
block_absmax_val_maybe = block_absmax_val_maybe = fmaxf(
fmaxf(block_absmax_val_maybe, block_absmax_val_maybe, fabs(static_cast<scalar_t>(x.y * rms) * w.y));
fabs(static_cast<scalar_t>(x.val[j] * rms) * w.val[j])); block_absmax_val_maybe = fmaxf(
} block_absmax_val_maybe, fabs(static_cast<scalar_t>(x.z * rms) * w.z));
block_absmax_val_maybe = fmaxf(
block_absmax_val_maybe, fabs(static_cast<scalar_t>(x.w * rms) * w.w));
} }
using BlockReduce = cub::BlockReduce<float, 1024>; using BlockReduce = cub::BlockReduce<float, 1024>;
@@ -284,7 +282,6 @@ __device__ void norm_and_quant(scalar_out_t* __restrict__ output,
vec_residual = reinterpret_cast<vec4_t<scalar_t>*>(&residual[token_offset]); vec_residual = reinterpret_cast<vec4_t<scalar_t>*>(&residual[token_offset]);
} }
const int VEC_SIZE = 4;
int32_t const num_vec_elems = hidden_size >> 2; int32_t const num_vec_elems = hidden_size >> 2;
// TODO(luka/varun) extract into type-agnostic vectorized quant function to // TODO(luka/varun) extract into type-agnostic vectorized quant function to
@@ -295,31 +292,33 @@ __device__ void norm_and_quant(scalar_out_t* __restrict__ output,
vec4_t<scalar_t> const w = vec_weight[i]; vec4_t<scalar_t> const w = vec_weight[i];
vec4_t<float> x; vec4_t<float> x;
#pragma unroll x.x = static_cast<float>(in.x);
for (int j = 0; j < VEC_SIZE; ++j) { x.y = static_cast<float>(in.y);
x.val[j] = static_cast<float>(in.val[j]); x.z = static_cast<float>(in.z);
} x.w = static_cast<float>(in.w);
if constexpr (has_residual) { if constexpr (has_residual) {
vec4_t<scalar_t> r = vec_residual[i]; vec4_t<scalar_t> r = vec_residual[i];
#pragma unroll x.x += static_cast<float>(r.x);
for (int j = 0; j < VEC_SIZE; ++j) { x.y += static_cast<float>(r.y);
x.val[j] += static_cast<float>(r.val[j]); x.z += static_cast<float>(r.z);
} x.w += static_cast<float>(r.w);
// Update residual // Update residual
#pragma unroll r.x = static_cast<scalar_t>(x.x);
for (int j = 0; j < VEC_SIZE; ++j) { r.y = static_cast<scalar_t>(x.y);
r.val[j] = static_cast<scalar_t>(x.val[j]); r.z = static_cast<scalar_t>(x.z);
} r.w = static_cast<scalar_t>(x.w);
vec_residual[i] = r; vec_residual[i] = r;
} }
q8x4_t<scalar_out_t> out; q8x4_t<scalar_out_t> out;
#pragma unroll out.x = ScaledQuant<scalar_out_t, is_scale_inverted>::quant_fn(
for (int j = 0; j < VEC_SIZE; ++j) { static_cast<scalar_t>(x.x * rms) * w.x, scale);
out.val[j] = ScaledQuant<scalar_out_t, is_scale_inverted>::quant_fn( out.y = ScaledQuant<scalar_out_t, is_scale_inverted>::quant_fn(
static_cast<scalar_t>(x.val[j] * rms) * w.val[j], scale); static_cast<scalar_t>(x.y * rms) * w.y, scale);
} out.z = ScaledQuant<scalar_out_t, is_scale_inverted>::quant_fn(
static_cast<scalar_t>(x.z * rms) * w.z, scale);
out.w = ScaledQuant<scalar_out_t, is_scale_inverted>::quant_fn(
static_cast<scalar_t>(x.w * rms) * w.w, scale);
vec_output[i] = out; vec_output[i] = out;
} }
} }

View File

@@ -1,5 +1,4 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import glob import glob
import itertools import itertools
import os import os

View File

@@ -1,5 +1,4 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import itertools import itertools
import math import math

View File

@@ -10,22 +10,23 @@
namespace vllm { namespace vllm {
// Vectorization containers // Vectorization containers
template <typename scalar_t, size_t vec_size> template <typename scalar_t>
struct __align__(vec_size * sizeof(scalar_t)) vec_n_t { struct __align__(8) vec4_t {
scalar_t val[vec_size]; scalar_t x;
scalar_t y;
scalar_t z;
scalar_t w;
}; };
template <typename quant_type_t, size_t vec_size> template <typename quant_type_t>
struct __align__(vec_size * sizeof(quant_type_t)) q8_n_t { struct __align__(4) q8x4_t {
static_assert(std::is_same_v<quant_type_t, int8_t> || static_assert(std::is_same_v<quant_type_t, int8_t> ||
std::is_same_v<quant_type_t, c10::Float8_e4m3fn> || std::is_same_v<quant_type_t, c10::Float8_e4m3fn> ||
std::is_same_v<quant_type_t, c10::Float8_e4m3fnuz>); std::is_same_v<quant_type_t, c10::Float8_e4m3fnuz>);
quant_type_t val[vec_size]; quant_type_t x;
quant_type_t y;
quant_type_t z;
quant_type_t w;
}; };
template <typename scalar_t>
using vec4_t = vec_n_t<scalar_t, 4>;
template <typename quant_type_t>
using q8x4_t = q8_n_t<quant_type_t, 4>;
} // namespace vllm } // namespace vllm

View File

@@ -13,34 +13,14 @@
#include "dispatch_utils.h" #include "dispatch_utils.h"
#include "quantization/fp8/common.cuh" #include "quantization/fp8/common.cuh"
#if defined(__HIPCC__) && \ #if defined(__HIPCC__) && (defined(__gfx90a__) || defined(__gfx942__))
(defined(__gfx90a__) || defined(__gfx942__) || defined(__gfx950__)) #define __HIP__MI300_MI250__
#define __HIP__GFX9__
#endif #endif
#if defined(__HIPCC__) && (defined(__gfx942__) || defined(__gfx950__)) #if defined(__HIPCC__) && defined(__gfx942__)
#define __HIP__MI3XX__ #define __HIP__MI300__
#endif #endif
#if defined(__gfx950__)
#define LDS_SIZE 160 * 1024
#else
#define LDS_SIZE 64 * 1024
#endif
int get_lds_size() {
static bool is_cached = false;
static int result;
if (is_cached == false) {
auto dprops = at::cuda::getCurrentDeviceProperties();
std::string device_arch = dprops->gcnArchName;
size_t substring = device_arch.find("gfx95");
result = (substring == std::string::npos ? 64 * 1024 : 160 * 1024);
is_cached = true;
}
return result;
}
#if defined(NDEBUG) #if defined(NDEBUG)
#undef NDEBUG #undef NDEBUG
#include <assert.h> #include <assert.h>
@@ -287,7 +267,7 @@ torch::Tensor LLMM1(at::Tensor& in_a, at::Tensor& in_b,
V0 += (s.x + s.y); \ V0 += (s.x + s.y); \
} }
#if defined(__HIP__GFX9__) // TODO: Add NAVI support #if defined(__HIP__MI300_MI250__) // TODO: Add NAVI support
// This version targets cases where A[] fits LDS capacity // This version targets cases where A[] fits LDS capacity
template <typename scalar_t, int THRDS, int YTILE, int WvPrGrp, int A_CHUNK, template <typename scalar_t, int THRDS, int YTILE, int WvPrGrp, int A_CHUNK,
int UNRL, int N> int UNRL, int N>
@@ -295,8 +275,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
wvSplitK_hf_sml_(const int K, const int M, const scalar_t* B, wvSplitK_hf_sml_(const int K, const int M, const scalar_t* B,
const scalar_t* __restrict__ A, scalar_t* C, const scalar_t* __restrict__ A, scalar_t* C,
const int _WvPrGrp, const int CuCount) { const int _WvPrGrp, const int CuCount) {
constexpr int max_lds_len = LDS_SIZE / 2; #if defined(__HIP__MI300__)
#if defined(__HIP__MI3XX__)
constexpr bool use_mfma = (std::is_same_v<scalar_t, __hip_bfloat16>); constexpr bool use_mfma = (std::is_same_v<scalar_t, __hip_bfloat16>);
#else #else
constexpr bool use_mfma = false; constexpr bool use_mfma = false;
@@ -316,13 +295,13 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
}; };
//---------------------------------------------------- //----------------------------------------------------
// Reserving 64/160 KB of LDS to have 1 WG / CU // Reserving 64 KB of LDS to have 1 WG / CU
// Goal is to bring the activation matrix A to the LDS // Goal is to bring the activation matrix A to the LDS
// and use it across the lifetime of the work group // and use it across the lifetime of the work group
// TODO: When activation matrix is larger than 64 KB // TODO: When activation matrix is larger than 64 KB
// then this is not goint to work! // then this is not goint to work!
//---------------------------------------------------- //----------------------------------------------------
__shared__ scalar_t s[max_lds_len]; __shared__ scalar_t s[1024 * 32];
//---------------------------------------------------- //----------------------------------------------------
// Fetch the activation matrix to LDS // Fetch the activation matrix to LDS
@@ -333,11 +312,11 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
// - Then the WG will move to another 8 K elements // - Then the WG will move to another 8 K elements
// TODO: Logic below will only work when K is multiple of 8 // TODO: Logic below will only work when K is multiple of 8
//---------------------------------------------------- //----------------------------------------------------
for (uint32_t k = 0; k < min(K * N, max_lds_len); for (uint32_t k = 0; k < min(K * N, 32 * 1024);
k += THRDS * WvPrGrp * A_CHUNK) { k += THRDS * WvPrGrp * A_CHUNK) {
uint32_t k_in = k + ((threadIdx.y * THRDS + threadIdx.x) * A_CHUNK); uint32_t k_in = k + ((threadIdx.y * THRDS + threadIdx.x) * A_CHUNK);
if (k_in >= min(K * N, max_lds_len)) break; if (k_in >= min(K * N, 32 * 1024)) break;
*((bigType*)(&s[k_in])) = *((bigType*)(&A[k_in])); *((bigType*)(&s[k_in])) = *((bigType*)(&A[k_in]));
} }
@@ -538,7 +517,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
m += CuCount * _WvPrGrp * YTILE; m += CuCount * _WvPrGrp * YTILE;
} }
} }
#else // !defined(__HIP__GFX9__) TODO: Add NAVI support #else // !defined(__HIP__MI300_MI250__) TODO: Add NAVI support
template <typename scalar_t, int THRDS, int YTILE, int WvPrGrp, int A_CHUNK, template <typename scalar_t, int THRDS, int YTILE, int WvPrGrp, int A_CHUNK,
int UNRL, int N> int UNRL, int N>
__global__ void wvSplitK_hf_sml_(const int K, const int M, const scalar_t* B, __global__ void wvSplitK_hf_sml_(const int K, const int M, const scalar_t* B,
@@ -546,9 +525,9 @@ __global__ void wvSplitK_hf_sml_(const int K, const int M, const scalar_t* B,
const int _WvPrGrp, const int CuCount) { const int _WvPrGrp, const int CuCount) {
UNREACHABLE_CODE UNREACHABLE_CODE
} }
#endif // defined(__HIP__GFX9__) TODO: Add NAVI support #endif // defined(__HIP__MI300_MI250__) TODO: Add NAVI support
#if defined(__HIP__GFX9__) // TODO: Add NAVI support #if defined(__HIP__MI300_MI250__) // TODO: Add NAVI support
// This version targets cases where A[] marginally exceeds LDS capacity // This version targets cases where A[] marginally exceeds LDS capacity
template <typename scalar_t, int THRDS, int YTILE, int WvPrGrp, int A_CHUNK, template <typename scalar_t, int THRDS, int YTILE, int WvPrGrp, int A_CHUNK,
int UNRL, int N> int UNRL, int N>
@@ -556,8 +535,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
wvSplitK_hf_(const int K, const int M, const scalar_t* B, wvSplitK_hf_(const int K, const int M, const scalar_t* B,
const scalar_t* __restrict__ A, scalar_t* C, const scalar_t* __restrict__ A, scalar_t* C,
const int _WvPrGrp, const int CuCount) { const int _WvPrGrp, const int CuCount) {
constexpr int max_lds_len = LDS_SIZE / 2; #if defined(__HIP__MI300__)
#if defined(__HIP__MI3XX__)
constexpr bool use_mfma = (std::is_same_v<scalar_t, __hip_bfloat16>); constexpr bool use_mfma = (std::is_same_v<scalar_t, __hip_bfloat16>);
#else #else
constexpr bool use_mfma = false; constexpr bool use_mfma = false;
@@ -583,7 +561,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
// TODO: When activation matrix is larger than 64 KB // TODO: When activation matrix is larger than 64 KB
// then this is not goint to work! // then this is not goint to work!
//---------------------------------------------------- //----------------------------------------------------
__shared__ scalar_t s[max_lds_len]; __shared__ scalar_t s[1024 * 32];
//---------------------------------------------------- //----------------------------------------------------
// Computation of columns that need to be committed to memory! // Computation of columns that need to be committed to memory!
@@ -620,11 +598,11 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
// - Then the WG will move to another 8 K elements // - Then the WG will move to another 8 K elements
// TODO: Logic below will only work when K is multiple of 8 // TODO: Logic below will only work when K is multiple of 8
//---------------------------------------------------- //----------------------------------------------------
for (uint32_t k = 0; k < min(K * N, max_lds_len); for (uint32_t k = 0; k < min(K * N, 32 * 1024);
k += THRDS * WvPrGrp * A_CHUNK) { k += THRDS * WvPrGrp * A_CHUNK) {
uint32_t k_in = k + ((threadIdx.y * THRDS + threadIdx.x) * A_CHUNK); uint32_t k_in = k + ((threadIdx.y * THRDS + threadIdx.x) * A_CHUNK);
if (k_in >= min(K * N, max_lds_len)) break; if (k_in >= min(K * N, 32 * 1024)) break;
*((bigType*)(&s[k_in])) = *((bigType*)(&A[k_in])); *((bigType*)(&s[k_in])) = *((bigType*)(&A[k_in]));
} }
@@ -708,7 +686,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
// Fetch A activation matrix in interleaved fashion from LDS or memory // Fetch A activation matrix in interleaved fashion from LDS or memory
for (int n = 0; n < N; n++) { for (int n = 0; n < N; n++) {
if (k_ + K * n < max_lds_len) if (k_ + K * n < 32 * 1024)
bigA[n][k2] = *((const bigType*)(&(s[k_ + K * n]))); bigA[n][k2] = *((const bigType*)(&(s[k_ + K * n])));
else else
bigA[n][k2] = *((const bigType*)(&(A[k_ + K * n]))); bigA[n][k2] = *((const bigType*)(&(A[k_ + K * n])));
@@ -839,7 +817,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
} }
} }
#else // !defined(__HIP__GFX9__) TODO: Add NAVI support #else // !defined(__HIP__MI300_MI250__) TODO: Add NAVI support
template <typename scalar_t, int THRDS, int YTILE, int WvPrGrp, int A_CHUNK, template <typename scalar_t, int THRDS, int YTILE, int WvPrGrp, int A_CHUNK,
int UNRL, int N> int UNRL, int N>
__global__ void wvSplitK_hf_(const int K, const int M, const scalar_t* B, __global__ void wvSplitK_hf_(const int K, const int M, const scalar_t* B,
@@ -847,9 +825,9 @@ __global__ void wvSplitK_hf_(const int K, const int M, const scalar_t* B,
const int _WvPrGrp, const int CuCount) { const int _WvPrGrp, const int CuCount) {
UNREACHABLE_CODE UNREACHABLE_CODE
} }
#endif // defined(__HIP__GFX9__) TODO: Add NAVI support #endif // defined(__HIP__MI300_MI250__) TODO: Add NAVI support
#if defined(__HIP__GFX9__) // TODO: Add NAVI support #if defined(__HIP__MI300_MI250__) // TODO: Add NAVI support
// This version targets big A[] cases, where it is much larger than LDS capacity // This version targets big A[] cases, where it is much larger than LDS capacity
template <typename scalar_t, int THRDS, int YTILE, int WvPrGrp, int A_CHUNK, template <typename scalar_t, int THRDS, int YTILE, int WvPrGrp, int A_CHUNK,
int UNRL, int N> int UNRL, int N>
@@ -857,8 +835,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
wvSplitK_hf_big_(const int K, const int M, const scalar_t* B, wvSplitK_hf_big_(const int K, const int M, const scalar_t* B,
const scalar_t* __restrict__ A, scalar_t* C, const scalar_t* __restrict__ A, scalar_t* C,
const int _WvPrGrp, const int CuCount) { const int _WvPrGrp, const int CuCount) {
constexpr int max_lds_len = LDS_SIZE / 2; #if defined(__HIP__MI300__)
#if defined(__HIP__MI3XX__)
constexpr bool use_mfma = (std::is_same_v<scalar_t, __hip_bfloat16>); constexpr bool use_mfma = (std::is_same_v<scalar_t, __hip_bfloat16>);
#else #else
constexpr bool use_mfma = false; constexpr bool use_mfma = false;
@@ -878,13 +855,13 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
}; };
//---------------------------------------------------- //----------------------------------------------------
// Reserving 64/160 KB of LDS to have 1 WG / CU // Reserving 64 KB of LDS to have 1 WG / CU
// Goal is to bring the activation matrix A to the LDS // Goal is to bring the activation matrix A to the LDS
// and use it across the lifetime of the work group // and use it across the lifetime of the work group
// TODO: When activation matrix is larger than 64 KB // TODO: When activation matrix is larger than 64 KB
// then this is not goint to work! // then this is not goint to work!
//---------------------------------------------------- //----------------------------------------------------
__shared__ scalar_t s[max_lds_len]; __shared__ scalar_t s[1024 * 32];
//---------------------------------------------------- //----------------------------------------------------
// Computation of columns that need to be committed to memory! // Computation of columns that need to be committed to memory!
@@ -925,11 +902,11 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
//---------------------------------------------------- //----------------------------------------------------
#define PCML #define PCML
#ifndef PCML #ifndef PCML
for (uint32_t k = 0; k < min(K * N, max_lds_len); for (uint32_t k = 0; k < min(K * N, 32 * 1024);
k += THRDS * WvPrGrp * A_CHUNK) { k += THRDS * WvPrGrp * A_CHUNK) {
uint32_t k_in = k + ((threadIdx.y * THRDS + threadIdx.x) * A_CHUNK); uint32_t k_in = k + ((threadIdx.y * THRDS + threadIdx.x) * A_CHUNK);
if (k_in >= min(K * N, max_lds_len)) break; if (k_in >= min(K * N, 32 * 1024)) break;
*((bigType*)(&s[k_in])) = *((bigType*)(&A[k_in])); *((bigType*)(&s[k_in])) = *((bigType*)(&A[k_in]));
} }
@@ -939,7 +916,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
#define TUC (THRDS * UNRL * A_CHUNK) #define TUC (THRDS * UNRL * A_CHUNK)
uint32_t kBase = 0; uint32_t kBase = 0;
// find biggest k size that fits in LDS // find biggest k size that fits in LDS
uint32_t kFit = (max_lds_len) / N; uint32_t kFit = (32 * 1024) / N;
// kFit = (kFit%TWC==0) ? kFit : (kFit-kFit%TWC+TWC); //round up to multiple // kFit = (kFit%TWC==0) ? kFit : (kFit-kFit%TWC+TWC); //round up to multiple
// of TUC // of TUC
kFit = (kFit % TUC == 0) kFit = (kFit % TUC == 0)
@@ -1187,7 +1164,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
} }
} }
} }
#else // !defined(__HIP__GFX9__) TODO: Add NAVI support #else // !defined(__HIP__MI300_MI250__) TODO: Add NAVI support
template <typename scalar_t, int THRDS, int YTILE, int WvPrGrp, int A_CHUNK, template <typename scalar_t, int THRDS, int YTILE, int WvPrGrp, int A_CHUNK,
int UNRL, int N> int UNRL, int N>
__global__ void wvSplitK_hf_big_(const int K, const int M, const scalar_t* B, __global__ void wvSplitK_hf_big_(const int K, const int M, const scalar_t* B,
@@ -1195,7 +1172,7 @@ __global__ void wvSplitK_hf_big_(const int K, const int M, const scalar_t* B,
const int _WvPrGrp, const int CuCount) { const int _WvPrGrp, const int CuCount) {
UNREACHABLE_CODE UNREACHABLE_CODE
} }
#endif // defined(__HIP__GFX9__) TODO: Add NAVI support #endif // defined(__HIP__MI300_MI250__) TODO: Add NAVI support
int mindiv(int N, int div1, int div2) { int mindiv(int N, int div1, int div2) {
int nPrRnd = div1 * div2; int nPrRnd = div1 * div2;
@@ -1245,18 +1222,17 @@ torch::Tensor wvSplitK(at::Tensor& in_a, at::Tensor& in_b,
const at::cuda::OptionalCUDAGuard device_guard(device_of(in_a)); const at::cuda::OptionalCUDAGuard device_guard(device_of(in_a));
const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
const int max_lds_len = get_lds_size() / 2;
#define WVSPLITK(_WvPrGrp, _YTILEs, _YTILEm, _YTILEb, _UNRLs, _UNRLm, _UNRLb, \ #define WVSPLITK(_WvPrGrp, _YTILEs, _YTILEm, _YTILEb, _UNRLs, _UNRLm, _UNRLb, \
_N) \ _N) \
{ \ { \
dim3 block(64, _WvPrGrp); \ dim3 block(64, _WvPrGrp); \
if ((K_in * N_in <= max_lds_len) && (M_in % _YTILEs == 0)) { \ if ((K_in * N_in <= 32 * 1024) && (M_in % _YTILEs == 0)) { \
int __wvPrGrp = mindiv(M_in, CuCount * _YTILEs, _WvPrGrp); \ int __wvPrGrp = mindiv(M_in, CuCount * _YTILEs, _WvPrGrp); \
wvSplitK_hf_sml_<fptype, 64, _YTILEs, _WvPrGrp, 8, _UNRLs, _N> \ wvSplitK_hf_sml_<fptype, 64, _YTILEs, _WvPrGrp, 8, _UNRLs, _N> \
<<<grid, block, 0, stream>>>(K_in, M_in, af4, bf4, c, __wvPrGrp, \ <<<grid, block, 0, stream>>>(K_in, M_in, af4, bf4, c, __wvPrGrp, \
CuCount); \ CuCount); \
} else if (K_in * N_in <= max_lds_len * 1.2) { \ } else if (K_in * N_in <= 32 * 1024 * 1.2) { \
int __wvPrGrp = mindiv(M_in, CuCount * _YTILEm, _WvPrGrp); \ int __wvPrGrp = mindiv(M_in, CuCount * _YTILEm, _WvPrGrp); \
wvSplitK_hf_<fptype, 64, _YTILEm, _WvPrGrp, 8, _UNRLm, _N> \ wvSplitK_hf_<fptype, 64, _YTILEm, _WvPrGrp, 8, _UNRLm, _N> \
<<<grid, block, 0, stream>>>(K_in, M_in, af4, bf4, c, __wvPrGrp, \ <<<grid, block, 0, stream>>>(K_in, M_in, af4, bf4, c, __wvPrGrp, \
@@ -1296,7 +1272,7 @@ torch::Tensor wvSplitK(at::Tensor& in_a, at::Tensor& in_b,
return out_c; return out_c;
} }
#if defined(__HIP__MI3XX__) // TODO: Add NAVI support #if defined(__HIP__MI300__) // TODO: Add NAVI support
template <typename scalar_t, typename fp8_t, int THRDS, int YTILE, int WvPrGrp, template <typename scalar_t, typename fp8_t, int THRDS, int YTILE, int WvPrGrp,
int A_CHUNK, int UNRL, int N> int A_CHUNK, int UNRL, int N>
__global__ void __launch_bounds__(WvPrGrp* THRDS) __global__ void __launch_bounds__(WvPrGrp* THRDS)
@@ -1305,7 +1281,6 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
const float* __restrict__ s_A, const float* __restrict__ s_A,
const float* __restrict__ s_B, const int _WvPrGrp, const float* __restrict__ s_B, const int _WvPrGrp,
const int CuCount) { const int CuCount) {
constexpr int max_lds_len = LDS_SIZE;
using scalar8 = using scalar8 =
__attribute__((__vector_size__((A_CHUNK / 4) * sizeof(float)))) float; __attribute__((__vector_size__((A_CHUNK / 4) * sizeof(float)))) float;
using intx2 = __attribute__((__vector_size__(2 * sizeof(int)))) int; using intx2 = __attribute__((__vector_size__(2 * sizeof(int)))) int;
@@ -1321,10 +1296,10 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
scalar8 h8; scalar8 h8;
}; };
__shared__ fp8_t s[max_lds_len]; __shared__ fp8_t s[1024 * 64];
for (uint32_t k = (threadIdx.y * THRDS + threadIdx.x) * A_CHUNK; for (uint32_t k = (threadIdx.y * THRDS + threadIdx.x) * A_CHUNK;
k < min(K * N, max_lds_len); k += THRDS * WvPrGrp * A_CHUNK) { k < min(K * N, 64 * 1024); k += THRDS * WvPrGrp * A_CHUNK) {
*((bigType*)(&s[k])) = *((bigType*)(&A[k])); *((bigType*)(&s[k])) = *((bigType*)(&A[k]));
} }
__syncthreads(); __syncthreads();
@@ -1461,7 +1436,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
m += CuCount * _WvPrGrp * YTILE; m += CuCount * _WvPrGrp * YTILE;
} }
} }
#else // !defined(__HIP__MI3XX__) TODO: Add NAVI support #else // !defined(__HIP__MI300__) TODO: Add NAVI support
template <typename scalar_t, typename fp8_t, int THRDS, int YTILE, int WvPrGrp, template <typename scalar_t, typename fp8_t, int THRDS, int YTILE, int WvPrGrp,
int A_CHUNK, int UNRL, int N> int A_CHUNK, int UNRL, int N>
__global__ void wvSplitKQ_hf_sml_(const int K, const int Kp, const int M, __global__ void wvSplitKQ_hf_sml_(const int K, const int Kp, const int M,
@@ -1471,9 +1446,9 @@ __global__ void wvSplitKQ_hf_sml_(const int K, const int Kp, const int M,
const int _WvPrGrp, const int CuCount) { const int _WvPrGrp, const int CuCount) {
UNREACHABLE_CODE UNREACHABLE_CODE
} }
#endif // defined(__HIP__MI3XX__) TODO: Add NAVI support #endif // defined(__HIP__MI300__) TODO: Add NAVI support
#if defined(__HIP__MI3XX__) // TODO: Add NAVI support #if defined(__HIP__MI300__) // TODO: Add NAVI support
template <typename scalar_t, typename fp8_t, int THRDS, int YTILE, int WvPrGrp, template <typename scalar_t, typename fp8_t, int THRDS, int YTILE, int WvPrGrp,
int A_CHUNK, int UNRL, int N> int A_CHUNK, int UNRL, int N>
__global__ void __launch_bounds__(WvPrGrp* THRDS) __global__ void __launch_bounds__(WvPrGrp* THRDS)
@@ -1481,7 +1456,6 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
const fp8_t* __restrict__ A, scalar_t* C, const fp8_t* __restrict__ A, scalar_t* C,
const float* __restrict__ s_A, const float* __restrict__ s_B, const float* __restrict__ s_A, const float* __restrict__ s_B,
const int _WvPrGrp, const int CuCount) { const int _WvPrGrp, const int CuCount) {
constexpr int max_lds_len = LDS_SIZE;
using scalar8 = using scalar8 =
__attribute__((__vector_size__((A_CHUNK / 4) * sizeof(float)))) float; __attribute__((__vector_size__((A_CHUNK / 4) * sizeof(float)))) float;
using intx2 = __attribute__((__vector_size__(2 * sizeof(int)))) int; using intx2 = __attribute__((__vector_size__(2 * sizeof(int)))) int;
@@ -1497,10 +1471,10 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
scalar8 h8; scalar8 h8;
}; };
__shared__ fp8_t s[max_lds_len]; __shared__ fp8_t s[1024 * 64];
for (uint32_t k = (threadIdx.y * THRDS + threadIdx.x) * A_CHUNK; for (uint32_t k = (threadIdx.y * THRDS + threadIdx.x) * A_CHUNK;
k < min(K * N, max_lds_len); k += THRDS * WvPrGrp * A_CHUNK) { k < min(K * N, 64 * 1024); k += THRDS * WvPrGrp * A_CHUNK) {
*((bigType*)(&s[k])) = *((bigType*)(&A[k])); *((bigType*)(&s[k])) = *((bigType*)(&A[k]));
} }
__syncthreads(); __syncthreads();
@@ -1543,7 +1517,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
uint32_t k_ = k + threadIdx.x * A_CHUNK; uint32_t k_ = k + threadIdx.x * A_CHUNK;
if (k_ >= K) break; if (k_ >= K) break;
for (int n = 0; n < N; n++) { for (int n = 0; n < N; n++) {
if (k_ + K * n < max_lds_len) if (k_ + K * n < 64 * 1024)
bigA[n][k2] = *((const bigType*)(&(s[k_ + K * n]))); bigA[n][k2] = *((const bigType*)(&(s[k_ + K * n])));
else else
bigA[n][k2] = *((const bigType*)(&(A[k_ + K * n]))); bigA[n][k2] = *((const bigType*)(&(A[k_ + K * n])));
@@ -1634,7 +1608,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
m += CuCount * _WvPrGrp * YTILE; m += CuCount * _WvPrGrp * YTILE;
} }
} }
#else // !defined(__HIP__MI3XX__) TODO: Add NAVI support #else // !defined(__HIP__MI300__) TODO: Add NAVI support
template <typename scalar_t, typename fp8_t, int THRDS, int YTILE, int WvPrGrp, template <typename scalar_t, typename fp8_t, int THRDS, int YTILE, int WvPrGrp,
int A_CHUNK, int UNRL, int N> int A_CHUNK, int UNRL, int N>
__global__ void wvSplitKQ_hf_(const int K, const int Kp, const int M, __global__ void wvSplitKQ_hf_(const int K, const int Kp, const int M,
@@ -1644,7 +1618,7 @@ __global__ void wvSplitKQ_hf_(const int K, const int Kp, const int M,
const int CuCount) { const int CuCount) {
UNREACHABLE_CODE UNREACHABLE_CODE
} }
#endif // defined(__HIP__MI3XX__) TODO: Add NAVI support #endif // defined(__HIP__MI300__) TODO: Add NAVI support
void wvSplitKQ(at::Tensor& in_a, at::Tensor& in_b, at::Tensor& out_c, void wvSplitKQ(at::Tensor& in_a, at::Tensor& in_b, at::Tensor& out_c,
at::Tensor& scale_a, at::Tensor& scale_b, at::Tensor& scale_a, at::Tensor& scale_b,
@@ -1664,13 +1638,12 @@ void wvSplitKQ(at::Tensor& in_a, at::Tensor& in_b, at::Tensor& out_c,
dim3 grid(CuCount); dim3 grid(CuCount);
const at::cuda::OptionalCUDAGuard device_guard(device_of(in_a)); const at::cuda::OptionalCUDAGuard device_guard(device_of(in_a));
const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
const int max_lds_len = get_lds_size();
#define WVSPLITKQ(_WvPrGrp, _YTILEs, _YTILEm, _YTILEb, _UNRLs, _UNRLm, _UNRLb, \ #define WVSPLITKQ(_WvPrGrp, _YTILEs, _YTILEm, _YTILEb, _UNRLs, _UNRLm, _UNRLb, \
_N) \ _N) \
{ \ { \
dim3 block(64, _WvPrGrp); \ dim3 block(64, _WvPrGrp); \
if ((K_in * N_in <= max_lds_len) && (M_in % _YTILEs == 0)) { \ if ((K_in * N_in <= 64 * 1024) && (M_in % _YTILEs == 0)) { \
int __wvPrGrp = mindiv(M_in, CuCount * _YTILEs, _WvPrGrp); \ int __wvPrGrp = mindiv(M_in, CuCount * _YTILEs, _WvPrGrp); \
wvSplitKQ_hf_sml_<fptype, fp8_t, 64, _YTILEs, _WvPrGrp, 16, _UNRLs, _N> \ wvSplitKQ_hf_sml_<fptype, fp8_t, 64, _YTILEs, _WvPrGrp, 16, _UNRLs, _N> \
<<<grid, block, 0, stream>>>(K_in, Kp_in, M_in, a_ptr, b_ptr, c_ptr, \ <<<grid, block, 0, stream>>>(K_in, Kp_in, M_in, a_ptr, b_ptr, c_ptr, \

View File

@@ -1,86 +0,0 @@
#include "dispatch_utils.h"
#include <torch/cuda.h>
#include <c10/cuda/CUDAGuard.h>
#ifndef USE_ROCM
#include <cub/cub.cuh>
#else
#include <hipcub/hipcub.hpp>
#endif
namespace vllm {
template <typename scalar_t>
__global__ void apply_repetition_penalties_kernel(
scalar_t* __restrict__ logits, // [num_seqs, vocab_size]
const bool* __restrict__ prompt_mask, // [num_seqs, vocab_size]
const bool* __restrict__ output_mask, // [num_seqs, vocab_size]
const scalar_t* __restrict__ repetition_penalties, // [num_seqs]
const int num_seqs, const int vocab_size, const int tile_size) {
// Each block handles one sequence and a tile of vocab
const int seq_idx = blockIdx.x;
if (seq_idx >= num_seqs) return;
const int tile_start = blockIdx.y * tile_size;
const int tile_end = min(tile_start + tile_size, vocab_size);
// Load repetition penalty for this sequence
const scalar_t penalty = repetition_penalties[seq_idx];
// Each thread processes multiple vocab items within the tile
for (int vocab_idx = tile_start + threadIdx.x; vocab_idx < tile_end;
vocab_idx += blockDim.x) {
const int64_t idx = static_cast<int64_t>(seq_idx) * vocab_size + vocab_idx;
const bool is_repeated = prompt_mask[idx] || output_mask[idx];
if (is_repeated) {
scalar_t logit = logits[idx];
if (logit > 0) {
logits[idx] = logit / penalty;
} else {
logits[idx] = logit * penalty;
}
}
}
}
} // namespace vllm
void apply_repetition_penalties_(
torch::Tensor& logits, // [num_seqs, vocab_size], in-place
const torch::Tensor& prompt_mask, // [num_seqs, vocab_size]
const torch::Tensor& output_mask, // [num_seqs, vocab_size]
const torch::Tensor& repetition_penalties) { // [num_seqs]
TORCH_CHECK(logits.is_contiguous());
TORCH_CHECK(prompt_mask.is_contiguous());
TORCH_CHECK(output_mask.is_contiguous());
TORCH_CHECK(repetition_penalties.is_contiguous());
int vocab_size = logits.size(-1);
int num_seqs = logits.size(0);
// Get number of SMs on the current device
int sms = 0;
cudaDeviceGetAttribute(&sms, cudaDevAttrMultiProcessorCount,
logits.get_device());
// Compute tile_num and tile_size
int tile_num =
std::min(vocab_size, std::max(1, (sms + num_seqs - 1) / num_seqs));
int tile_size = (vocab_size + tile_num - 1) / tile_num;
// Each block handles one sequence and a tile of vocab
dim3 grid(num_seqs, tile_num);
dim3 block(std::min(tile_size, 1024));
const at::cuda::OptionalCUDAGuard device_guard(device_of(logits));
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
VLLM_DISPATCH_FLOATING_TYPES(
logits.scalar_type(), "apply_repetition_penalties_kernel", [&] {
vllm::apply_repetition_penalties_kernel<scalar_t>
<<<grid, block, 0, stream>>>(
logits.data_ptr<scalar_t>(), prompt_mask.data_ptr<bool>(),
output_mask.data_ptr<bool>(),
repetition_penalties.data_ptr<scalar_t>(), num_seqs, vocab_size,
tile_size);
});
}

Some files were not shown because too many files have changed in this diff Show More