Compare commits

..

11 Commits

Author SHA1 Message Date
Shengqi Chen
b17039bccc [CI] Implement uploading to PyPI and GitHub in the release pipeline, enable release image building for CUDA 13.0 (#31032)
(cherry picked from commit 8e61425ee6)
2026-01-16 21:04:48 -08:00
Cyrus Leung
48b67ba75f [Frontend] Standardize use of create_error_response (#32319)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2026-01-16 11:35:10 +00:00
TJian
09f4264a55 [Bugfix] Fix ROCm dockerfiles (#32447)
Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com>
2026-01-16 10:50:00 +08:00
Matthew Bonanni
7f42dc20bb [CI] Fix LM Eval Large Models (H100) (#32423)
Signed-off-by: Matthew Bonanni <mbonanni@redhat.com>
(cherry picked from commit bcf2333cd6)
2026-01-15 18:00:21 -08:00
TJian
c2a37a3cf8 Cherry pick [ROCm] [CI] [Release] Rocm wheel pipeline with sccache #32264
Signed-off-by: Kevin H. Luu <khluu000@gmail.com>
2026-01-15 17:59:58 -08:00
Michael Goin
0e31fc7996 [UX] Use kv_offloading_backend=native by default (#32421)
Signed-off-by: mgoin <mgoin64@gmail.com>
(cherry picked from commit 1be5a73571)
2026-01-15 17:55:20 -08:00
Pleaplusone
6ac0fcf416 [ROCm][Bugfix] Disable hip sampler to fix deepseek's accuracy issue on ROCm (#32413)
Signed-off-by: ganyi <ygan@amd.com>
(cherry picked from commit 77c16df31d)
2026-01-15 17:55:06 -08:00
Douglas Lehr
b62249725c [ROCM] Add ROCm image build to release pipeline (#31995)
Signed-off-by: Doug Lehr <douglehr@amd.com>
Co-authored-by: Doug Lehr <douglehr@amd.com>
(cherry picked from commit c5891b5430)
2026-01-15 17:54:47 -08:00
vllmellm
1b57275207 [Bugfix][ROCm][performance] Resolve the performance regression issue of the Qwen3-Next-80B-A3B-Thinking under rocm_atten (#32336)
Signed-off-by: vllmellm <vllm.ellm@embeddedllm.com>
(cherry picked from commit e27078ea80)
2026-01-15 17:54:01 -08:00
Martin Hickey
2c24bc6996 [BugFix] [KVConnector] Fix KV events for LMCache connector (#32169)
Signed-off-by: Martin Hickey <martin.hickey@ie.ibm.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
2026-01-13 10:56:23 -08:00
Cyrus Leung
0aa8c40552 [Bugfix] Replace PoolingParams.normalize with use_activation (#32243)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2026-01-13 10:56:23 -08:00
1268 changed files with 35452 additions and 71986 deletions

View File

@@ -1,8 +1,7 @@
name: vllm_ci name: vllm_ci
job_dirs: job_dirs:
- ".buildkite/image_build"
- ".buildkite/test_areas" - ".buildkite/test_areas"
- ".buildkite/hardware_tests" - ".buildkite/image_build"
run_all_patterns: run_all_patterns:
- "docker/Dockerfile" - "docker/Dockerfile"
- "CMakeLists.txt" - "CMakeLists.txt"

View File

@@ -1,29 +0,0 @@
group: Hardware
steps:
- label: "AMD: :docker: build image"
depends_on: []
device: amd_cpu
no_plugin: true
commands:
- >
docker build
--build-arg max_jobs=16
--build-arg REMOTE_VLLM=1
--build-arg ARG_PYTORCH_ROCM_ARCH='gfx90a;gfx942'
--build-arg VLLM_BRANCH=$BUILDKITE_COMMIT
--tag "rocm/vllm-ci:${BUILDKITE_COMMIT}"
-f docker/Dockerfile.rocm
--target test
--no-cache
--progress plain .
- docker push "rocm/vllm-ci:${BUILDKITE_COMMIT}"
env:
DOCKER_BUILDKIT: "1"
retry:
automatic:
- exit_status: -1 # Agent was lost
limit: 1
- exit_status: -10 # Agent was lost
limit: 1
- exit_status: 1 # Machine occasionally fail
limit: 1

View File

@@ -1,8 +0,0 @@
group: Hardware
steps:
- label: "Arm CPU Test"
soft_fail: true
device: arm_cpu
no_plugin: true
commands:
- bash .buildkite/scripts/hardware_ci/run-cpu-test-arm.sh

View File

@@ -1,10 +0,0 @@
group: Hardware
depends_on: ~
steps:
- label: "Ascend NPU Test"
soft_fail: true
timeout_in_minutes: 20
no_plugin: true
device: ascend_npu
commands:
- bash .buildkite/scripts/hardware_ci/run-npu-test.sh

View File

@@ -1,10 +0,0 @@
group: Hardware
steps:
- label: "GH200 Test"
soft_fail: true
device: gh200
no_plugin: true
optional: true
commands:
- nvidia-smi
- bash .buildkite/scripts/hardware_ci/run-gh200-test.sh

View File

@@ -1,24 +0,0 @@
group: Hardware
depends_on: ~
steps:
- label: "Intel CPU Test"
soft_fail: true
device: intel_cpu
no_plugin: true
commands:
- bash .buildkite/scripts/hardware_ci/run-cpu-test.sh
- label: "Intel HPU Test"
soft_fail: true
device: intel_hpu
no_plugin: true
commands:
- bash .buildkite/scripts/hardware_ci/run-hpu-test.sh
- label: "Intel GPU Test"
depends_on: []
soft_fail: true
device: intel_gpu
no_plugin: true
commands:
- bash .buildkite/scripts/hardware_ci/run-xpu-test.sh

View File

@@ -1,256 +1,56 @@
#!/bin/bash #!/bin/bash
set -euo pipefail set -e
# replace invalid characters in Docker image tags and truncate to 128 chars if [[ $# -lt 8 ]]; then
clean_docker_tag() { echo "Usage: $0 <registry> <repo> <commit> <branch> <vllm_use_precompiled> <vllm_merge_base_commit> <cache_from> <cache_to>"
local input="$1" exit 1
echo "$input" | sed 's/[^a-zA-Z0-9._-]/_/g' | cut -c1-128
}
print_usage_and_exit() {
echo "Usage: $0 <registry> <repo> <commit> <branch> <vllm_use_precompiled> <vllm_merge_base_commit> <cache_from> <cache_to>"
exit 1
}
print_instance_info() {
echo ""
echo "=== Debug: Instance Information ==="
# Get IMDSv2 token
if TOKEN=$(curl -s -X PUT "http://169.254.169.254/latest/api/token" \
-H "X-aws-ec2-metadata-token-ttl-seconds: 21600" 2>/dev/null); then
AMI_ID=$(curl -s -H "X-aws-ec2-metadata-token: $TOKEN" \
http://169.254.169.254/latest/meta-data/ami-id 2>/dev/null || echo "unknown")
INSTANCE_TYPE=$(curl -s -H "X-aws-ec2-metadata-token: $TOKEN" \
http://169.254.169.254/latest/meta-data/instance-type 2>/dev/null || echo "unknown")
INSTANCE_ID=$(curl -s -H "X-aws-ec2-metadata-token: $TOKEN" \
http://169.254.169.254/latest/meta-data/instance-id 2>/dev/null || echo "unknown")
AZ=$(curl -s -H "X-aws-ec2-metadata-token: $TOKEN" \
http://169.254.169.254/latest/meta-data/placement/availability-zone 2>/dev/null || echo "unknown")
echo "AMI ID: ${AMI_ID}"
echo "Instance Type: ${INSTANCE_TYPE}"
echo "Instance ID: ${INSTANCE_ID}"
echo "AZ: ${AZ}"
else
echo "Not running on EC2 or IMDS not available"
fi
# Check for warm cache AMI (marker file baked into custom AMI)
if [[ -f /etc/vllm-ami-info ]]; then
echo "Cache: warm (custom vLLM AMI)"
cat /etc/vllm-ami-info
else
echo "Cache: cold (standard AMI)"
fi
echo "==================================="
echo ""
}
setup_buildx_builder() {
echo "--- :buildkite: Setting up buildx builder"
if [[ -S "${BUILDKIT_SOCKET}" ]]; then
# Custom AMI with standalone buildkitd - use remote driver for warm cache
echo "✅ Found local buildkitd socket at ${BUILDKIT_SOCKET}"
echo "Using remote driver to connect to buildkitd (warm cache available)"
if docker buildx inspect baked-vllm-builder >/dev/null 2>&1; then
echo "Using existing baked-vllm-builder"
docker buildx use baked-vllm-builder
else
echo "Creating baked-vllm-builder with remote driver"
docker buildx create \
--name baked-vllm-builder \
--driver remote \
--use \
"unix://${BUILDKIT_SOCKET}"
fi
docker buildx inspect --bootstrap
elif docker buildx inspect "${BUILDER_NAME}" >/dev/null 2>&1; then
# Existing builder available
echo "Using existing builder: ${BUILDER_NAME}"
docker buildx use "${BUILDER_NAME}"
docker buildx inspect --bootstrap
else
# No local buildkitd, no existing builder - create new docker-container builder
echo "No local buildkitd found, using docker-container driver"
docker buildx create --name "${BUILDER_NAME}" --driver docker-container --use
docker buildx inspect --bootstrap
fi
# builder info
echo "Active builder:"
docker buildx ls | grep -E '^\*|^NAME' || docker buildx ls
}
check_and_skip_if_image_exists() {
if [[ -n "${IMAGE_TAG:-}" ]]; then
echo "--- :mag: Checking if image exists"
if docker manifest inspect "${IMAGE_TAG}" >/dev/null 2>&1; then
echo "Image already exists: ${IMAGE_TAG}"
echo "Skipping build"
exit 0
fi
echo "Image not found, proceeding with build"
fi
}
ecr_login() {
aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin "$REGISTRY"
aws ecr get-login-password --region us-east-1 | docker login --username AWS --password-stdin 936637512419.dkr.ecr.us-east-1.amazonaws.com
}
prepare_cache_tags() {
# resolve and set: CACHE_TO, CACHE_FROM, CACHE_FROM_BASE_BRANCH, CACHE_FROM_MAIN
TEST_CACHE_ECR="936637512419.dkr.ecr.us-east-1.amazonaws.com/vllm-ci-test-cache"
MAIN_CACHE_ECR="936637512419.dkr.ecr.us-east-1.amazonaws.com/vllm-ci-postmerge-cache"
if [[ "$BUILDKITE_PULL_REQUEST" == "false" ]]; then
if [[ "$BUILDKITE_BRANCH" == "main" ]]; then
cache="${MAIN_CACHE_ECR}:latest"
else
clean_branch=$(clean_docker_tag "$BUILDKITE_BRANCH")
cache="${TEST_CACHE_ECR}:${clean_branch}"
fi
CACHE_TO="$cache"
CACHE_FROM="$cache"
CACHE_FROM_BASE_BRANCH="$cache"
else
CACHE_TO="${TEST_CACHE_ECR}:pr-${BUILDKITE_PULL_REQUEST}"
CACHE_FROM="${TEST_CACHE_ECR}:pr-${BUILDKITE_PULL_REQUEST}"
if [[ "$BUILDKITE_PULL_REQUEST_BASE_BRANCH" == "main" ]]; then
CACHE_FROM_BASE_BRANCH="${MAIN_CACHE_ECR}:latest"
else
clean_base=$(clean_docker_tag "$BUILDKITE_PULL_REQUEST_BASE_BRANCH")
CACHE_FROM_BASE_BRANCH="${TEST_CACHE_ECR}:${clean_base}"
fi
fi
CACHE_FROM_MAIN="${MAIN_CACHE_ECR}:latest"
export CACHE_TO CACHE_FROM CACHE_FROM_BASE_BRANCH CACHE_FROM_MAIN
}
resolve_parent_commit() {
if [[ -z "${PARENT_COMMIT:-}" ]]; then
PARENT_COMMIT=$(git rev-parse HEAD~1 2>/dev/null || echo "")
if [[ -n "${PARENT_COMMIT}" ]]; then
echo "Computed parent commit for cache fallback: ${PARENT_COMMIT}"
export PARENT_COMMIT
else
echo "Could not determine parent commit (may be first commit in repo)"
fi
else
echo "Using provided PARENT_COMMIT: ${PARENT_COMMIT}"
fi
}
print_bake_config() {
echo "--- :page_facing_up: Resolved bake configuration"
BAKE_CONFIG_FILE="bake-config-build-${BUILDKITE_BUILD_NUMBER:-local}.json"
docker buildx bake -f "${VLLM_BAKE_FILE_PATH}" -f "${CI_HCL_PATH}" --print "${TARGET}" | tee "${BAKE_CONFIG_FILE}" || true
echo "Saved bake config to ${BAKE_CONFIG_FILE}"
echo "--- :arrow_down: Uploading bake config to Buildkite"
buildkite-agent artifact upload "${BAKE_CONFIG_FILE}"
}
#################################
# Main Script #
#################################
print_instance_info
if [[ $# -lt 7 ]]; then
print_usage_and_exit
fi fi
# input args
REGISTRY=$1 REGISTRY=$1
REPO=$2 REPO=$2
BUILDKITE_COMMIT=$3 BUILDKITE_COMMIT=$3
BRANCH=$4 BRANCH=$4
VLLM_USE_PRECOMPILED=$5 VLLM_USE_PRECOMPILED=$5
VLLM_MERGE_BASE_COMMIT=$6 VLLM_MERGE_BASE_COMMIT=$6
IMAGE_TAG=$7 CACHE_FROM=$7
IMAGE_TAG_LATEST=${8:-} # only used for main branch, optional CACHE_TO=$8
# build config # authenticate with AWS ECR
TARGET="test-ci" aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin $REGISTRY
VLLM_BAKE_FILE_PATH="${VLLM_BAKE_FILE_PATH:-docker/docker-bake.hcl}" aws ecr get-login-password --region us-east-1 | docker login --username AWS --password-stdin 936637512419.dkr.ecr.us-east-1.amazonaws.com
BUILDER_NAME="${BUILDER_NAME:-vllm-builder}"
CI_HCL_URL="${CI_HCL_URL:-https://raw.githubusercontent.com/vllm-project/ci-infra/main/docker/ci.hcl}"
CI_HCL_PATH="/tmp/ci.hcl"
BUILDKIT_SOCKET="/run/buildkit/buildkitd.sock"
prepare_cache_tags # docker buildx
ecr_login docker buildx create --name vllm-builder --driver docker-container --use
docker buildx inspect --bootstrap
docker buildx ls
# Environment info (for docs and human readers) # skip build if image already exists
# VLLM_CI_BRANCH - ci-infra branch to use (default: main) if [[ -z $(docker manifest inspect $REGISTRY/$REPO:$BUILDKITE_COMMIT) ]]; then
# VLLM_BAKE_FILE_PATH - Path to vLLM's bake file (default: docker/docker-bake.hcl) echo "Image not found, proceeding with build..."
# BUILDER_NAME - Name for buildx builder (default: vllm-builder) else
# echo "Image found"
# Build configuration (exported as environment variables for bake): exit 0
export BUILDKITE_COMMIT
export PARENT_COMMIT
export IMAGE_TAG
export IMAGE_TAG_LATEST
export CACHE_FROM
export CACHE_FROM_BASE_BRANCH
export CACHE_FROM_MAIN
export CACHE_TO
export VLLM_USE_PRECOMPILED
export VLLM_MERGE_BASE_COMMIT
# print args
echo "--- :mag: Arguments"
echo "REGISTRY: ${REGISTRY}"
echo "REPO: ${REPO}"
echo "BUILDKITE_COMMIT: ${BUILDKITE_COMMIT}"
echo "BRANCH: ${BRANCH}"
echo "VLLM_USE_PRECOMPILED: ${VLLM_USE_PRECOMPILED}"
echo "VLLM_MERGE_BASE_COMMIT: ${VLLM_MERGE_BASE_COMMIT}"
echo "IMAGE_TAG: ${IMAGE_TAG}"
echo "IMAGE_TAG_LATEST: ${IMAGE_TAG_LATEST}"
# print build configuration
echo "--- :mag: Build configuration"
echo "TARGET: ${TARGET}"
echo "vLLM bake file: ${VLLM_BAKE_FILE_PATH}"
echo "BUILDER_NAME: ${BUILDER_NAME}"
echo "CI_HCL_URL: ${CI_HCL_URL}"
echo "BUILDKIT_SOCKET: ${BUILDKIT_SOCKET}"
echo "--- :mag: Cache tags"
echo "CACHE_TO: ${CACHE_TO}"
echo "CACHE_FROM: ${CACHE_FROM}"
echo "CACHE_FROM_BASE_BRANCH: ${CACHE_FROM_BASE_BRANCH}"
echo "CACHE_FROM_MAIN: ${CACHE_FROM_MAIN}"
check_and_skip_if_image_exists
echo "--- :docker: Setting up Docker buildx bake"
echo "Target: ${TARGET}"
echo "vLLM bake file: ${VLLM_BAKE_FILE_PATH}"
echo "CI HCL path: ${CI_HCL_PATH}"
if [[ ! -f "${VLLM_BAKE_FILE_PATH}" ]]; then
echo "Error: vLLM bake file not found at ${VLLM_BAKE_FILE_PATH}"
echo "Make sure you're running from the vLLM repository root"
exit 1
fi fi
echo "--- :arrow_down: Downloading ci.hcl" if [[ "${VLLM_USE_PRECOMPILED:-0}" == "1" ]]; then
curl -sSfL -o "${CI_HCL_PATH}" "${CI_HCL_URL}" merge_base_commit_build_args="--build-arg VLLM_MERGE_BASE_COMMIT=${VLLM_MERGE_BASE_COMMIT}"
echo "Downloaded to ${CI_HCL_PATH}" else
merge_base_commit_build_args=""
if [[ ! -f "${CI_HCL_PATH}" ]]; then
echo "Error: ci.hcl not found at ${CI_HCL_PATH}"
exit 1
fi fi
setup_buildx_builder # build
docker buildx build --file docker/Dockerfile \
resolve_parent_commit --build-arg max_jobs=16 \
export PARENT_COMMIT --build-arg buildkite_commit=$BUILDKITE_COMMIT \
--build-arg USE_SCCACHE=1 \
print_bake_config --build-arg TORCH_CUDA_ARCH_LIST="8.0 8.9 9.0 10.0" \
--build-arg FI_TORCH_CUDA_ARCH_LIST="8.0 8.9 9.0a 10.0a" \
echo "--- :docker: Building ${TARGET}" --build-arg VLLM_USE_PRECOMPILED="${VLLM_USE_PRECOMPILED:-0}" \
docker --debug buildx bake -f "${VLLM_BAKE_FILE_PATH}" -f "${CI_HCL_PATH}" --progress plain "${TARGET}" ${merge_base_commit_build_args} \
--cache-from type=registry,ref=${CACHE_FROM},mode=max \
echo "--- :white_check_mark: Build complete" --cache-to type=registry,ref=${CACHE_TO},mode=max \
--tag ${REGISTRY}/${REPO}:${BUILDKITE_COMMIT} \
$( [[ "${BRANCH}" == "main" ]] && echo "--tag ${REGISTRY}/${REPO}:latest" ) \
--push \
--target test \
--progress plain .

View File

@@ -4,8 +4,7 @@ steps:
key: image-build key: image-build
depends_on: [] depends_on: []
commands: commands:
- if [[ "$BUILDKITE_BRANCH" != "main" ]]; then .buildkite/image_build/image_build.sh $REGISTRY $REPO $BUILDKITE_COMMIT $BRANCH $VLLM_USE_PRECOMPILED $VLLM_MERGE_BASE_COMMIT $IMAGE_TAG; fi - .buildkite/image_build/image_build.sh $REGISTRY $REPO $BUILDKITE_COMMIT $BRANCH $VLLM_USE_PRECOMPILED $VLLM_MERGE_BASE_COMMIT $CACHE_FROM $CACHE_TO
- if [[ "$BUILDKITE_BRANCH" == "main" ]]; then .buildkite/image_build/image_build.sh $REGISTRY $REPO $BUILDKITE_COMMIT $BRANCH $VLLM_USE_PRECOMPILED $VLLM_MERGE_BASE_COMMIT $IMAGE_TAG $IMAGE_TAG_LATEST; fi
retry: retry:
automatic: automatic:
- exit_status: -1 # Agent was lost - exit_status: -1 # Agent was lost

View File

@@ -1,5 +0,0 @@
Qwen2.5-1.5B-Instruct.yaml
Meta-Llama-3.2-1B-Instruct-INT8-compressed-tensors.yaml
Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml
Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml
Qwen1.5-MoE-W4A16-compressed-tensors.yaml

View File

@@ -393,7 +393,7 @@ if __name__ == "__main__":
with open(results_folder / md_file, "w") as f: with open(results_folder / md_file, "w") as f:
results = read_markdown( results = read_markdown(
"../.buildkite/performance-benchmarks/" "../.buildkite/performance-benchmarks/"
"performance-benchmarks-descriptions.md" + "performance-benchmarks-descriptions.md"
) )
results = results.format( results = results.format(
latency_tests_markdown_table=latency_md_table, latency_tests_markdown_table=latency_md_table,

View File

@@ -25,9 +25,9 @@ check_gpus() {
echo "Need at least 1 GPU to run benchmarking." echo "Need at least 1 GPU to run benchmarking."
exit 1 exit 1
fi fi
declare -g arch_suffix='' declare -g arch_suffix=''
if command -v nvidia-smi; then if command -v nvidia-smi; then
declare -g gpu_type=$(nvidia-smi --query-gpu=name --format=csv,noheader | awk '{print $2}') declare -g gpu_type=$(nvidia-smi --query-gpu=name --format=csv,noheader | awk '{print $2}')
elif command -v amd-smi; then elif command -v amd-smi; then
@@ -181,20 +181,19 @@ upload_to_buildkite() {
$BUILDKITE_AGENT_COMMAND artifact upload "$RESULTS_FOLDER/*" $BUILDKITE_AGENT_COMMAND artifact upload "$RESULTS_FOLDER/*"
} }
run_benchmark_tests() { run_latency_tests() {
# run benchmark tests using `vllm bench <test_type>` command # run latency tests using `vllm bench latency` command
# $1: test type (latency or throughput) # $1: a json file specifying latency test cases
# $2: a json file specifying test cases
local test_type=$1 local latency_test_file
local test_file=$2 latency_test_file=$1
# Iterate over tests # Iterate over latency tests
jq -c '.[]' "$test_file" | while read -r params; do jq -c '.[]' "$latency_test_file" | while read -r params; do
# get the test name, and append the GPU type back to it. # get the test name, and append the GPU type back to it.
test_name=$(echo "$params" | jq -r '.test_name') test_name=$(echo "$params" | jq -r '.test_name')
if [[ ! "$test_name" =~ ^${test_type}_ ]]; then if [[ ! "$test_name" =~ ^latency_ ]]; then
echo "In ${test_type}-test.json, test_name must start with \"${test_type}_\"." echo "In latency-test.json, test_name must start with \"latency_\"."
exit 1 exit 1
fi fi
@@ -205,15 +204,15 @@ run_benchmark_tests() {
fi fi
# get arguments # get arguments
bench_params=$(echo "$params" | jq -r '.parameters') latency_params=$(echo "$params" | jq -r '.parameters')
bench_args=$(json2args "$bench_params") latency_args=$(json2args "$latency_params")
bench_environment_variables=$(echo "$params" | jq -r '.environment_variables') latency_environment_variables=$(echo "$params" | jq -r '.environment_variables')
bench_envs=$(json2envs "$bench_environment_variables") latency_envs=$(json2envs "$latency_environment_variables")
# check if there is enough GPU to run the test # check if there is enough GPU to run the test
tp=$(echo "$bench_params" | jq -r '.tensor_parallel_size') tp=$(echo "$latency_params" | jq -r '.tensor_parallel_size')
if [[ "$ON_CPU" == "1" ]]; then if [[ "$ON_CPU" == "1" ]]; then
pp=$(echo "$bench_params" | jq -r '.pipeline_parallel_size // 1') pp=$(echo "$latency_params" | jq -r '.pipeline_parallel_size // 1')
world_size=$(($tp*$pp)) world_size=$(($tp*$pp))
if [[ $numa_count -lt $world_size && -z "${REMOTE_HOST}" ]]; then if [[ $numa_count -lt $world_size && -z "${REMOTE_HOST}" ]]; then
echo "Required world-size $world_size but only $numa_count NUMA nodes found. Skip testcase $test_name." echo "Required world-size $world_size but only $numa_count NUMA nodes found. Skip testcase $test_name."
@@ -226,42 +225,97 @@ run_benchmark_tests() {
fi fi
fi fi
bench_command=" $bench_envs vllm bench $test_type \ latency_command=" $latency_envs vllm bench latency \
--output-json $RESULTS_FOLDER/${test_name}.json \ --output-json $RESULTS_FOLDER/${test_name}.json \
$bench_args" $latency_args"
echo "Running test case $test_name" echo "Running test case $test_name"
echo "${test_type^} command: $bench_command" echo "Latency command: $latency_command"
# recording benchmarking command and GPU command # recoding benchmarking command ang GPU command
jq_output=$(jq -n \ jq_output=$(jq -n \
--arg command "$bench_command" \ --arg latency "$latency_command" \
--arg gpu "$gpu_type" \ --arg gpu "$gpu_type" \
--arg test_type "$test_type" \
'{ '{
($test_type + "_command"): $command, latency_command: $latency,
gpu_type: $gpu gpu_type: $gpu
}') }')
echo "$jq_output" >"$RESULTS_FOLDER/$test_name.commands" echo "$jq_output" >"$RESULTS_FOLDER/$test_name.commands"
# run the benchmark # run the benchmark
eval "$bench_command" eval "$latency_command"
kill_gpu_processes kill_gpu_processes
done done
} }
run_latency_tests() {
run_benchmark_tests "latency" "$1"
}
run_startup_tests() {
run_benchmark_tests "startup" "$1"
}
run_throughput_tests() { run_throughput_tests() {
run_benchmark_tests "throughput" "$1" # run throughput tests using `vllm bench throughput`
# $1: a json file specifying throughput test cases
local throughput_test_file
throughput_test_file=$1
# Iterate over throughput tests
jq -c '.[]' "$throughput_test_file" | while read -r params; do
# get the test name, and append the GPU type back to it.
test_name=$(echo "$params" | jq -r '.test_name')
if [[ ! "$test_name" =~ ^throughput_ ]]; then
echo "In throughput-test.json, test_name must start with \"throughput_\"."
exit 1
fi
# if TEST_SELECTOR is set, only run the test cases that match the selector
if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
echo "Skip test case $test_name."
continue
fi
# get arguments
throughput_params=$(echo "$params" | jq -r '.parameters')
throughput_args=$(json2args "$throughput_params")
throughput_environment_variables=$(echo "$params" | jq -r '.environment_variables')
throughput_envs=$(json2envs "$throughput_environment_variables")
# check if there is enough GPU to run the test
tp=$(echo "$throughput_params" | jq -r '.tensor_parallel_size')
if [[ "$ON_CPU" == "1" ]]; then
pp=$(echo "$throughput_params" | jq -r '.pipeline_parallel_size // 1')
world_size=$(($tp*$pp))
if [[ $numa_count -lt $world_size && -z "${REMOTE_HOST}" ]]; then
echo "Required world-size $world_size but only $numa_count NUMA nodes found. Skip testcase $test_name."
continue
fi
else
if [[ $gpu_count -lt $tp ]]; then
echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name."
continue
fi
fi
throughput_command=" $throughput_envs vllm bench throughput \
--output-json $RESULTS_FOLDER/${test_name}.json \
$throughput_args"
echo "Running test case $test_name"
echo "Throughput command: $throughput_command"
# recoding benchmarking command ang GPU command
jq_output=$(jq -n \
--arg command "$throughput_command" \
--arg gpu "$gpu_type" \
'{
throughput_command: $command,
gpu_type: $gpu
}')
echo "$jq_output" >"$RESULTS_FOLDER/$test_name.commands"
# run the benchmark
eval "$throughput_command"
kill_gpu_processes
done
} }
run_serving_tests() { run_serving_tests() {
@@ -480,7 +534,6 @@ main() {
# benchmarking # benchmarking
run_serving_tests $QUICK_BENCHMARK_ROOT/tests/"${SERVING_JSON:-serving-tests$ARCH.json}" run_serving_tests $QUICK_BENCHMARK_ROOT/tests/"${SERVING_JSON:-serving-tests$ARCH.json}"
run_latency_tests $QUICK_BENCHMARK_ROOT/tests/"${LATENCY_JSON:-latency-tests$ARCH.json}" run_latency_tests $QUICK_BENCHMARK_ROOT/tests/"${LATENCY_JSON:-latency-tests$ARCH.json}"
run_startup_tests $QUICK_BENCHMARK_ROOT/tests/"${STARTUP_JSON:-startup-tests$ARCH.json}"
run_throughput_tests $QUICK_BENCHMARK_ROOT/tests/"${THROUGHPUT_JSON:-throughput-tests$ARCH.json}" run_throughput_tests $QUICK_BENCHMARK_ROOT/tests/"${THROUGHPUT_JSON:-throughput-tests$ARCH.json}"
# postprocess benchmarking results # postprocess benchmarking results

View File

@@ -1,270 +1,277 @@
steps: steps:
# aarch64 + CUDA builds
- label: "Build wheel - aarch64 - CUDA 12.9"
depends_on: ~
id: build-wheel-arm64-cuda-12-9
agents:
queue: arm64_cpu_queue_postmerge
commands:
# #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here:
# https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
- "mkdir artifacts"
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
- "bash .buildkite/scripts/upload-nightly-wheels.sh"
env:
DOCKER_BUILDKIT: "1"
- label: "Build wheel - aarch64 - CUDA 13.0"
depends_on: ~
id: build-wheel-arm64-cuda-13-0
agents:
queue: arm64_cpu_queue_postmerge
commands:
# #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here:
# https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
- "mkdir artifacts"
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
- "bash .buildkite/scripts/upload-nightly-wheels.sh manylinux_2_35"
env:
DOCKER_BUILDKIT: "1"
# aarch64 build
- label: "Build wheel - aarch64 - CPU"
depends_on: ~
id: build-wheel-arm64-cpu
agents:
queue: arm64_cpu_queue_postmerge
commands:
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_BUILD_ACL=ON --tag vllm-ci:build-image --target vllm-build --progress plain -f docker/Dockerfile.cpu ."
- "mkdir artifacts"
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
- "bash .buildkite/scripts/upload-nightly-wheels.sh manylinux_2_35"
env:
DOCKER_BUILDKIT: "1"
# x86 + CUDA builds
- label: "Build wheel - x86_64 - CUDA 12.9"
depends_on: ~
id: build-wheel-x86-cuda-12-9
agents:
queue: cpu_queue_postmerge
commands:
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
- "mkdir artifacts"
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
- "bash .buildkite/scripts/upload-nightly-wheels.sh manylinux_2_31"
env:
DOCKER_BUILDKIT: "1"
- label: "Build wheel - x86_64 - CUDA 13.0"
depends_on: ~
id: build-wheel-x86-cuda-13-0
agents:
queue: cpu_queue_postmerge
commands:
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
- "mkdir artifacts"
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
- "bash .buildkite/scripts/upload-nightly-wheels.sh manylinux_2_35"
env:
DOCKER_BUILDKIT: "1"
# x86 CPU wheel build
- label: "Build wheel - x86_64 - CPU"
depends_on: ~
id: build-wheel-x86-cpu
agents:
queue: cpu_queue_postmerge
commands:
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_CPU_AVX512BF16=true --build-arg VLLM_CPU_AVX512VNNI=true --build-arg VLLM_CPU_AMXBF16=true --tag vllm-ci:build-image --target vllm-build --progress plain -f docker/Dockerfile.cpu ."
- "mkdir artifacts"
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
- "bash .buildkite/scripts/upload-nightly-wheels.sh manylinux_2_35"
env:
DOCKER_BUILDKIT: "1"
# Build release images (CUDA 12.9)
- label: "Build release image - x86_64 - CUDA 12.9"
depends_on: ~
id: build-release-image-x86
agents:
queue: cpu_queue_postmerge
commands:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)"
# re-tag to default image tag and push, just in case arm64 build fails
- "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
- label: "Build release image - aarch64 - CUDA 12.9"
depends_on: ~
id: build-release-image-arm64
agents:
queue: arm64_cpu_queue_postmerge
commands:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)"
- label: "Create multi-arch manifest - CUDA 12.9"
depends_on:
- build-release-image-x86
- build-release-image-arm64
id: create-multi-arch-manifest
agents:
queue: small_cpu_queue_postmerge
commands:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
- "docker manifest create public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64 --amend"
- "docker manifest push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
- label: "Annotate release workflow - CUDA 12.9"
depends_on:
- create-multi-arch-manifest
id: annotate-release-workflow
agents:
queue: small_cpu_queue_postmerge
commands:
- "bash .buildkite/scripts/annotate-release.sh"
- block: "Build CUDA 13.0 release images"
key: block-release-image-build-cuda-13-0
depends_on: ~
- label: "Build release image - x86_64 - CUDA 13.0"
depends_on: block-release-image-build-cuda-13-0
id: build-release-image-x86-cuda-13-0
agents:
queue: cpu_queue_postmerge
commands:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.2 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg INSTALL_KV_CONNECTORS=true --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.2-devel-ubuntu22.04 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130 --target vllm-openai --progress plain -f docker/Dockerfile ."
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130"
# re-tag to default image tag and push, just in case arm64 build fails
- "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu130"
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu130"
- label: "Build release image - aarch64 - CUDA 13.0"
depends_on: block-release-image-build-cuda-13-0
id: build-release-image-arm64-cuda-13-0
agents:
queue: arm64_cpu_queue_postmerge
commands:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.2 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --build-arg INSTALL_KV_CONNECTORS=true --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.2-devel-ubuntu22.04 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130 --target vllm-openai --progress plain -f docker/Dockerfile ."
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130"
- label: "Create multi-arch manifest - CUDA 13.0"
depends_on:
- build-release-image-x86-cuda-13-0
- build-release-image-arm64-cuda-13-0
id: create-multi-arch-manifest-cuda-13-0
agents:
queue: small_cpu_queue_postmerge
commands:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
- "docker manifest create public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu130 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64-cu130 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64-cu130 --amend"
- "docker manifest push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu130"
- input: "Provide Release version here" - input: "Provide Release version here"
id: input-release-version id: input-release-version
fields: fields:
- text: "What is the release version?" - text: "What is the release version?"
key: release-version key: release-version
- group: "Build Python wheels" - block: "Confirm update release wheels to PyPI (experimental, use with caution)?"
key: "build-wheels" key: block-upload-release-wheels
steps: depends_on:
- label: "Build wheel - aarch64 - CUDA 12.9" - input-release-version
depends_on: ~ - build-wheel-x86-cuda-12-9
id: build-wheel-arm64-cuda-12-9 - build-wheel-x86-cuda-13-0
agents: - build-wheel-x86-cpu
queue: arm64_cpu_queue_postmerge - build-wheel-arm64-cuda-12-9
commands: - build-wheel-arm64-cuda-13-0
# #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here: - build-wheel-arm64-cpu
# https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
- "mkdir artifacts"
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
- "bash .buildkite/scripts/upload-nightly-wheels.sh"
env:
DOCKER_BUILDKIT: "1"
- label: "Build wheel - aarch64 - CUDA 13.0" - label: "Upload release wheels to PyPI and GitHub"
depends_on: ~ depends_on:
id: build-wheel-arm64-cuda-13-0 - block-upload-release-wheels
agents: id: upload-release-wheels
queue: arm64_cpu_queue_postmerge agents:
commands: queue: small_cpu_queue_postmerge
# #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here: commands:
# https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7 - "bash .buildkite/scripts/upload-release-wheels.sh"
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
- "mkdir artifacts"
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
- "bash .buildkite/scripts/upload-nightly-wheels.sh manylinux_2_35"
env:
DOCKER_BUILDKIT: "1"
- label: "Build wheel - aarch64 - CPU" - block: "Build CPU release image"
depends_on: ~ key: block-cpu-release-image-build
id: build-wheel-arm64-cpu depends_on: ~
agents:
queue: arm64_cpu_queue_postmerge
commands:
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_BUILD_ACL=ON --tag vllm-ci:build-image --target vllm-build --progress plain -f docker/Dockerfile.cpu ."
- "mkdir artifacts"
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
- "bash .buildkite/scripts/upload-nightly-wheels.sh manylinux_2_35"
env:
DOCKER_BUILDKIT: "1"
- label: "Build wheel - x86_64 - CUDA 12.9" - label: "Build and publish CPU release image"
depends_on: ~ depends_on: block-cpu-release-image-build
id: build-wheel-x86-cuda-12-9 agents:
agents: queue: cpu_queue_postmerge
queue: cpu_queue_postmerge commands:
commands: - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ." - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_CPU_AVX512BF16=true --build-arg VLLM_CPU_AVX512VNNI=true --build-arg VLLM_CPU_AMXBF16=true --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ."
- "mkdir artifacts" - "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest"
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'" - "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version)"
- "bash .buildkite/scripts/upload-nightly-wheels.sh manylinux_2_31" env:
env: DOCKER_BUILDKIT: "1"
DOCKER_BUILDKIT: "1"
- label: "Build wheel - x86_64 - CUDA 13.0" - block: "Build arm64 CPU release image"
depends_on: ~ key: block-arm64-cpu-release-image-build
id: build-wheel-x86-cuda-13-0 depends_on: ~
agents:
queue: cpu_queue_postmerge
commands:
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
- "mkdir artifacts"
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
- "bash .buildkite/scripts/upload-nightly-wheels.sh manylinux_2_35"
env:
DOCKER_BUILDKIT: "1"
- label: "Build wheel - x86_64 - CPU" - label: "Build and publish arm64 CPU release image"
depends_on: ~ depends_on: block-arm64-cpu-release-image-build
id: build-wheel-x86-cpu agents:
agents: queue: arm64_cpu_queue_postmerge
queue: cpu_queue_postmerge commands:
commands: - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_CPU_AVX512BF16=true --build-arg VLLM_CPU_AVX512VNNI=true --build-arg VLLM_CPU_AMXBF16=true --tag vllm-ci:build-image --target vllm-build --progress plain -f docker/Dockerfile.cpu ." - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ."
- "mkdir artifacts" - "docker push public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:latest"
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'" - "docker push public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:$(buildkite-agent meta-data get release-version)"
- "bash .buildkite/scripts/upload-nightly-wheels.sh manylinux_2_35" env:
env: DOCKER_BUILDKIT: "1"
DOCKER_BUILDKIT: "1"
- group: "Build release Docker images" - block: "Build ROCm release image"
key: "build-release-images" key: block-rocm-release-image-build
steps: depends_on: ~
- label: "Build release image - x86_64 - CUDA 12.9"
depends_on: ~
id: build-release-image-x86
agents:
queue: cpu_queue_postmerge
commands:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)"
# re-tag to default image tag and push, just in case arm64 build fails
- "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
- label: "Build release image - aarch64 - CUDA 12.9" - label: "Build release image (ROCm)"
depends_on: ~ depends_on: block-rocm-release-image-build
id: build-release-image-arm64 id: build-release-image-rocm
agents: agents:
queue: arm64_cpu_queue_postmerge queue: cpu_queue_postmerge
commands: commands:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7" - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ." # Build base image first
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)" - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --tag rocm/vllm-dev:base-$BUILDKITE_COMMIT --target final --progress plain -f docker/Dockerfile.rocm_base ."
# Build vLLM ROCm image using the base
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg BASE_IMAGE=rocm/vllm-dev:base-$BUILDKITE_COMMIT --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-rocm --target vllm-openai --progress plain -f docker/Dockerfile.rocm ."
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-rocm"
- label: "Build release image - x86_64 - CUDA 13.0"
depends_on: ~ - label: "Build and publish nightly multi-arch image to DockerHub"
id: build-release-image-x86-cuda-13-0 depends_on:
agents: - create-multi-arch-manifest
queue: cpu_queue_postmerge if: build.env("NIGHTLY") == "1"
commands: agents:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7" queue: small_cpu_queue_postmerge
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg INSTALL_KV_CONNECTORS=true --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130 --target vllm-openai --progress plain -f docker/Dockerfile ." commands:
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130" - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
# re-tag to default image tag and push, just in case arm64 build fails - "docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64"
- "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu130" - "docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64"
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu130" - "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64 vllm/vllm-openai:nightly-x86_64"
- "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64 vllm/vllm-openai:nightly-aarch64"
- label: "Build release image - aarch64 - CUDA 13.0" - "docker push vllm/vllm-openai:nightly-x86_64"
depends_on: ~ - "docker push vllm/vllm-openai:nightly-aarch64"
id: build-release-image-arm64-cuda-13-0 - "docker manifest create vllm/vllm-openai:nightly vllm/vllm-openai:nightly-x86_64 vllm/vllm-openai:nightly-aarch64 --amend"
agents: - "docker manifest create vllm/vllm-openai:nightly-$BUILDKITE_COMMIT vllm/vllm-openai:nightly-x86_64 vllm/vllm-openai:nightly-aarch64 --amend"
queue: arm64_cpu_queue_postmerge - "docker manifest push vllm/vllm-openai:nightly"
commands: - "docker manifest push vllm/vllm-openai:nightly-$BUILDKITE_COMMIT"
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7" # Clean up old nightly builds (keep only last 14)
# compute capability 12.0 for RTX-50 series / RTX PRO 6000 Blackwell, 12.1 for DGX Spark - "bash .buildkite/scripts/cleanup-nightly-builds.sh"
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0 12.1' --build-arg INSTALL_KV_CONNECTORS=true --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130 --target vllm-openai --progress plain -f docker/Dockerfile ." plugins:
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130" - docker-login#v3.0.0:
username: vllmbot
- block: "Build release image for x86_64 CPU" password-env: DOCKERHUB_TOKEN
key: block-cpu-release-image-build env:
depends_on: ~ DOCKER_BUILDKIT: "1"
DOCKERHUB_USERNAME: "vllmbot"
- label: "Build release image - x86_64 - CPU"
depends_on:
- block-cpu-release-image-build
- input-release-version
agents:
queue: cpu_queue_postmerge
commands:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_CPU_AVX512BF16=true --build-arg VLLM_CPU_AVX512VNNI=true --build-arg VLLM_CPU_AMXBF16=true --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ."
- "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest"
- "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version)"
env:
DOCKER_BUILDKIT: "1"
- block: "Build release image for arm64 CPU"
key: block-arm64-cpu-release-image-build
depends_on: ~
- label: "Build release image - arm64 - CPU"
depends_on:
- block-arm64-cpu-release-image-build
- input-release-version
agents:
queue: arm64_cpu_queue_postmerge
commands:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ."
- "docker push public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:latest"
- "docker push public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:$(buildkite-agent meta-data get release-version)"
env:
DOCKER_BUILDKIT: "1"
- group: "Publish release images"
key: "publish-release-images"
steps:
- label: "Create multi-arch manifest - CUDA 12.9"
depends_on:
- build-release-image-x86
- build-release-image-arm64
id: create-multi-arch-manifest
agents:
queue: small_cpu_queue_postmerge
commands:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
- "docker manifest create public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64 --amend"
- "docker manifest push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
- label: "Annotate release workflow - CUDA 12.9"
depends_on:
- create-multi-arch-manifest
id: annotate-release-workflow
agents:
queue: small_cpu_queue_postmerge
commands:
- "bash .buildkite/scripts/annotate-release.sh"
- label: "Create multi-arch manifest - CUDA 13.0"
depends_on:
- build-release-image-x86-cuda-13-0
- build-release-image-arm64-cuda-13-0
id: create-multi-arch-manifest-cuda-13-0
agents:
queue: small_cpu_queue_postmerge
commands:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
- "docker manifest create public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu130 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64-cu130 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64-cu130 --amend"
- "docker manifest push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu130"
- label: "Publish nightly multi-arch image to DockerHub"
depends_on:
- create-multi-arch-manifest
if: build.env("NIGHTLY") == "1"
agents:
queue: small_cpu_queue_postmerge
commands:
- "bash .buildkite/scripts/push-nightly-builds.sh"
# Clean up old nightly builds (keep only last 14)
- "bash .buildkite/scripts/cleanup-nightly-builds.sh"
plugins:
- docker-login#v3.0.0:
username: vllmbot
password-env: DOCKERHUB_TOKEN
env:
DOCKER_BUILDKIT: "1"
DOCKERHUB_USERNAME: "vllmbot"
- label: "Publish nightly multi-arch image to DockerHub - CUDA 13.0"
depends_on:
- create-multi-arch-manifest-cuda-13-0
if: build.env("NIGHTLY") == "1"
agents:
queue: small_cpu_queue_postmerge
commands:
- "bash .buildkite/scripts/push-nightly-builds.sh cu130"
# Clean up old nightly builds (keep only last 14)
- "bash .buildkite/scripts/cleanup-nightly-builds.sh cu130-nightly-"
plugins:
- docker-login#v3.0.0:
username: vllmbot
password-env: DOCKERHUB_TOKEN
env:
DOCKER_BUILDKIT: "1"
DOCKERHUB_USERNAME: "vllmbot"
- group: "Publish wheels"
key: "publish-wheels"
steps:
- block: "Confirm update release wheels to PyPI (experimental, use with caution)?"
key: block-upload-release-wheels
depends_on:
- input-release-version
- build-wheels
- label: "Upload release wheels to PyPI"
depends_on:
- block-upload-release-wheels
id: upload-release-wheels
agents:
queue: small_cpu_queue_postmerge
commands:
- "bash .buildkite/scripts/upload-release-wheels-pypi.sh"
# ============================================================================= # =============================================================================
# ROCm Release Pipeline (x86_64 only) # ROCm Release Pipeline (x86_64 only)
@@ -459,7 +466,7 @@ steps:
S3_BUCKET: "vllm-wheels" S3_BUCKET: "vllm-wheels"
# ROCm Job 2: Build vLLM ROCm Wheel # ROCm Job 2: Build vLLM ROCm Wheel
- label: ":python: Build vLLM ROCm Wheel - x86_64" - label: ":python: Build vLLM ROCm Wheel"
id: build-rocm-vllm-wheel id: build-rocm-vllm-wheel
depends_on: depends_on:
- step: build-rocm-base-wheels - step: build-rocm-base-wheels
@@ -621,93 +628,9 @@ steps:
depends_on: depends_on:
- step: upload-rocm-wheels - step: upload-rocm-wheels
allow_failure: true allow_failure: true
- step: input-release-version
allow_failure: true
agents: agents:
queue: cpu_queue_postmerge queue: cpu_queue_postmerge
commands: commands:
- "bash .buildkite/scripts/annotate-rocm-release.sh" - "bash .buildkite/scripts/annotate-rocm-release.sh"
env: env:
S3_BUCKET: "vllm-wheels" S3_BUCKET: "vllm-wheels"
# ROCm Job 5: Generate Root Index for ROCm Wheels (for release only)
# This is the job to create https://wheels.vllm.ai/rocm/ index allowing
# users to install with `uv pip install vllm --extra-index-url https://wheels.vllm.ai/rocm/`
- block: "Generate Root Index for ROCm Wheels for Release"
key: block-generate-root-index-rocm-wheels
depends_on: upload-rocm-wheels
- label: ":package: Generate Root Index for ROCm Wheels for Release"
depends_on: block-generate-root-index-rocm-wheels
id: generate-root-index-rocm-wheels
agents:
queue: cpu_queue_postmerge
commands:
- "bash tools/vllm-rocm/generate-rocm-wheels-root-index.sh"
env:
S3_BUCKET: "vllm-wheels"
VARIANT: "rocm700"
# ROCm Job 5: Build ROCm Release Docker Image
- label: ":docker: Build release image - x86_64 - ROCm"
id: build-rocm-release-image
depends_on:
- step: build-rocm-base-wheels
allow_failure: false
agents:
queue: cpu_queue_postmerge
timeout_in_minutes: 60
commands:
- |
set -euo pipefail
# Login to ECR
aws ecr-public get-login-password --region us-east-1 | \
docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7
# Download Docker image from S3 (set by build-rocm-base-wheels)
DOCKER_IMAGE_S3_PATH="$$(buildkite-agent meta-data get rocm-docker-image-s3-path 2>/dev/null || echo '')"
if [ -z "$${DOCKER_IMAGE_S3_PATH}" ]; then
echo "ERROR: rocm-docker-image-s3-path metadata not found"
exit 1
fi
echo "Downloading base image from $${DOCKER_IMAGE_S3_PATH}"
mkdir -p artifacts/rocm-docker-image
aws s3 cp "$${DOCKER_IMAGE_S3_PATH}" artifacts/rocm-docker-image/rocm-base-image.tar.gz
# Load base Docker image
echo "Loading base Docker image..."
LOAD_OUTPUT=$$(gunzip -c artifacts/rocm-docker-image/rocm-base-image.tar.gz | docker load)
BASE_IMAGE_TAG=$$(echo "$${LOAD_OUTPUT}" | grep "Loaded image:" | sed 's/Loaded image: //')
echo "Loaded base image: $${BASE_IMAGE_TAG}"
# Tag and push the base image to ECR
docker tag "$${BASE_IMAGE_TAG}" public.ecr.aws/q9t5s3a7/vllm-release-repo:$${BUILDKITE_COMMIT}-rocm-base
docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$${BUILDKITE_COMMIT}-rocm-base
echo "Pushed base image: public.ecr.aws/q9t5s3a7/vllm-release-repo:$${BUILDKITE_COMMIT}-rocm-base"
# Get GPU architectures from meta-data
PYTORCH_ROCM_ARCH="$$(buildkite-agent meta-data get rocm-pytorch-rocm-arch 2>/dev/null || echo '')"
PYTORCH_ROCM_ARCH="$${PYTORCH_ROCM_ARCH:-gfx90a;gfx942;gfx950;gfx1100;gfx1101;gfx1200;gfx1201;gfx1150;gfx1151}"
# Build vLLM ROCm release image using cached base
DOCKER_BUILDKIT=1 docker build \
--build-arg max_jobs=16 \
--build-arg BASE_IMAGE="$${BASE_IMAGE_TAG}" \
--build-arg ARG_PYTORCH_ROCM_ARCH="$${PYTORCH_ROCM_ARCH}" \
--build-arg USE_SCCACHE=1 \
--build-arg SCCACHE_BUCKET_NAME=vllm-build-sccache \
--build-arg SCCACHE_REGION_NAME=us-west-2 \
--build-arg SCCACHE_S3_NO_CREDENTIALS=0 \
--tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$${BUILDKITE_COMMIT}-rocm \
--target vllm-openai \
--progress plain \
-f docker/Dockerfile.rocm .
# Push to ECR
docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$${BUILDKITE_COMMIT}-rocm
echo "Pushed: public.ecr.aws/q9t5s3a7/vllm-release-repo:$${BUILDKITE_COMMIT}-rocm"
env:
DOCKER_BUILDKIT: "1"
S3_BUCKET: "vllm-wheels"

View File

@@ -11,80 +11,51 @@ fi
buildkite-agent annotate --style 'info' --context 'release-workflow' << EOF buildkite-agent annotate --style 'info' --context 'release-workflow' << EOF
To download the wheel (by commit): To download the wheel (by commit):
\`\`\` \`\`\`
aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux_2_31_x86_64.whl . aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux1_x86_64.whl .
aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux_2_31_aarch64.whl . aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux2014_aarch64.whl .
(Optional) For CUDA 13.0: aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}+cu129-cp38-abi3-manylinux1_x86_64.whl .
aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}+cu130-cp38-abi3-manylinux_2_35_x86_64.whl . aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}+cu129-cp38-abi3-manylinux1_x86_64.whl .
aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}+cu130-cp38-abi3-manylinux_2_35_aarch64.whl .
(Optional) For CPU:
aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}+cpu-cp38-abi3-manylinux_2_35_x86_64.whl .
aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}+cpu-cp38-abi3-manylinux_2_35_aarch64.whl .
\`\`\` \`\`\`
To download the wheel (by version):
\`\`\`
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux1_x86_64.whl .
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux2014_aarch64.whl .
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu129/vllm-${RELEASE_VERSION}+cu129-cp38-abi3-manylinux1_x86_64.whl .
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu130/vllm-${RELEASE_VERSION}+cu130-cp38-abi3-manylinux1_x86_64.whl .
\`\`\`
To download and upload the image: To download and upload the image:
\`\`\` \`\`\`
Download images:
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64 docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64 docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64-cu130
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64-cu130
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm-base
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm
Tag and push images:
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64 vllm/vllm-openai:x86_64 docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64 vllm/vllm-openai:x86_64
docker tag vllm/vllm-openai:x86_64 vllm/vllm-openai:latest-x86_64 docker tag vllm/vllm-openai:x86_64 vllm/vllm-openai:latest-x86_64
docker tag vllm/vllm-openai:x86_64 vllm/vllm-openai:v${RELEASE_VERSION}-x86_64 docker tag vllm/vllm-openai:x86_64 vllm/vllm-openai:v${RELEASE_VERSION}-x86_64
docker push vllm/vllm-openai:latest-x86_64 docker push vllm/vllm-openai:latest-x86_64
docker push vllm/vllm-openai:v${RELEASE_VERSION}-x86_64 docker push vllm/vllm-openai:v${RELEASE_VERSION}-x86_64
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64-cu130 vllm/vllm-openai:x86_64-cu130
docker tag vllm/vllm-openai:x86_64-cu130 vllm/vllm-openai:latest-x86_64-cu130
docker tag vllm/vllm-openai:x86_64-cu130 vllm/vllm-openai:v${RELEASE_VERSION}-x86_64-cu130
docker push vllm/vllm-openai:latest-x86_64-cu130
docker push vllm/vllm-openai:v${RELEASE_VERSION}-x86_64-cu130
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64 vllm/vllm-openai:aarch64 docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64 vllm/vllm-openai:aarch64
docker tag vllm/vllm-openai:aarch64 vllm/vllm-openai:latest-aarch64 docker tag vllm/vllm-openai:aarch64 vllm/vllm-openai:latest-aarch64
docker tag vllm/vllm-openai:aarch64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64 docker tag vllm/vllm-openai:aarch64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
docker push vllm/vllm-openai:latest-aarch64 docker push vllm/vllm-openai:latest-aarch64
docker push vllm/vllm-openai:v${RELEASE_VERSION}-aarch64 docker push vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64-cu130 vllm/vllm-openai:aarch64-cu130 docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm vllm/vllm-openai:rocm
docker tag vllm/vllm-openai:aarch64-cu130 vllm/vllm-openai:latest-aarch64-cu130 docker tag vllm/vllm-openai:rocm vllm/vllm-openai:latest-rocm
docker tag vllm/vllm-openai:aarch64-cu130 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64-cu130 docker tag vllm/vllm-openai:rocm vllm/vllm-openai:v${RELEASE_VERSION}-rocm
docker push vllm/vllm-openai:latest-aarch64-cu130 docker push vllm/vllm-openai:latest-rocm
docker push vllm/vllm-openai:v${RELEASE_VERSION}-aarch64-cu130 docker push vllm/vllm-openai:v${RELEASE_VERSION}-rocm
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-rocm
docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-rocm vllm/vllm-openai-rocm:latest
docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-rocm vllm/vllm-openai-rocm:v${RELEASE_VERSION}-rocm
docker push vllm/vllm-openai-rocm:latest
docker push vllm/vllm-openai-rocm:v${RELEASE_VERSION}-rocm
Create multi-arch manifest:
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm-base vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-base
docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-base vllm/vllm-openai-rocm:latest-base
docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-base vllm/vllm-openai-rocm:v${RELEASE_VERSION}-base
docker push vllm/vllm-openai-rocm:latest-base
docker push vllm/vllm-openai-rocm:v${RELEASE_VERSION}-base
docker manifest rm vllm/vllm-openai:latest docker manifest rm vllm/vllm-openai:latest
docker manifest create vllm/vllm-openai:latest vllm/vllm-openai:latest-x86_64 vllm/vllm-openai:latest-aarch64 docker manifest create vllm/vllm-openai:latest vllm/vllm-openai:latest-x86_64 vllm/vllm-openai:latest-aarch64
docker manifest create vllm/vllm-openai:v${RELEASE_VERSION} vllm/vllm-openai:v${RELEASE_VERSION}-x86_64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64 docker manifest create vllm/vllm-openai:v${RELEASE_VERSION} vllm/vllm-openai:v${RELEASE_VERSION}-x86_64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
docker manifest push vllm/vllm-openai:latest docker manifest push vllm/vllm-openai:latest
docker manifest push vllm/vllm-openai:v${RELEASE_VERSION} docker manifest push vllm/vllm-openai:v${RELEASE_VERSION}
docker manifest rm vllm/vllm-openai:latest-cu130
docker manifest create vllm/vllm-openai:latest-cu130 vllm/vllm-openai:latest-x86_64-cu130 vllm/vllm-openai:latest-aarch64-cu130
docker manifest create vllm/vllm-openai:v${RELEASE_VERSION}-cu130 vllm/vllm-openai:v${RELEASE_VERSION}-x86_64-cu130 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64-cu130
docker manifest push vllm/vllm-openai:latest-cu130
docker manifest push vllm/vllm-openai:v${RELEASE_VERSION}-cu130
\`\`\` \`\`\`
EOF EOF

View File

@@ -3,32 +3,25 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# #
# Generate Buildkite annotation for ROCm wheel release # Generate Buildkite annotation for ROCm wheel release
set -ex set -ex
# Get build configuration from meta-data # Get build configuration from meta-data
# Extract ROCm version dynamically from Dockerfile.rocm_base # Extract ROCm version dynamically from Dockerfile.rocm_base
# BASE_IMAGE format: rocm/dev-ubuntu-22.04:7.0-complete -> extracts "7.0" # BASE_IMAGE format: rocm/dev-ubuntu-22.04:7.1-complete -> extracts "7.1"
ROCM_VERSION=$(grep -E '^ARG BASE_IMAGE=' docker/Dockerfile.rocm_base | sed -E 's/.*:([0-9]+\.[0-9]+).*/\1/' || echo "unknown") ROCM_VERSION=$(grep -E '^ARG BASE_IMAGE=' docker/Dockerfile.rocm_base | sed -E 's/.*:([0-9]+\.[0-9]+).*/\1/' || echo "unknown")
PYTHON_VERSION=$(buildkite-agent meta-data get rocm-python-version 2>/dev/null || echo "3.12") PYTHON_VERSION=$(buildkite-agent meta-data get rocm-python-version 2>/dev/null || echo "3.12")
PYTORCH_ROCM_ARCH=$(buildkite-agent meta-data get rocm-pytorch-rocm-arch 2>/dev/null || echo "gfx90a;gfx942;gfx950;gfx1100;gfx1101;gfx1200;gfx1201;gfx1150;gfx1151") PYTORCH_ROCM_ARCH=$(buildkite-agent meta-data get rocm-pytorch-rocm-arch 2>/dev/null || echo "gfx90a;gfx942;gfx950;gfx1100;gfx1101;gfx1200;gfx1201;gfx1150;gfx1151")
# TODO: Enable the nightly build for ROCm
# Get release version, default to 1.0.0.dev for nightly/per-commit builds
RELEASE_VERSION=$(buildkite-agent meta-data get release-version 2>/dev/null || echo "")
if [ -z "${RELEASE_VERSION}" ]; then
RELEASE_VERSION="1.0.0.dev"
fi
# S3 URLs # S3 URLs
S3_BUCKET="${S3_BUCKET:-vllm-wheels}" S3_BUCKET="${S3_BUCKET:-vllm-wheels}"
S3_REGION="${AWS_DEFAULT_REGION:-us-west-2}" S3_REGION="${AWS_DEFAULT_REGION:-us-west-2}"
S3_URL="http://${S3_BUCKET}.s3-website-${S3_REGION}.amazonaws.com" S3_URL="https://${S3_BUCKET}.s3.${S3_REGION}.amazonaws.com"
ROCM_PATH="rocm/${BUILDKITE_COMMIT}"
# Format ROCm version for path (e.g., "7.1" -> "rocm710")
ROCM_VERSION_PATH="rocm$(echo ${ROCM_VERSION} | tr -d '.')"
ROCM_PATH="rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}"
buildkite-agent annotate --style 'success' --context 'rocm-release-workflow' << EOF buildkite-agent annotate --style 'success' --context 'rocm-release-workflow' << EOF
## ROCm Wheel and Docker Image Releases ## :rocm: ROCm Wheel Release
### Build Configuration ### Build Configuration
| Setting | Value | | Setting | Value |
|---------|-------| |---------|-------|
@@ -41,72 +34,41 @@ buildkite-agent annotate --style 'success' --context 'rocm-release-workflow' <<
### :package: Installation ### :package: Installation
**Install from this build (by commit):** **Install from this build (by commit):**
\`\`\`bash \`\`\`bash
pip install vllm --extra-index-url ${S3_URL}/${ROCM_PATH}/ --trusted-host ${S3_BUCKET}.s3-website-${S3_REGION}.amazonaws.com uv pip install vllm --extra-index-url ${S3_URL}/${ROCM_PATH}/{rocm_variant}/
# Example for ROCm ${ROCM_VERSION}: # Example:
pip install vllm --extra-index-url ${S3_URL}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/ --trusted-host ${S3_BUCKET}.s3-website-${S3_REGION}.amazonaws.com uv pip install vllm --extra-index-url ${S3_URL}/${ROCM_PATH}/rocm700/
\`\`\` \`\`\`
**Install from nightly (if published):** **Install from nightly (if published):**
\`\`\`bash \`\`\`bash
pip install vllm --extra-index-url ${S3_URL}/rocm/nightly/ --trusted-host ${S3_BUCKET}.s3-website-${S3_REGION}.amazonaws.com uv pip install vllm --extra-index-url ${S3_URL}/rocm/nightly/
\`\`\` \`\`\`
### :floppy_disk: Download Wheels Directly ### :floppy_disk: Download Wheels Directly
\`\`\`bash \`\`\`bash
# List all ROCm wheels # List all ROCm wheels
aws s3 ls s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/ aws s3 ls s3://${S3_BUCKET}/${ROCM_PATH}/
# Download specific wheels # Download specific wheels
aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/vllm-*.whl . aws s3 cp s3://${S3_BUCKET}/${ROCM_PATH}/vllm-*.whl .
aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/torch-*.whl . aws s3 cp s3://${S3_BUCKET}/${ROCM_PATH}/torch-*.whl .
aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/triton-*.whl . aws s3 cp s3://${S3_BUCKET}/${ROCM_PATH}/triton_rocm-*.whl .
aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/triton-kernels-*.whl . aws s3 cp s3://${S3_BUCKET}/${ROCM_PATH}/torchvision-*.whl .
aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/torchvision-*.whl . aws s3 cp s3://${S3_BUCKET}/${ROCM_PATH}/amdsmi-*.whl .
aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/torchaudio-*.whl .
aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/amdsmi-*.whl .
aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/aiter-*.whl .
aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/flash-attn-*.whl .
\`\`\` \`\`\`
### :gear: Included Packages ### :gear: Included Packages
- **vllm**: vLLM with ROCm support - **vllm**: vLLM with ROCm support
- **torch**: PyTorch built for ROCm ${ROCM_VERSION} - **torch**: PyTorch built for ROCm ${ROCM_VERSION}
- **triton**: Triton - **triton_rocm**: Triton built for ROCm
- **triton-kernels**: Triton kernels
- **torchvision**: TorchVision for ROCm PyTorch - **torchvision**: TorchVision for ROCm PyTorch
- **torchaudio**: Torchaudio for ROCm PyTorch
- **amdsmi**: AMD SMI Python bindings - **amdsmi**: AMD SMI Python bindings
- **aiter**: Aiter for ROCm
- **flash-attn**: Flash Attention for ROCm
### :warning: Notes ### :warning: Notes
- These wheels are built for **ROCm ${ROCM_VERSION}** and will NOT work with CUDA GPUs - These wheels are built for **ROCm ${ROCM_VERSION}** and will NOT work with CUDA GPUs
- Supported GPU architectures: ${PYTORCH_ROCM_ARCH} - Supported GPU architectures: ${PYTORCH_ROCM_ARCH}
- Platform: Linux x86_64 only - Platform: Linux x86_64 only
### :package: Docker Image Release
To download and upload the image:
\`\`\`
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm-base
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm-base vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-base
docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-base vllm/vllm-openai-rocm:latest-base
docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-base vllm/vllm-openai-rocm:v${RELEASE_VERSION}-base
docker push vllm/vllm-openai-rocm:latest-base
docker push vllm/vllm-openai-rocm:v${RELEASE_VERSION}-base
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}
docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT} vllm/vllm-openai-rocm:latest
docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT} vllm/vllm-openai-rocm:v${RELEASE_VERSION}
docker push vllm/vllm-openai-rocm:latest
docker push vllm/vllm-openai-rocm:v${RELEASE_VERSION}
\`\`\`
EOF EOF

View File

@@ -1,242 +0,0 @@
#!/bin/bash
#
# cherry-pick-from-milestone.sh
# Find commits from a GitHub milestone that are missing from the current branch
# and output them in chronological order for cherry-picking.
#
# Usage: ./cherry-pick-from-milestone.sh <milestone> [--dry-run] [--execute]
#
set -euo pipefail
# Colors for output
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
NC='\033[0m' # No Color
usage() {
cat <<EOF
Usage: $(basename "$0") <milestone> [options]
Find commits from a GitHub milestone that need to be cherry-picked into the current branch.
Arguments:
milestone The GitHub milestone name (e.g., v0.14.0)
Options:
--dry-run Show the cherry-pick commands without executing (default)
--execute Actually execute the cherry-picks
--main-branch Specify the main branch name (default: main)
--help Show this help message
Examples:
$(basename "$0") v0.14.0
$(basename "$0") v0.14.0 --dry-run
$(basename "$0") v0.14.0 --execute
$(basename "$0") v0.14.0 --main-branch master
EOF
exit 1
}
log_info() {
echo -e "${BLUE}[INFO]${NC} $1"
}
log_success() {
echo -e "${GREEN}[OK]${NC} $1"
}
log_warn() {
echo -e "${YELLOW}[WARN]${NC} $1"
}
log_error() {
echo -e "${RED}[ERROR]${NC} $1" >&2
}
# Default values
MILESTONE=""
DRY_RUN=true
MAIN_BRANCH="main"
# Parse arguments
while [[ $# -gt 0 ]]; do
case $1 in
--dry-run)
DRY_RUN=true
shift
;;
--execute)
DRY_RUN=false
shift
;;
--main-branch)
MAIN_BRANCH="$2"
shift 2
;;
--help|-h)
usage
;;
-*)
log_error "Unknown option: $1"
usage
;;
*)
if [[ -z "$MILESTONE" ]]; then
MILESTONE="$1"
else
log_error "Unexpected argument: $1"
usage
fi
shift
;;
esac
done
# Validate milestone argument
if [[ -z "$MILESTONE" ]]; then
log_error "Milestone is required"
usage
fi
# Check if we're in a git repository
if ! git rev-parse --is-inside-work-tree &>/dev/null; then
log_error "Not in a git repository"
exit 1
fi
# Check if gh CLI is available
if ! command -v gh &>/dev/null; then
log_error "GitHub CLI (gh) is not installed"
exit 1
fi
# Check if authenticated with gh
if ! gh auth status &>/dev/null; then
log_error "Not authenticated with GitHub CLI. Run 'gh auth login' first."
exit 1
fi
CURRENT_BRANCH=$(git branch --show-current)
log_info "Current branch: ${CURRENT_BRANCH}"
log_info "Main branch: ${MAIN_BRANCH}"
log_info "Milestone: ${MILESTONE}"
echo ""
# Fetch latest from remote
log_info "Fetching latest from remote..."
git fetch origin "$MAIN_BRANCH" --quiet
# Get merged PRs from the milestone, sorted by merge date
log_info "Fetching merged PRs from milestone '${MILESTONE}'..."
# Store PR data in a temp file
PR_DATA=$(mktemp)
trap "rm -f $PR_DATA" EXIT
if ! gh pr list --state merged --search "milestone:${MILESTONE}" \
--limit 1000 \
--json number,title,mergeCommit,mergedAt \
--jq 'sort_by(.mergedAt) | .[] | "\(.mergeCommit.oid)\t\(.number)\t\(.title)"' > "$PR_DATA" 2>/dev/null; then
log_error "Failed to fetch PRs from milestone '${MILESTONE}'"
log_error "This could be due to:"
log_error " - Milestone does not exist"
log_error " - Network/authentication issues"
log_error " - Invalid milestone name format"
exit 1
fi
if [[ ! -s "$PR_DATA" ]]; then
log_warn "No merged PRs found for milestone '${MILESTONE}'"
exit 0
fi
TOTAL_PRS=$(wc -l < "$PR_DATA")
log_info "Found ${TOTAL_PRS} merged PR(s) in milestone"
echo ""
# Find commits that are missing from current branch
MISSING_COMMITS=()
MISSING_INFO=()
while IFS=$'\t' read -r sha pr_number title; do
# Skip if SHA is empty or null
if [[ -z "$sha" || "$sha" == "null" ]]; then
log_warn "PR #${pr_number} has no merge commit SHA, skipping"
continue
fi
# Check if this commit is already in the current branch
if git merge-base --is-ancestor "$sha" HEAD 2>/dev/null; then
log_success "PR #${pr_number} already in branch: ${title:0:60}"
else
log_warn "PR #${pr_number} MISSING: ${title:0:60}"
MISSING_COMMITS+=("$sha")
MISSING_INFO+=("$sha PR #${pr_number}: ${title}")
fi
done < "$PR_DATA"
echo ""
if [[ ${#MISSING_COMMITS[@]} -eq 0 ]]; then
log_success "All PRs from milestone '${MILESTONE}' are already in the current branch!"
exit 0
fi
log_info "Found ${#MISSING_COMMITS[@]} missing commit(s) to cherry-pick"
echo ""
# Output the cherry-pick commands
echo "=========================================="
echo "Cherry-pick commands (in chronological order):"
echo "=========================================="
echo ""
for info in "${MISSING_INFO[@]}"; do
echo "# $info"
done
echo ""
echo "# Run these commands to cherry-pick all missing commits:"
echo "git cherry-pick ${MISSING_COMMITS[*]}"
echo ""
# Or one by one
echo "# Or cherry-pick one at a time:"
for sha in "${MISSING_COMMITS[@]}"; do
echo "git cherry-pick $sha"
done
echo ""
# Execute if requested
if [[ "$DRY_RUN" == false ]]; then
echo "=========================================="
log_info "Executing cherry-picks..."
echo "=========================================="
for i in "${!MISSING_COMMITS[@]}"; do
sha="${MISSING_COMMITS[$i]}"
info="${MISSING_INFO[$i]}"
echo ""
log_info "Cherry-picking: $info"
if git cherry-pick "$sha"; then
log_success "Successfully cherry-picked $sha"
else
log_error "Failed to cherry-pick $sha"
log_error "Resolve conflicts and run 'git cherry-pick --continue', or 'git cherry-pick --abort' to cancel"
exit 1
fi
done
echo ""
log_success "All cherry-picks completed successfully!"
else
echo "=========================================="
echo -e "${YELLOW}Dry run mode - no changes made${NC}"
echo "Run with --execute to perform the cherry-picks"
echo "=========================================="
fi

View File

@@ -3,14 +3,7 @@
set -ex set -ex
# Clean up old nightly builds from DockerHub, keeping only the last 14 builds # Clean up old nightly builds from DockerHub, keeping only the last 14 builds
# This script uses DockerHub API to list and delete old tags with specified prefix # This script uses DockerHub API to list and delete old tags with "nightly-" prefix
# Usage: cleanup-nightly-builds.sh [TAG_PREFIX]
# Example: cleanup-nightly-builds.sh "nightly-" or cleanup-nightly-builds.sh "cu130-nightly-"
# Get tag prefix from argument, default to "nightly-" if not provided
TAG_PREFIX="${1:-nightly-}"
echo "Cleaning up tags with prefix: $TAG_PREFIX"
# DockerHub API endpoint for vllm/vllm-openai repository # DockerHub API endpoint for vllm/vllm-openai repository
REPO_API_URL="https://hub.docker.com/v2/repositories/vllm/vllm-openai/tags" REPO_API_URL="https://hub.docker.com/v2/repositories/vllm/vllm-openai/tags"
@@ -52,7 +45,7 @@ get_all_tags() {
set -x set -x
# Get both last_updated timestamp and tag name, separated by | # Get both last_updated timestamp and tag name, separated by |
local tags=$(echo "$response" | jq -r --arg prefix "$TAG_PREFIX" '.results[] | select(.name | startswith($prefix)) | "\(.last_updated)|\(.name)"') local tags=$(echo "$response" | jq -r '.results[] | select(.name | startswith("nightly-")) | "\(.last_updated)|\(.name)"')
if [ -z "$tags" ]; then if [ -z "$tags" ]; then
break break

View File

@@ -112,7 +112,7 @@ def parse_from_filename(file: str) -> WheelFileInfo:
def generate_project_list(subdir_names: list[str], comment: str = "") -> str: def generate_project_list(subdir_names: list[str], comment: str = "") -> str:
""" """
Generate project list HTML content linking to each project & variant subdirectory. Generate project list HTML content linking to each project & variant sub-directory.
""" """
href_tags = [] href_tags = []
for name in sorted(subdir_names): for name in sorted(subdir_names):
@@ -168,23 +168,23 @@ def generate_index_and_metadata(
comment (str | None): Optional comment to include in the generated HTML files. comment (str | None): Optional comment to include in the generated HTML files.
First, parse all wheel files to extract metadata. First, parse all wheel files to extract metadata.
We need to collect all wheel files for each variant, and generate an index for it (in a subdirectory). We need to collect all wheel files for each variant, and generate an index for it (in a sub-directory).
The index for the default variant (if any) is generated in the root index directory. The index for the default variant (if any) is generated in the root index directory.
If `default_variant` is provided, all wheels must have variant suffixes, and the default variant index If `default_variant` is provided, all wheels must have variant suffixes, and the default variant index
is purely a copy of the corresponding variant index, with only the links adjusted. is purely a copy of the corresponding variant index, with only the links adjusted.
Otherwise, all wheels without variant suffixes are treated as the default variant. Otherwise, all wheels without variant suffixes are treated as the default variant.
If `alias_to_default` is provided, an additional alias subdirectory is created, it has the same content If `alias_to_default` is provided, an additional alias sub-directory is created, it has the same content
as the default variant index, but the links are adjusted accordingly. as the default variant index, but the links are adjusted accordingly.
Index directory structure: Index directory structure:
index_base_dir/ (hosted at wheels.vllm.ai/{nightly,$commit,$version}/) index_base_dir/ (hosted at wheels.vllm.ai/{nightly,$commit,$version}/)
index.html # project list, linking to "vllm/" and other packages, and all variant subdirectories index.html # project list, linking to "vllm/" and other packages, and all variant sub-directories
vllm/ vllm/
index.html # package index, pointing to actual files in wheel_base_dir (relative path) index.html # package index, pointing to actual files in wheel_base_dir (relative path)
metadata.json # machine-readable metadata for all wheels in this package metadata.json # machine-readable metadata for all wheels in this package
cpu/ # cpu variant subdirectory cpu/ # cpu variant sub-directory
index.html index.html
vllm/ vllm/
index.html index.html
@@ -194,7 +194,7 @@ def generate_index_and_metadata(
vllm/ vllm/
index.html index.html
metadata.json metadata.json
cu130/ # cu130 variant subdirectory cu130/ # cu130 variant sub-directory
index.html index.html
vllm/ vllm/
index.html index.html

View File

@@ -44,17 +44,6 @@ cleanup_docker() {
fi fi
} }
cleanup_network() {
for node in $(seq 0 $((NUM_NODES-1))); do
if docker pr -a -q -f name="node${node}" | grep -q .; then
docker stop "node${node}"
fi
done
if docker network ls | grep docker-net; then
docker network rm docker-net
fi
}
# Call the cleanup docker function # Call the cleanup docker function
cleanup_docker cleanup_docker
@@ -235,35 +224,6 @@ if [[ $commands == *"--shard-id="* ]]; then
echo "All shards reported no tests collected. Failing the build." echo "All shards reported no tests collected. Failing the build."
exit 1 exit 1
fi fi
elif [[ $commands == *"VLLM_TEST_GROUP_NAME=mi325_4-2-node-tests-4-gpus-in-total"* ]]; then
export DCKR_VER=$(docker --version | sed 's/Docker version \(.*\), build .*/\1/')
if [[ "$commands" =~ ^(.*)"["(.*)"] && ["(.*)"]"$ ]]; then
prefix=$( echo "${BASH_REMATCH[1]}" | sed 's/;//g')
echo "PREFIX: ${prefix}"
export composite_command="(command rocm-smi || true)"
myIFS=$IFS
IFS=','
read -ra node0 <<< ${BASH_REMATCH[2]}
read -ra node1 <<< ${BASH_REMATCH[3]}
IFS=$myIFS
for i in "${!node0[@]}";do
command_node_0=$(echo ${node0[i]} | sed 's/\"//g')
command_node_1=$(echo ${node1[i]} | sed 's/\"//g')
export commands="./.buildkite/scripts/run-multi-node-test.sh /vllm-workspace/tests 2 2 ${image_name} '${command_node_0}' '${command_node_1}'"
echo "COMMANDS: ${commands}"
composite_command=$(echo "${composite_command} && ${commands}")
done
/bin/bash -c "${composite_command}"
cleanup_network
else
echo "Failed to parse node commands! Exiting."
cleanup_network
exit 111
fi
else else
echo "Render devices: $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES" echo "Render devices: $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES"
docker run \ docker run \

View File

@@ -46,7 +46,7 @@ docker run \
pytest -v -s v1/sample --ignore=v1/sample/test_logprobs.py --ignore=v1/sample/test_logprobs_e2e.py pytest -v -s v1/sample --ignore=v1/sample/test_logprobs.py --ignore=v1/sample/test_logprobs_e2e.py
pytest -v -s v1/worker --ignore=v1/worker/test_gpu_model_runner.py pytest -v -s v1/worker --ignore=v1/worker/test_gpu_model_runner.py
pytest -v -s v1/structured_output pytest -v -s v1/structured_output
pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_tree_attention.py --ignore=v1/spec_decode/test_speculators_eagle3.py --ignore=v1/spec_decode/test_acceptance_length.py pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_tree_attention.py --ignore=v1/spec_decode/test_speculators_eagle3.py
pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_nixl_connector.py --ignore=v1/kv_connector/unit/test_example_connector.py --ignore=v1/kv_connector/unit/test_lmcache_integration.py pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_nixl_connector.py --ignore=v1/kv_connector/unit/test_example_connector.py --ignore=v1/kv_connector/unit/test_lmcache_integration.py
pytest -v -s v1/test_serial_utils.py pytest -v -s v1/test_serial_utils.py
' '

View File

@@ -1,36 +0,0 @@
#!/bin/bash
set -ex
# Get tag variant from argument, default to empty if not provided, should be something like "cu130".
# Due to limits in cleanup script, we must move variants to use separate tags like "cu130-nightly",
# otherwise they will be cleaned up together with the main "nightly" tags.
TAG_VARIANT="$1"
if [ -n "$TAG_VARIANT" ]; then
ORIG_TAG_SUFFIX="-$TAG_VARIANT"
TAG_NAME="$TAG_VARIANT-nightly"
else
ORIG_TAG_SUFFIX=""
TAG_NAME="nightly"
fi
ORIG_TAG_NAME="$BUILDKITE_COMMIT"
echo "Pushing original tag $ORIG_TAG_NAME$ORIG_TAG_SUFFIX to new nightly tag name: $TAG_NAME"
# pull original arch-dependent images from AWS ECR Public
aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:$ORIG_TAG_NAME-x86_64$ORIG_TAG_SUFFIX
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:$ORIG_TAG_NAME-aarch64$ORIG_TAG_SUFFIX
# tag arch-dependent images
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$ORIG_TAG_NAME-x86_64$ORIG_TAG_SUFFIX vllm/vllm-openai:$TAG_NAME-x86_64
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$ORIG_TAG_NAME-aarch64$ORIG_TAG_SUFFIX vllm/vllm-openai:$TAG_NAME-aarch64
# push arch-dependent images to DockerHub
docker push vllm/vllm-openai:$TAG_NAME-x86_64
docker push vllm/vllm-openai:$TAG_NAME-aarch64
# push arch-independent manifest to DockerHub
docker manifest create vllm/vllm-openai:$TAG_NAME vllm/vllm-openai:$TAG_NAME-x86_64 vllm/vllm-openai:$TAG_NAME-aarch64 --amend
docker manifest create vllm/vllm-openai:$TAG_NAME-$BUILDKITE_COMMIT vllm/vllm-openai:$TAG_NAME-x86_64 vllm/vllm-openai:$TAG_NAME-aarch64 --amend
docker manifest push vllm/vllm-openai:$TAG_NAME
docker manifest push vllm/vllm-openai:$TAG_NAME-$BUILDKITE_COMMIT

View File

@@ -18,18 +18,15 @@ wait_for_server() {
MODEL="Qwen/Qwen3-Next-80B-A3B-Instruct" MODEL="Qwen/Qwen3-Next-80B-A3B-Instruct"
# Set BACKENDS and platform-specific args based on platform # Set BACKENDS based on platform
if command -v rocm-smi &> /dev/null || [[ -d /opt/rocm ]] || [[ -n "${ROCM_PATH:-}" ]]; then if command -v rocm-smi &> /dev/null || [[ -d /opt/rocm ]] || [[ -n "${ROCM_PATH:-}" ]]; then
# ROCm platform # ROCm platform
BACKENDS=("allgather_reducescatter") BACKENDS=("allgather_reducescatter")
# Disable MOE padding for ROCm since it is causing eplb to fail # Disable MOE padding for ROCm since it is causing eplb to fail
export VLLM_ROCM_MOE_PADDING=0 export VLLM_ROCM_MOE_PADDING=0
PLATFORM_ARGS=("--no-async-scheduling")
echo "Disabled async scheduling for ROCm platform due to issues with spec decode."
else else
# Non-ROCm platform (CUDA/other) # Non-ROCm platform (CUDA/other)
BACKENDS=("deepep_high_throughput" "deepep_low_latency") BACKENDS=("deepep_high_throughput" "deepep_low_latency")
PLATFORM_ARGS=()
fi fi
cleanup() { cleanup() {
@@ -57,7 +54,6 @@ for BACK in "${BACKENDS[@]}"; do
--trust-remote-code \ --trust-remote-code \
--max-model-len 2048 \ --max-model-len 2048 \
--gpu-memory-utilization 0.9 \ --gpu-memory-utilization 0.9 \
"${PLATFORM_ARGS[@]}" \
--port $PORT & --port $PORT &
SERVER_PID=$! SERVER_PID=$!
wait_for_server $PORT wait_for_server $PORT

View File

@@ -1,227 +0,0 @@
#!/bin/bash
#
# trigger-ci-build.sh
# Trigger a Buildkite CI build using the bk CLI for the current commit and branch
# with RUN_ALL=1 and NIGHTLY=1 environment variables.
#
# Usage: ./trigger-ci-build.sh [options]
#
# Requires: bk CLI (https://buildkite.com/docs/platform/cli)
#
# SAFETY: Dry-run by default. Use --execute to actually trigger a build.
#
set -euo pipefail
# Colors for output
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
NC='\033[0m' # No Color
# Default configuration
PIPELINE="ci"
DRY_RUN=true
usage() {
cat <<EOF
Usage: $(basename "$0") [options]
Trigger a Buildkite CI build using the bk CLI for the current commit and branch.
Sets RUN_ALL=1 and NIGHTLY=1 environment variables.
SAFETY: Dry-run by default. Use --execute to actually trigger a build.
Options:
--execute Actually trigger the build (default: dry-run)
--pipeline Buildkite pipeline slug (default: ${PIPELINE})
--commit Override commit SHA (default: current HEAD)
--branch Override branch name (default: current branch)
--message Custom build message (default: auto-generated)
--help Show this help message
Prerequisites:
- bk CLI installed: brew tap buildkite/buildkite && brew install buildkite/buildkite/bk
- bk configured: bk configure
Examples:
$(basename "$0") # Dry-run, show what would happen
$(basename "$0") --execute # Actually trigger the build
$(basename "$0") --pipeline ci-shadow # Dry-run with different pipeline
EOF
exit 1
}
log_info() {
echo -e "${BLUE}[INFO]${NC} $1"
}
log_success() {
echo -e "${GREEN}[OK]${NC} $1"
}
log_warn() {
echo -e "${YELLOW}[WARN]${NC} $1"
}
log_error() {
echo -e "${RED}[ERROR]${NC} $1" >&2
}
# Parse arguments
COMMIT=""
BRANCH=""
MESSAGE=""
while [[ $# -gt 0 ]]; do
case $1 in
--execute)
DRY_RUN=false
shift
;;
--pipeline)
PIPELINE="$2"
shift 2
;;
--commit)
COMMIT="$2"
shift 2
;;
--branch)
BRANCH="$2"
shift 2
;;
--message)
MESSAGE="$2"
shift 2
;;
--help|-h)
usage
;;
-*)
log_error "Unknown option: $1"
usage
;;
*)
log_error "Unexpected argument: $1"
usage
;;
esac
done
# Check if bk CLI is installed
if ! command -v bk &>/dev/null; then
log_error "Buildkite CLI (bk) is not installed"
echo ""
echo "Install with:"
echo " brew tap buildkite/buildkite && brew install buildkite/buildkite/bk"
echo ""
echo "Then configure:"
echo " bk configure"
exit 1
fi
# Check if we're in a git repository
if ! git rev-parse --is-inside-work-tree &>/dev/null; then
log_error "Not in a git repository"
exit 1
fi
# Get current commit and branch if not overridden
if [[ -z "$COMMIT" ]]; then
COMMIT=$(git rev-parse HEAD)
fi
if [[ -z "$BRANCH" ]]; then
BRANCH=$(git branch --show-current)
if [[ -z "$BRANCH" ]]; then
# Detached HEAD state - try to get branch from ref
BRANCH=$(git rev-parse --abbrev-ref HEAD)
fi
fi
# Generate default message if not provided
if [[ -z "$MESSAGE" ]]; then
COMMIT_MSG=$(git log -1 --pretty=format:"%s" "$COMMIT" 2>/dev/null || echo "Manual build")
MESSAGE="[Manual] ${COMMIT_MSG}"
fi
# Safety check: Verify the commit exists on the remote
log_info "Verifying commit exists on remote..."
git fetch origin --quiet 2>/dev/null || true
# Check if commit is reachable from any remote branch
REMOTE_BRANCHES=$(git branch -r --contains "$COMMIT" 2>/dev/null || true)
if [[ -z "$REMOTE_BRANCHES" ]]; then
log_error "Commit ${COMMIT} does not exist on any remote branch!"
echo ""
echo "The CI system will fail to checkout this commit."
echo "Please push your changes first:"
echo ""
echo " git push origin ${BRANCH}"
echo ""
exit 1
fi
log_success "Commit found on remote branches:"
echo "$REMOTE_BRANCHES" | head -5 | sed 's/^/ /'
if [[ $(echo "$REMOTE_BRANCHES" | wc -l) -gt 5 ]]; then
echo " ... and more"
fi
echo ""
log_info "Pipeline: ${PIPELINE}"
log_info "Branch: ${BRANCH}"
log_info "Commit: ${COMMIT}"
log_info "Message: ${MESSAGE}"
log_info "Environment: RUN_ALL=1, NIGHTLY=1"
echo ""
# Build the command
CMD=(bk build create
-y
-w
-i
--pipeline "${PIPELINE}"
--commit "${COMMIT}"
--branch "${BRANCH}"
--message "${MESSAGE}"
--env "RUN_ALL=1"
--env "NIGHTLY=1"
)
if [[ "$DRY_RUN" == true ]]; then
echo "=========================================="
log_warn "DRY-RUN MODE - No build will be triggered"
echo "=========================================="
echo ""
echo "Command that would be executed:"
echo ""
# Escape single quotes in values for safe shell display
escape_for_shell() {
printf '%s' "$1" | sed "s/'/'\\\\''/g"
}
echo " bk build create \\"
echo " -y \\"
echo " -w \\"
echo " -i \\"
echo " --pipeline '$(escape_for_shell "${PIPELINE}")' \\"
echo " --commit '$(escape_for_shell "${COMMIT}")' \\"
echo " --branch '$(escape_for_shell "${BRANCH}")' \\"
echo " --message '$(escape_for_shell "${MESSAGE}")' \\"
echo " --env 'RUN_ALL=1' \\"
echo " --env 'NIGHTLY=1'"
echo ""
echo "=========================================="
echo -e "${YELLOW}To actually trigger this build, run:${NC}"
echo ""
echo " $0 --execute"
echo "=========================================="
exit 0
fi
log_info "Triggering build..."
# Execute the command - bk will print the URL and open browser
"${CMD[@]}"

View File

@@ -1,70 +0,0 @@
#!/usr/bin/env bash
set -e
BUCKET="vllm-wheels"
SUBPATH=$BUILDKITE_COMMIT
S3_COMMIT_PREFIX="s3://$BUCKET/$SUBPATH/"
RELEASE_VERSION=$(buildkite-agent meta-data get release-version)
GIT_VERSION=$(git describe --exact-match --tags $BUILDKITE_COMMIT 2>/dev/null)
echo "Release version from Buildkite: $RELEASE_VERSION"
if [[ -z "$GIT_VERSION" ]]; then
echo "[FATAL] Not on a git tag, cannot create release."
exit 1
else
echo "Git version for commit $BUILDKITE_COMMIT: $GIT_VERSION"
fi
# sanity check for version mismatch
if [[ "$RELEASE_VERSION" != "$GIT_VERSION" ]]; then
if [[ "$FORCE_RELEASE_IGNORE_VERSION_MISMATCH" == "true" ]]; then
echo "[WARNING] Force release and ignore version mismatch"
else
echo "[FATAL] Release version from Buildkite does not match Git version."
exit 1
fi
fi
PURE_VERSION=${RELEASE_VERSION#v} # remove leading 'v'
# check pypi token
if [[ -z "$PYPI_TOKEN" ]]; then
echo "[FATAL] PYPI_TOKEN is not set."
exit 1
else
export TWINE_USERNAME="__token__"
export TWINE_PASSWORD="$PYPI_TOKEN"
fi
set -x # avoid printing secrets above
# install twine from pypi
python3 -m venv /tmp/vllm-release-env
source /tmp/vllm-release-env/bin/activate
pip install twine
python3 -m twine --version
# copy release wheels to local directory
DIST_DIR=/tmp/vllm-release-dist
echo "Existing wheels on S3:"
aws s3 ls "$S3_COMMIT_PREFIX"
echo "Copying wheels to local directory"
mkdir -p $DIST_DIR
# include only wheels for the release version, ignore all files with "dev" or "rc" in the name (without excluding 'aarch64')
aws s3 cp --recursive --exclude "*" --include "vllm-${PURE_VERSION}*.whl" --exclude "*dev*" --exclude "*rc[0-9]*" "$S3_COMMIT_PREFIX" $DIST_DIR
echo "Wheels copied to local directory"
# generate source tarball
git archive --format=tar.gz --output="$DIST_DIR/vllm-${PURE_VERSION}.tar.gz" $BUILDKITE_COMMIT
ls -la $DIST_DIR
# upload wheels to PyPI (only default variant, i.e. files without '+' in the name)
PYPI_WHEEL_FILES=$(find $DIST_DIR -name "vllm-${PURE_VERSION}*.whl" -not -name "*+*")
if [[ -z "$PYPI_WHEEL_FILES" ]]; then
echo "No default variant wheels found, quitting..."
exit 1
fi
python3 -m twine check $PYPI_WHEEL_FILES
python3 -m twine upload --non-interactive --verbose $PYPI_WHEEL_FILES
echo "Wheels uploaded to PyPI"

View File

@@ -0,0 +1,103 @@
#!/usr/bin/env bash
set -e
BUCKET="vllm-wheels"
SUBPATH=$BUILDKITE_COMMIT
S3_COMMIT_PREFIX="s3://$BUCKET/$SUBPATH/"
RELEASE_VERSION=$(buildkite-agent meta-data get release-version)
echo "Release version from Buildkite: $RELEASE_VERSION"
GIT_VERSION=$(git describe --exact-match --tags $BUILDKITE_COMMIT 2>/dev/null)
if [ -z "$GIT_VERSION" ]; then
echo "[FATAL] Not on a git tag, cannot create release."
exit 1
else
echo "Git version for commit $BUILDKITE_COMMIT: $GIT_VERSION"
fi
# sanity check for version mismatch
if [ "v$RELEASE_VERSION" != "$GIT_VERSION" ]; then
if [ "$FORCE_RELEASE_IGNORE_VERSION_MISMATCH" == "true" ]; then
echo "[WARNING] Force release and ignore version mismatch"
else
echo "[FATAL] Release version from Buildkite does not match Git version."
exit 1
fi
fi
# check pypi token
if [ -z "$PYPI_TOKEN" ]; then
echo "[FATAL] PYPI_TOKEN is not set."
exit 1
else
export TWINE_USERNAME="__token__"
export TWINE_PASSWORD="$PYPI_TOKEN"
fi
# check github token
if [ -z "$GITHUB_TOKEN" ]; then
echo "[FATAL] GITHUB_TOKEN is not set."
exit 1
else
export GH_TOKEN="$GITHUB_TOKEN"
fi
set -x # avoid printing secrets above
# download gh CLI from github
# Get latest gh CLI version from GitHub API
GH_VERSION=$(curl -s https://api.github.com/repos/cli/cli/releases/latest | grep '"tag_name":' | sed -E 's/.*"([^"]+)".*/\1/' | sed 's/^v//')
if [ -z "$GH_VERSION" ]; then
echo "[FATAL] Failed to get latest gh CLI version from GitHub"
exit 1
fi
echo "Downloading gh CLI version: $GH_VERSION"
GH_TARBALL="gh_${GH_VERSION}_linux_amd64.tar.gz"
GH_URL="https://github.com/cli/cli/releases/download/v${GH_VERSION}/${GH_TARBALL}"
GH_INSTALL_DIR="/tmp/gh-install"
mkdir -p "$GH_INSTALL_DIR"
pushd "$GH_INSTALL_DIR"
curl -L -o "$GH_TARBALL" "$GH_URL"
tar -xzf "$GH_TARBALL"
GH_BIN=$(realpath $(find . -name "gh" -type f -executable | head -n 1))
if [ -z "$GH_BIN" ]; then
echo "[FATAL] Failed to find gh CLI executable"
exit 1
fi
echo "gh CLI downloaded successfully, version: $($GH_BIN --version)"
echo "Last 5 releases on GitHub:" # as a sanity check of gh and GH_TOKEN
command "$GH_BIN" release list --limit 5
popd
# install twine from pypi
python3 -m venv /tmp/vllm-release-env
source /tmp/vllm-release-env/bin/activate
pip install twine
python3 -m twine --version
# copy release wheels to local directory
DIST_DIR=/tmp/vllm-release-dist
echo "Existing wheels on S3:"
aws s3 ls "$S3_COMMIT_PREFIX"
echo "Copying wheels to local directory"
mkdir -p $DIST_DIR
# include only wheels for the release version, ignore all files with "dev" or "rc" in the name
aws s3 cp --recursive --exclude "*" --include "vllm-${RELEASE_VERSION}*.whl" --exclude "*dev*" --exclude "*rc*" "$S3_COMMIT_PREFIX" $DIST_DIR
echo "Wheels copied to local directory"
# generate source tarball
git archive --format=tar.gz --output="$DIST_DIR/vllm-${RELEASE_VERSION}.tar.gz" $BUILDKITE_COMMIT
ls -la $DIST_DIR
# upload wheels to PyPI (only default variant, i.e. files without '+' in the name)
PYPI_WHEEL_FILES=$(find $DIST_DIR -name "vllm-${RELEASE_VERSION}*.whl" -not -name "*+*")
if [ -z "$PYPI_WHEEL_FILES" ]; then
echo "No default variant wheels found, quitting..."
exit 1
fi
python3 -m twine check $PYPI_WHEEL_FILES
python3 -m twine --non-interactive --verbose upload $PYPI_WHEEL_FILES
echo "Wheels uploaded to PyPI"
# create release on GitHub with the release version and all wheels
command "$GH_BIN" release create $GIT_VERSION -d --latest --notes-from-tag --verify-tag $DIST_DIR/*.whl

View File

@@ -71,7 +71,6 @@ steps:
- tests/test_inputs.py - tests/test_inputs.py
- tests/test_outputs.py - tests/test_outputs.py
- tests/multimodal - tests/multimodal
- tests/renderers
- tests/standalone_tests/lazy_imports.py - tests/standalone_tests/lazy_imports.py
- tests/tokenizers_ - tests/tokenizers_
- tests/tool_parsers - tests/tool_parsers
@@ -83,7 +82,6 @@ steps:
- pytest -v -s test_inputs.py - pytest -v -s test_inputs.py
- pytest -v -s test_outputs.py - pytest -v -s test_outputs.py
- pytest -v -s -m 'cpu_test' multimodal - pytest -v -s -m 'cpu_test' multimodal
- pytest -v -s renderers
- pytest -v -s tokenizers_ - pytest -v -s tokenizers_
- pytest -v -s tool_parsers - pytest -v -s tool_parsers
- pytest -v -s transformers_utils - pytest -v -s transformers_utils
@@ -430,8 +428,6 @@ steps:
timeout_in_minutes: 30 timeout_in_minutes: 30
gpu: h100 gpu: h100
source_file_dependencies: source_file_dependencies:
- vllm/config/attention.py
- vllm/model_executor/layers/attention
- vllm/v1/attention - vllm/v1/attention
- tests/v1/attention - tests/v1/attention
commands: commands:
@@ -456,12 +452,10 @@ steps:
timeout_in_minutes: 30 timeout_in_minutes: 30
gpu: b200 gpu: b200
source_file_dependencies: source_file_dependencies:
- vllm/config/attention.py
- vllm/model_executor/layers/attention
- vllm/v1/attention - vllm/v1/attention
- tests/v1/attention - tests/v1/attention
commands: commands:
- pytest -v -s v1/attention - VLLM_DISABLE_FLASHINFER_PREFILL=1 pytest -v -s v1/attention # TODO: FI prefill is bugged and causes incorrectness, fix this
- label: V1 Test others (CPU) # 5 mins - label: V1 Test others (CPU) # 5 mins
mirror_hardwares: [amdexperimental, amdproduction, amdtentative] mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
@@ -640,9 +634,8 @@ steps:
# grade: Blocking # grade: Blocking
source_file_dependencies: source_file_dependencies:
- csrc/attention/ - csrc/attention/
- vllm/attention
- vllm/v1/attention - vllm/v1/attention
# TODO: remove this dependency (https://github.com/vllm-project/vllm/issues/32267)
- vllm/model_executor/layers/attention
- tests/kernels/attention - tests/kernels/attention
commands: commands:
- pytest -v -s kernels/attention --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT - pytest -v -s kernels/attention --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
@@ -710,17 +703,6 @@ steps:
- pytest -v -s kernels/moe/test_batched_deepgemm.py - pytest -v -s kernels/moe/test_batched_deepgemm.py
- pytest -v -s kernels/attention/test_deepgemm_attention.py - pytest -v -s kernels/attention/test_deepgemm_attention.py
- label: Kernels Helion Test
timeout_in_minutes: 30
mirror_hardwares: [amdexperimental, amdproduction]
agent_pool: mi325_1
source_file_dependencies:
- vllm/utils/import_utils.py
- tests/kernels/helion/
commands:
- pip install helion
- pytest -v -s kernels/helion/
- label: Model Executor Test # 23min - label: Model Executor Test # 23min
timeout_in_minutes: 35 timeout_in_minutes: 35
torch_nightly: true torch_nightly: true
@@ -873,7 +855,7 @@ steps:
- label: Language Models Tests (Standard) - label: Language Models Tests (Standard)
timeout_in_minutes: 25 timeout_in_minutes: 25
mirror_hardwares: [amdexperimental, amdproduction] mirror_hardwares: [amdexperimental]
agent_pool: mi325_1 agent_pool: mi325_1
# grade: Blocking # grade: Blocking
torch_nightly: true torch_nightly: true
@@ -1132,7 +1114,7 @@ steps:
- csrc/quantization/cutlass_w8a8/moe/ - csrc/quantization/cutlass_w8a8/moe/
- vllm/model_executor/layers/fused_moe/cutlass_moe.py - vllm/model_executor/layers/fused_moe/cutlass_moe.py
- vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py - vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py
- vllm/model_executor/layers/fused_moe/flashinfer_a2a_prepare_finalize.py - vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py
- vllm/model_executor/layers/quantization/utils/flashinfer_utils.py - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
- vllm/v1/attention/backends/flashinfer.py - vllm/v1/attention/backends/flashinfer.py
- vllm/v1/attention/backends/mla/cutlass_mla.py - vllm/v1/attention/backends/mla/cutlass_mla.py
@@ -1278,7 +1260,7 @@ steps:
- label: 2 Node Tests (4 GPUs in total) # 16min - label: 2 Node Tests (4 GPUs in total) # 16min
timeout_in_minutes: 30 timeout_in_minutes: 30
mirror_hardwares: [amdexperimental, amdmultinode] mirror_hardwares: [amdexperimental]
agent_pool: mi325_4 agent_pool: mi325_4
# grade: Blocking # grade: Blocking
working_dir: "/vllm-workspace/tests" working_dir: "/vllm-workspace/tests"
@@ -1292,15 +1274,15 @@ steps:
- tests/distributed/ - tests/distributed/
- tests/examples/offline_inference/data_parallel.py - tests/examples/offline_inference/data_parallel.py
commands: commands:
- # the following commands are for the first node, with ip 192.168.10.10 (ray environment already set up) | grep 'Same node test passed' | grep 'Node count test passed' - # the following commands are for the first node, with ip 192.168.10.10 (ray environment already set up)
- VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed'
- NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py - NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed'
- python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=0 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code - python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=0 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code
- VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py
- VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py
- # the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up) - # the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up)
- VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed'
- NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py - NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed'
- python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=1 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code - python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=1 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code
- label: Distributed Tests (2 GPUs) # 68min - label: Distributed Tests (2 GPUs) # 68min
@@ -1469,7 +1451,7 @@ steps:
- bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large-amd.txt - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large-amd.txt
- label: NixlConnector PD accuracy tests (Distributed) # 30min - label: NixlConnector PD accuracy tests (Distributed) # 30min
mirror_hardwares: [amdexperimental, amdproduction] mirror_hardwares: [amdexperimental]
agent_pool: mi325_4 agent_pool: mi325_4
# grade: Blocking # grade: Blocking
timeout_in_minutes: 30 timeout_in_minutes: 30
@@ -1480,10 +1462,10 @@ steps:
- tests/v1/kv_connector/nixl_integration/ - tests/v1/kv_connector/nixl_integration/
commands: commands:
- uv pip install --system -r /vllm-workspace/requirements/kv_connectors_rocm.txt - uv pip install --system -r /vllm-workspace/requirements/kv_connectors_rocm.txt
- ROCM_ATTN=1 bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh - VLLM_ATTENTION_BACKEND=ROCM_ATTN bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh
- label: DP EP NixlConnector PD accuracy tests (Distributed) # 15min - label: DP EP NixlConnector PD accuracy tests (Distributed) # 15min
mirror_hardwares: [amdexperimental, amdproduction] mirror_hardwares: [amdexperimental]
agent_pool: mi325_4 agent_pool: mi325_4
# grade: Blocking # grade: Blocking
timeout_in_minutes: 15 timeout_in_minutes: 15
@@ -1494,7 +1476,7 @@ steps:
- tests/v1/kv_connector/nixl_integration/ - tests/v1/kv_connector/nixl_integration/
commands: commands:
- uv pip install --system -r /vllm-workspace/requirements/kv_connectors_rocm.txt - uv pip install --system -r /vllm-workspace/requirements/kv_connectors_rocm.txt
- DP_EP=1 ROCM_ATTN=1 bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh - VLLM_ATTENTION_BACKEND=ROCM_ATTN DP_EP=1 bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh
##### multi gpus test ##### ##### multi gpus test #####
##### A100 test ##### ##### A100 test #####
@@ -1509,9 +1491,6 @@ steps:
source_file_dependencies: source_file_dependencies:
- vllm/ - vllm/
commands: commands:
# Work around HIP bug tracked here: https://github.com/ROCm/hip/issues/3876
# TODO: Remove when the bug is fixed in a future ROCm release
- export TORCH_NCCL_BLOCKING_WAIT=1
# NOTE: don't test llama model here, it seems hf implementation is buggy # NOTE: don't test llama model here, it seems hf implementation is buggy
# see https://github.com/vllm-project/vllm/pull/5689 for details # see https://github.com/vllm-project/vllm/pull/5689 for details
- pytest -v -s distributed/test_custom_all_reduce.py - pytest -v -s distributed/test_custom_all_reduce.py
@@ -1683,6 +1662,17 @@ steps:
commands: commands:
- bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1 - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1
- label: DeepSeek V2-Lite Async EPLB Accuracy
timeout_in_minutes: 60
mirror_hardwares: [amdexperimental]
agent_pool: mi325_4
# grade: Blocking
gpu: h100
optional: true
num_gpus: 4
working_dir: "/vllm-workspace"
commands:
- bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_async_eplb.sh 0.25 1319 8030
- label: Qwen3-Next-80B-A3B-Instruct MTP Async EPLB Accuracy - label: Qwen3-Next-80B-A3B-Instruct MTP Async EPLB Accuracy
timeout_in_minutes: 60 timeout_in_minutes: 60

View File

@@ -64,7 +64,6 @@ steps:
- tests/test_inputs.py - tests/test_inputs.py
- tests/test_outputs.py - tests/test_outputs.py
- tests/multimodal - tests/multimodal
- tests/renderers
- tests/standalone_tests/lazy_imports.py - tests/standalone_tests/lazy_imports.py
- tests/tokenizers_ - tests/tokenizers_
- tests/tool_parsers - tests/tool_parsers
@@ -76,7 +75,6 @@ steps:
- pytest -v -s test_inputs.py - pytest -v -s test_inputs.py
- pytest -v -s test_outputs.py - pytest -v -s test_outputs.py
- pytest -v -s -m 'cpu_test' multimodal - pytest -v -s -m 'cpu_test' multimodal
- pytest -v -s renderers
- pytest -v -s tokenizers_ - pytest -v -s tokenizers_
- pytest -v -s tool_parsers - pytest -v -s tool_parsers
- pytest -v -s transformers_utils - pytest -v -s transformers_utils
@@ -362,7 +360,7 @@ steps:
- pytest -v -s v1/sample - pytest -v -s v1/sample
- pytest -v -s v1/logits_processors - pytest -v -s v1/logits_processors
- pytest -v -s v1/worker - pytest -v -s v1/worker
- pytest -v -s -m 'not slow_test' v1/spec_decode - pytest -v -s v1/spec_decode
- pytest -v -s -m 'not cpu_test' v1/kv_connector/unit - pytest -v -s -m 'not cpu_test' v1/kv_connector/unit
- pytest -v -s -m 'not cpu_test' v1/metrics - pytest -v -s -m 'not cpu_test' v1/metrics
- pytest -v -s v1/test_oracle.py - pytest -v -s v1/test_oracle.py
@@ -376,8 +374,6 @@ steps:
timeout_in_minutes: 30 timeout_in_minutes: 30
gpu: h100 gpu: h100
source_file_dependencies: source_file_dependencies:
- vllm/config/attention.py
- vllm/model_executor/layers/attention
- vllm/v1/attention - vllm/v1/attention
- tests/v1/attention - tests/v1/attention
commands: commands:
@@ -400,12 +396,10 @@ steps:
timeout_in_minutes: 30 timeout_in_minutes: 30
gpu: b200 gpu: b200
source_file_dependencies: source_file_dependencies:
- vllm/config/attention.py
- vllm/model_executor/layers/attention
- vllm/v1/attention - vllm/v1/attention
- tests/v1/attention - tests/v1/attention
commands: commands:
- pytest -v -s v1/attention - VLLM_DISABLE_FLASHINFER_PREFILL=1 pytest -v -s v1/attention # TODO: FI prefill is bugged and causes incorrectness, fix this
- label: V1 Test others (CPU) # 5 mins - label: V1 Test others (CPU) # 5 mins
source_file_dependencies: source_file_dependencies:
@@ -568,9 +562,8 @@ steps:
mirror_hardwares: [amdexperimental] mirror_hardwares: [amdexperimental]
source_file_dependencies: source_file_dependencies:
- csrc/attention/ - csrc/attention/
- vllm/attention
- vllm/v1/attention - vllm/v1/attention
# TODO: remove this dependency (https://github.com/vllm-project/vllm/issues/32267)
- vllm/model_executor/layers/attention
- tests/kernels/attention - tests/kernels/attention
commands: commands:
- pytest -v -s kernels/attention --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT - pytest -v -s kernels/attention --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
@@ -631,56 +624,6 @@ steps:
- pytest -v -s kernels/moe/test_batched_deepgemm.py - pytest -v -s kernels/moe/test_batched_deepgemm.py
- pytest -v -s kernels/attention/test_deepgemm_attention.py - pytest -v -s kernels/attention/test_deepgemm_attention.py
- label: Kernels Helion Test
timeout_in_minutes: 30
gpu: h100
source_file_dependencies:
- vllm/utils/import_utils.py
- tests/kernels/helion/
commands:
- pip install helion
- pytest -v -s kernels/helion/
- label: Kernels FP8 MoE Test (1 H100)
timeout_in_minutes: 90
gpu: h100
num_gpus: 1
optional: true
commands:
- pytest -v -s kernels/moe/test_cutlass_moe.py
- pytest -v -s kernels/moe/test_flashinfer.py
- pytest -v -s kernels/moe/test_gpt_oss_triton_kernels.py
- pytest -v -s kernels/moe/test_modular_oai_triton_moe.py
- pytest -v -s kernels/moe/test_moe.py
# - pytest -v -s kernels/moe/test_block_fp8.py - failing on main
- pytest -v -s kernels/moe/test_block_int8.py
- pytest -v -s kernels/moe/test_triton_moe_no_act_mul.py
- pytest -v -s kernels/moe/test_triton_moe_ptpc_fp8.py
- label: Kernels FP8 MoE Test (2 H100s)
timeout_in_minutes: 90
gpu: h100
num_gpus: 2
optional: true
commands:
- pytest -v -s kernels/moe/test_deepep_deepgemm_moe.py
- pytest -v -s kernels/moe/test_deepep_moe.py
- pytest -v -s kernels/moe/test_pplx_cutlass_moe.py
# - pytest -v -s kernels/moe/test_pplx_moe.py - failing on main
- label: Kernels Fp4 MoE Test (B200)
timeout_in_minutes: 60
gpu: b200
num_gpus: 1
optional: true
commands:
- pytest -v -s kernels/moe/test_cutedsl_moe.py
- pytest -v -s kernels/moe/test_flashinfer_moe.py
- pytest -v -s kernels/moe/test_nvfp4_moe.py
- pytest -v -s kernels/moe/test_ocp_mx_moe.py
- label: Model Executor Test # 23min - label: Model Executor Test # 23min
timeout_in_minutes: 35 timeout_in_minutes: 35
torch_nightly: true torch_nightly: true
@@ -1008,7 +951,7 @@ steps:
# Whisper needs spawn method to avoid deadlock # Whisper needs spawn method to avoid deadlock
- VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper - VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper
- label: Blackwell Test # 23 min - label: Blackwell Test # 21 min
timeout_in_minutes: 30 timeout_in_minutes: 30
working_dir: "/vllm-workspace/" working_dir: "/vllm-workspace/"
gpu: b200 gpu: b200
@@ -1018,7 +961,7 @@ steps:
- csrc/quantization/cutlass_w8a8/moe/ - csrc/quantization/cutlass_w8a8/moe/
- vllm/model_executor/layers/fused_moe/cutlass_moe.py - vllm/model_executor/layers/fused_moe/cutlass_moe.py
- vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py - vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py
- vllm/model_executor/layers/fused_moe/flashinfer_a2a_prepare_finalize.py - vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py
- vllm/model_executor/layers/quantization/utils/flashinfer_utils.py - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
- vllm/v1/attention/backends/flashinfer.py - vllm/v1/attention/backends/flashinfer.py
- vllm/v1/attention/backends/mla/cutlass_mla.py - vllm/v1/attention/backends/mla/cutlass_mla.py
@@ -1048,8 +991,6 @@ steps:
- pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py - pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py
- pytest -v -s tests/kernels/moe/test_flashinfer.py - pytest -v -s tests/kernels/moe/test_flashinfer.py
- pytest -v -s tests/kernels/moe/test_cutedsl_moe.py - pytest -v -s tests/kernels/moe/test_cutedsl_moe.py
# e2e
- pytest -v -s tests/models/quantization/test_nvfp4.py
- label: Blackwell Fusion and Compile Tests # 30 min - label: Blackwell Fusion and Compile Tests # 30 min
timeout_in_minutes: 40 timeout_in_minutes: 40
@@ -1104,48 +1045,6 @@ steps:
# Run all e2e fusion tests # Run all e2e fusion tests
- pytest -v -s tests/compile/distributed/test_fusions_e2e.py - pytest -v -s tests/compile/distributed/test_fusions_e2e.py
- label: Hopper Fusion E2E Tests (H100) # 10min
timeout_in_minutes: 70
working_dir: "/vllm-workspace/"
gpu: h100
optional: true
source_file_dependencies:
- csrc/quantization/fp4/
- vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
- vllm/v1/attention/backends/flashinfer.py
- vllm/compilation/
# can affect pattern matching
- vllm/model_executor/layers/layernorm.py
- vllm/model_executor/layers/activation.py
- vllm/model_executor/layers/quantization/input_quant_fp8.py
- tests/compile/test_fusion_attn.py
commands:
- export VLLM_TEST_CLEAN_GPU_MEMORY=1
# skip Llama-4 since it does not fit on this device
- pytest -v -s tests/compile/test_fusion_attn.py -k 'not Llama-4'
- label: Hopper Fusion Distributed E2E Tests (2xH100) # 70min
timeout_in_minutes: 70
working_dir: "/vllm-workspace/"
gpu: h100
optional: true
num_gpus: 2
source_file_dependencies:
- csrc/quantization/fp4/
- vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
- vllm/v1/attention/backends/flashinfer.py
- vllm/compilation/
# can affect pattern matching
- vllm/model_executor/layers/layernorm.py
- vllm/model_executor/layers/activation.py
- vllm/model_executor/layers/quantization/input_quant_fp8.py
- tests/compile/distributed/test_fusions_e2e.py
commands:
- export VLLM_TEST_CLEAN_GPU_MEMORY=1
# Run all e2e fusion tests
- pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'
- pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
- label: Blackwell GPT-OSS Eval - label: Blackwell GPT-OSS Eval
timeout_in_minutes: 60 timeout_in_minutes: 60
working_dir: "/vllm-workspace/" working_dir: "/vllm-workspace/"
@@ -1317,7 +1216,7 @@ steps:
- pytest -v -s distributed/test_distributed_oot.py - pytest -v -s distributed/test_distributed_oot.py
- pytest -v -s entrypoints/openai/test_oot_registration.py # it needs a clean process - pytest -v -s entrypoints/openai/test_oot_registration.py # it needs a clean process
- pytest -v -s models/test_oot_registration.py # it needs a clean process - pytest -v -s models/test_oot_registration.py # it needs a clean process
- pytest -v -s plugins/lora_resolvers # unit tests for lora resolver plugins - pytest -v -s plugins/lora_resolvers # unit tests for in-tree lora resolver plugins
- label: Pipeline + Context Parallelism Test # 45min - label: Pipeline + Context Parallelism Test # 45min
timeout_in_minutes: 60 timeout_in_minutes: 60
@@ -1420,20 +1319,6 @@ steps:
- TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)' - TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
- pytest -v -s -x lora/test_mixtral.py - pytest -v -s -x lora/test_mixtral.py
- label: Acceptance Length Test (Large Models) # optional
timeout_in_minutes: 120
gpu: h100
optional: true
num_gpus: 1
working_dir: "/vllm-workspace/tests"
source_file_dependencies:
- vllm/v1/spec_decode/
- vllm/model_executor/models/mlp_speculator.py
- tests/v1/spec_decode/test_acceptance_length.py
commands:
- export VLLM_ALLOW_INSECURE_SERIALIZATION=1
- pytest -v -s v1/spec_decode/test_acceptance_length.py -m slow_test
- label: LM Eval Large Models # optional - label: LM Eval Large Models # optional
gpu: a100 gpu: a100
optional: true optional: true
@@ -1459,31 +1344,22 @@ steps:
- export VLLM_USE_DEEP_GEMM=0 # We found Triton is faster than DeepGEMM for H100 - export VLLM_USE_DEEP_GEMM=0 # We found Triton is faster than DeepGEMM for H100
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt --tp-size=4 - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt --tp-size=4
- label: Sequence Parallel Tests (H100) # 60 min ##### H200 test #####
timeout_in_minutes: 60 - label: Distributed Tests (H200) # optional
working_dir: "/vllm-workspace/" gpu: h200
gpu: h100
optional: true
num_gpus: 2
commands:
- export VLLM_TEST_CLEAN_GPU_MEMORY=1
# Run sequence parallel tests
- pytest -v -s tests/distributed/test_sequence_parallel.py
- pytest -v -s tests/compile/distributed/test_sequence_parallelism.py
- label: Distributed Tests (H100) # optional
gpu: h100
optional: true optional: true
working_dir: "/vllm-workspace/" working_dir: "/vllm-workspace/"
num_gpus: 2 num_gpus: 2
commands: commands:
- VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_async_tp.py - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_async_tp.py
- pytest -v -s tests/compile/distributed/test_sequence_parallelism.py
- pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
- "VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'"
- VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/distributed/test_sequence_parallel.py
- pytest -v -s tests/distributed/test_context_parallel.py - pytest -v -s tests/distributed/test_context_parallel.py
- VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=deepep_high_throughput - CUDA_VISIBLE_DEVICES=1,2 VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=deepep_high_throughput
- pytest -v -s tests/v1/distributed/test_dbo.py - pytest -v -s tests/v1/distributed/test_dbo.py
##### H200 test #####
- label: LM Eval Large Models (H200) # optional - label: LM Eval Large Models (H200) # optional
timeout_in_minutes: 60 timeout_in_minutes: 60
gpu: h200 gpu: h200

View File

@@ -4,10 +4,8 @@ depends_on:
steps: steps:
- label: V1 attention (H100) - label: V1 attention (H100)
timeout_in_minutes: 30 timeout_in_minutes: 30
device: h100 gpu: h100
source_file_dependencies: source_file_dependencies:
- vllm/config/attention.py
- vllm/model_executor/layers/attention
- vllm/v1/attention - vllm/v1/attention
- tests/v1/attention - tests/v1/attention
commands: commands:
@@ -15,11 +13,9 @@ steps:
- label: V1 attention (B200) - label: V1 attention (B200)
timeout_in_minutes: 30 timeout_in_minutes: 30
device: b200 gpu: b200
source_file_dependencies: source_file_dependencies:
- vllm/config/attention.py
- vllm/model_executor/layers/attention
- vllm/v1/attention - vllm/v1/attention
- tests/v1/attention - tests/v1/attention
commands: commands:
- pytest -v -s v1/attention - VLLM_DISABLE_FLASHINFER_PREFILL=1 pytest -v -s v1/attention # TODO: FI prefill is bugged and causes incorrectness, fix this

View File

@@ -5,7 +5,7 @@ steps:
- label: Fusion and Compile Tests (B200) - label: Fusion and Compile Tests (B200)
timeout_in_minutes: 40 timeout_in_minutes: 40
working_dir: "/vllm-workspace/" working_dir: "/vllm-workspace/"
device: b200 gpu: b200
source_file_dependencies: source_file_dependencies:
- csrc/quantization/fp4/ - csrc/quantization/fp4/
- vllm/model_executor/layers/quantization/utils/flashinfer_utils.py - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
@@ -26,7 +26,7 @@ steps:
- nvidia-smi - nvidia-smi
- pytest -v -s tests/compile/test_fusion_attn.py - pytest -v -s tests/compile/test_fusion_attn.py
- pytest -v -s tests/compile/test_silu_mul_quant_fusion.py - pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
# this runner has 2 GPUs available even though num_devices=2 is not set # this runner has 2 GPUs available even though num_gpus=2 is not set
- pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py - pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
# Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time # Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time
# Wrap with quotes to escape yaml # Wrap with quotes to escape yaml
@@ -37,9 +37,9 @@ steps:
- label: Fusion E2E (2 GPUs)(B200) - label: Fusion E2E (2 GPUs)(B200)
timeout_in_minutes: 40 timeout_in_minutes: 40
working_dir: "/vllm-workspace/" working_dir: "/vllm-workspace/"
device: b200 gpu: b200
optional: true optional: true
num_devices: 2 num_gpus: 2
source_file_dependencies: source_file_dependencies:
- csrc/quantization/fp4/ - csrc/quantization/fp4/
- vllm/model_executor/layers/quantization/utils/flashinfer_utils.py - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py

View File

@@ -5,7 +5,7 @@ steps:
- label: Distributed Comm Ops - label: Distributed Comm Ops
timeout_in_minutes: 20 timeout_in_minutes: 20
working_dir: "/vllm-workspace/tests" working_dir: "/vllm-workspace/tests"
num_devices: 2 num_gpus: 2
source_file_dependencies: source_file_dependencies:
- vllm/distributed - vllm/distributed
- tests/distributed - tests/distributed
@@ -18,7 +18,7 @@ steps:
- label: Distributed (2 GPUs) - label: Distributed (2 GPUs)
timeout_in_minutes: 90 timeout_in_minutes: 90
working_dir: "/vllm-workspace/tests" working_dir: "/vllm-workspace/tests"
num_devices: 2 num_gpus: 2
source_file_dependencies: source_file_dependencies:
- vllm/compilation/ - vllm/compilation/
- vllm/distributed/ - vllm/distributed/
@@ -54,7 +54,7 @@ steps:
- label: Distributed Tests (4 GPUs) - label: Distributed Tests (4 GPUs)
timeout_in_minutes: 50 timeout_in_minutes: 50
working_dir: "/vllm-workspace/tests" working_dir: "/vllm-workspace/tests"
num_devices: 4 num_gpus: 4
source_file_dependencies: source_file_dependencies:
- vllm/distributed/ - vllm/distributed/
- tests/distributed/test_utils - tests/distributed/test_utils
@@ -103,8 +103,8 @@ steps:
- label: Distributed Tests (8 GPUs)(H100) - label: Distributed Tests (8 GPUs)(H100)
timeout_in_minutes: 10 timeout_in_minutes: 10
device: h100 gpu: h100
num_devices: 8 num_gpus: 8
working_dir: "/vllm-workspace/tests" working_dir: "/vllm-workspace/tests"
source_file_dependencies: source_file_dependencies:
- examples/offline_inference/torchrun_dp_example.py - examples/offline_inference/torchrun_dp_example.py
@@ -120,9 +120,9 @@ steps:
- torchrun --nproc-per-node=8 ../examples/offline_inference/torchrun_dp_example.py --tp-size=2 --pp-size=1 --dp-size=4 --enable-ep - torchrun --nproc-per-node=8 ../examples/offline_inference/torchrun_dp_example.py --tp-size=2 --pp-size=1 --dp-size=4 --enable-ep
- label: Distributed Tests (4 GPUs)(A100) - label: Distributed Tests (4 GPUs)(A100)
device: a100 gpu: a100
optional: true optional: true
num_devices: 4 num_gpus: 4
source_file_dependencies: source_file_dependencies:
- vllm/ - vllm/
commands: commands:
@@ -133,34 +133,26 @@ steps:
- TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)' - TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
- pytest -v -s -x lora/test_mixtral.py - pytest -v -s -x lora/test_mixtral.py
- label: Sequence Parallel Tests (H100) - label: Distributed Tests (2 GPUs)(H200)
timeout_in_minutes: 60 gpu: h200
working_dir: "/vllm-workspace/"
device: h100
optional: true
num_devices: 2
commands:
- export VLLM_TEST_CLEAN_GPU_MEMORY=1
# Run sequence parallel tests
- pytest -v -s tests/distributed/test_sequence_parallel.py
- pytest -v -s tests/compile/distributed/test_sequence_parallelism.py
- label: Distributed Tests (2 GPUs)(H100)
device: h100
optional: true optional: true
working_dir: "/vllm-workspace/" working_dir: "/vllm-workspace/"
num_devices: 2 num_gpus: 2
commands: commands:
- VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_async_tp.py - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_async_tp.py
- pytest -v -s tests/compile/distributed/test_sequence_parallelism.py
- pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
- VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'
- VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/distributed/test_sequence_parallel.py
- pytest -v -s tests/distributed/test_context_parallel.py - pytest -v -s tests/distributed/test_context_parallel.py
- VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=deepep_high_throughput - CUDA_VISIBLE_DEVICES=1,2 VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=deepep_high_throughput
- pytest -v -s tests/v1/distributed/test_dbo.py - pytest -v -s tests/v1/distributed/test_dbo.py
- label: Distributed Tests (2 GPUs)(B200) - label: Distributed Tests (2 GPUs)(B200)
device: b200 gpu: b200
optional: true optional: true
working_dir: "/vllm-workspace/" working_dir: "/vllm-workspace/"
num_devices: 2 num_gpus: 2
commands: commands:
- pytest -v -s tests/distributed/test_context_parallel.py - pytest -v -s tests/distributed/test_context_parallel.py
- pytest -v -s tests/distributed/test_nccl_symm_mem_allreduce.py - pytest -v -s tests/distributed/test_nccl_symm_mem_allreduce.py
@@ -169,9 +161,8 @@ steps:
- label: 2 Node Test (4 GPUs) - label: 2 Node Test (4 GPUs)
timeout_in_minutes: 30 timeout_in_minutes: 30
working_dir: "/vllm-workspace/tests" working_dir: "/vllm-workspace/tests"
num_devices: 2 num_gpus: 2
num_nodes: 2 num_nodes: 2
no_plugin: true
source_file_dependencies: source_file_dependencies:
- vllm/distributed/ - vllm/distributed/
- vllm/engine/ - vllm/engine/
@@ -180,12 +171,12 @@ steps:
- tests/distributed/ - tests/distributed/
- tests/examples/offline_inference/data_parallel.py - tests/examples/offline_inference/data_parallel.py
commands: commands:
- ./.buildkite/scripts/run-multi-node-test.sh /vllm-workspace/tests 2 2 $IMAGE_TAG "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=0 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py" "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=1 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code" - ./.buildkite/scripts/run-multi-node-test.sh /vllm-workspace/tests 2 2 public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:0bec63fa317e1fbd62e19b0fc31c43c81bf89077 "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=0 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py" "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=1 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code"
- label: Distributed NixlConnector PD accuracy (4 GPUs) - label: Distributed NixlConnector PD accuracy (4 GPUs)
timeout_in_minutes: 30 timeout_in_minutes: 30
working_dir: "/vllm-workspace/tests" working_dir: "/vllm-workspace/tests"
num_devices: 4 num_gpus: 4
source_file_dependencies: source_file_dependencies:
- vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
- tests/v1/kv_connector/nixl_integration/ - tests/v1/kv_connector/nixl_integration/
@@ -193,21 +184,10 @@ steps:
- uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
- bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh - bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh
- label: DP EP Distributed NixlConnector PD accuracy tests (4 GPUs)
timeout_in_minutes: 30
working_dir: "/vllm-workspace/tests"
num_devices: 4
source_file_dependencies:
- vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
- tests/v1/kv_connector/nixl_integration/
commands:
- uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
- DP_EP=1 bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh
- label: Pipeline + Context Parallelism (4 GPUs)) - label: Pipeline + Context Parallelism (4 GPUs))
timeout_in_minutes: 60 timeout_in_minutes: 60
working_dir: "/vllm-workspace/tests" working_dir: "/vllm-workspace/tests"
num_devices: 4 num_gpus: 4
source_file_dependencies: source_file_dependencies:
- vllm/distributed/ - vllm/distributed/
- vllm/engine/ - vllm/engine/
@@ -216,46 +196,4 @@ steps:
- tests/distributed/ - tests/distributed/
commands: commands:
- pytest -v -s distributed/test_pp_cudagraph.py - pytest -v -s distributed/test_pp_cudagraph.py
- pytest -v -s distributed/test_pipeline_parallel.py - pytest -v -s distributed/test_pipeline_parallel.py
- label: Hopper Fusion E2E Tests (H100)
timeout_in_minutes: 70
working_dir: "/vllm-workspace/"
device: h100
optional: true
source_file_dependencies:
- csrc/quantization/fp4/
- vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
- vllm/v1/attention/backends/flashinfer.py
- vllm/compilation/
# can affect pattern matching
- vllm/model_executor/layers/layernorm.py
- vllm/model_executor/layers/activation.py
- vllm/model_executor/layers/quantization/input_quant_fp8.py
- tests/compile/test_fusion_attn.py
commands:
- export VLLM_TEST_CLEAN_GPU_MEMORY=1
# skip Llama-4 since it does not fit on this device
- pytest -v -s tests/compile/test_fusion_attn.py -k 'not Llama-4'
- label: Hopper Fusion Distributed E2E Tests (2xH100)
timeout_in_minutes: 70
working_dir: "/vllm-workspace/"
device: h100
optional: true
num_devices: 2
source_file_dependencies:
- csrc/quantization/fp4/
- vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
- vllm/v1/attention/backends/flashinfer.py
- vllm/compilation/
# can affect pattern matching
- vllm/model_executor/layers/layernorm.py
- vllm/model_executor/layers/activation.py
- vllm/model_executor/layers/quantization/input_quant_fp8.py
- tests/compile/distributed/test_fusions_e2e.py
commands:
- export VLLM_TEST_CLEAN_GPU_MEMORY=1
# Run all e2e fusion tests
- pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'
- pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py

View File

@@ -4,27 +4,27 @@ depends_on:
steps: steps:
- label: DeepSeek V2-Lite Accuracy - label: DeepSeek V2-Lite Accuracy
timeout_in_minutes: 60 timeout_in_minutes: 60
device: h100 gpu: h100
optional: true optional: true
num_devices: 4 num_gpus: 4
working_dir: "/vllm-workspace" working_dir: "/vllm-workspace"
commands: commands:
- bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh 0.25 200 8010 - bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh 0.25 200 8010
- label: Qwen3-30B-A3B-FP8-block Accuracy - label: Qwen3-30B-A3B-FP8-block Accuracy
timeout_in_minutes: 60 timeout_in_minutes: 60
device: h100 gpu: h100
optional: true optional: true
num_devices: 4 num_gpus: 4
working_dir: "/vllm-workspace" working_dir: "/vllm-workspace"
commands: commands:
- bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020
- label: Qwen3-30B-A3B-FP8-block Accuracy (B200) - label: Qwen3-30B-A3B-FP8-block Accuracy (B200)
timeout_in_minutes: 60 timeout_in_minutes: 60
device: b200 gpu: b200
optional: true optional: true
num_devices: 2 num_gpus: 2
working_dir: "/vllm-workspace" working_dir: "/vllm-workspace"
commands: commands:
- bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1 - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1
@@ -33,11 +33,10 @@ steps:
timeout_in_minutes: 30 timeout_in_minutes: 30
optional: true optional: true
soft_fail: true soft_fail: true
num_devices: 2 num_gpus: 2
working_dir: "/vllm-workspace" working_dir: "/vllm-workspace"
source_file_dependencies: source_file_dependencies:
- vllm/ - vllm/
- .buildkite/scripts/run-prime-rl-test.sh - .buildkite/scripts/run-prime-rl-test.sh
commands: commands:
- nvidia-smi
- bash .buildkite/scripts/run-prime-rl-test.sh - bash .buildkite/scripts/run-prime-rl-test.sh

View File

@@ -23,8 +23,4 @@ steps:
# TODO: accuracy does not match, whether setting # TODO: accuracy does not match, whether setting
# VLLM_USE_FLASHINFER_SAMPLER or not on H100. # VLLM_USE_FLASHINFER_SAMPLER or not on H100.
- pytest -v -s v1/e2e - pytest -v -s v1/e2e
# Run this test standalone for now; - pytest -v -s v1/engine
# need to untangle use (implicit) use of spawn/fork across the tests.
- pytest -v -s v1/engine/test_preprocess_error_handling.py
# Run the rest of v1/engine tests
- pytest -v -s v1/engine --ignore v1/engine/test_preprocess_error_handling.py

View File

@@ -14,7 +14,7 @@ steps:
- label: EPLB Execution - label: EPLB Execution
timeout_in_minutes: 20 timeout_in_minutes: 20
working_dir: "/vllm-workspace/tests" working_dir: "/vllm-workspace/tests"
num_devices: 4 num_gpus: 4
source_file_dependencies: source_file_dependencies:
- vllm/distributed/eplb - vllm/distributed/eplb
- tests/distributed/test_eplb_execute.py - tests/distributed/test_eplb_execute.py

View File

@@ -15,9 +15,8 @@ steps:
timeout_in_minutes: 35 timeout_in_minutes: 35
source_file_dependencies: source_file_dependencies:
- csrc/attention/ - csrc/attention/
- vllm/attention
- vllm/v1/attention - vllm/v1/attention
# TODO: remove this dependency (https://github.com/vllm-project/vllm/issues/32267)
- vllm/model_executor/layers/attention
- tests/kernels/attention - tests/kernels/attention
commands: commands:
- pytest -v -s kernels/attention --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT - pytest -v -s kernels/attention --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
@@ -58,8 +57,8 @@ steps:
- label: Kernels DeepGEMM Test (H100) - label: Kernels DeepGEMM Test (H100)
timeout_in_minutes: 45 timeout_in_minutes: 45
device: h100 gpu: h100
num_devices: 1 num_gpus: 1
source_file_dependencies: source_file_dependencies:
- tools/install_deepgemm.sh - tools/install_deepgemm.sh
- vllm/utils/deep_gemm.py - vllm/utils/deep_gemm.py
@@ -78,7 +77,7 @@ steps:
- label: Kernels (B200) - label: Kernels (B200)
timeout_in_minutes: 30 timeout_in_minutes: 30
working_dir: "/vllm-workspace/" working_dir: "/vllm-workspace/"
device: b200 gpu: b200
# optional: true # optional: true
source_file_dependencies: source_file_dependencies:
- csrc/quantization/fp4/ - csrc/quantization/fp4/
@@ -86,7 +85,7 @@ steps:
- csrc/quantization/cutlass_w8a8/moe/ - csrc/quantization/cutlass_w8a8/moe/
- vllm/model_executor/layers/fused_moe/cutlass_moe.py - vllm/model_executor/layers/fused_moe/cutlass_moe.py
- vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py - vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py
- vllm/model_executor/layers/fused_moe/flashinfer_a2a_prepare_finalize.py - vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py
- vllm/model_executor/layers/quantization/utils/flashinfer_utils.py - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
- vllm/v1/attention/backends/flashinfer.py - vllm/v1/attention/backends/flashinfer.py
- vllm/v1/attention/backends/mla/cutlass_mla.py - vllm/v1/attention/backends/mla/cutlass_mla.py
@@ -115,55 +114,4 @@ steps:
- pytest -v -s tests/kernels/moe/test_nvfp4_moe.py - pytest -v -s tests/kernels/moe/test_nvfp4_moe.py
- pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py - pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py
- pytest -v -s tests/kernels/moe/test_flashinfer.py - pytest -v -s tests/kernels/moe/test_flashinfer.py
- pytest -v -s tests/kernels/moe/test_cutedsl_moe.py - pytest -v -s tests/kernels/moe/test_cutedsl_moe.py
# e2e
- pytest -v -s tests/models/quantization/test_nvfp4.py
- label: Kernels Helion Test
timeout_in_minutes: 30
device: h100
source_file_dependencies:
- vllm/utils/import_utils.py
- tests/kernels/helion/
commands:
- pip install helion
- pytest -v -s kernels/helion/
- label: Kernels FP8 MoE Test (1 H100)
timeout_in_minutes: 90
device: h100
num_devices: 1
optional: true
commands:
- pytest -v -s kernels/moe/test_cutlass_moe.py
- pytest -v -s kernels/moe/test_flashinfer.py
- pytest -v -s kernels/moe/test_gpt_oss_triton_kernels.py
- pytest -v -s kernels/moe/test_modular_oai_triton_moe.py
- pytest -v -s kernels/moe/test_moe.py
# - pytest -v -s kernels/moe/test_block_fp8.py - failing on main
- pytest -v -s kernels/moe/test_block_int8.py
- pytest -v -s kernels/moe/test_triton_moe_no_act_mul.py
- pytest -v -s kernels/moe/test_triton_moe_ptpc_fp8.py
- label: Kernels FP8 MoE Test (2 H100s)
timeout_in_minutes: 90
device: h100
num_devices: 2
optional: true
commands:
- pytest -v -s kernels/moe/test_deepep_deepgemm_moe.py
- pytest -v -s kernels/moe/test_deepep_moe.py
- pytest -v -s kernels/moe/test_pplx_cutlass_moe.py
# - pytest -v -s kernels/moe/test_pplx_moe.py - failing on main
- label: Kernels Fp4 MoE Test (B200)
timeout_in_minutes: 60
device: b200
num_devices: 1
optional: true
commands:
- pytest -v -s kernels/moe/test_cutedsl_moe.py
- pytest -v -s kernels/moe/test_flashinfer_moe.py
- pytest -v -s kernels/moe/test_nvfp4_moe.py
- pytest -v -s kernels/moe/test_ocp_mx_moe.py

View File

@@ -12,9 +12,9 @@ steps:
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt
- label: LM Eval Large Models (4 GPUs)(A100) - label: LM Eval Large Models (4 GPUs)(A100)
device: a100 gpu: a100
optional: true optional: true
num_devices: 4 num_gpus: 4
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness" working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
source_file_dependencies: source_file_dependencies:
- csrc/ - csrc/
@@ -24,9 +24,9 @@ steps:
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4 - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4
- label: LM Eval Large Models (4 GPUs)(H100) - label: LM Eval Large Models (4 GPUs)(H100)
device: h100 gpu: h100
optional: true optional: true
num_devices: 4 num_gpus: 4
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness" working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
source_file_dependencies: source_file_dependencies:
- csrc/ - csrc/
@@ -37,39 +37,10 @@ steps:
- label: LM Eval Small Models (B200) - label: LM Eval Small Models (B200)
timeout_in_minutes: 120 timeout_in_minutes: 120
device: b200 gpu: b200
optional: true optional: true
source_file_dependencies: source_file_dependencies:
- csrc/ - csrc/
- vllm/model_executor/layers/quantization - vllm/model_executor/layers/quantization
commands: commands:
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt
- label: LM Eval Large Models (H200)
timeout_in_minutes: 60
device: h200
optional: true
num_devices: 8
commands:
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-h200.txt
- label: MoE Refactor Integration Test (H100 - TEMPORARY)
device: h100
optional: true
num_devices: 2
commands:
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=evals/gsm8k/configs/moe-refactor/config-h100.txt
- label: MoE Refactor Integration Test (B200 - TEMPORARY)
device: b200
optional: true
num_devices: 2
commands:
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=evals/gsm8k/configs/moe-refactor/config-b200.txt
- label: MoE Refactor Integration Test (B200 DP - TEMPORARY)
device: b200
optional: true
num_devices: 2
commands:
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=evals/gsm8k/configs/moe-refactor-dp-ep/config-b200.txt

View File

@@ -14,7 +14,7 @@ steps:
- label: LoRA TP (Distributed) - label: LoRA TP (Distributed)
timeout_in_minutes: 30 timeout_in_minutes: 30
num_devices: 4 num_gpus: 4
source_file_dependencies: source_file_dependencies:
- vllm/lora - vllm/lora
- tests/lora - tests/lora

View File

@@ -27,12 +27,11 @@ steps:
- pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine - pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
- label: V1 Others (CPU) - label: V1 Others (CPU)
depends_on: depends_on: ~
- image-build-cpu
source_file_dependencies: source_file_dependencies:
- vllm/ - vllm/
- tests/v1 - tests/v1
device: cpu no_gpu: true
commands: commands:
# split the test to avoid interference # split the test to avoid interference
- pytest -v -s -m 'cpu_test' v1/core - pytest -v -s -m 'cpu_test' v1/core
@@ -83,7 +82,7 @@ steps:
- label: Metrics, Tracing (2 GPUs) - label: Metrics, Tracing (2 GPUs)
timeout_in_minutes: 20 timeout_in_minutes: 20
num_devices: 2 num_gpus: 2
source_file_dependencies: source_file_dependencies:
- vllm/ - vllm/
- tests/v1/tracing - tests/v1/tracing
@@ -115,27 +114,24 @@ steps:
- pytest -v -s utils_ - pytest -v -s utils_
- label: Async Engine, Inputs, Utils, Worker, Config (CPU) - label: Async Engine, Inputs, Utils, Worker, Config (CPU)
depends_on: depends_on: ~
- image-build-cpu
timeout_in_minutes: 30 timeout_in_minutes: 30
source_file_dependencies: source_file_dependencies:
- vllm/ - vllm/
- tests/test_inputs.py - tests/test_inputs.py
- tests/test_outputs.py - tests/test_outputs.py
- tests/multimodal - tests/multimodal
- tests/renderers
- tests/standalone_tests/lazy_imports.py - tests/standalone_tests/lazy_imports.py
- tests/tokenizers_ - tests/tokenizers_
- tests/tool_parsers - tests/tool_parsers
- tests/transformers_utils - tests/transformers_utils
- tests/config - tests/config
device: cpu no_gpu: true
commands: commands:
- python3 standalone_tests/lazy_imports.py - python3 standalone_tests/lazy_imports.py
- pytest -v -s test_inputs.py - pytest -v -s test_inputs.py
- pytest -v -s test_outputs.py - pytest -v -s test_outputs.py
- pytest -v -s -m 'cpu_test' multimodal - pytest -v -s -m 'cpu_test' multimodal
- pytest -v -s renderers
- pytest -v -s tokenizers_ - pytest -v -s tokenizers_
- pytest -v -s tool_parsers - pytest -v -s tool_parsers
- pytest -v -s transformers_utils - pytest -v -s transformers_utils
@@ -144,7 +140,7 @@ steps:
- label: GPT-OSS Eval (B200) - label: GPT-OSS Eval (B200)
timeout_in_minutes: 60 timeout_in_minutes: 60
working_dir: "/vllm-workspace/" working_dir: "/vllm-workspace/"
device: b200 gpu: b200
optional: true optional: true
source_file_dependencies: source_file_dependencies:
- tests/evals/gpt_oss - tests/evals/gpt_oss
@@ -157,7 +153,7 @@ steps:
- label: Batch Invariance (H100) - label: Batch Invariance (H100)
timeout_in_minutes: 25 timeout_in_minutes: 25
device: h100 gpu: h100
source_file_dependencies: source_file_dependencies:
- vllm/v1/attention - vllm/v1/attention
- vllm/model_executor/layers - vllm/model_executor/layers

View File

@@ -39,14 +39,12 @@ steps:
- pytest -v -s models/test_transformers.py models/test_registry.py - pytest -v -s models/test_transformers.py models/test_registry.py
- label: Basic Models Test (Other CPU) # 5min - label: Basic Models Test (Other CPU) # 5min
depends_on:
- image-build-cpu
timeout_in_minutes: 10 timeout_in_minutes: 10
source_file_dependencies: source_file_dependencies:
- vllm/ - vllm/
- tests/models/test_utils.py - tests/models/test_utils.py
- tests/models/test_vision.py - tests/models/test_vision.py
device: cpu no_gpu: true
commands: commands:
- pytest -v -s models/test_utils.py models/test_vision.py - pytest -v -s models/test_utils.py models/test_vision.py

View File

@@ -5,7 +5,7 @@ steps:
- label: Distributed Model Tests (2 GPUs) - label: Distributed Model Tests (2 GPUs)
timeout_in_minutes: 50 timeout_in_minutes: 50
working_dir: "/vllm-workspace/tests" working_dir: "/vllm-workspace/tests"
num_devices: 2 num_gpus: 2
source_file_dependencies: source_file_dependencies:
- vllm/model_executor/model_loader/sharded_state_loader.py - vllm/model_executor/model_loader/sharded_state_loader.py
- vllm/model_executor/models/ - vllm/model_executor/models/

View File

@@ -14,13 +14,11 @@ steps:
- cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model # Otherwise, mp_method="spawn" doesn't work - cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model # Otherwise, mp_method="spawn" doesn't work
- label: Multi-Modal Processor Test (CPU) - label: Multi-Modal Processor Test (CPU)
depends_on:
- image-build-cpu
timeout_in_minutes: 60 timeout_in_minutes: 60
source_file_dependencies: source_file_dependencies:
- vllm/ - vllm/
- tests/models/multimodal - tests/models/multimodal
device: cpu no_gpu: true
commands: commands:
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
- pytest -v -s models/multimodal/processing --ignore models/multimodal/processing/test_tensor_schema.py - pytest -v -s models/multimodal/processing --ignore models/multimodal/processing/test_tensor_schema.py

View File

@@ -5,7 +5,7 @@ steps:
- label: Plugin Tests (2 GPUs) - label: Plugin Tests (2 GPUs)
timeout_in_minutes: 60 timeout_in_minutes: 60
working_dir: "/vllm-workspace/tests" working_dir: "/vllm-workspace/tests"
num_devices: 2 num_gpus: 2
source_file_dependencies: source_file_dependencies:
- vllm/plugins/ - vllm/plugins/
- tests/plugins/ - tests/plugins/

View File

@@ -16,14 +16,14 @@ steps:
# https://github.com/pytorch/ao/issues/2919, we'll have to skip new torchao tests for now # https://github.com/pytorch/ao/issues/2919, we'll have to skip new torchao tests for now
# we can only upgrade after this is resolved # we can only upgrade after this is resolved
# TODO(jerryzh168): resolve the above comment # TODO(jerryzh168): resolve the above comment
- uv pip install --system torchao==0.14.1 --index-url https://download.pytorch.org/whl/cu129 - uv pip install --system torchao==0.13.0 --index-url https://download.pytorch.org/whl/cu129
- uv pip install --system conch-triton-kernels - uv pip install --system conch-triton-kernels
- VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py
- label: Quantized MoE Test (B200) - label: Quantized MoE Test (B200)
timeout_in_minutes: 60 timeout_in_minutes: 60
working_dir: "/vllm-workspace/" working_dir: "/vllm-workspace/"
device: b200 gpu: b200
source_file_dependencies: source_file_dependencies:
- tests/quantization/test_blackwell_moe.py - tests/quantization/test_blackwell_moe.py
- vllm/model_executor/models/deepseek_v2.py - vllm/model_executor/models/deepseek_v2.py

View File

@@ -5,7 +5,7 @@ steps:
- label: Weight Loading Multiple GPU # 33min - label: Weight Loading Multiple GPU # 33min
timeout_in_minutes: 45 timeout_in_minutes: 45
working_dir: "/vllm-workspace/tests" working_dir: "/vllm-workspace/tests"
num_devices: 2 num_gpus: 2
optional: true optional: true
source_file_dependencies: source_file_dependencies:
- vllm/ - vllm/
@@ -15,8 +15,8 @@ steps:
- label: Weight Loading Multiple GPU - Large Models # optional - label: Weight Loading Multiple GPU - Large Models # optional
working_dir: "/vllm-workspace/tests" working_dir: "/vllm-workspace/tests"
num_devices: 2 num_gpus: 2
device: a100 gpu: a100
optional: true optional: true
source_file_dependencies: source_file_dependencies:
- vllm/ - vllm/

16
.github/CODEOWNERS vendored
View File

@@ -2,8 +2,8 @@
# for more info about CODEOWNERS file # for more info about CODEOWNERS file
# This lists cover the "core" components of vLLM that require careful review # This lists cover the "core" components of vLLM that require careful review
/vllm/attention @LucasWilkinson
/vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-redhat @njhill @22quinn /vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-redhat @njhill @22quinn
/vllm/model_executor/layers/attention @LucasWilkinson
/vllm/model_executor/layers/fused_moe @mgoin @pavanimajety /vllm/model_executor/layers/fused_moe @mgoin @pavanimajety
/vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth @yewentao256 @pavanimajety /vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth @yewentao256 @pavanimajety
/vllm/model_executor/layers/mamba @tdoublep /vllm/model_executor/layers/mamba @tdoublep
@@ -16,7 +16,7 @@
/vllm/entrypoints @aarnphm @chaunceyjiang /vllm/entrypoints @aarnphm @chaunceyjiang
/vllm/tool_parsers @aarnphm @chaunceyjiang /vllm/tool_parsers @aarnphm @chaunceyjiang
/vllm/compilation @zou3519 @youkaichao @ProExpertProg /vllm/compilation @zou3519 @youkaichao @ProExpertProg
/vllm/distributed/kv_transfer @NickLucche @ApostaC @orozery /vllm/distributed/kv_transfer @NickLucche @ApostaC
CMakeLists.txt @tlrmchlsmth @LucasWilkinson CMakeLists.txt @tlrmchlsmth @LucasWilkinson
# Any change to the VllmConfig changes can have a large user-facing impact, # Any change to the VllmConfig changes can have a large user-facing impact,
@@ -30,14 +30,12 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
/vllm/v1/attention/backends/mla @pavanimajety /vllm/v1/attention/backends/mla @pavanimajety
/vllm/v1/attention/backends/flashinfer.py @mgoin @pavanimajety /vllm/v1/attention/backends/flashinfer.py @mgoin @pavanimajety
/vllm/v1/attention/backends/triton_attn.py @tdoublep /vllm/v1/attention/backends/triton_attn.py @tdoublep
/vllm/v1/core @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @alexm-redhat @heheda12345 @ApostaC @orozery /vllm/v1/core @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @alexm-redhat @heheda12345 @ApostaC
/vllm/v1/sample @22quinn @houseroad @njhill /vllm/v1/sample @22quinn @houseroad @njhill
/vllm/v1/spec_decode @benchislett @luccafong /vllm/v1/spec_decode @benchislett @luccafong
/vllm/v1/structured_output @mgoin @russellb @aarnphm @benchislett /vllm/v1/structured_output @mgoin @russellb @aarnphm @benchislett
/vllm/v1/kv_cache_interface.py @heheda12345 /vllm/v1/kv_cache_interface.py @heheda12345
/vllm/v1/kv_offload @ApostaC @orozery /vllm/v1/offloading @ApostaC
/vllm/v1/worker/gpu/kv_connector.py @orozery
/vllm/v1/worker/kv_connector_model_runner_mixin.py @orozery
# Model runner V2 # Model runner V2
/vllm/v1/worker/gpu @WoosukKwon /vllm/v1/worker/gpu @WoosukKwon
@@ -56,13 +54,13 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
/tests/test_inputs.py @DarkLight1337 @ywang96 /tests/test_inputs.py @DarkLight1337 @ywang96
/tests/v1/entrypoints/llm/test_struct_output_generate.py @mgoin @russellb @aarnphm /tests/v1/entrypoints/llm/test_struct_output_generate.py @mgoin @russellb @aarnphm
/tests/v1/structured_output @mgoin @russellb @aarnphm /tests/v1/structured_output @mgoin @russellb @aarnphm
/tests/v1/core @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @alexm-redhat @heheda12345 @ApostaC @orozery /tests/v1/core @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @alexm-redhat @heheda12345 @ApostaC
/tests/weight_loading @mgoin @youkaichao @yewentao256 /tests/weight_loading @mgoin @youkaichao @yewentao256
/tests/lora @jeejeelee /tests/lora @jeejeelee
/tests/models/language/generation/test_hybrid.py @tdoublep /tests/models/language/generation/test_hybrid.py @tdoublep
/tests/v1/kv_connector/nixl_integration @NickLucche /tests/v1/kv_connector/nixl_integration @NickLucche
/tests/v1/kv_connector @ApostaC @orozery /tests/v1/kv_connector @ApostaC
/tests/v1/kv_offload @ApostaC @orozery /tests/v1/offloading @ApostaC
/tests/v1/determinism @yewentao256 /tests/v1/determinism @yewentao256
# Transformers modeling backend # Transformers modeling backend

12
.github/mergify.yml vendored
View File

@@ -414,18 +414,6 @@ pull_request_rules:
remove: remove:
- needs-rebase - needs-rebase
- name: label-bug
description: Automatically apply bug label
conditions:
- label != stale
- or:
- title~=(?i)\bbug\b
- title~=(?i)\bbugfix\b
actions:
label:
add:
- bug
- name: label-kv-connector - name: label-kv-connector
description: Automatically apply kv-connector label description: Automatically apply kv-connector label
conditions: conditions:

View File

@@ -29,9 +29,8 @@ jobs:
- name: Install dependencies and build vLLM - name: Install dependencies and build vLLM
run: | run: |
uv pip install -r requirements/cpu-build.txt --index-strategy unsafe-best-match
uv pip install -r requirements/cpu.txt --index-strategy unsafe-best-match uv pip install -r requirements/cpu.txt --index-strategy unsafe-best-match
uv pip install -e . --no-build-isolation uv pip install -e .
env: env:
CMAKE_BUILD_PARALLEL_LEVEL: 4 CMAKE_BUILD_PARALLEL_LEVEL: 4

6
.gitignore vendored
View File

@@ -7,9 +7,6 @@ vllm/vllm_flash_attn/*
# OpenAI triton kernels copied from source # OpenAI triton kernels copied from source
vllm/third_party/triton_kernels/* vllm/third_party/triton_kernels/*
# FlashMLA interface copied from source
vllm/third_party/flashmla/flash_mla_interface.py
# triton jit # triton jit
.triton .triton
@@ -194,9 +191,6 @@ CLAUDE.md
AGENTS.md AGENTS.md
.codex/ .codex/
# Cursor
.cursor/
# DS Store # DS Store
.DS_Store .DS_Store

View File

@@ -147,17 +147,6 @@ repos:
entry: python tools/pre_commit/validate_config.py entry: python tools/pre_commit/validate_config.py
language: python language: python
additional_dependencies: [regex] additional_dependencies: [regex]
- id: validate-docker-versions
name: Validate docker/versions.json matches Dockerfile
entry: python tools/generate_versions_json.py --check
language: python
files: ^docker/(Dockerfile|versions\.json)$
pass_filenames: false
additional_dependencies: [dockerfile-parse]
- id: attention-backend-docs
name: Check attention backend documentation is up to date
entry: python tools/pre_commit/generate_attention_backend_docs.py --check
language: python
# Keep `suggestion` last # Keep `suggestion` last
- id: suggestion - id: suggestion
name: Suggestion name: Suggestion

View File

@@ -377,7 +377,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
# preselected input type pairs and schedules. # preselected input type pairs and schedules.
# Generate sources: # Generate sources:
set(MARLIN_GEN_SCRIPT set(MARLIN_GEN_SCRIPT
${CMAKE_CURRENT_SOURCE_DIR}/csrc/quantization/marlin/generate_kernels.py) ${CMAKE_CURRENT_SOURCE_DIR}/csrc/quantization/gptq_marlin/generate_kernels.py)
file(MD5 ${MARLIN_GEN_SCRIPT} MARLIN_GEN_SCRIPT_HASH) file(MD5 ${MARLIN_GEN_SCRIPT} MARLIN_GEN_SCRIPT_HASH)
list(JOIN CUDA_ARCHS "," CUDA_ARCHS_STR) list(JOIN CUDA_ARCHS "," CUDA_ARCHS_STR)
set(MARLIN_GEN_SCRIPT_HASH_AND_ARCH "${MARLIN_GEN_SCRIPT_HASH}(ARCH:${CUDA_ARCHS_STR})") set(MARLIN_GEN_SCRIPT_HASH_AND_ARCH "${MARLIN_GEN_SCRIPT_HASH}(ARCH:${CUDA_ARCHS_STR})")
@@ -412,7 +412,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
endif() endif()
if (MARLIN_ARCHS) if (MARLIN_ARCHS)
file(GLOB MARLIN_TEMPLATE_KERNEL_SRC "csrc/quantization/marlin/sm80_kernel_*_float16.cu") file(GLOB MARLIN_TEMPLATE_KERNEL_SRC "csrc/quantization/gptq_marlin/sm80_kernel_*_float16.cu")
set_gencode_flags_for_srcs( set_gencode_flags_for_srcs(
SRCS "${MARLIN_TEMPLATE_KERNEL_SRC}" SRCS "${MARLIN_TEMPLATE_KERNEL_SRC}"
CUDA_ARCHS "${MARLIN_ARCHS}") CUDA_ARCHS "${MARLIN_ARCHS}")
@@ -422,7 +422,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
endif() endif()
list(APPEND VLLM_EXT_SRC ${MARLIN_TEMPLATE_KERNEL_SRC}) list(APPEND VLLM_EXT_SRC ${MARLIN_TEMPLATE_KERNEL_SRC})
file(GLOB MARLIN_TEMPLATE_BF16_KERNEL_SRC "csrc/quantization/marlin/sm80_kernel_*_bfloat16.cu") file(GLOB MARLIN_TEMPLATE_BF16_KERNEL_SRC "csrc/quantization/gptq_marlin/sm80_kernel_*_bfloat16.cu")
set_gencode_flags_for_srcs( set_gencode_flags_for_srcs(
SRCS "${MARLIN_TEMPLATE_BF16_KERNEL_SRC}" SRCS "${MARLIN_TEMPLATE_BF16_KERNEL_SRC}"
CUDA_ARCHS "${MARLIN_BF16_ARCHS}") CUDA_ARCHS "${MARLIN_BF16_ARCHS}")
@@ -434,7 +434,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
endif() endif()
if (MARLIN_SM75_ARCHS) if (MARLIN_SM75_ARCHS)
file(GLOB MARLIN_TEMPLATE_SM75_KERNEL_SRC "csrc/quantization/marlin/sm75_kernel_*.cu") file(GLOB MARLIN_TEMPLATE_SM75_KERNEL_SRC "csrc/quantization/gptq_marlin/sm75_kernel_*.cu")
set_gencode_flags_for_srcs( set_gencode_flags_for_srcs(
SRCS "${MARLIN_TEMPLATE_SM75_KERNEL_SRC}" SRCS "${MARLIN_TEMPLATE_SM75_KERNEL_SRC}"
CUDA_ARCHS "${MARLIN_SM75_ARCHS}") CUDA_ARCHS "${MARLIN_SM75_ARCHS}")
@@ -446,7 +446,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
endif() endif()
if (MARLIN_FP8_ARCHS) if (MARLIN_FP8_ARCHS)
file(GLOB MARLIN_TEMPLATE_FP8_KERNEL_SRC "csrc/quantization/marlin/sm89_kernel_*.cu") file(GLOB MARLIN_TEMPLATE_FP8_KERNEL_SRC "csrc/quantization/gptq_marlin/sm89_kernel_*.cu")
set_gencode_flags_for_srcs( set_gencode_flags_for_srcs(
SRCS "${MARLIN_TEMPLATE_FP8_KERNEL_SRC}" SRCS "${MARLIN_TEMPLATE_FP8_KERNEL_SRC}"
CUDA_ARCHS "${MARLIN_FP8_ARCHS}") CUDA_ARCHS "${MARLIN_FP8_ARCHS}")
@@ -458,10 +458,11 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
endif() endif()
set(MARLIN_SRCS set(MARLIN_SRCS
"csrc/quantization/marlin/marlin.cu" "csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu"
"csrc/quantization/marlin/marlin_int4_fp8_preprocess.cu" "csrc/quantization/gptq_marlin/gptq_marlin.cu"
"csrc/quantization/marlin/gptq_marlin_repack.cu" "csrc/quantization/gptq_marlin/marlin_int4_fp8_preprocess.cu"
"csrc/quantization/marlin/awq_marlin_repack.cu") "csrc/quantization/gptq_marlin/gptq_marlin_repack.cu"
"csrc/quantization/gptq_marlin/awq_marlin_repack.cu")
set_gencode_flags_for_srcs( set_gencode_flags_for_srcs(
SRCS "${MARLIN_SRCS}" SRCS "${MARLIN_SRCS}"
CUDA_ARCHS "${MARLIN_OTHER_ARCHS}") CUDA_ARCHS "${MARLIN_OTHER_ARCHS}")

View File

@@ -1,266 +0,0 @@
# vLLM Attention Benchmarking Suite
Fast, flexible benchmarking for vLLM attention and MLA backends with an extended batch specification grammar.
## Quick Start
```bash
cd benchmarks/attention_benchmarks
# Run a pre-configured benchmark
python benchmark.py --config configs/mla_decode.yaml
python benchmark.py --config configs/mla_mixed_batch.yaml
python benchmark.py --config configs/speculative_decode.yaml
python benchmark.py --config configs/standard_attention.yaml
python benchmark.py --config configs/reorder_threshold.yaml
# Or run custom benchmarks
python benchmark.py \
--backends flash flashinfer \
--batch-specs "q2k" "8q1s1k" "2q2k_32q1s1k" \
--output-csv results.csv
```
## Simplified Batch Specification Grammar
Express workloads concisely using query length and sequence length:
```python
"q2k" # 2048-token prefill (q_len=2048, seq_len=2048)
"q1s1k" # Decode: 1 token with 1K sequence
"8q1s1k" # 8 decode requests
"q4s1k" # 4-token extend (e.g., spec decode)
"2q2k_32q1s1k" # Mixed: 2 prefills + 32 decodes
"16q4s1k" # 16 spec decode (4 tokens each)
```
### Grammar Rule
```text
Format: (<count>?) q<q_len>(k?) (s<seq_len>(k?))?
- count: Number of identical requests (optional, default=1)
- q_len: Query length (number of new tokens)
- seq_len: Total sequence length (optional, defaults to q_len for prefill)
- 'k': Multiplies value by 1024
Mixed batches: Use _ to combine (e.g., "2q2k_32q1s1k")
```
**Note**: Decode, prefill, and spec decode are just different query lengths - no special syntax needed!
## Pre-configured Benchmarks
The suite includes several pre-configured YAML benchmark configurations:
### MLA Decode Benchmark
Tests pure decode performance across MLA backends with varying batch sizes and sequence lengths.
```bash
python benchmark.py --config configs/mla_decode.yaml
```
### MLA Mixed Batch Benchmark
Tests chunked prefill performance with mixed prefill + decode batches.
```bash
python benchmark.py --config configs/mla_mixed_batch.yaml
```
### Speculative Decoding Benchmark
Tests speculative decode scenarios (K-token verification) and reorder_batch_threshold optimization.
```bash
python benchmark.py --config configs/speculative_decode.yaml
```
### Standard Attention Benchmark
Tests standard attention backends (Flash/Triton/FlashInfer) with pure prefill, decode, and mixed batches.
```bash
python benchmark.py --config configs/standard_attention.yaml
```
### Reorder Threshold Study
**Question:** At what query length does the prefill pipeline become faster than the decode pipeline?
Tests query lengths from 1-1024 across 9 batch sizes to find the crossover point. Uses `decode_vs_prefill` mode to compare both pipelines for each query length.
```bash
python benchmark.py --config configs/reorder_threshold.yaml
```
---
## Universal Benchmark
The `benchmark.py` script handles **all** backends - both standard attention and MLA.
### Standard Attention (Flash/Triton/FlashInfer)
```bash
python benchmark.py \
--backends flash triton flashinfer \
--batch-specs "q2k" "8q1s1k" "2q2k_32q1s1k" \
--num-layers 10 \
--repeats 5 \
--output-csv results.csv
```
### MLA Backends
```bash
# Compare all MLA backends
python benchmark.py \
--backends cutlass_mla flashinfer_mla flashattn_mla flashmla \
--batch-specs "64q1s1k" "64q1s4k" \
--output-csv mla_results.csv
```
### Parameter Sweeps
Use `--sweep-param` and `--sweep-values` to run parameter sweeps from the CLI:
#### CUTLASS MLA num-splits Optimization
**Question:** What is the optimal `num_kv_splits` for CUTLASS MLA?
```bash
python benchmark.py \
--backend cutlass_mla \
--batch-specs "64q1s1k" "64q1s4k" "64q1s16k" \
--sweep-param num_kv_splits \
--sweep-values 1 2 4 8 16 \
--output-json optimal_splits.json
```
#### Reorder Batch Threshold Optimization
**Question:** What's the optimal `reorder_batch_threshold` for speculative decoding?
```bash
python benchmark.py \
--backend flashmla \
--batch-specs "q4s1k" "q8s2k" \
--sweep-param reorder_batch_threshold \
--sweep-values 1 4 16 64 256 512 \
--output-csv threshold_sweep.csv
```
### All Command-Line Options
```text
--config CONFIG # Path to YAML config file (overrides other args)
--backends BACKEND [BACKEND ...] # flash, triton, flashinfer, cutlass_mla,
# flashinfer_mla, flashattn_mla, flashmla
--backend BACKEND # Single backend (alternative to --backends)
--batch-specs SPEC [SPEC ...] # Batch specifications using extended grammar
# Model configuration
--num-layers N # Number of layers
--head-dim N # Head dimension
--num-q-heads N # Query heads
--num-kv-heads N # KV heads
--block-size N # Block size
# Benchmark settings
--device DEVICE # Device (default: cuda:0)
--repeats N # Repetitions
--warmup-iters N # Warmup iterations
--profile-memory # Profile memory usage
# Parameter sweeps
--sweep-param PARAM # Parameter name to sweep (e.g., num_kv_splits,
# reorder_batch_threshold)
--sweep-values N [N ...] # Values to sweep for the parameter
# Output
--output-csv FILE # Save to CSV
--output-json FILE # Save to JSON
```
## Hardware Requirements
| Backend | Hardware |
|---------|----------|
| Flash/Triton/FlashInfer | Any CUDA GPU |
| CUTLASS MLA | Blackwell (SM100+) |
| FlashAttn MLA | Hopper (SM90+) |
| FlashMLA | Hopper (SM90+) |
| FlashInfer-MLA | Any CUDA GPU |
## Using MLA Runner Directly
All MLA backends are available through `mla_runner.run_mla_benchmark()`:
```python
from mla_runner import run_mla_benchmark
from common import BenchmarkConfig
config = BenchmarkConfig(
backend="cutlass_mla",
batch_spec="64q1s4k",
num_layers=10,
head_dim=576,
num_q_heads=128,
num_kv_heads=1,
block_size=128,
device="cuda:0",
repeats=5,
warmup_iters=3,
)
# CUTLASS MLA with specific num_kv_splits
result = run_mla_benchmark("cutlass_mla", config, num_kv_splits=4)
print(f"Time: {result.mean_time:.6f}s")
# FlashInfer-MLA
result = run_mla_benchmark("flashinfer_mla", config)
# FlashAttn MLA (Hopper SM90+)
result = run_mla_benchmark("flashattn_mla", config, reorder_batch_threshold=64)
# FlashMLA (Hopper SM90+)
result = run_mla_benchmark("flashmla", config, reorder_batch_threshold=64)
```
## Python API
```python
from batch_spec import parse_batch_spec, format_batch_spec, get_batch_stats
from common import BenchmarkConfig, BenchmarkResult, ResultsFormatter
# Parse batch specs
requests = parse_batch_spec("2q2k_q4s1k_32q1s1k")
print(format_batch_spec(requests))
# "2 prefill (2x2k), 1 extend (1xq4kv1k), 32 decode (32x1k)"
# Get batch statistics
stats = get_batch_stats(requests)
print(f"Total tokens: {stats['total_tokens']}")
print(f"Num decode: {stats['num_decode']}, Num prefill: {stats['num_prefill']}")
# Format results
formatter = ResultsFormatter()
formatter.save_csv(results, "output.csv")
formatter.save_json(results, "output.json")
```
## Tips
**1. Warmup matters** - Use `--warmup-iters 10` for stable results
**2. Multiple repeats** - Use `--repeats 20` for low variance
**3. Save results** - Always use `--output-csv` or `--output-json`
**4. Test incrementally** - Start with `--num-layers 1 --repeats 1`
**5. Extended grammar** - Leverage spec decode, chunked prefill patterns
**6. Parameter sweeps** - Use `--sweep-param` and `--sweep-values` to find optimal values

View File

@@ -1,44 +0,0 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""vLLM Attention Benchmarking Suite."""
from .batch_spec import (
BatchRequest,
format_batch_spec,
get_batch_stats,
parse_batch_spec,
reorder_for_flashinfer,
split_by_type,
)
from .common import (
BenchmarkConfig,
BenchmarkResult,
MockLayer,
MockModelConfig,
ResultsFormatter,
get_attention_scale,
is_mla_backend,
setup_mla_dims,
)
__all__ = [
# Batch specification
"BatchRequest",
"parse_batch_spec",
"format_batch_spec",
"reorder_for_flashinfer",
"split_by_type",
"get_batch_stats",
# Benchmarking infrastructure
"BenchmarkConfig",
"BenchmarkResult",
"ResultsFormatter",
# Mock objects
"MockLayer",
"MockModelConfig",
# Utilities
"setup_mla_dims",
"get_attention_scale",
"is_mla_backend",
]

View File

@@ -1,231 +0,0 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
Simplified batch specification grammar for attention benchmarks.
Grammar (underscore-separated segments):
Format: (<count>?) q<q_len>(k?) (s<seq_len>(k?))?
- count: Number of identical requests (optional, default=1)
- q_len: Query length (number of new tokens)
- seq_len: Total sequence length (optional, defaults to q_len for prefill)
- 'k' suffix: Multiplies value by 1024
Common patterns:
- Prefill: q_len == seq_len (e.g., "q2k" → 2048 new tokens, 2048 seq)
- Decode: q_len == 1 (e.g., "q1s1k" → 1 token, 1024 seq length)
- Extend: q_len < seq_len (e.g., "q4s1k" → 4 tokens, 1024 seq length)
Examples:
q2k -> [(2048, 2048)] # Prefill: 2048 tokens
q1s1k -> [(1, 1024)] # Decode: 1 token, 1K sequence
8q1s1k -> [(1, 1024)] * 8 # 8 decode requests
q4s1k -> [(4, 1024)] # 4-token extend (spec decode)
2q1k_32q1s1k -> [(1024, 1024)] * 2 + [(1, 1024)] * 32 # Mixed batch
16q4s1k -> [(4, 1024)] * 16 # 16 spec decode requests
"""
from collections import Counter
from dataclasses import dataclass
import regex as re
@dataclass
class BatchRequest:
"""Represents a single request in a batch."""
q_len: int # Query length (number of new tokens)
kv_len: int # Total KV cache length
@property
def is_decode(self) -> bool:
"""True if this is a decode request (q_len == 1)."""
return self.q_len == 1
@property
def is_prefill(self) -> bool:
"""True if this is a pure prefill (q_len == kv_len)."""
return self.q_len == self.kv_len
@property
def is_extend(self) -> bool:
"""True if this is context extension (q_len > 1, kv_len > q_len)."""
return self.q_len > 1 and self.kv_len > self.q_len
@property
def context_len(self) -> int:
"""Context length (KV cache - query)."""
return self.kv_len - self.q_len
def as_tuple(self) -> tuple[int, int]:
"""Return as (q_len, kv_len) tuple for compatibility."""
return (self.q_len, self.kv_len)
def _parse_size(size_str: str, k_suffix: str) -> int:
"""Parse size string with optional 'k' suffix."""
size = int(size_str)
return size * 1024 if k_suffix == "k" else size
def parse_batch_spec(spec: str) -> list[BatchRequest]:
"""
Parse batch specification string into list of BatchRequest objects.
Grammar: (<count>?) q<q_len>(k?) (s<seq_len>(k?))?
Args:
spec: Batch specification string (see module docstring for grammar)
Returns:
List of BatchRequest objects
Raises:
ValueError: If spec format is invalid
"""
requests = []
for seg in spec.split("_"):
# Unified pattern: (<count>?) q<q_len>(k?) (s<seq_len>(k?))?
m = re.match(r"^(?:(\d+))?q(\d+)(k?)(?:s(\d+)(k?))?$", seg)
if m:
cnt = int(m.group(1)) if m.group(1) else 1
q_len = _parse_size(m.group(2), m.group(3))
kv_len = _parse_size(m.group(4), m.group(5)) if m.group(4) else q_len
requests.extend([BatchRequest(q_len=q_len, kv_len=kv_len)] * cnt)
continue
raise ValueError(f"Invalid batch spec segment: '{seg}'")
return requests
def format_batch_spec(requests: list[BatchRequest]) -> str:
"""
Format list of BatchRequest into human-readable string.
Groups requests by type and provides counts and sizes.
Args:
requests: List of BatchRequest objects
Returns:
Formatted string describing the batch
"""
kinds = {
"prefill": [],
"extend": [],
"decode": [],
}
for req in requests:
tup = (req.q_len, req.kv_len)
if req.is_prefill:
kinds["prefill"].append(tup)
elif req.is_extend:
kinds["extend"].append(tup)
elif req.is_decode:
kinds["decode"].append(tup)
parts = []
for kind in ["prefill", "extend", "decode"]:
lst = kinds[kind]
if not lst:
continue
cnt_total = len(lst)
ctr = Counter(lst)
inner = []
for (q, kv), cnt in ctr.items():
if kind == "prefill":
size = f"{q // 1024}k" if q % 1024 == 0 else str(q)
inner.append(f"{cnt}x{size}")
elif kind == "decode":
size = f"{kv // 1024}k" if kv % 1024 == 0 else str(kv)
inner.append(f"{cnt}x{size}")
else: # extend
qstr = f"{q // 1024}k" if q % 1024 == 0 else str(q)
kstr = f"{kv // 1024}k" if kv % 1024 == 0 else str(kv)
inner.append(f"{cnt}xq{qstr}kv{kstr}")
parts.append(f"{cnt_total} {kind} ({', '.join(inner)})")
return ", ".join(parts)
def reorder_for_flashinfer(requests: list[BatchRequest]) -> list[BatchRequest]:
"""
Reorder requests for FlashInfer: decode first, then prefill.
FlashInfer expects decode requests before prefill requests for
optimal performance.
Args:
requests: Original list of BatchRequest
Returns:
Reordered list with decode requests first
"""
decodes = [r for r in requests if r.is_decode]
non_decodes = [r for r in requests if not r.is_decode]
return decodes + non_decodes
def split_by_type(
requests: list[BatchRequest],
) -> dict[str, list[BatchRequest]]:
"""
Split requests by type for analysis.
Args:
requests: List of BatchRequest
Returns:
Dict with keys: 'decode', 'prefill', 'extend'
"""
result = {
"decode": [],
"prefill": [],
"extend": [],
}
for req in requests:
if req.is_decode:
result["decode"].append(req)
elif req.is_prefill:
result["prefill"].append(req)
elif req.is_extend:
result["extend"].append(req)
return result
def get_batch_stats(requests: list[BatchRequest]) -> dict:
"""
Compute statistics about a batch.
Args:
requests: List of BatchRequest
Returns:
Dict with batch statistics
"""
by_type = split_by_type(requests)
return {
"total_requests": len(requests),
"num_decode": len(by_type["decode"]),
"num_prefill": len(by_type["prefill"]),
"num_extend": len(by_type["extend"]),
"total_tokens": sum(r.q_len for r in requests),
"total_kv_cache": sum(r.kv_len for r in requests),
"max_q_len": max((r.q_len for r in requests), default=0),
"max_kv_len": max((r.kv_len for r in requests), default=0),
"avg_q_len": sum(r.q_len for r in requests) / len(requests) if requests else 0,
"avg_kv_len": (
sum(r.kv_len for r in requests) / len(requests) if requests else 0
),
}

View File

@@ -1,886 +0,0 @@
#!/usr/bin/env python3
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
Universal vLLM Attention Benchmark
Benchmark any attention backend with the extended grammar.
Supports standard attention (Flash/Triton/FlashInfer) and MLA backends.
Examples:
# Standard attention
python benchmark.py --backends flash flashinfer --batch-specs "q2k" "8q1s1k"
# MLA backends
python benchmark.py --backends cutlass_mla flashinfer_mla --batch-specs "64q1s1k"
# Parameter sweep (CLI)
python benchmark.py --backend cutlass_mla \
--batch-specs "64q1s1k" \
--sweep-param num_kv_splits \
--sweep-values 1 4 8 16
# Parameter sweep (YAML config - recommended)
python benchmark.py --config configs/cutlass_numsplits.yaml
"""
import argparse
import sys
from dataclasses import replace
from pathlib import Path
import yaml
from rich.console import Console
from tqdm import tqdm
sys.path.insert(0, str(Path(__file__).parent.parent.parent))
from batch_spec import parse_batch_spec
from common import (
BenchmarkConfig,
BenchmarkResult,
ModelParameterSweep,
ParameterSweep,
ResultsFormatter,
is_mla_backend,
)
def run_standard_attention_benchmark(config: BenchmarkConfig) -> BenchmarkResult:
"""Run standard attention benchmark (Flash/Triton/FlashInfer)."""
from runner import run_attention_benchmark
return run_attention_benchmark(config)
def run_mla_benchmark(config: BenchmarkConfig, **kwargs) -> BenchmarkResult:
"""Run MLA benchmark with appropriate backend."""
from mla_runner import run_mla_benchmark as run_mla
return run_mla(config.backend, config, **kwargs)
def run_benchmark(config: BenchmarkConfig, **kwargs) -> BenchmarkResult:
"""
Run a single benchmark with proper backend selection.
Args:
config: BenchmarkConfig with backend, batch_spec, and model params
**kwargs: Additional arguments passed to MLA benchmarks
Returns:
BenchmarkResult (may have error field set on failure)
"""
try:
if is_mla_backend(config.backend):
return run_mla_benchmark(config, **kwargs)
else:
return run_standard_attention_benchmark(config)
except Exception as e:
return BenchmarkResult(
config=config,
mean_time=float("inf"),
std_time=0,
min_time=float("inf"),
max_time=float("inf"),
error=str(e),
)
def run_model_parameter_sweep(
backends: list[str],
batch_specs: list[str],
base_config_args: dict,
sweep: ModelParameterSweep,
console: Console,
) -> list[BenchmarkResult]:
"""
Run model parameter sweep for given backends and batch specs.
Args:
backends: List of backend names
batch_specs: List of batch specifications
base_config_args: Base configuration arguments (num_layers, head_dim, etc.)
sweep: ModelParameterSweep configuration
console: Rich console for output
Returns:
List of BenchmarkResult objects
"""
all_results = []
console.print(
f"[yellow]Model sweep mode: testing {sweep.param_name} = {sweep.values}[/]"
)
total = len(backends) * len(batch_specs) * len(sweep.values)
with tqdm(total=total, desc="Benchmarking") as pbar:
for backend in backends:
for spec in batch_specs:
for value in sweep.values:
# Create config with modified model parameter
config_args = base_config_args.copy()
config_args[sweep.param_name] = value
# Create config with original backend for running
clean_config = BenchmarkConfig(
backend=backend, batch_spec=spec, **config_args
)
# Run benchmark
result = run_benchmark(clean_config)
# Replace backend with labeled version for display
backend_label = sweep.get_label(backend, value)
labeled_config = replace(result.config, backend=backend_label)
result = replace(result, config=labeled_config)
all_results.append(result)
if not result.success:
console.print(
f"[red]Error {backend} {spec} {sweep.param_name}="
f"{value}: {result.error}[/]"
)
pbar.update(1)
# Display sweep results - create separate table for each parameter value
console.print("\n[bold green]Model Parameter Sweep Results:[/]")
formatter = ResultsFormatter(console)
# Group results by parameter value and extract backend mapping
by_param_value = {}
backend_mapping = {} # Maps labeled backend -> original backend
for r in all_results:
# Extract original backend and param value from labeled backend
# The label format is: {backend}_{param_name}_{value}
# We need to reverse engineer this
labeled_backend = r.config.backend
# Try each backend to find which one this result belongs to
for backend in backends:
for value in sweep.values:
expected_label = sweep.get_label(backend, value)
if labeled_backend == expected_label:
backend_mapping[labeled_backend] = backend
param_value = str(value)
if param_value not in by_param_value:
by_param_value[param_value] = []
by_param_value[param_value].append(r)
break
# Create a table for each parameter value
sorted_param_values = sorted(
by_param_value.keys(), key=lambda x: int(x) if x.isdigit() else x
)
for param_value in sorted_param_values:
console.print(f"\n[bold cyan]{sweep.param_name} = {param_value}[/]")
param_results = by_param_value[param_value]
# Create modified results with original backend names
modified_results = []
for r in param_results:
# Get the original backend name from our mapping
original_backend = backend_mapping[r.config.backend]
modified_config = replace(r.config, backend=original_backend)
modified_result = replace(r, config=modified_config)
modified_results.append(modified_result)
# Print table with original backend names
formatter.print_table(modified_results, backends, compare_to_fastest=True)
# Show optimal backend for each (param_value, batch_spec) combination
console.print(
f"\n[bold cyan]Optimal backend for each ({sweep.param_name}, batch_spec):[/]"
)
# Group by (param_value, batch_spec)
by_param_and_spec = {}
for r in all_results:
if r.success:
# Find which (backend, value) this result corresponds to
labeled_backend = r.config.backend
for backend in backends:
for value in sweep.values:
expected_label = sweep.get_label(backend, value)
if labeled_backend == expected_label:
param_value = str(value)
spec = r.config.batch_spec
key = (param_value, spec)
if key not in by_param_and_spec:
by_param_and_spec[key] = []
by_param_and_spec[key].append(r)
break
# Sort by param value then spec
sorted_keys = sorted(
by_param_and_spec.keys(),
key=lambda x: (int(x[0]) if x[0].isdigit() else x[0], x[1]),
)
current_param_value = None
for param_value, spec in sorted_keys:
# Print header when param value changes
if param_value != current_param_value:
console.print(f"\n [bold]{sweep.param_name}={param_value}:[/]")
current_param_value = param_value
results = by_param_and_spec[(param_value, spec)]
best = min(results, key=lambda r: r.mean_time)
# Extract original backend name using the mapping
backend_name = backend_mapping[best.config.backend]
# Show all backends' times for comparison
times_str = " | ".join(
[
f"{backend_mapping[r.config.backend]}: {r.mean_time:.6f}s"
for r in sorted(results, key=lambda r: r.mean_time)
]
)
console.print(
f" {spec:12s} -> [bold green]{backend_name:15s}[/] ({times_str})"
)
return all_results
def run_parameter_sweep(
backends: list[str],
batch_specs: list[str],
base_config_args: dict,
sweep: ParameterSweep,
console: Console,
) -> list[BenchmarkResult]:
"""
Run parameter sweep for given backends and batch specs.
Args:
backends: List of backend names
batch_specs: List of batch specifications
base_config_args: Base configuration arguments (num_layers, head_dim, etc.)
sweep: ParameterSweep configuration
console: Rich console for output
Returns:
List of BenchmarkResult objects
"""
all_results = []
# Build list of values to sweep (including auto if requested)
sweep_values = list(sweep.values)
if sweep.include_auto:
sweep_values.append("auto")
console.print(f"[yellow]Sweep mode: testing {sweep.param_name} = {sweep_values}[/]")
total = len(backends) * len(batch_specs) * len(sweep_values)
with tqdm(total=total, desc="Benchmarking") as pbar:
for backend in backends:
for spec in batch_specs:
for value in sweep_values:
# Create config with original backend for running
config = BenchmarkConfig(
backend=backend, batch_spec=spec, **base_config_args
)
# Prepare kwargs for benchmark runner
kwargs = {}
if value != "auto":
kwargs[sweep.param_name] = value
# Run benchmark
result = run_benchmark(config, **kwargs)
# Replace backend with labeled version for display
backend_label = sweep.get_label(backend, value)
labeled_config = replace(result.config, backend=backend_label)
result = replace(result, config=labeled_config)
all_results.append(result)
if not result.success:
console.print(
f"[red]Error {backend} {spec} {sweep.param_name}="
f"{value}: {result.error}[/]"
)
pbar.update(1)
# Display sweep results
console.print("\n[bold green]Sweep Results:[/]")
backend_labels = [sweep.get_label(b, v) for b in backends for v in sweep_values]
formatter = ResultsFormatter(console)
formatter.print_table(all_results, backend_labels)
# Show optimal values
console.print(f"\n[bold cyan]Optimal {sweep.param_name} per batch spec:[/]")
by_spec = {}
for r in all_results:
if r.success:
spec = r.config.batch_spec
if spec not in by_spec:
by_spec[spec] = []
by_spec[spec].append(r)
for spec in sorted(by_spec.keys()):
results = by_spec[spec]
best = min(results, key=lambda r: r.mean_time)
console.print(
f" {spec}: [bold green]{best.config.backend}[/] ({best.mean_time:.6f}s)"
)
return all_results
def load_config_from_yaml(config_path: str) -> dict:
"""Load configuration from YAML file."""
with open(config_path) as f:
return yaml.safe_load(f)
def generate_batch_specs_from_ranges(ranges: list[dict]) -> list[str]:
"""
Generate batch specs from range specifications.
Args:
ranges: List of range specifications, each containing:
- template: Batch spec template (e.g., "q{q_len}kv1k")
- q_len: Dict with start, stop, step, end_inclusive (optional)
- Other parameters can also be ranges
Returns:
List of generated batch spec strings
Example:
ranges = [
{
"template": "q{q_len}kv1k",
"q_len": {
"start": 1,
"stop": 16,
"step": 1,
"end_inclusive": true # Optional, defaults to true
}
}
]
Returns: ["q1kv1k", "q2kv1k", ..., "q16kv1k"]
"""
all_specs = []
for range_spec in ranges:
template = range_spec.get("template")
if not template:
raise ValueError("Range specification must include 'template'")
# Extract all range parameters from the spec
range_params = {}
for key, value in range_spec.items():
if key == "template":
continue
if isinstance(value, dict) and "start" in value:
# This is a range specification
start = value["start"]
stop = value["stop"]
step = value.get("step", 1)
# Check if end should be inclusive (default: True)
end_inclusive = value.get("end_inclusive", True)
# Adjust stop based on end_inclusive
if end_inclusive:
range_params[key] = list(range(start, stop + 1, step))
else:
range_params[key] = list(range(start, stop, step))
else:
# This is a fixed value
range_params[key] = [value]
# Generate all combinations (Cartesian product)
if range_params:
import itertools
param_names = list(range_params.keys())
param_values = [range_params[name] for name in param_names]
for values in itertools.product(*param_values):
params = dict(zip(param_names, values))
spec = template.format(**params)
all_specs.append(spec)
else:
# No parameters, just use template as-is
all_specs.append(template)
return all_specs
def main():
parser = argparse.ArgumentParser(
description="Universal vLLM attention benchmark",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog=__doc__,
)
# Config file
parser.add_argument(
"--config",
help="Path to YAML config file (overrides other args)",
)
# Backend selection
parser.add_argument(
"--backends",
nargs="+",
help="Backends to benchmark (flash, triton, flashinfer, cutlass_mla, "
"flashinfer_mla, flashattn_mla, flashmla)",
)
parser.add_argument(
"--backend",
help="Single backend (alternative to --backends)",
)
# Batch specifications
parser.add_argument(
"--batch-specs",
nargs="+",
default=["q2k", "8q1s1k"],
help="Batch specifications using extended grammar",
)
# Model config
parser.add_argument("--num-layers", type=int, default=10, help="Number of layers")
parser.add_argument("--head-dim", type=int, default=128, help="Head dimension")
parser.add_argument("--num-q-heads", type=int, default=32, help="Query heads")
parser.add_argument("--num-kv-heads", type=int, default=8, help="KV heads")
parser.add_argument("--block-size", type=int, default=16, help="Block size")
# Benchmark settings
parser.add_argument("--device", default="cuda:0", help="Device")
parser.add_argument("--repeats", type=int, default=1, help="Repetitions")
parser.add_argument("--warmup-iters", type=int, default=3, help="Warmup iterations")
parser.add_argument("--profile-memory", action="store_true", help="Profile memory")
# Parameter sweep (use YAML config for advanced sweeps)
parser.add_argument(
"--sweep-param",
help="Parameter name to sweep (e.g., num_kv_splits, reorder_batch_threshold)",
)
parser.add_argument(
"--sweep-values",
type=int,
nargs="+",
help="Values to sweep for the parameter",
)
# Output
parser.add_argument("--output-csv", help="Save to CSV")
parser.add_argument("--output-json", help="Save to JSON")
args = parser.parse_args()
console = Console()
console.print("[bold cyan]vLLM Attention Benchmark[/]")
# Load config from YAML if provided
if args.config:
console.print(f"[yellow]Loading config from: {args.config}[/]")
yaml_config = load_config_from_yaml(args.config)
# Show description if available
if "description" in yaml_config:
console.print(f"[dim]{yaml_config['description']}[/]")
# Override args with YAML values
# (YAML takes precedence unless CLI arg was explicitly set)
# Backend(s)
if "backend" in yaml_config:
args.backend = yaml_config["backend"]
args.backends = None
elif "backends" in yaml_config:
args.backends = yaml_config["backends"]
args.backend = None
# Check for special modes
if "mode" in yaml_config:
args.mode = yaml_config["mode"]
else:
args.mode = None
# Batch specs and sizes
# Support both explicit batch_specs and generated batch_spec_ranges
if "batch_spec_ranges" in yaml_config:
# Generate batch specs from ranges
generated_specs = generate_batch_specs_from_ranges(
yaml_config["batch_spec_ranges"]
)
# Combine with any explicit batch_specs
if "batch_specs" in yaml_config:
args.batch_specs = yaml_config["batch_specs"] + generated_specs
else:
args.batch_specs = generated_specs
console.print(
f"[dim]Generated {len(generated_specs)} batch specs from ranges[/]"
)
elif "batch_specs" in yaml_config:
args.batch_specs = yaml_config["batch_specs"]
if "batch_sizes" in yaml_config:
args.batch_sizes = yaml_config["batch_sizes"]
else:
args.batch_sizes = None
# Model config
if "model" in yaml_config:
model = yaml_config["model"]
args.num_layers = model.get("num_layers", args.num_layers)
args.head_dim = model.get("head_dim", args.head_dim)
args.num_q_heads = model.get("num_q_heads", args.num_q_heads)
args.num_kv_heads = model.get("num_kv_heads", args.num_kv_heads)
args.block_size = model.get("block_size", args.block_size)
# Benchmark settings
if "benchmark" in yaml_config:
bench = yaml_config["benchmark"]
args.device = bench.get("device", args.device)
args.repeats = bench.get("repeats", args.repeats)
args.warmup_iters = bench.get("warmup_iters", args.warmup_iters)
args.profile_memory = bench.get("profile_memory", args.profile_memory)
# Parameter sweep configuration
if "parameter_sweep" in yaml_config:
sweep_config = yaml_config["parameter_sweep"]
args.parameter_sweep = ParameterSweep(
param_name=sweep_config["param_name"],
values=sweep_config["values"],
include_auto=sweep_config.get("include_auto", False),
label_format=sweep_config.get(
"label_format", "{backend}_{param_name}_{value}"
),
)
else:
args.parameter_sweep = None
# Model parameter sweep configuration
if "model_parameter_sweep" in yaml_config:
sweep_config = yaml_config["model_parameter_sweep"]
args.model_parameter_sweep = ModelParameterSweep(
param_name=sweep_config["param_name"],
values=sweep_config["values"],
label_format=sweep_config.get(
"label_format", "{backend}_{param_name}_{value}"
),
)
else:
args.model_parameter_sweep = None
# Output
if "output" in yaml_config:
output = yaml_config["output"]
if "csv" in output and not args.output_csv:
args.output_csv = output["csv"]
if "json" in output and not args.output_json:
args.output_json = output["json"]
console.print()
# Handle CLI-based parameter sweep (if not from YAML)
if (
(not hasattr(args, "parameter_sweep") or args.parameter_sweep is None)
and args.sweep_param
and args.sweep_values
):
args.parameter_sweep = ParameterSweep(
param_name=args.sweep_param,
values=args.sweep_values,
include_auto=False,
label_format="{backend}_{param_name}_{value}",
)
# Determine backends
backends = args.backends or ([args.backend] if args.backend else ["flash"])
console.print(f"Backends: {', '.join(backends)}")
console.print(f"Batch specs: {', '.join(args.batch_specs)}")
console.print()
# Run benchmarks
all_results = []
# Handle special mode: decode_vs_prefill comparison
if hasattr(args, "mode") and args.mode == "decode_vs_prefill":
console.print("[yellow]Mode: Decode vs Prefill pipeline comparison[/]")
console.print(
"[dim]For each query length, testing both decode and prefill pipelines[/]"
)
console.print("[dim]Using batched execution for optimal performance[/]")
# Extract batch sizes from config
batch_sizes = getattr(args, "batch_sizes", [1])
backend = backends[0] # Use first backend (should only be one)
# Calculate total benchmarks
total = len(batch_sizes)
with tqdm(total=total, desc="Benchmarking") as pbar:
for batch_size in batch_sizes:
# Prepare all configs for this batch size
configs_with_thresholds = []
for spec in args.batch_specs:
# Parse the batch spec to get query length
requests = parse_batch_spec(spec)
if not requests:
console.print(
f"[red]Error: Could not parse batch spec '{spec}'[/]"
)
continue
# Get query length from first request
query_length = requests[0].q_len
# Create batch spec for this batch size
# For batch_size > 1, we need to prepend the count
batch_spec = f"{batch_size}{spec}" if batch_size > 1 else spec
# Create base config (without backend name)
base_config = BenchmarkConfig(
backend=backend, # Will be overridden later
batch_spec=batch_spec,
num_layers=args.num_layers,
head_dim=args.head_dim,
num_q_heads=args.num_q_heads,
num_kv_heads=args.num_kv_heads,
block_size=args.block_size,
device=args.device,
repeats=args.repeats,
warmup_iters=args.warmup_iters,
profile_memory=args.profile_memory,
)
# Add decode pipeline config
decode_threshold = query_length
config_decode = replace(
base_config,
backend=f"{backend}_decode_qlen{query_length}_bs{batch_size}",
)
configs_with_thresholds.append((config_decode, decode_threshold))
# Add prefill pipeline config if query_length > 1
if query_length > 1:
prefill_threshold = query_length - 1
config_prefill = replace(
base_config,
backend=f"{backend}_prefill_qlen{query_length}"
f"_bs{batch_size}",
)
configs_with_thresholds.append(
(config_prefill, prefill_threshold)
)
# Run all benchmarks for this batch size in one go (batched mode)
try:
from mla_runner import run_mla_benchmark as run_mla
# Use batched API: pass list of (config, threshold) tuples
timing_results = run_mla(backend, configs_with_thresholds)
# Create BenchmarkResult objects from timing results
for (config, _), timing in zip(
configs_with_thresholds, timing_results
):
result = BenchmarkResult(
config=config,
mean_time=timing["mean"],
std_time=timing["std"],
min_time=timing["min"],
max_time=timing["max"],
throughput_tokens_per_sec=timing.get("throughput", None),
)
all_results.append(result)
except Exception as e:
import traceback
console.print(
f"[red]Error running batched benchmarks for "
f"batch_size={batch_size}: {e}[/]"
)
console.print("[red]Traceback:[/]")
traceback.print_exc()
# Add error results for all configs
for config, _ in configs_with_thresholds:
result = BenchmarkResult(
config=config,
mean_time=float("inf"),
std_time=0,
min_time=float("inf"),
max_time=float("inf"),
error=str(e),
)
all_results.append(result)
pbar.update(1)
# Display decode vs prefill results
console.print("\n[bold green]Decode vs Prefill Results:[/]")
# Group by batch size
by_batch_size = {}
for r in all_results:
if r.success:
# Extract batch size from backend name
parts = r.config.backend.split("_")
bs_part = [p for p in parts if p.startswith("bs")]
if bs_part:
bs = int(bs_part[0][2:])
if bs not in by_batch_size:
by_batch_size[bs] = []
by_batch_size[bs].append(r)
# For each batch size, analyze crossover point
for bs in sorted(by_batch_size.keys()):
console.print(f"\n[bold cyan]Batch size: {bs}[/]")
results = by_batch_size[bs]
# Group by query length
by_qlen = {}
for r in results:
parts = r.config.backend.split("_")
qlen_part = [p for p in parts if p.startswith("qlen")]
if qlen_part:
qlen = int(qlen_part[0][4:])
if qlen not in by_qlen:
by_qlen[qlen] = {}
pipeline = "decode" if "decode" in r.config.backend else "prefill"
by_qlen[qlen][pipeline] = r
# Find crossover point
last_decode_faster = None
for qlen in sorted(by_qlen.keys()):
pipelines = by_qlen[qlen]
if "decode" in pipelines and "prefill" in pipelines:
decode_time = pipelines["decode"].mean_time
prefill_time = pipelines["prefill"].mean_time
faster = "decode" if decode_time < prefill_time else "prefill"
speedup = (
prefill_time / decode_time
if decode_time < prefill_time
else decode_time / prefill_time
)
console.print(
f" qlen={qlen:3d}: decode={decode_time:.6f}s, "
f"prefill={prefill_time:.6f}s -> "
f"[bold]{faster}[/] ({speedup:.2f}x)"
)
if faster == "decode":
last_decode_faster = qlen
if last_decode_faster is not None:
optimal_threshold = last_decode_faster
console.print(
f"\n [bold green]Optimal threshold for batch_size={bs}: "
f"{optimal_threshold}[/]"
)
console.print(
f" [dim](Use decode pipeline for query_length <= "
f"{optimal_threshold})[/]"
)
else:
console.print(
f"\n [yellow]Prefill always faster for batch_size={bs}[/]"
)
# Handle model parameter sweep mode
elif hasattr(args, "model_parameter_sweep") and args.model_parameter_sweep:
# Model parameter sweep
base_config_args = {
"num_layers": args.num_layers,
"head_dim": args.head_dim,
"num_q_heads": args.num_q_heads,
"num_kv_heads": args.num_kv_heads,
"block_size": args.block_size,
"device": args.device,
"repeats": args.repeats,
"warmup_iters": args.warmup_iters,
"profile_memory": args.profile_memory,
}
all_results = run_model_parameter_sweep(
backends,
args.batch_specs,
base_config_args,
args.model_parameter_sweep,
console,
)
# Handle parameter sweep mode (unified)
elif hasattr(args, "parameter_sweep") and args.parameter_sweep:
# Unified parameter sweep
base_config_args = {
"num_layers": args.num_layers,
"head_dim": args.head_dim,
"num_q_heads": args.num_q_heads,
"num_kv_heads": args.num_kv_heads,
"block_size": args.block_size,
"device": args.device,
"repeats": args.repeats,
"warmup_iters": args.warmup_iters,
"profile_memory": args.profile_memory,
}
all_results = run_parameter_sweep(
backends, args.batch_specs, base_config_args, args.parameter_sweep, console
)
else:
# Normal mode: compare backends
total = len(backends) * len(args.batch_specs)
with tqdm(total=total, desc="Benchmarking") as pbar:
for spec in args.batch_specs:
for backend in backends:
config = BenchmarkConfig(
backend=backend,
batch_spec=spec,
num_layers=args.num_layers,
head_dim=args.head_dim,
num_q_heads=args.num_q_heads,
num_kv_heads=args.num_kv_heads,
block_size=args.block_size,
device=args.device,
repeats=args.repeats,
warmup_iters=args.warmup_iters,
profile_memory=args.profile_memory,
)
result = run_benchmark(config)
all_results.append(result)
if not result.success:
console.print(f"[red]Error {backend} {spec}: {result.error}[/]")
pbar.update(1)
# Display results
console.print("\n[bold green]Results:[/]")
formatter = ResultsFormatter(console)
formatter.print_table(all_results, backends)
# Save results
if all_results:
formatter = ResultsFormatter(console)
if args.output_csv:
formatter.save_csv(all_results, args.output_csv)
if args.output_json:
formatter.save_json(all_results, args.output_json)
if __name__ == "__main__":
main()

View File

@@ -1,503 +0,0 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Common utilities for attention benchmarking."""
import csv
import json
import math
from dataclasses import asdict, dataclass
from pathlib import Path
from typing import Any
import numpy as np
import torch
from rich.console import Console
from rich.table import Table
# Mock classes for vLLM attention infrastructure
class MockHfConfig:
"""Mock HuggingFace config that satisfies vLLM's requirements."""
def __init__(self, mla_dims: dict):
self.num_attention_heads = mla_dims["num_q_heads"]
self.num_key_value_heads = mla_dims["num_kv_heads"]
self.hidden_size = mla_dims["head_dim"] * mla_dims["num_q_heads"]
self.model_type = "deepseek_v2"
self.is_encoder_decoder = False
self.kv_lora_rank = mla_dims["kv_lora_rank"]
self.qk_nope_head_dim = mla_dims["qk_nope_head_dim"]
self.qk_rope_head_dim = mla_dims["qk_rope_head_dim"]
self.v_head_dim = mla_dims["v_head_dim"]
self.qk_head_dim = mla_dims["qk_nope_head_dim"] + mla_dims["qk_rope_head_dim"]
def get_text_config(self):
return self
# Import AttentionLayerBase at module level to avoid circular dependencies
try:
from vllm.model_executor.layers.attention_layer_base import AttentionLayerBase
_HAS_ATTENTION_LAYER_BASE = True
except ImportError:
_HAS_ATTENTION_LAYER_BASE = False
AttentionLayerBase = object # Fallback
class MockKVBProj:
"""Mock KV projection layer for MLA prefill mode.
Mimics ColumnParallelLinear behavior for kv_b_proj in MLA backends.
Projects kv_c_normed to [qk_nope_head_dim + v_head_dim] per head.
"""
def __init__(self, num_heads: int, qk_nope_head_dim: int, v_head_dim: int):
self.num_heads = num_heads
self.qk_nope_head_dim = qk_nope_head_dim
self.v_head_dim = v_head_dim
self.out_dim = qk_nope_head_dim + v_head_dim
def __call__(self, x: torch.Tensor) -> tuple[torch.Tensor]:
"""
Project kv_c_normed to output space.
Args:
x: Input tensor [num_tokens, kv_lora_rank]
Returns:
Tuple containing output tensor
[num_tokens, num_heads, qk_nope_head_dim + v_head_dim]
"""
num_tokens = x.shape[0]
result = torch.randn(
num_tokens,
self.num_heads,
self.out_dim,
device=x.device,
dtype=x.dtype,
)
return (result,) # Return as tuple to match ColumnParallelLinear API
class MockLayer(AttentionLayerBase):
"""Mock attention layer with scale parameters and impl.
Inherits from AttentionLayerBase so it passes isinstance checks
in get_layers_from_vllm_config when FlashInfer prefill is enabled.
"""
def __init__(self, device: torch.device, impl=None, kv_cache_spec=None):
# Don't call super().__init__() as AttentionLayerBase doesn't have __init__
self._k_scale = torch.tensor(1.0, device=device)
self._v_scale = torch.tensor(1.0, device=device)
self._q_scale = torch.tensor(1.0, device=device)
# Scalar floats for kernels that need them
self._k_scale_float = float(self._k_scale.item())
self._v_scale_float = float(self._v_scale.item())
self._q_scale_float = float(self._q_scale.item())
# AttentionImpl for metadata builders to query
self.impl = impl
# KV cache spec for get_kv_cache_spec
self._kv_cache_spec = kv_cache_spec
def get_attn_backend(self):
"""Get the attention backend class (required by AttentionLayerBase)."""
# Return None as this is just a mock layer for benchmarking
return None
def get_kv_cache_spec(self):
"""Get the KV cache spec (required by AttentionLayerBase)."""
return self._kv_cache_spec
class MockModelConfig:
"""Mock model configuration."""
def __init__(
self,
num_q_heads: int,
num_kv_heads: int,
head_dim: int,
dtype: torch.dtype = torch.float16,
max_model_len: int = 32768,
):
self._n_q = num_q_heads
self._n_kv = num_kv_heads
self._d = head_dim
self.dtype = dtype
self.max_model_len = max_model_len
def get_num_attention_heads(self, _=None) -> int:
return self._n_q
def get_num_kv_heads(self, _=None) -> int:
return self._n_kv
def get_head_size(self) -> int:
return self._d
def get_num_layers(self) -> int:
"""Mock method for layer count queries."""
return 1
def get_sliding_window_for_layer(self, _layer_idx: int):
"""Mock method for sliding window queries."""
return None
def get_logits_soft_cap_for_layer(self, _layer_idx: int):
"""Mock method for logits soft cap queries."""
return None
def get_sm_scale_for_layer(self, _layer_idx: int) -> float:
"""Mock method for SM scale queries."""
return 1.0 / (self.get_head_size() ** 0.5)
class MockParallelConfig:
"""Mock parallel configuration."""
pass
class MockCompilationConfig:
"""Mock compilation configuration."""
def __init__(self):
self.full_cuda_graph = False
self.static_forward_context = {}
class MockVLLMConfig:
"""Mock VLLM configuration."""
def __init__(self):
self.compilation_config = MockCompilationConfig()
class MockRunner:
"""Mock GPU runner for metadata builders."""
def __init__(
self,
seq_lens: np.ndarray,
query_start_locs: np.ndarray,
device: torch.device,
num_q_heads: int,
num_kv_heads: int,
head_dim: int,
dtype: torch.dtype,
):
self.model_config = MockModelConfig(num_q_heads, num_kv_heads, head_dim, dtype)
self.parallel_config = MockParallelConfig()
self.vllm_config = MockVLLMConfig()
self.seq_lens_np = seq_lens
self.query_start_loc_np = query_start_locs
self.device = device
self.attention_chunk_size = None
self.num_query_heads = num_q_heads
self.num_kv_heads = num_kv_heads
self.dtype = dtype
@dataclass
class ParameterSweep:
"""Configuration for sweeping a backend parameter."""
param_name: str # Name of the backend parameter to sweep
values: list[Any] # List of values to test
include_auto: bool = False # Also test with param unset (auto mode)
label_format: str = "{backend}_{param_name}_{value}" # Result label template
def get_label(self, backend: str, value: Any) -> str:
"""Generate a label for a specific parameter value."""
return self.label_format.format(
backend=backend, param_name=self.param_name, value=value
)
@dataclass
class ModelParameterSweep:
"""Configuration for sweeping a model configuration parameter."""
param_name: str # Name of the model config parameter to sweep (e.g., "num_q_heads")
values: list[Any] # List of values to test
label_format: str = "{backend}_{param_name}_{value}" # Result label template
def get_label(self, backend: str, value: Any) -> str:
"""Generate a label for a specific parameter value."""
return self.label_format.format(
backend=backend, param_name=self.param_name, value=value
)
@dataclass
class BenchmarkConfig:
"""Configuration for a single benchmark run."""
backend: str
batch_spec: str
num_layers: int
head_dim: int
num_q_heads: int
num_kv_heads: int
block_size: int
device: str
dtype: torch.dtype = torch.float16
repeats: int = 1
warmup_iters: int = 3
profile_memory: bool = False
use_cuda_graphs: bool = False
# MLA-specific
kv_lora_rank: int | None = None
qk_nope_head_dim: int | None = None
qk_rope_head_dim: int | None = None
v_head_dim: int | None = None
# Backend-specific tuning
num_kv_splits: int | None = None # CUTLASS MLA
reorder_batch_threshold: int | None = None # FlashAttn MLA, FlashMLA
@dataclass
class BenchmarkResult:
"""Results from a single benchmark run."""
config: BenchmarkConfig
mean_time: float # seconds
std_time: float # seconds
min_time: float # seconds
max_time: float # seconds
throughput_tokens_per_sec: float | None = None
memory_allocated_mb: float | None = None
memory_reserved_mb: float | None = None
error: str | None = None
@property
def success(self) -> bool:
"""Whether benchmark completed successfully."""
return self.error is None
def to_dict(self) -> dict[str, Any]:
"""Convert to dictionary for serialization."""
return {
"config": asdict(self.config),
"mean_time": self.mean_time,
"std_time": self.std_time,
"min_time": self.min_time,
"max_time": self.max_time,
"throughput_tokens_per_sec": self.throughput_tokens_per_sec,
"memory_allocated_mb": self.memory_allocated_mb,
"memory_reserved_mb": self.memory_reserved_mb,
"error": self.error,
}
class ResultsFormatter:
"""Format and display benchmark results."""
def __init__(self, console: Console | None = None):
self.console = console or Console()
def print_table(
self,
results: list[BenchmarkResult],
backends: list[str],
compare_to_fastest: bool = True,
):
"""
Print results as a rich table.
Args:
results: List of BenchmarkResult
backends: List of backend names being compared
compare_to_fastest: Show percentage comparison to fastest
"""
# Group by batch spec
by_spec = {}
for r in results:
spec = r.config.batch_spec
if spec not in by_spec:
by_spec[spec] = {}
by_spec[spec][r.config.backend] = r
# Create shortened backend names for display
def shorten_backend_name(name: str) -> str:
"""Shorten long backend names for table display."""
# Remove common prefixes
name = name.replace("flashattn_mla", "famla")
name = name.replace("flashinfer_mla", "fimla")
name = name.replace("flashmla", "fmla")
name = name.replace("cutlass_mla", "cmla")
name = name.replace("numsplits", "ns")
return name
table = Table(title="Attention Benchmark Results")
table.add_column("Batch\nSpec", no_wrap=True)
multi = len(backends) > 1
for backend in backends:
short_name = shorten_backend_name(backend)
# Time column
col_time = f"{short_name}\nTime (s)"
table.add_column(col_time, justify="right", no_wrap=False)
if multi and compare_to_fastest:
# Relative performance column
col_rel = f"{short_name}\nvs Best"
table.add_column(col_rel, justify="right", no_wrap=False)
# Add rows
for spec in sorted(by_spec.keys()):
spec_results = by_spec[spec]
times = {b: r.mean_time for b, r in spec_results.items() if r.success}
best_time = min(times.values()) if times else 0.0
row = [spec]
for backend in backends:
if backend in spec_results:
r = spec_results[backend]
if r.success:
row.append(f"{r.mean_time:.6f}")
if multi and compare_to_fastest:
pct = (
(r.mean_time / best_time * 100) if best_time > 0 else 0
)
pct_str = f"{pct:.1f}%"
if r.mean_time == best_time:
pct_str = f"[bold green]{pct_str}[/]"
row.append(pct_str)
else:
row.append("[red]ERROR[/]")
if multi and compare_to_fastest:
row.append("-")
else:
row.append("-")
if multi and compare_to_fastest:
row.append("-")
table.add_row(*row)
self.console.print(table)
def save_csv(self, results: list[BenchmarkResult], path: str):
"""Save results to CSV file."""
if not results:
return
path_obj = Path(path)
path_obj.parent.mkdir(parents=True, exist_ok=True)
with open(path, "w", newline="") as f:
writer = csv.DictWriter(
f,
fieldnames=[
"backend",
"batch_spec",
"num_layers",
"mean_time",
"std_time",
"throughput",
"memory_mb",
],
)
writer.writeheader()
for r in results:
writer.writerow(
{
"backend": r.config.backend,
"batch_spec": r.config.batch_spec,
"num_layers": r.config.num_layers,
"mean_time": r.mean_time,
"std_time": r.std_time,
"throughput": r.throughput_tokens_per_sec or 0,
"memory_mb": r.memory_allocated_mb or 0,
}
)
self.console.print(f"[green]Saved CSV results to {path}[/]")
def save_json(self, results: list[BenchmarkResult], path: str):
"""Save results to JSON file."""
path_obj = Path(path)
path_obj.parent.mkdir(parents=True, exist_ok=True)
data = [r.to_dict() for r in results]
with open(path, "w") as f:
json.dump(data, f, indent=2, default=str)
self.console.print(f"[green]Saved JSON results to {path}[/]")
def setup_mla_dims(model_name: str = "deepseek-v3") -> dict:
"""
Get MLA dimensions for known models.
Args:
model_name: Model identifier
Returns:
Dict with MLA dimension configuration
"""
configs = {
"deepseek-v2": {
"kv_lora_rank": 512,
"qk_nope_head_dim": 128,
"qk_rope_head_dim": 64,
"v_head_dim": 128,
"num_q_heads": 128,
"num_kv_heads": 1,
"head_dim": 576,
},
"deepseek-v3": {
"kv_lora_rank": 512,
"qk_nope_head_dim": 128,
"qk_rope_head_dim": 64,
"v_head_dim": 128,
"num_q_heads": 128,
"num_kv_heads": 1,
"head_dim": 576,
},
"deepseek-v2-lite": {
"kv_lora_rank": 512,
"qk_nope_head_dim": 128,
"qk_rope_head_dim": 64,
"v_head_dim": 128,
"num_q_heads": 16,
"num_kv_heads": 1,
"head_dim": 576,
},
}
if model_name not in configs:
raise ValueError(
f"Unknown model '{model_name}'. Known models: {list(configs.keys())}"
)
return configs[model_name]
def get_attention_scale(head_dim: int) -> float:
"""Compute attention scale factor (1/sqrt(d))."""
return 1.0 / math.sqrt(head_dim)
def is_mla_backend(backend: str) -> bool:
"""
Check if backend is an MLA backend using the backend's is_mla() property.
Args:
backend: Backend name (e.g., "CUTLASS_MLA", "FLASHINFER_MLA")
Returns:
True if the backend is an MLA backend, False otherwise
"""
from vllm.v1.attention.backends.registry import AttentionBackendEnum
try:
backend_class = AttentionBackendEnum[backend.upper()].get_class()
return backend_class.is_mla()
except (KeyError, ValueError, ImportError):
return False

View File

@@ -1,61 +0,0 @@
# MLA decode-only benchmark configuration
model:
name: "deepseek-v3"
num_layers: 60
num_q_heads: 128
num_kv_heads: 1 # MLA uses single latent KV
head_dim: 576
kv_lora_rank: 512
qk_nope_head_dim: 128
qk_rope_head_dim: 64
v_head_dim: 128
block_size: 128 # CUTLASS MLA and FlashAttn MLA use 128
batch_specs:
# Small batches, varying sequence lengths
- "16q1s512" # 16 requests, 512 KV cache
- "16q1s1k" # 16 requests, 1k KV cache
- "16q1s2k" # 16 requests, 2k KV cache
- "16q1s4k" # 16 requests, 4k KV cache
# Medium batches
- "32q1s1k" # 32 requests, 1k KV cache
- "32q1s2k" # 32 requests, 2k KV cache
- "32q1s4k" # 32 requests, 4k KV cache
- "32q1s8k" # 32 requests, 8k KV cache
# Large batches
- "64q1s1k" # 64 requests, 1k KV cache
- "64q1s2k" # 64 requests, 2k KV cache
- "64q1s4k" # 64 requests, 4k KV cache
- "64q1s8k" # 64 requests, 8k KV cache
# Very large batches
- "128q1s1k" # 128 requests, 1k KV cache
- "128q1s2k" # 128 requests, 2k KV cache
# Long context
- "32q1s16k" # 32 requests, 16k KV cache
- "32q1s32k" # 32 requests, 32k KV cache
backends:
- cutlass_mla
- flashinfer_mla
- flashattn_mla # Hopper only
- flashmla # Hopper only
device: "cuda:0"
repeats: 5
warmup_iters: 3
profile_memory: true
# Backend-specific tuning
cutlass_mla:
num_kv_splits: auto # or specific value like 4, 8, 16
flashattn_mla:
reorder_batch_threshold: 512
flashmla:
reorder_batch_threshold: 1

View File

@@ -1,60 +0,0 @@
# MLA mixed batch benchmark (prefill + decode)
# Tests chunked prefill performance
model:
name: "deepseek-v3"
num_layers: 60
num_q_heads: 128
num_kv_heads: 1
head_dim: 576
kv_lora_rank: 512
qk_nope_head_dim: 128
qk_rope_head_dim: 64
v_head_dim: 128
block_size: 128
batch_specs:
# Small prefill + decode
- "1q1k_8q1s1k" # 1 prefill + 8 decode
- "2q2k_16q1s1k" # 2 prefill + 16 decode
- "4q1k_32q1s2k" # 4 prefill + 32 decode
# Medium prefill + decode
- "2q4k_32q1s2k" # 2 medium prefill + 32 decode
- "4q4k_64q1s2k" # 4 medium prefill + 64 decode
- "8q2k_64q1s4k" # 8 prefill + 64 decode
# Large prefill + decode (chunked prefill stress test)
- "2q8k_32q1s1k" # 2 large prefill + 32 decode
- "1q16k_16q1s2k" # 1 very large prefill + 16 decode
- "2q16k_32q1s4k" # 2 very large prefill + 32 decode
# Context extension + decode
- "2q1kkv2k_16q1s1k" # 2 extend + 16 decode
- "4q2kkv4k_32q1s2k" # 4 extend + 32 decode
- "2q1kkv8k_32q1s2k" # 2 large extend + 32 decode
# Explicitly chunked prefill
- "q8k" # 8k prefill with chunking hint
- "q16k" # 16k prefill with chunking hint
- "2q8k_32q1s2k" # 2 chunked prefill + 32 decode
# High decode ratio (realistic serving)
- "1q2k_63q1s1k" # 1 prefill + 63 decode
- "2q2k_62q1s2k" # 2 prefill + 62 decode
- "4q4k_60q1s4k" # 4 prefill + 60 decode
backends:
- cutlass_mla
- flashinfer_mla
- flashattn_mla # Hopper only
- flashmla # Hopper only
device: "cuda:0"
repeats: 5
warmup_iters: 3
profile_memory: true
# Analyze chunked prefill workspace size impact
chunked_prefill:
test_workspace_sizes: [4096, 8192, 16384, 32768, 65536]

View File

@@ -1,88 +0,0 @@
# Study 4: What is optimal reorder_batch_threshold for MLA backends supporting query length > 1?
# Question: At what query length does prefill pipeline become faster than decode pipeline?
# Methodology: For each query length, compare decode vs prefill performance to find crossover point
# Applies to: FlashAttn MLA, FlashMLA
description: "Decode vs Prefill pipeline crossover analysis"
# Test FlashAttn MLA
backend: flashattn_mla
# Mode: decode_vs_prefill comparison (special sweep mode)
# For each batch spec, we'll test both decode and prefill pipelines
mode: "decode_vs_prefill"
# Query lengths to test (from old benchmark_mla_threshold.py methodology)
# Each query length will be tested with BOTH decode and prefill pipelines:
# - decode: threshold >= query_length (forces decode pipeline)
# - prefill: threshold < query_length (forces prefill pipeline)
#
# We use q<N>s1k format which creates q_len=N, seq_len=1024 requests
# This tests different query lengths with fixed sequence length context
#
# Using batch_spec_ranges for automatic generation:
batch_spec_ranges:
- template: "q{q_len}s1k"
q_len:
start: 1
stop: 16
step: 1
end_inclusive: false
- template: "q{q_len}s1k"
q_len:
start: 16
stop: 64
step: 2
end_inclusive: false
- template: "q{q_len}s1k"
q_len:
start: 64
stop: 1024
step: 4
end_inclusive: true
# Batch sizes to test (from old script)
batch_sizes:
- 1
- 2
- 4
- 8
- 16
- 32
- 64
- 128
- 256
# Model configuration (DeepSeek V2/V3 defaults)
model:
num_layers: 10
head_dim: 576
num_q_heads: 128
num_kv_heads: 1
block_size: 128
# Benchmark settings
benchmark:
device: "cuda:0"
repeats: 15 # More repeats for spec decode variance
warmup_iters: 5
profile_memory: false
# Output
output:
csv: "reorder_threshold_results.csv"
json: "reorder_threshold_results.json"
# Expected outcome (reproduces old benchmark_mla_threshold.py study):
# - For each batch size, find the crossover point where prefill becomes faster than decode
# - Show decode vs prefill performance across all query lengths
# - Determine optimal reorder_batch_threshold based on last query length where decode is faster
# - Understand how crossover point varies with batch size
# - Provide data-driven guidance for default threshold value
#
# Methodology (from old script):
# - Each query length tested with BOTH pipelines:
# * decode: threshold >= query_length (forces decode pipeline)
# * prefill: threshold < query_length (forces prefill pipeline)
# - Compare which is faster to find crossover point
#

View File

@@ -1,62 +0,0 @@
# Speculative decoding benchmark configuration
# Tests reorder_batch_threshold optimization
model:
name: "deepseek-v3"
num_layers: 60
num_q_heads: 128
num_kv_heads: 1
head_dim: 576
kv_lora_rank: 512
qk_nope_head_dim: 128
qk_rope_head_dim: 64
v_head_dim: 128
batch_specs:
# Pure speculative decode (K-token verification)
- "q2s1k" # 2-token spec, 1k KV
- "q4s1k" # 4-token spec, 1k KV
- "q8s1k" # 8-token spec, 1k KV
- "q16s1k" # 16-token spec, 1k KV
# Speculative with different context lengths
- "q4s2k" # 4-token spec, 2k KV
- "q4s4k" # 4-token spec, 4k KV
- "q8s2k" # 8-token spec, 2k KV
- "q8s4k" # 8-token spec, 4k KV
# Mixed: speculative + regular decode
- "32q4s1k" # 32 spec requests
- "16q4s1k_16q1s1k" # 16 spec + 16 regular
- "8q8s2k_24q1s2k" # 8 spec (8-tok) + 24 regular
# Mixed: speculative + prefill + decode
- "2q1k_16q4s1k_16q1s1k" # 2 prefill + 16 spec + 16 decode
- "4q2k_32q4s2k_32q1s2k" # 4 prefill + 32 spec + 32 decode
# Large batches with speculation
- "64q4s1k" # 64 spec requests
- "32q8s2k" # 32 spec (8-token)
- "16q16s4k" # 16 spec (16-token)
# Backends that support query length > 1
backends:
- flashattn_mla # reorder_batch_threshold = 512
- flashmla # reorder_batch_threshold = 1 (tunable)
# FlashInfer-MLA also supports uniform spec-as-decode but with different mechanism
# - flashinfer_mla
# Benchmark settings
benchmark:
device: "cuda:0"
repeats: 10 # More repeats for statistical significance
warmup_iters: 5
profile_memory: false
# Test these threshold values for optimization
parameter_sweep:
param_name: "reorder_batch_threshold"
values: [1, 2, 4, 8, 16, 32, 64, 128, 256, 512]
include_auto: false
label_format: "{backend}_threshold_{value}"

View File

@@ -1,40 +0,0 @@
# Standard attention backend benchmark configuration
model:
num_layers: 32
num_q_heads: 32
num_kv_heads: 8 # GQA with 4:1 ratio
head_dim: 128
block_size: 16
batch_specs:
# Pure prefill
- "q512" # Small prefill (512 tokens)
- "q2k" # Medium prefill (2048 tokens)
- "q4k" # Large prefill (4096 tokens)
- "q8k" # Very large prefill (8192 tokens)
# Pure decode
- "8q1s1k" # 8 requests, 1k KV cache each
- "16q1s2k" # 16 requests, 2k KV cache each
- "32q1s1k" # 32 requests, 1k KV cache each
- "64q1s4k" # 64 requests, 4k KV cache each
# Mixed prefill/decode
- "2q2k_8q1s1k" # 2 prefill + 8 decode
- "4q1k_16q1s2k" # 4 prefill + 16 decode
- "2q4k_32q1s1k" # 2 large prefill + 32 decode
# Context extension
- "q1ks2k" # 1k query, 2k sequence (chunked prefill)
- "2q1ks4k" # 2 requests: 1k query, 4k sequence
backends:
- flash
- triton
- flashinfer
device: "cuda:0"
repeats: 5
warmup_iters: 3
profile_memory: false

View File

@@ -1,836 +0,0 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
MLA benchmark runner - shared utilities for MLA benchmarks.
This module provides helpers for running MLA backends without
needing full VllmConfig integration.
"""
import importlib
import numpy as np
import torch
from batch_spec import parse_batch_spec
from common import (
BenchmarkResult,
MockHfConfig,
MockKVBProj,
MockLayer,
setup_mla_dims,
)
from vllm.config import (
CacheConfig,
CompilationConfig,
ModelConfig,
ParallelConfig,
SchedulerConfig,
VllmConfig,
set_current_vllm_config,
)
# ============================================================================
# VllmConfig Creation
# ============================================================================
def _add_mock_methods_to_model_config(model_config: ModelConfig) -> None:
"""
Add mock methods for layer-specific queries to ModelConfig.
These methods are needed by metadata builders but aren't normally
present on ModelConfig when used in benchmark contexts.
"""
import types
model_config.get_num_layers = types.MethodType(lambda self: 1, model_config)
model_config.get_sliding_window_for_layer = types.MethodType(
lambda self, _i: None, model_config
)
model_config.get_logits_soft_cap_for_layer = types.MethodType(
lambda self, _i: None, model_config
)
model_config.get_sm_scale_for_layer = types.MethodType(
lambda self, _i: 1.0 / model_config.get_head_size() ** 0.5, model_config
)
def create_minimal_vllm_config(
model_name: str = "deepseek-v3",
block_size: int = 128,
max_num_seqs: int = 256,
mla_dims: dict | None = None,
) -> VllmConfig:
"""
Create minimal VllmConfig for MLA benchmarks.
Args:
model_name: Model name (deepseek-v2, deepseek-v3, etc.) - used if mla_dims not
provided
block_size: KV cache block size
max_num_seqs: Maximum number of sequences
mla_dims: Optional custom MLA dimensions dict. If not provided, uses
setup_mla_dims(model_name)
Returns:
VllmConfig for benchmarking
"""
# Get MLA dimensions - use provided or load from model name
if mla_dims is None:
mla_dims = setup_mla_dims(model_name)
# Create mock HF config first (avoids downloading from HuggingFace)
mock_hf_config = MockHfConfig(mla_dims)
# Create a temporary minimal config.json to avoid HF downloads
# This ensures consistent ModelConfig construction without network access
import json
import os
import shutil
import tempfile
minimal_config = {
"architectures": ["DeepseekV2ForCausalLM"],
"model_type": "deepseek_v2",
"num_attention_heads": mla_dims["num_q_heads"],
"num_key_value_heads": mla_dims["num_kv_heads"],
"hidden_size": mla_dims["head_dim"] * mla_dims["num_q_heads"],
"torch_dtype": "bfloat16",
"max_position_embeddings": 163840, # DeepSeek V3 default
"rope_theta": 10000.0,
"vocab_size": 128256,
}
# Create temporary directory with config.json
temp_dir = tempfile.mkdtemp(prefix="vllm_bench_")
config_path = os.path.join(temp_dir, "config.json")
with open(config_path, "w") as f:
json.dump(minimal_config, f)
try:
# Create model config using local path - no HF downloads
model_config = ModelConfig(
model=temp_dir, # Use local temp directory
tokenizer=None,
tokenizer_mode="auto",
trust_remote_code=True,
dtype="bfloat16",
seed=0,
max_model_len=32768,
quantization=None,
quantization_param_path=None,
enforce_eager=False,
max_context_len_to_capture=None,
max_seq_len_to_capture=8192,
max_logprobs=20,
disable_sliding_window=False,
skip_tokenizer_init=True,
served_model_name=None,
limit_mm_per_prompt=None,
use_async_output_proc=True,
config_format="auto",
)
finally:
# Clean up temporary directory
shutil.rmtree(temp_dir, ignore_errors=True)
# Override with our mock config
model_config.hf_config = mock_hf_config
model_config.hf_text_config = mock_hf_config
# Add mock methods for layer-specific queries
_add_mock_methods_to_model_config(model_config)
# Create sub-configs
cache_config = CacheConfig(
block_size=block_size,
gpu_memory_utilization=0.9,
swap_space=0,
cache_dtype="auto",
enable_prefix_caching=False,
)
scheduler_config = SchedulerConfig(
max_num_seqs=max_num_seqs,
max_num_batched_tokens=8192,
max_model_len=32768,
is_encoder_decoder=False,
enable_chunked_prefill=True,
)
parallel_config = ParallelConfig(
tensor_parallel_size=1,
)
compilation_config = CompilationConfig()
return VllmConfig(
model_config=model_config,
cache_config=cache_config,
parallel_config=parallel_config,
scheduler_config=scheduler_config,
compilation_config=compilation_config,
)
# ============================================================================
# Backend Configuration
# ============================================================================
# Backend name to class name prefix mapping
_BACKEND_NAME_MAP = {
"flashattn_mla": "FlashAttnMLA",
"flashmla": "FlashMLA",
"flashinfer_mla": "FlashInferMLA",
"cutlass_mla": "CutlassMLA",
}
# Special properties that differ from defaults
_BACKEND_PROPERTIES = {
"flashmla": {
"query_format": "concat", # Single concatenated tensor (vs tuple)
"block_size": 64, # FlashMLA uses fixed block size
},
"flashinfer_mla": {
"block_size": 64, # FlashInfer MLA only supports 32 or 64
},
}
def _get_backend_config(backend: str) -> dict:
"""
Get backend configuration using naming conventions.
All MLA backends follow the pattern:
- Module: vllm.v1.attention.backends.mla.{backend}
- Impl: {Name}Impl
- Metadata: {Name}Metadata (or MLACommonMetadata)
- DecodeMetadata: {Name}DecodeMetadata (or MLACommonDecodeMetadata)
- MetadataBuilder: {Name}MetadataBuilder
"""
if backend not in _BACKEND_NAME_MAP:
raise ValueError(f"Unknown backend: {backend}")
name = _BACKEND_NAME_MAP[backend]
props = _BACKEND_PROPERTIES.get(backend, {})
# Check if backend uses common metadata (FlashInfer, CUTLASS)
uses_common = backend in ("flashinfer_mla", "cutlass_mla")
return {
"module": f"vllm.v1.attention.backends.mla.{backend}",
"impl_class": f"{name}Impl",
"metadata_class": "MLACommonMetadata" if uses_common else f"{name}Metadata",
"decode_metadata_class": "MLACommonDecodeMetadata"
if uses_common
else f"{name}DecodeMetadata",
"builder_class": f"{name}MetadataBuilder",
"query_format": props.get("query_format", "tuple"),
"block_size": props.get("block_size", None),
}
# ============================================================================
# Metadata Building Helpers
# ============================================================================
def _build_attention_metadata(
requests: list,
block_size: int,
device: torch.device,
builder_instance,
) -> tuple:
"""
Build attention metadata from batch requests.
Args:
requests: List of BatchRequest objects
block_size: KV cache block size
device: Target device
builder_instance: Metadata builder instance
Returns:
Tuple of (metadata, kv_cache_num_blocks)
"""
q_lens = [r.q_len for r in requests]
kv_lens = [r.kv_len for r in requests]
total_q = sum(q_lens)
max_kv = max(kv_lens)
# Build query start locations
q_start_cpu = torch.tensor(
[0] + [sum(q_lens[: i + 1]) for i in range(len(q_lens))],
dtype=torch.int32,
)
q_start_gpu = q_start_cpu.to(device)
# Build sequence lengths
seq_lens_cpu = torch.tensor(kv_lens, dtype=torch.int32)
seq_lens_gpu = seq_lens_cpu.to(device)
# Build num_computed_tokens (context length for each request)
context_lens = [kv_len - q_len for q_len, kv_len in zip(q_lens, kv_lens)]
num_computed_tokens_cpu = torch.tensor(context_lens, dtype=torch.int32)
# Build block table
num_blocks_per_req = [(kv + block_size - 1) // block_size for kv in kv_lens]
max_num_blocks = max(num_blocks_per_req)
block_table_cpu = np.zeros((len(requests), max_num_blocks), dtype=np.int32)
current_block = 0
for i, num_blocks in enumerate(num_blocks_per_req):
for j in range(num_blocks):
block_table_cpu[i, j] = current_block
current_block += 1
block_table_gpu = torch.from_numpy(block_table_cpu).to(device)
# Build slot mapping
slot_mapping_list = []
for i, (q_len, kv_len, num_blocks) in enumerate(
zip(q_lens, kv_lens, num_blocks_per_req)
):
context_len = kv_len - q_len
for j in range(q_len):
token_kv_idx = context_len + j
block_idx = token_kv_idx // block_size
offset_in_block = token_kv_idx % block_size
global_block_id = block_table_cpu[i, block_idx]
slot_id = global_block_id * block_size + offset_in_block
slot_mapping_list.append(slot_id)
slot_mapping = torch.tensor(slot_mapping_list, dtype=torch.int64, device=device)
# Create CommonAttentionMetadata
from vllm.v1.attention.backends.utils import CommonAttentionMetadata
common_attn_metadata = CommonAttentionMetadata(
num_reqs=len(requests),
max_query_len=max(q_lens),
max_seq_len=max_kv,
num_actual_tokens=total_q,
query_start_loc=q_start_gpu,
query_start_loc_cpu=q_start_cpu,
seq_lens=seq_lens_gpu,
_seq_lens_cpu=seq_lens_cpu,
_num_computed_tokens_cpu=num_computed_tokens_cpu,
slot_mapping=slot_mapping,
block_table_tensor=block_table_gpu,
dcp_local_seq_lens=None,
)
# Use the production build() method
metadata = builder_instance.build(
common_prefix_len=0,
common_attn_metadata=common_attn_metadata,
fast_build=False,
)
return metadata, current_block
def _create_input_tensors(
total_q: int,
mla_dims: dict,
query_format: str,
device: torch.device,
dtype: torch.dtype,
):
"""
Create input tensors for both decode and prefill modes.
MLA requires different tensor formats for decode vs prefill:
- Decode: Uses kv_lora_rank (512) dimension
- Prefill: Uses qk_nope_head_dim (128) to stay under FlashAttention's 256 limit
Args:
total_q: Total number of query tokens
mla_dims: MLA dimension configuration
query_format: Either "tuple" or "concat"
device: Target device
dtype: Tensor dtype
Returns:
Tuple of (decode_inputs, prefill_inputs)
- decode_inputs: Query tensor(s) for decode mode
- prefill_inputs: Dict with 'q', 'k_c_normed', 'k_pe', 'k_scale' for prefill
"""
if query_format == "tuple":
# Decode mode format: (q_nope, q_pe) where q_nope has kv_lora_rank dim
q_nope_decode = torch.randn(
total_q,
mla_dims["num_q_heads"],
mla_dims["kv_lora_rank"],
device=device,
dtype=dtype,
)
q_pe = torch.randn(
total_q,
mla_dims["num_q_heads"],
mla_dims["qk_rope_head_dim"],
device=device,
dtype=dtype,
)
decode_inputs = (q_nope_decode, q_pe)
# For prefill, we need q with qk_nope_head_dim instead of kv_lora_rank
q_nope_prefill = torch.randn(
total_q,
mla_dims["num_q_heads"],
mla_dims["qk_nope_head_dim"],
device=device,
dtype=dtype,
)
prefill_q = torch.cat([q_nope_prefill, q_pe], dim=-1)
else: # concat
decode_inputs = torch.randn(
total_q,
mla_dims["num_q_heads"],
mla_dims["kv_lora_rank"] + mla_dims["qk_rope_head_dim"],
device=device,
dtype=dtype,
)
# For prefill with concat format
prefill_q = torch.randn(
total_q,
mla_dims["num_q_heads"],
mla_dims["qk_nope_head_dim"] + mla_dims["qk_rope_head_dim"],
device=device,
dtype=dtype,
)
# Create additional inputs needed for prefill forward
k_c_normed = torch.randn(
total_q,
mla_dims["kv_lora_rank"],
device=device,
dtype=dtype,
)
k_pe = torch.randn(
total_q,
1, # Single head for MLA
mla_dims["qk_rope_head_dim"],
device=device,
dtype=dtype,
)
k_scale = torch.ones(1, device=device, dtype=torch.float32)
output = torch.zeros(
total_q,
mla_dims["num_q_heads"] * mla_dims["v_head_dim"],
device=device,
dtype=dtype,
)
prefill_inputs = {
"q": prefill_q,
"k_c_normed": k_c_normed,
"k_pe": k_pe,
"k_scale": k_scale,
"output": output,
}
return decode_inputs, prefill_inputs
# ============================================================================
# Backend Initialization
# ============================================================================
def _create_backend_impl(
backend_cfg: dict,
mla_dims: dict,
vllm_config: VllmConfig,
device: torch.device,
):
"""
Create backend implementation instance.
Args:
backend_cfg: Backend configuration dict
mla_dims: MLA dimension configuration
vllm_config: VllmConfig instance
device: Target device
Returns:
Tuple of (impl, layer, builder_instance)
"""
# Import backend classes
backend_module = importlib.import_module(backend_cfg["module"])
impl_class = getattr(backend_module, backend_cfg["impl_class"])
# Calculate scale
scale = 1.0 / np.sqrt(mla_dims["qk_nope_head_dim"] + mla_dims["qk_rope_head_dim"])
# Create mock kv_b_proj layer for prefill mode
mock_kv_b_proj = MockKVBProj(
num_heads=mla_dims["num_q_heads"],
qk_nope_head_dim=mla_dims["qk_nope_head_dim"],
v_head_dim=mla_dims["v_head_dim"],
)
# Create impl
impl = impl_class(
num_heads=mla_dims["num_q_heads"],
head_size=mla_dims["head_dim"],
scale=scale,
num_kv_heads=mla_dims["num_kv_heads"],
alibi_slopes=None,
sliding_window=None,
kv_cache_dtype="auto",
logits_soft_cap=None,
attn_type="decoder",
kv_sharing_target_layer_name=None,
q_lora_rank=None,
kv_lora_rank=mla_dims["kv_lora_rank"],
qk_nope_head_dim=mla_dims["qk_nope_head_dim"],
qk_rope_head_dim=mla_dims["qk_rope_head_dim"],
qk_head_dim=mla_dims["qk_nope_head_dim"] + mla_dims["qk_rope_head_dim"],
v_head_dim=mla_dims["v_head_dim"],
kv_b_proj=mock_kv_b_proj,
)
# Initialize DCP attributes
if not hasattr(impl, "dcp_world_size") or impl.dcp_world_size in (None, -1):
impl.dcp_world_size = 1
impl.dcp_rank = 0
# Create KV cache spec for MockLayer
from vllm.v1.kv_cache_interface import FullAttentionSpec
kv_cache_spec = FullAttentionSpec(
block_size=backend_cfg["block_size"] or vllm_config.cache_config.block_size,
num_kv_heads=1, # MLA uses 1 KV head
head_size=576, # MLA head dim
dtype=torch.bfloat16,
)
# Create mock layer
layer = MockLayer(device, impl=impl, kv_cache_spec=kv_cache_spec)
# Create builder instance if needed
builder_instance = None
if backend_cfg["builder_class"]:
builder_class = getattr(backend_module, backend_cfg["builder_class"])
# Populate static_forward_context so builder can find the layer
# MockLayer inherits from AttentionLayerBase, so isinstance checks pass
vllm_config.compilation_config.static_forward_context = {"placeholder": layer}
builder_instance = builder_class(
kv_cache_spec=kv_cache_spec,
layer_names=["placeholder"],
vllm_config=vllm_config,
device=device,
)
return impl, layer, builder_instance
# ============================================================================
# Config Helpers
# ============================================================================
def _extract_mla_dims_from_config(config) -> dict | None:
"""
Extract MLA dimensions from BenchmarkConfig if all required fields are present.
Args:
config: BenchmarkConfig instance
Returns:
Dict with MLA dimensions if all fields are provided, None otherwise
"""
# Check if all MLA-specific fields are provided
if all(
[
config.kv_lora_rank is not None,
config.qk_nope_head_dim is not None,
config.qk_rope_head_dim is not None,
config.v_head_dim is not None,
]
):
return {
"kv_lora_rank": config.kv_lora_rank,
"qk_nope_head_dim": config.qk_nope_head_dim,
"qk_rope_head_dim": config.qk_rope_head_dim,
"v_head_dim": config.v_head_dim,
"num_q_heads": config.num_q_heads,
"num_kv_heads": config.num_kv_heads,
"head_dim": config.head_dim,
}
# Fallback: if MLA fields not fully specified, try to construct from basic fields
elif config.head_dim == 576:
# This looks like a DeepSeek MLA config, use standard dimensions with custom
# head count
return {
"kv_lora_rank": 512,
"qk_nope_head_dim": 128,
"qk_rope_head_dim": 64,
"v_head_dim": 128,
"num_q_heads": config.num_q_heads,
"num_kv_heads": config.num_kv_heads,
"head_dim": config.head_dim,
}
return None
# ============================================================================
# Benchmark Execution
# ============================================================================
def _run_single_benchmark(
config,
impl,
layer,
builder_instance,
backend_cfg: dict,
mla_dims: dict,
device: torch.device,
) -> BenchmarkResult:
"""
Run a single benchmark iteration.
Args:
config: BenchmarkConfig instance
impl: Backend implementation instance
layer: MockLayer instance
builder_instance: Metadata builder instance
backend_cfg: Backend configuration dict
mla_dims: MLA dimension configuration
device: Target device
Returns:
BenchmarkResult with timing statistics
"""
# Parse batch spec
requests = parse_batch_spec(config.batch_spec)
q_lens = [r.q_len for r in requests]
total_q = sum(q_lens)
# Determine block size
block_size = backend_cfg["block_size"] or config.block_size
# Build metadata
metadata, num_blocks = _build_attention_metadata(
requests, block_size, device, builder_instance
)
# Create KV cache
kv_cache = torch.zeros(
num_blocks,
block_size,
mla_dims["kv_lora_rank"] + mla_dims["qk_rope_head_dim"],
device=device,
dtype=torch.bfloat16,
)
# Create input tensors for both decode and prefill modes
decode_inputs, prefill_inputs = _create_input_tensors(
total_q,
mla_dims,
backend_cfg["query_format"],
device,
torch.bfloat16,
)
# Determine which forward method to use based on metadata
if metadata.decode is not None:
forward_fn = lambda: impl._forward_decode(
decode_inputs, kv_cache, metadata, layer
)
elif metadata.prefill is not None:
forward_fn = lambda: impl._forward_prefill(
prefill_inputs["q"],
prefill_inputs["k_c_normed"],
prefill_inputs["k_pe"],
kv_cache,
metadata,
prefill_inputs["k_scale"],
prefill_inputs["output"],
)
else:
raise RuntimeError("Metadata has neither decode nor prefill metadata")
# Warmup
for _ in range(config.warmup_iters):
forward_fn()
torch.cuda.synchronize()
# Benchmark
times = []
for _ in range(config.repeats):
start = torch.cuda.Event(enable_timing=True)
end = torch.cuda.Event(enable_timing=True)
start.record()
for _ in range(config.num_layers):
forward_fn()
end.record()
torch.cuda.synchronize()
elapsed_ms = start.elapsed_time(end)
times.append(elapsed_ms / 1000.0 / config.num_layers)
mean_time = float(np.mean(times))
return BenchmarkResult(
config=config,
mean_time=mean_time,
std_time=float(np.std(times)),
min_time=float(np.min(times)),
max_time=float(np.max(times)),
throughput_tokens_per_sec=total_q / mean_time if mean_time > 0 else 0,
)
def _run_mla_benchmark_batched(
backend: str,
configs_with_params: list[tuple], # [(config, threshold, num_splits), ...]
) -> list[BenchmarkResult]:
"""
Unified batched MLA benchmark runner for all backends.
Works for: flashattn_mla, flashmla, flashinfer_mla, cutlass_mla
This function reuses backend initialization across multiple benchmarks
to avoid setup/teardown overhead.
Args:
backend: Backend name
configs_with_params: List of (config, threshold, num_splits) tuples
- threshold: reorder_batch_threshold (FlashAttn/FlashMLA only)
- num_splits: num_kv_splits (CUTLASS only)
Returns:
List of BenchmarkResult objects
"""
if not configs_with_params:
return []
backend_cfg = _get_backend_config(backend)
device = torch.device(configs_with_params[0][0].device)
torch.cuda.set_device(device)
# Determine block size
config_block_size = configs_with_params[0][0].block_size
block_size = backend_cfg["block_size"] or config_block_size
# Extract MLA dimensions from the first config
first_config = configs_with_params[0][0]
mla_dims = _extract_mla_dims_from_config(first_config)
# If config didn't provide MLA dims, fall back to default model
if mla_dims is None:
mla_dims = setup_mla_dims("deepseek-v3")
# Create and set vLLM config for MLA (reused across all benchmarks)
vllm_config = create_minimal_vllm_config(
model_name="deepseek-v3", # Used only for model path
block_size=block_size,
mla_dims=mla_dims, # Use custom dims from config or default
)
results = []
with set_current_vllm_config(vllm_config):
# Create backend impl, layer, and builder (reused across benchmarks)
impl, layer, builder_instance = _create_backend_impl(
backend_cfg, mla_dims, vllm_config, device
)
# Run each benchmark with the shared impl
for config, threshold, num_splits in configs_with_params:
# Set threshold for this benchmark (FlashAttn/FlashMLA only)
original_threshold = None
if threshold is not None and builder_instance:
original_threshold = builder_instance.reorder_batch_threshold
builder_instance.reorder_batch_threshold = threshold
# Set num_splits for CUTLASS
original_num_splits = None
if num_splits is not None and hasattr(impl, "_num_kv_splits"):
original_num_splits = impl._num_kv_splits
impl._num_kv_splits = num_splits
try:
result = _run_single_benchmark(
config,
impl,
layer,
builder_instance,
backend_cfg,
mla_dims,
device,
)
results.append(result)
finally:
# Restore original threshold
if original_threshold is not None:
builder_instance.reorder_batch_threshold = original_threshold
# Restore original num_splits
if original_num_splits is not None:
impl._num_kv_splits = original_num_splits
return results
# ============================================================================
# Public API
# ============================================================================
def run_mla_benchmark(
backend: str,
config,
reorder_batch_threshold: int | None = None,
num_kv_splits: int | None = None,
) -> BenchmarkResult | list[BenchmarkResult]:
"""
Unified MLA benchmark runner for all backends.
Works for: flashattn_mla, flashmla, flashinfer_mla, cutlass_mla
Always uses batched execution internally for optimal performance.
Args:
backend: Backend name (flashattn_mla, flashmla, flashinfer_mla, cutlass_mla)
config: BenchmarkConfig or list of (BenchmarkConfig, param) tuples
reorder_batch_threshold: Threshold override for FlashAttn/FlashMLA
(single config mode only)
num_kv_splits: Number of KV splits for CUTLASS (single config mode only)
Returns:
BenchmarkResult (single mode) or list of BenchmarkResult (batched mode)
"""
# Normalize to batched mode: (config, threshold, num_splits)
if isinstance(config, list):
# Already in batched format
if len(config) > 0 and isinstance(config[0], tuple):
# Format: [(cfg, param), ...] where param is threshold or num_splits
if backend in ("flashattn_mla", "flashmla"):
configs_with_params = [(cfg, param, None) for cfg, param in config]
else: # cutlass_mla or flashinfer_mla
configs_with_params = [(cfg, None, param) for cfg, param in config]
else:
# Format: [cfg, ...] - just configs
configs_with_params = [(cfg, None, None) for cfg in config]
return_single = False
else:
# Single config: convert to batched format
configs_with_params = [(config, reorder_batch_threshold, num_kv_splits)]
return_single = True
# Use unified batched execution
results = _run_mla_benchmark_batched(backend, configs_with_params)
# Return single result or list based on input
return results[0] if return_single else results

View File

@@ -1,481 +0,0 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
Standard attention benchmark runner - shared utilities for non-MLA benchmarks.
This module provides helpers for running standard attention backends
(FlashAttention, Triton, FlashInfer) with real vLLM integration.
"""
import types
import numpy as np
import torch
from batch_spec import parse_batch_spec, reorder_for_flashinfer
from common import BenchmarkConfig, BenchmarkResult, MockLayer, get_attention_scale
from vllm.config import (
CacheConfig,
CompilationConfig,
DeviceConfig,
LoadConfig,
ModelConfig,
ParallelConfig,
SchedulerConfig,
VllmConfig,
)
from vllm.v1.attention.backends.utils import CommonAttentionMetadata
from vllm.v1.kv_cache_interface import FullAttentionSpec
# ============================================================================
# Backend Configuration
# ============================================================================
_BACKEND_CONFIG = {
"flash": {
"module": "vllm.v1.attention.backends.flash_attn",
"backend_class": "FlashAttentionBackend",
"dtype": torch.float16,
"cache_layout": "standard",
# ^ [2, num_blocks, block_size, num_kv_heads, head_dim]
},
"triton": {
"module": "vllm.v1.attention.backends.triton_attn",
"backend_class": "TritonAttentionBackend",
"dtype": torch.float32,
"cache_layout": "standard",
},
"flashinfer": {
"module": "vllm.v1.attention.backends.flashinfer",
"backend_class": "FlashInferBackend",
"dtype": torch.float16,
"cache_layout": "flashinfer",
# ^ [num_blocks, 2, block_size, num_kv_heads, head_dim]
},
}
def _get_backend_config(backend: str) -> dict:
if backend not in _BACKEND_CONFIG:
raise ValueError(
f"Unknown backend: {backend}. "
f"Available: {', '.join(_BACKEND_CONFIG.keys())}"
)
return _BACKEND_CONFIG[backend]
# ============================================================================
# Metadata Building Helpers
# ============================================================================
def _build_common_attn_metadata(
q_lens: list[int],
kv_lens: list[int],
block_size: int,
device: torch.device,
) -> CommonAttentionMetadata:
"""Build CommonAttentionMetadata from query/kv lengths."""
batch_size = len(q_lens)
total_tokens = sum(q_lens)
query_start_loc = torch.zeros(batch_size + 1, dtype=torch.int32, device=device)
query_start_loc[1:] = torch.tensor(q_lens, dtype=torch.int32, device=device).cumsum(
0
)
query_start_loc_cpu = query_start_loc.cpu()
seq_lens = torch.tensor(kv_lens, dtype=torch.int32, device=device)
seq_lens_cpu = seq_lens.cpu()
max_seq_len = int(seq_lens_cpu.max())
context_lens = [kv - q for kv, q in zip(kv_lens, q_lens)]
num_computed_tokens_cpu = torch.tensor(context_lens, dtype=torch.int32)
max_blocks = (max(kv_lens) + block_size - 1) // block_size
num_blocks = batch_size * max_blocks
block_table_tensor = torch.arange(
num_blocks, dtype=torch.int32, device=device
).view(batch_size, max_blocks)
slot_mapping = torch.arange(total_tokens, dtype=torch.int64, device=device)
max_query_len = max(q_lens)
return CommonAttentionMetadata(
query_start_loc=query_start_loc,
query_start_loc_cpu=query_start_loc_cpu,
seq_lens=seq_lens,
seq_lens_cpu=seq_lens_cpu,
num_computed_tokens_cpu=num_computed_tokens_cpu,
num_reqs=batch_size,
num_actual_tokens=total_tokens,
max_query_len=max_query_len,
max_seq_len=max_seq_len,
block_table_tensor=block_table_tensor,
slot_mapping=slot_mapping,
causal=True,
)
def _create_vllm_config(
config: BenchmarkConfig,
dtype: torch.dtype,
max_num_blocks: int,
) -> VllmConfig:
"""Create a VllmConfig for benchmarking with mock model methods."""
model_config = ModelConfig(
model="meta-llama/Meta-Llama-3-8B",
tokenizer="meta-llama/Meta-Llama-3-8B",
trust_remote_code=False,
dtype=dtype,
seed=0,
max_model_len=1024,
)
cache_config = CacheConfig(
block_size=config.block_size,
cache_dtype="auto",
swap_space=0,
)
cache_config.num_gpu_blocks = max_num_blocks
cache_config.num_cpu_blocks = 0
parallel_config = ParallelConfig(tensor_parallel_size=1)
scheduler_config = SchedulerConfig(
max_num_seqs=256,
max_num_batched_tokens=8192,
max_model_len=8192,
is_encoder_decoder=False,
enable_chunked_prefill=True,
)
device_config = DeviceConfig()
load_config = LoadConfig()
compilation_config = CompilationConfig()
# Add mock methods for benchmark config values
model_config.get_num_layers = types.MethodType(
lambda self: config.num_layers, model_config
)
model_config.get_sliding_window_for_layer = types.MethodType(
lambda self, i: None, model_config
)
model_config.get_logits_soft_cap_for_layer = types.MethodType(
lambda self, i: 0.0, model_config
)
model_config.get_sm_scale_for_layer = types.MethodType(
lambda self, i: 1.0 / config.head_dim**0.5, model_config
)
model_config.get_num_attention_heads = types.MethodType(
lambda self, parallel_config=None: config.num_q_heads, model_config
)
model_config.get_num_kv_heads = types.MethodType(
lambda self, parallel_config=None: config.num_kv_heads, model_config
)
model_config.get_head_size = types.MethodType(
lambda self: config.head_dim, model_config
)
model_config.get_sliding_window = types.MethodType(lambda self: None, model_config)
return VllmConfig(
model_config=model_config,
cache_config=cache_config,
parallel_config=parallel_config,
scheduler_config=scheduler_config,
device_config=device_config,
load_config=load_config,
compilation_config=compilation_config,
)
# ============================================================================
# Backend Initialization
# ============================================================================
def _create_backend_impl(
backend_cfg: dict,
config: BenchmarkConfig,
device: torch.device,
):
"""Create backend implementation instance."""
import importlib
backend_module = importlib.import_module(backend_cfg["module"])
backend_class = getattr(backend_module, backend_cfg["backend_class"])
scale = get_attention_scale(config.head_dim)
dtype = backend_cfg["dtype"]
impl = backend_class.get_impl_cls()(
num_heads=config.num_q_heads,
head_size=config.head_dim,
scale=scale,
num_kv_heads=config.num_kv_heads,
alibi_slopes=None,
sliding_window=None,
kv_cache_dtype="auto",
)
kv_cache_spec = FullAttentionSpec(
block_size=config.block_size,
num_kv_heads=config.num_kv_heads,
head_size=config.head_dim,
dtype=dtype,
)
layer = MockLayer(device, kv_cache_spec=kv_cache_spec)
return backend_class, impl, layer, dtype
def _create_metadata_builder(
backend_class,
kv_cache_spec: FullAttentionSpec,
vllm_config: VllmConfig,
device: torch.device,
):
"""Create metadata builder instance."""
return backend_class.get_builder_cls()(
kv_cache_spec=kv_cache_spec,
layer_names=["layer_0"],
vllm_config=vllm_config,
device=device,
)
# ============================================================================
# Tensor Creation Helpers
# ============================================================================
def _create_input_tensors(
config: BenchmarkConfig,
total_q: int,
device: torch.device,
dtype: torch.dtype,
) -> tuple:
"""Create Q, K, V input tensors for all layers."""
q_list = [
torch.randn(
total_q, config.num_q_heads, config.head_dim, device=device, dtype=dtype
)
for _ in range(config.num_layers)
]
k_list = [
torch.randn(
total_q, config.num_kv_heads, config.head_dim, device=device, dtype=dtype
)
for _ in range(config.num_layers)
]
v_list = [
torch.randn(
total_q, config.num_kv_heads, config.head_dim, device=device, dtype=dtype
)
for _ in range(config.num_layers)
]
return q_list, k_list, v_list
def _create_kv_cache(
config: BenchmarkConfig,
max_num_blocks: int,
cache_layout: str,
device: torch.device,
dtype: torch.dtype,
) -> list:
"""Create KV cache tensors for all layers."""
if cache_layout == "flashinfer":
# FlashInfer layout: [num_blocks, 2, block_size, num_kv_heads, head_dim]
cache_list = [
torch.zeros(
max_num_blocks,
2,
config.block_size,
config.num_kv_heads,
config.head_dim,
device=device,
dtype=dtype,
)
for _ in range(config.num_layers)
]
else:
# Standard layout: [2, num_blocks, block_size, num_kv_heads, head_dim]
cache_list = [
torch.zeros(
2,
max_num_blocks,
config.block_size,
config.num_kv_heads,
config.head_dim,
device=device,
dtype=dtype,
)
for _ in range(config.num_layers)
]
return cache_list
# ============================================================================
# Benchmark Execution
# ============================================================================
def _run_single_benchmark(
config: BenchmarkConfig,
impl,
layer,
q_list: list,
k_list: list,
v_list: list,
cache_list: list,
attn_metadata,
device: torch.device,
dtype: torch.dtype,
) -> tuple:
"""Run single benchmark iteration with warmup and timing loop."""
total_q = q_list[0].shape[0]
out = torch.empty(
total_q, config.num_q_heads, config.head_dim, device=device, dtype=dtype
)
# Warmup
for _ in range(config.warmup_iters):
for i in range(config.num_layers):
impl.forward(
layer,
q_list[i],
k_list[i],
v_list[i],
cache_list[i],
attn_metadata,
output=out,
)
torch.cuda.synchronize()
# Benchmark
times = []
for _ in range(config.repeats):
start = torch.cuda.Event(enable_timing=True)
end = torch.cuda.Event(enable_timing=True)
start.record()
for i in range(config.num_layers):
impl.forward(
layer,
q_list[i],
k_list[i],
v_list[i],
cache_list[i],
attn_metadata,
output=out,
)
end.record()
torch.cuda.synchronize()
elapsed_ms = start.elapsed_time(end)
times.append(elapsed_ms / 1000.0 / config.num_layers) # seconds per layer
mem_stats = {}
if config.profile_memory:
mem_stats = {
"allocated_mb": torch.cuda.memory_allocated(device) / 1024**2,
"reserved_mb": torch.cuda.memory_reserved(device) / 1024**2,
}
return times, mem_stats
# ============================================================================
# Public API
# ============================================================================
def run_attention_benchmark(config: BenchmarkConfig) -> BenchmarkResult:
"""
Run standard attention benchmark with real kernels.
Supports: flash, triton, flashinfer
Args:
config: Benchmark configuration
Returns:
BenchmarkResult with timing and memory statistics
"""
device = torch.device(config.device)
torch.cuda.set_device(device)
backend_cfg = _get_backend_config(config.backend)
requests = parse_batch_spec(config.batch_spec)
if config.backend == "flashinfer":
requests = reorder_for_flashinfer(requests)
q_lens = [r.q_len for r in requests]
kv_lens = [r.kv_len for r in requests]
total_q = sum(q_lens)
max_kv = max(kv_lens)
max_num_blocks = (max_kv + config.block_size - 1) // config.block_size
backend_class, impl, layer, dtype = _create_backend_impl(
backend_cfg, config, device
)
common_metadata = _build_common_attn_metadata(
q_lens, kv_lens, config.block_size, device
)
kv_cache_spec = FullAttentionSpec(
block_size=config.block_size,
num_kv_heads=config.num_kv_heads,
head_size=config.head_dim,
dtype=dtype,
)
vllm_config = _create_vllm_config(config, dtype, max_num_blocks)
builder = _create_metadata_builder(
backend_class, kv_cache_spec, vllm_config, device
)
attn_metadata = builder.build(
common_prefix_len=0,
common_attn_metadata=common_metadata,
)
q_list, k_list, v_list = _create_input_tensors(config, total_q, device, dtype)
cache_list = _create_kv_cache(
config, max_num_blocks, backend_cfg["cache_layout"], device, dtype
)
times, mem_stats = _run_single_benchmark(
config,
impl,
layer,
q_list,
k_list,
v_list,
cache_list,
attn_metadata,
device,
dtype,
)
mean_time = np.mean(times)
throughput = total_q / mean_time if mean_time > 0 else 0
return BenchmarkResult(
config=config,
mean_time=mean_time,
std_time=np.std(times),
min_time=np.min(times),
max_time=np.max(times),
throughput_tokens_per_sec=throughput,
memory_allocated_mb=mem_stats.get("allocated_mb"),
memory_reserved_mb=mem_stats.get("reserved_mb"),
)

View File

@@ -20,12 +20,8 @@ FLOAT4_E2M1_MAX = scalar_types.float4_e2m1f.max()
FLOAT8_E4M3_MAX = torch.finfo(torch.float8_e4m3fn).max FLOAT8_E4M3_MAX = torch.finfo(torch.float8_e4m3fn).max
PROVIDER_CFGS = { PROVIDER_CFGS = {
"vllm": dict(backend="vllm", is_sf_swizzled_layout=False, enabled=True), "vllm": dict(backend="vllm", enabled=True),
"vllm-swizzle": dict(backend="vllm", is_sf_swizzled_layout=True, enabled=True), "flashinfer": dict(backend="flashinfer", enabled=True),
"flashinfer": dict(backend="flashinfer", is_sf_swizzled_layout=False, enabled=True),
"flashinfer-swizzle": dict(
backend="flashinfer", is_sf_swizzled_layout=True, enabled=True
),
} }
_enabled = [k for k, v in PROVIDER_CFGS.items() if v["enabled"]] _enabled = [k for k, v in PROVIDER_CFGS.items() if v["enabled"]]
@@ -40,7 +36,7 @@ def compute_global_scale(tensor: torch.Tensor) -> torch.Tensor:
@triton.testing.perf_report( @triton.testing.perf_report(
triton.testing.Benchmark( triton.testing.Benchmark(
x_names=["batch_size"], x_names=["batch_size"],
x_vals=[1, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192], x_vals=[1, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096],
x_log=False, x_log=False,
line_arg="provider", line_arg="provider",
line_vals=_enabled, line_vals=_enabled,
@@ -67,36 +63,19 @@ def benchmark(batch_size, provider, N, K):
if cfg["backend"] == "vllm": if cfg["backend"] == "vllm":
# vLLM's FP4 quantization # vLLM's FP4 quantization
if cfg["is_sf_swizzled_layout"]: ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
ms, min_ms, max_ms = triton.testing.do_bench_cudagraph( lambda: ops.scaled_fp4_quant(a, a_global_scale),
lambda: ops.scaled_fp4_quant( quantiles=quantiles,
a, a_global_scale, is_sf_swizzled_layout=True )
),
quantiles=quantiles,
)
else:
ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
lambda: ops.scaled_fp4_quant(
a, a_global_scale, is_sf_swizzled_layout=False
),
quantiles=quantiles,
)
elif cfg["backend"] == "flashinfer": elif cfg["backend"] == "flashinfer":
# FlashInfer's FP4 quantization # FlashInfer's FP4 quantization
if cfg["is_sf_swizzled_layout"]: # Use is_sf_swizzled_layout=True to match vLLM's output format
ms, min_ms, max_ms = triton.testing.do_bench_cudagraph( ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
lambda: flashinfer_fp4_quantize( lambda: flashinfer_fp4_quantize(
a, a_global_scale, is_sf_swizzled_layout=True a, a_global_scale, is_sf_swizzled_layout=True
), ),
quantiles=quantiles, quantiles=quantiles,
) )
else:
ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
lambda: flashinfer_fp4_quantize(
a, a_global_scale, is_sf_swizzled_layout=False
),
quantiles=quantiles,
)
# Convert ms to us for better readability at small batch sizes # Convert ms to us for better readability at small batch sizes
to_us = lambda t_ms: t_ms * 1000 to_us = lambda t_ms: t_ms * 1000
@@ -113,9 +92,7 @@ def prepare_shapes(args):
return out return out
def _test_accuracy_once( def _test_accuracy_once(M: int, K: int, dtype: torch.dtype, device: str):
M: int, K: int, dtype: torch.dtype, device: str, is_sf_swizzled_layout: bool
):
"""Test accuracy between vLLM and FlashInfer FP4 quantization.""" """Test accuracy between vLLM and FlashInfer FP4 quantization."""
# Create input tensor # Create input tensor
a = torch.randn((M, K), device=device, dtype=dtype) a = torch.randn((M, K), device=device, dtype=dtype)
@@ -124,13 +101,11 @@ def _test_accuracy_once(
a_global_scale = compute_global_scale(a) a_global_scale = compute_global_scale(a)
# vLLM quantization # vLLM quantization
vllm_fp4, vllm_scale = ops.scaled_fp4_quant( vllm_fp4, vllm_scale = ops.scaled_fp4_quant(a, a_global_scale)
a, a_global_scale, is_sf_swizzled_layout=is_sf_swizzled_layout
)
# FlashInfer quantization (with swizzled layout to match vLLM's output) # FlashInfer quantization (with swizzled layout to match vLLM's output)
flashinfer_fp4, flashinfer_scale = flashinfer_fp4_quantize( flashinfer_fp4, flashinfer_scale = flashinfer_fp4_quantize(
a, a_global_scale, is_sf_swizzled_layout=is_sf_swizzled_layout a, a_global_scale, is_sf_swizzled_layout=True
) )
flashinfer_scale = flashinfer_scale.view(torch.float8_e4m3fn) flashinfer_scale = flashinfer_scale.view(torch.float8_e4m3fn)
@@ -139,14 +114,7 @@ def _test_accuracy_once(
vllm_fp4, vllm_fp4,
flashinfer_fp4, flashinfer_fp4,
) )
# Compare scales print(f"M={M}, K={K}, dtype={dtype}: PASSED")
torch.testing.assert_close(
vllm_scale,
flashinfer_scale,
)
print(
f"M={M}, K={K}, dtype={dtype}, is_sf_swizzled_layout={is_sf_swizzled_layout}: PASSED" # noqa: E501
)
def test_accuracy(): def test_accuracy():
@@ -162,10 +130,9 @@ def test_accuracy():
Ms = [1, 1024] Ms = [1, 1024]
Ks = [4096] Ks = [4096]
for is_sf_swizzled_layout in [True, False]: for M in Ms:
for M in Ms: for K in Ks:
for K in Ks: _test_accuracy_once(M, K, dtype, device)
_test_accuracy_once(M, K, dtype, device, is_sf_swizzled_layout)
print("\nAll accuracy tests passed!") print("\nAll accuracy tests passed!")
@@ -178,7 +145,7 @@ if __name__ == "__main__":
"--models", "--models",
nargs="+", nargs="+",
type=str, type=str,
default=["meta-llama/Llama-3.3-70B-Instruct"], default=["meta-llama/Llama-3.1-8B-Instruct"],
choices=list(WEIGHT_SHAPES.keys()), choices=list(WEIGHT_SHAPES.keys()),
) )
parser.add_argument("--tp-sizes", nargs="+", type=int, default=[1]) parser.add_argument("--tp-sizes", nargs="+", type=int, default=[1])

View File

@@ -7,7 +7,7 @@ import itertools
import torch import torch
import vllm.model_executor.layers.activation # noqa F401 import vllm.model_executor.layers.activation # noqa F401
from vllm.model_executor.custom_op import op_registry from vllm.model_executor.custom_op import CustomOp
from vllm.triton_utils import triton from vllm.triton_utils import triton
from vllm.utils.argparse_utils import FlexibleArgumentParser from vllm.utils.argparse_utils import FlexibleArgumentParser
from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE, set_random_seed from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE, set_random_seed
@@ -33,14 +33,14 @@ def benchmark_activation(
torch.set_default_device(device) torch.set_default_device(device)
if func_name == "gelu_and_mul": if func_name == "gelu_and_mul":
layer = op_registry[func_name](approximate="none") layer = CustomOp.op_registry[func_name](approximate="none")
elif func_name == "gelu_and_mul_tanh": elif func_name == "gelu_and_mul_tanh":
layer = op_registry["gelu_and_mul"](approximate="tanh") layer = CustomOp.op_registry["gelu_and_mul"](approximate="tanh")
elif func_name == "fatrelu_and_mul": elif func_name == "fatrelu_and_mul":
threshold = 0.5 threshold = 0.5
layer = op_registry[func_name](threshold) layer = CustomOp.op_registry[func_name](threshold)
else: else:
layer = op_registry[func_name]() layer = CustomOp.op_registry[func_name]()
x = torch.randn(num_tokens, dim, dtype=dtype, device=device) x = torch.randn(num_tokens, dim, dtype=dtype, device=device)
compiled_layer = torch.compile(layer.forward_native) compiled_layer = torch.compile(layer.forward_native)

View File

@@ -0,0 +1,244 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
from packaging import version
from vllm.model_executor.layers.quantization.utils.bitblas_utils import (
MINIMUM_BITBLAS_VERSION,
)
try:
import bitblas
if version.parse(bitblas.__version__) < version.parse(MINIMUM_BITBLAS_VERSION):
raise ImportError(
"bitblas version is wrong. Please "
f"install bitblas>={MINIMUM_BITBLAS_VERSION}"
)
except ImportError as e:
bitblas_import_exception = e
raise ValueError(
"Trying to use the bitblas backend, but could not import"
f"with the following error: {bitblas_import_exception}. "
"Please install bitblas through the following command: "
f"`pip install bitblas>={MINIMUM_BITBLAS_VERSION}`"
) from bitblas_import_exception
from bitblas import Matmul, MatmulConfig, auto_detect_nvidia_target
from vllm.utils.argparse_utils import FlexibleArgumentParser
parser = FlexibleArgumentParser(
description="Benchmark BitBLAS int4 on a specific target."
)
# Add arguments to the parser
parser.add_argument(
"--target",
type=str,
default=auto_detect_nvidia_target(),
help="Specify the target device for benchmarking.",
)
parser.add_argument(
"--group_size", type=int, default=None, help="Group size for grouped quantization."
)
parser.add_argument(
"--A_dtype",
type=str,
default="float16",
choices=["float16", "float32", "float64", "int32", "int8"],
help="Data type of activation A.",
)
parser.add_argument(
"--W_dtype",
type=str,
default="int4",
choices=[
"float16",
"float32",
"float64",
"int32",
"int8",
"int4",
"int2",
"int1",
"nf4",
"fp4_e2m1",
],
help="Data type of weight W.",
)
parser.add_argument(
"--accum_dtype",
type=str,
default="float16",
choices=["float16", "int32"],
help="Data type for accumulation.",
)
parser.add_argument(
"--out_dtype",
type=str,
default="float16",
choices=["float16", "float32", "int32", "int8"],
help="Data type for output.",
)
parser.add_argument(
"--layout",
type=str,
default="nt",
choices=["nt", "nn"],
help="Matrix layout, 'nt' for non-transpose A and transpose W.",
)
parser.add_argument(
"--with_bias", action="store_true", help="Include bias in the benchmark."
)
parser.add_argument(
"--with_scaling",
action="store_true",
help="Include scaling factor in the quantization.",
)
parser.add_argument(
"--with_zeros", action="store_true", help="Include zeros in the quantization."
)
parser.add_argument(
"--zeros_mode",
type=str,
default=None,
choices=["original", "rescale", "quantized"],
help="Specify the mode for calculating zeros.",
)
# Parse the arguments
args = parser.parse_args()
# Assign arguments to variables
target = args.target
A_dtype = args.A_dtype
W_dtype = args.W_dtype
accum_dtype = args.accum_dtype
out_dtype = args.out_dtype
layout = args.layout
with_bias = args.with_bias
group_size = args.group_size
with_scaling = args.with_scaling
with_zeros = args.with_zeros
zeros_mode = args.zeros_mode
# Define a list of shared arguments that repeat in every config
shared_args = [
A_dtype,
W_dtype,
out_dtype,
accum_dtype,
layout,
with_bias,
group_size,
with_scaling,
with_zeros,
zeros_mode,
]
# Define just the (M, K, N) shapes in a more compact list
shapes = [
# square test
(1, 16384, 16384),
# BLOOM-176B
(1, 43008, 14336),
(1, 14336, 14336),
(1, 57344, 14336),
(1, 14336, 57344),
# OPT-65B
(1, 9216, 9216),
(1, 36864, 9216),
(1, 9216, 36864),
(1, 22016, 8192),
# LLAMA-70B/65B
(1, 8192, 22016),
(1, 8192, 8192),
(1, 28672, 8192),
(1, 8192, 28672),
# square test
(16384, 16384, 16384),
# BLOOM-176B
(8192, 43008, 14336),
(8192, 14336, 14336),
(8192, 57344, 14336),
(8192, 14336, 57344),
# OPT-65B
(8192, 9216, 9216),
(8192, 36864, 9216),
(8192, 9216, 36864),
(8192, 22016, 8192),
# LLAMA-70B/65B
(8192, 8192, 22016),
(8192, 8192, 8192),
(8192, 28672, 8192),
(8192, 8192, 28672),
]
# Build test shapes with all the shared arguments
test_shapes = [(MatmulConfig, Matmul, (*shape, *shared_args)) for shape in shapes]
benchmark_sets = []
benchmark_sets.extend(test_shapes)
benchmark_results = {}
for config_class, operator, input_args in benchmark_sets:
config = config_class(*input_args)
matmul = operator(config, target=target, enable_tuning=True)
kernel_latency = matmul.profile_latency()
print("Time cost is: {:.3f} ms".format(kernel_latency))
profile_config = {
f"{operator.__name__}-{'-'.join([str(i) for i in input_args])}": {
"BitBLAS_top20_latency": kernel_latency,
}
}
benchmark_results.update(profile_config)
# Define headers for the table
headers = [
"PrimFunc",
"Input Arguments",
"BitBLAS Top20 Latency",
]
# Calculate column widths for pretty printing
col_widths = [0, 0, 0]
for config_key, values in benchmark_results.items():
args_split = config_key.split("-")
func_name = args_split[0]
input_args_str = "-".join(args_split[1:])
col_widths[0] = max(col_widths[0], len(func_name) + 2, len(headers[0]) + 2)
col_widths[1] = max(col_widths[1], len(input_args_str) + 2, len(headers[1]) + 2)
col_widths[2] = max(
col_widths[2],
len(f"{values['BitBLAS_top20_latency']:.3f} ms") + 2,
len(headers[2]) + 2,
)
# break only if you want to measure widths from a single example;
# otherwise, let it loop over all items.
# Print header
for i, header in enumerate(headers):
headers[i] = header.ljust(col_widths[i])
print("".join(headers))
print("-" * sum(col_widths))
# Print rows
for config_key, values in benchmark_results.items():
args_split = config_key.split("-")
func_name = args_split[0]
input_args_str = "-".join(args_split[1:])
row = [
func_name,
input_args_str,
f"{values['BitBLAS_top20_latency']:.3f} ms",
]
row_str = "".join(
[str(cell).ljust(col_widths[idx]) for idx, cell in enumerate(row)]
)
print(row_str)

View File

@@ -9,7 +9,6 @@ but use different quantization strategies and backends.
import torch import torch
import vllm.model_executor.layers.fused_moe.modular_kernel as mk import vllm.model_executor.layers.fused_moe.modular_kernel as mk
from tests.kernels.moe.utils import make_dummy_moe_config
from vllm import _custom_ops as ops from vllm import _custom_ops as ops
from vllm.model_executor.layers.fused_moe.config import fp8_w8a8_moe_quant_config from vllm.model_executor.layers.fused_moe.config import fp8_w8a8_moe_quant_config
from vllm.model_executor.layers.fused_moe.cutlass_moe import CutlassExpertsFp8 from vllm.model_executor.layers.fused_moe.cutlass_moe import CutlassExpertsFp8
@@ -139,13 +138,12 @@ def bench_run(
fn = mk.FusedMoEModularKernel( fn = mk.FusedMoEModularKernel(
MoEPrepareAndFinalizeNoEP(), MoEPrepareAndFinalizeNoEP(),
CutlassExpertsFp8( CutlassExpertsFp8(
moe_config=make_dummy_moe_config( out_dtype=a.dtype,
num_experts=num_experts, e=num_experts,
hidden_dim=k, n=n,
intermediate_size_per_partition=n, k=k,
in_dtype=a.dtype,
),
quant_config=quant_config, quant_config=quant_config,
device=w1.device,
), ),
) )

View File

@@ -12,7 +12,6 @@ import torch
import torch.utils.benchmark as benchmark import torch.utils.benchmark as benchmark
import vllm.model_executor.layers.fused_moe.modular_kernel as mk import vllm.model_executor.layers.fused_moe.modular_kernel as mk
from tests.kernels.moe.utils import make_dummy_moe_config
from vllm import _custom_ops as ops from vllm import _custom_ops as ops
from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config
from vllm.model_executor.layers.fused_moe.config import ( from vllm.model_executor.layers.fused_moe.config import (
@@ -197,9 +196,10 @@ def bench_run(
) )
kernel = mk.FusedMoEModularKernel( kernel = mk.FusedMoEModularKernel(
MoEPrepareAndFinalizeNoEP(), MoEPrepareAndFinalizeNoEP(defer_input_quant=True),
CutlassExpertsFp4( CutlassExpertsFp4(
make_dummy_moe_config(), out_dtype=dtype,
max_experts_per_worker=e,
quant_config=quant_config, quant_config=quant_config,
), ),
) )
@@ -242,9 +242,10 @@ def bench_run(
) )
kernel = mk.FusedMoEModularKernel( kernel = mk.FusedMoEModularKernel(
MoEPrepareAndFinalizeNoEP(), MoEPrepareAndFinalizeNoEP(defer_input_quant=True),
CutlassExpertsFp4( CutlassExpertsFp4(
make_dummy_moe_config(), out_dtype=dtype,
max_experts_per_worker=e,
quant_config=quant_config, quant_config=quant_config,
), ),
) )

View File

@@ -1,99 +0,0 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import itertools
import torch
from vllm.model_executor.layers.fused_moe.router.fused_topk_router import fused_topk
from vllm.triton_utils import triton
from vllm.utils.argparse_utils import FlexibleArgumentParser
num_tokens_range = [2**i for i in range(0, 8, 2)]
num_experts_range = [16, 32, 64, 128, 256, 512]
topk_range = [3, 4]
configs = list(itertools.product(num_tokens_range, num_experts_range, topk_range))
def torch_topk(
gating_output: torch.Tensor,
topk: int,
renormalize: bool,
scoring_func: str = "softmax",
):
if scoring_func == "softmax":
scores = torch.softmax(gating_output.float(), dim=-1)
else:
scores = torch.sigmoid(gating_output.float())
topk_weights, topk_ids = torch.topk(scores, k=topk, dim=-1)
if renormalize:
topk_weights = topk_weights / topk_weights.sum(dim=-1, keepdim=True)
return topk_weights, topk_ids
def get_benchmark(scoring_func):
@triton.testing.perf_report(
triton.testing.Benchmark(
x_names=["num_tokens", "num_experts", "topk"],
x_vals=[list(_) for _ in configs],
line_arg="provider",
line_vals=["torch", "vllm"],
line_names=["Torch", "vLLM"],
styles=[("blue", "-"), ("red", "-")],
ylabel="us",
plot_name=f"fused-topk-perf-{scoring_func}",
args={},
)
)
def benchmark(num_tokens, num_experts, topk, provider):
dtype = torch.bfloat16
hidden_size = 1024
renormalize = True
hidden_states = torch.randn(
(num_tokens, hidden_size), dtype=dtype, device="cuda"
)
gating_output = torch.randn(
(num_tokens, num_experts), dtype=dtype, device="cuda"
)
quantiles = [0.5, 0.2, 0.8]
if provider == "torch":
ms, min_ms, max_ms = triton.testing.do_bench(
lambda: torch_topk(
gating_output=gating_output,
topk=topk,
renormalize=renormalize,
scoring_func=scoring_func,
),
quantiles=quantiles,
)
else:
ms, min_ms, max_ms = triton.testing.do_bench(
lambda: fused_topk(
hidden_states=hidden_states,
gating_output=gating_output,
topk=topk,
renormalize=renormalize,
scoring_func=scoring_func,
),
quantiles=quantiles,
)
return 1000 * ms, 1000 * max_ms, 1000 * min_ms
return benchmark
if __name__ == "__main__":
parser = FlexibleArgumentParser(description="Benchmark the MoE topk kernel.")
parser.add_argument("--scoring-func", type=str, default="softmax")
parser.add_argument("--save-path", type=str, default="./configs/fused_topk/")
args = parser.parse_args()
# Get the benchmark function
benchmark = get_benchmark(args.scoring_func)
# Run performance benchmark
benchmark.run(print_data=True, save_path=args.save_path)

View File

@@ -6,7 +6,6 @@ import torch.utils.benchmark as benchmark
from benchmark_shapes import WEIGHT_SHAPES_MOE from benchmark_shapes import WEIGHT_SHAPES_MOE
import vllm.model_executor.layers.fused_moe.modular_kernel as mk import vllm.model_executor.layers.fused_moe.modular_kernel as mk
from tests.kernels.moe.utils import make_dummy_moe_config
from vllm import _custom_ops as ops from vllm import _custom_ops as ops
from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config
from vllm.model_executor.layers.fused_moe.config import fp8_w8a8_moe_quant_config from vllm.model_executor.layers.fused_moe.config import fp8_w8a8_moe_quant_config
@@ -135,13 +134,13 @@ def bench_run(
fn = mk.FusedMoEModularKernel( fn = mk.FusedMoEModularKernel(
MoEPrepareAndFinalizeNoEP(), MoEPrepareAndFinalizeNoEP(),
CutlassExpertsFp8( CutlassExpertsFp8(
moe_config=make_dummy_moe_config( out_dtype=a.dtype,
num_experts=w2.shape[0], # NOTE(rob): w2 is shaped as [E, hidden, intermediate]
hidden_dim=w2.shape[1], e=w2.shape[0],
intermediate_size_per_partition=w2.shape[2], n=w2.shape[2],
in_dtype=a.dtype, k=w2.shape[1],
),
quant_config=quant_config, quant_config=quant_config,
device=w1.device,
), ),
) )
@@ -167,13 +166,13 @@ def bench_run(
fn = mk.FusedMoEModularKernel( fn = mk.FusedMoEModularKernel(
MoEPrepareAndFinalizeNoEP(), MoEPrepareAndFinalizeNoEP(),
CutlassExpertsFp8( CutlassExpertsFp8(
moe_config=make_dummy_moe_config( out_dtype=a.dtype,
num_experts=w2.shape[0], # NOTE(rob): w2 is shaped as [E, hidden, intermediate]
hidden_dim=w2.shape[1], e=w2.shape[0],
intermediate_size_per_partition=w2.shape[2], n=w2.shape[2],
in_dtype=a.dtype, k=w2.shape[1],
),
quant_config=quant_config, quant_config=quant_config,
device=w1.device,
), ),
) )

View File

@@ -231,7 +231,7 @@ def marlin_create_bench_fn(bt: BenchmarkTensors) -> Callable:
assert bt.w_tok_s is None assert bt.w_tok_s is None
assert bt.group_size is not None assert bt.group_size is not None
fn = lambda: ops.marlin_gemm( fn = lambda: ops.gptq_marlin_gemm(
a=bt.a, a=bt.a,
c=None, c=None,
b_q_weight=w_q, b_q_weight=w_q,

View File

@@ -6,6 +6,12 @@ import torch.utils.benchmark as benchmark
from benchmark_shapes import WEIGHT_SHAPES from benchmark_shapes import WEIGHT_SHAPES
from vllm import _custom_ops as ops from vllm import _custom_ops as ops
from vllm.model_executor.layers.quantization.gptq_marlin_24 import (
GPTQ_MARLIN_24_MAX_PARALLEL,
GPTQ_MARLIN_24_MIN_THREAD_N,
GPTQ_MARLIN_24_SUPPORTED_GROUP_SIZES,
GPTQ_MARLIN_24_SUPPORTED_QUANT_TYPES,
)
from vllm.model_executor.layers.quantization.utils.allspark_utils import ( from vllm.model_executor.layers.quantization.utils.allspark_utils import (
ALLSPARK_AMPERE_M_CUBLAS_THRESHOLD, ALLSPARK_AMPERE_M_CUBLAS_THRESHOLD,
ALLSPARK_SUPPORTED_QUANT_TYPES, ALLSPARK_SUPPORTED_QUANT_TYPES,
@@ -28,6 +34,9 @@ from vllm.model_executor.layers.quantization.utils.marlin_utils_test import (
awq_marlin_quantize, awq_marlin_quantize,
marlin_quantize, marlin_quantize,
) )
from vllm.model_executor.layers.quantization.utils.marlin_utils_test_24 import (
marlin_24_quantize,
)
from vllm.model_executor.layers.quantization.utils.quant_utils import ( from vllm.model_executor.layers.quantization.utils.quant_utils import (
gptq_pack, gptq_pack,
gptq_quantize_weights, gptq_quantize_weights,
@@ -69,7 +78,14 @@ def bench_run(
if size_k % group_size != 0: if size_k % group_size != 0:
return return
repack_supported = group_size in MARLIN_SUPPORTED_GROUP_SIZES marlin_24_supported = (
quant_type in GPTQ_MARLIN_24_SUPPORTED_QUANT_TYPES
and group_size in GPTQ_MARLIN_24_SUPPORTED_GROUP_SIZES
)
repack_supported = (
quant_type in GPTQ_MARLIN_24_SUPPORTED_QUANT_TYPES
and group_size in MARLIN_SUPPORTED_GROUP_SIZES
)
allspark_supported = ( allspark_supported = (
quant_type in ALLSPARK_SUPPORTED_QUANT_TYPES quant_type in ALLSPARK_SUPPORTED_QUANT_TYPES
and group_size == -1 and group_size == -1
@@ -110,6 +126,14 @@ def bench_run(
marlin_sort_indices, marlin_sort_indices,
) )
def gen_marlin_24_params():
marlin_24_w_ref = marlin_24_q_w_comp = marlin_24_meta = marlin_24_s = None
if marlin_24_supported:
(marlin_24_w_ref, marlin_24_q_w_comp, marlin_24_meta, marlin_24_s) = (
marlin_24_quantize(b, quant_type, group_size)
)
return (marlin_24_w_ref, marlin_24_q_w_comp, marlin_24_meta, marlin_24_s)
def gen_repack_params(): def gen_repack_params():
q_w_gptq = None q_w_gptq = None
repack_sort_indices = None repack_sort_indices = None
@@ -164,6 +188,9 @@ def bench_run(
marlin_g_idx, marlin_g_idx,
marlin_sort_indices, marlin_sort_indices,
) = gen_marlin_params() ) = gen_marlin_params()
marlin_24_w_ref, marlin_24_q_w_comp, marlin_24_meta, marlin_24_s = (
gen_marlin_24_params()
)
q_w_gptq, repack_sort_indices = gen_repack_params() q_w_gptq, repack_sort_indices = gen_repack_params()
qw_reorder, s_reorder, zp_reorder, sm_count, sm_version, CUBLAS_M_THRESHOLD = ( qw_reorder, s_reorder, zp_reorder, sm_count, sm_version, CUBLAS_M_THRESHOLD = (
gen_allspark_params() gen_allspark_params()
@@ -173,6 +200,9 @@ def bench_run(
marlin_workspace = MarlinWorkspace( marlin_workspace = MarlinWorkspace(
size_n, GPTQ_MARLIN_MIN_THREAD_N, GPTQ_MARLIN_MAX_PARALLEL size_n, GPTQ_MARLIN_MIN_THREAD_N, GPTQ_MARLIN_MAX_PARALLEL
) )
marlin_24_workspace = MarlinWorkspace(
size_n, GPTQ_MARLIN_24_MIN_THREAD_N, GPTQ_MARLIN_24_MAX_PARALLEL
)
globals = { globals = {
# Gen params # Gen params
@@ -192,6 +222,12 @@ def bench_run(
"marlin_sort_indices": marlin_sort_indices, "marlin_sort_indices": marlin_sort_indices,
"marlin_workspace": marlin_workspace, "marlin_workspace": marlin_workspace,
"is_k_full": is_k_full, "is_k_full": is_k_full,
# Marlin_24 params
"marlin_24_w_ref": marlin_24_w_ref,
"marlin_24_q_w_comp": marlin_24_q_w_comp,
"marlin_24_meta": marlin_24_meta,
"marlin_24_s": marlin_24_s,
"marlin_24_workspace": marlin_24_workspace,
# GPTQ params # GPTQ params
"q_w_gptq": q_w_gptq, "q_w_gptq": q_w_gptq,
"repack_sort_indices": repack_sort_indices, "repack_sort_indices": repack_sort_indices,
@@ -203,7 +239,8 @@ def bench_run(
"sm_version": sm_version, "sm_version": sm_version,
"CUBLAS_M_THRESHOLD": CUBLAS_M_THRESHOLD, "CUBLAS_M_THRESHOLD": CUBLAS_M_THRESHOLD,
# Kernels # Kernels
"marlin_gemm": ops.marlin_gemm, "gptq_marlin_gemm": ops.gptq_marlin_gemm,
"gptq_marlin_24_gemm": ops.gptq_marlin_24_gemm,
"gptq_marlin_repack": ops.gptq_marlin_repack, "gptq_marlin_repack": ops.gptq_marlin_repack,
"allspark_w8a16_gemm": ops.allspark_w8a16_gemm, "allspark_w8a16_gemm": ops.allspark_w8a16_gemm,
} }
@@ -226,24 +263,35 @@ def bench_run(
results.append( results.append(
benchmark.Timer( benchmark.Timer(
stmt="output = marlin_gemm(a, None, marlin_q_w, marlin_s, None, marlin_s2, marlin_zp, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, quant_type, size_m, size_n, size_k, is_k_full, False, False, False)", # noqa: E501 stmt="output = gptq_marlin_gemm(a, None, marlin_q_w, marlin_s, None, marlin_s2, marlin_zp, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, quant_type, size_m, size_n, size_k, is_k_full, False, False, False)", # noqa: E501
globals=globals, globals=globals,
label=label, label=label,
sub_label=sub_label, sub_label=sub_label,
description="marlin_gemm", description="gptq_marlin_gemm",
).blocked_autorange(min_run_time=min_run_time) ).blocked_autorange(min_run_time=min_run_time)
) )
results.append( results.append(
benchmark.Timer( benchmark.Timer(
stmt="output = marlin_gemm(a, None, marlin_q_w, marlin_s, None, marlin_s2, marlin_zp, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, quant_type, size_m, size_n, size_k, is_k_full, False, True, False)", # noqa: E501 stmt="output = gptq_marlin_gemm(a, None, marlin_q_w, marlin_s, None, marlin_s2, marlin_zp, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, quant_type, size_m, size_n, size_k, is_k_full, False, True, False)", # noqa: E501
globals=globals, globals=globals,
label=label, label=label,
sub_label=sub_label, sub_label=sub_label,
description="marlin_gemm_fp32", description="gptq_marlin_gemm_fp32",
).blocked_autorange(min_run_time=min_run_time) ).blocked_autorange(min_run_time=min_run_time)
) )
if marlin_24_supported:
results.append(
benchmark.Timer(
stmt="output = gptq_marlin_24_gemm(a, marlin_24_q_w_comp, marlin_24_meta, marlin_24_s, marlin_24_workspace.scratch, quant_type, size_m, size_n, size_k)", # noqa: E501
globals=globals,
label=label,
sub_label=sub_label,
description="gptq_marlin_24_gemm",
).blocked_autorange(min_run_time=min_run_time)
)
if repack_supported: if repack_supported:
results.append( results.append(
benchmark.Timer( benchmark.Timer(

View File

@@ -15,18 +15,11 @@ import ray
import torch import torch
from ray.experimental.tqdm_ray import tqdm from ray.experimental.tqdm_ray import tqdm
from vllm.model_executor.layers.fused_moe import fused_topk
from vllm.model_executor.layers.fused_moe.config import ( from vllm.model_executor.layers.fused_moe.config import (
FusedMoEConfig,
FusedMoEParallelConfig,
FusedMoEQuantConfig, FusedMoEQuantConfig,
RoutingMethodType,
_get_config_dtype_str, _get_config_dtype_str,
) )
from vllm.model_executor.layers.fused_moe.fused_moe import * from vllm.model_executor.layers.fused_moe.fused_moe import *
from vllm.model_executor.layers.fused_moe.triton_deep_gemm_moe import (
TritonOrDeepGemmExperts,
)
from vllm.platforms import current_platform from vllm.platforms import current_platform
from vllm.transformers_utils.config import get_config from vllm.transformers_utils.config import get_config
from vllm.triton_utils import triton from vllm.triton_utils import triton
@@ -201,36 +194,10 @@ def benchmark_config(
block_shape=block_quant_shape, block_shape=block_quant_shape,
) )
deep_gemm_experts = None
if use_deep_gemm:
deep_gemm_experts = mk.FusedMoEModularKernel(
prepare_finalize=MoEPrepareAndFinalizeNoEP(),
fused_experts=TritonOrDeepGemmExperts(
moe_config=FusedMoEConfig(
num_experts=num_experts,
experts_per_token=topk,
hidden_dim=hidden_size,
intermediate_size_per_partition=shard_intermediate_size,
num_local_experts=num_experts,
activation="silu",
moe_parallel_config=FusedMoEParallelConfig.make_no_parallel(),
in_dtype=init_dtype,
routing_method=RoutingMethodType.TopK,
device="cuda",
),
quant_config=quant_config,
),
)
with override_config(config): with override_config(config):
topk_weights, topk_ids, token_expert_indices = fused_topk( topk_weights, topk_ids, token_expert_indices = fused_topk(
x, input_gating, topk, renormalize=not use_deep_gemm x, input_gating, topk, renormalize=not use_deep_gemm
) )
if use_deep_gemm:
return deep_gemm_experts(
x, w1, w2, topk_weights, topk_ids, inplace=True
)
return fused_experts( return fused_experts(
x, x,
w1, w1,
@@ -239,6 +206,7 @@ def benchmark_config(
topk_ids, topk_ids,
inplace=True, inplace=True,
quant_config=quant_config, quant_config=quant_config,
allow_deep_gemm=use_deep_gemm,
) )
# JIT compilation & warmup # JIT compilation & warmup
@@ -675,7 +643,6 @@ def main(args: argparse.Namespace):
"DeepseekV3ForCausalLM", "DeepseekV3ForCausalLM",
"DeepseekV32ForCausalLM", "DeepseekV32ForCausalLM",
"Glm4MoeForCausalLM", "Glm4MoeForCausalLM",
"Glm4MoeLiteForCausalLM",
"NemotronHForCausalLM", "NemotronHForCausalLM",
): ):
E = config.n_routed_experts E = config.n_routed_experts

View File

@@ -8,8 +8,10 @@ import ray
import torch import torch
from transformers import AutoConfig from transformers import AutoConfig
from vllm.model_executor.layers.fused_moe import fused_topk from vllm.model_executor.layers.fused_moe.fused_moe import *
from vllm.model_executor.layers.fused_moe.moe_permute_unpermute import ( from vllm.model_executor.layers.fused_moe.moe_permute_unpermute import (
_moe_permute,
_moe_unpermute_and_reduce,
moe_permute, moe_permute,
moe_unpermute, moe_unpermute,
) )
@@ -39,6 +41,7 @@ def benchmark_permute(
use_fp8_w8a8: bool, use_fp8_w8a8: bool,
use_int8_w8a16: bool, use_int8_w8a16: bool,
num_iters: int = 100, num_iters: int = 100,
use_customized_permute: bool = False,
) -> float: ) -> float:
# init_dtype = torch.float16 if use_fp8_w8a8 else dtype # init_dtype = torch.float16 if use_fp8_w8a8 else dtype
hidden_states = torch.randn(num_tokens, hidden_size, dtype=dtype) hidden_states = torch.randn(num_tokens, hidden_size, dtype=dtype)
@@ -61,14 +64,31 @@ def benchmark_permute(
input_gating.copy_(gating_output[i]) input_gating.copy_(gating_output[i])
def run(): def run():
moe_permute( if use_customized_permute:
qhidden_states, (
a1q_scale=None, permuted_hidden_states,
topk_ids=topk_ids, a1q_scale,
n_expert=num_experts, first_token_off,
expert_map=None, inv_perm_idx,
align_block_size=align_block_size, m_indices,
) ) = moe_permute(
qhidden_states,
a1q_scale=None,
topk_ids=topk_ids,
n_expert=num_experts,
expert_map=None,
align_block_size=align_block_size,
)
else:
(
permuted_hidden_states,
a1q_scale,
sorted_token_ids,
expert_ids,
inv_perm,
) = _moe_permute(
qhidden_states, None, topk_ids, num_experts, None, align_block_size
)
# JIT compilation & warmup # JIT compilation & warmup
run() run()
@@ -113,9 +133,11 @@ def benchmark_unpermute(
use_fp8_w8a8: bool, use_fp8_w8a8: bool,
use_int8_w8a16: bool, use_int8_w8a16: bool,
num_iters: int = 100, num_iters: int = 100,
use_customized_permute: bool = False,
) -> float: ) -> float:
# init_dtype = torch.float16 if use_fp8_w8a8 else dtype # init_dtype = torch.float16 if use_fp8_w8a8 else dtype
hidden_states = torch.randn(num_tokens, hidden_size, dtype=dtype) hidden_states = torch.randn(num_tokens, hidden_size, dtype=dtype)
output_hidden_states = torch.empty_like(hidden_states)
if use_fp8_w8a8: if use_fp8_w8a8:
align_block_size = 128 # deepgemm needs 128 m aligned block align_block_size = 128 # deepgemm needs 128 m aligned block
qhidden_states, scale = _fp8_quantize(hidden_states, None, None) qhidden_states, scale = _fp8_quantize(hidden_states, None, None)
@@ -130,37 +152,78 @@ def benchmark_unpermute(
) )
def prepare(): def prepare():
( if use_customized_permute:
permuted_hidden_states, (
_, permuted_hidden_states,
first_token_off, a1q_scale,
inv_perm_idx, first_token_off,
_, inv_perm_idx,
) = moe_permute( m_indices,
qhidden_states, ) = moe_permute(
a1q_scale=None, qhidden_states,
topk_ids=topk_ids, a1q_scale=None,
n_expert=num_experts, topk_ids=topk_ids,
expert_map=None, n_expert=num_experts,
align_block_size=align_block_size, expert_map=None,
) align_block_size=align_block_size,
# convert to fp16/bf16 as gemm output )
return ( # convert to fp16/bf16 as gemm output
permuted_hidden_states.to(dtype), return (
first_token_off, permuted_hidden_states.to(dtype),
inv_perm_idx, first_token_off,
) inv_perm_idx,
m_indices,
)
else:
(
permuted_qhidden_states,
a1q_scale,
sorted_token_ids,
expert_ids,
inv_perm,
) = _moe_permute(
qhidden_states, None, topk_ids, num_experts, None, align_block_size
)
# convert to fp16/bf16 as gemm output
return (
permuted_qhidden_states.to(dtype),
a1q_scale,
sorted_token_ids,
expert_ids,
inv_perm,
)
def run(input: tuple): def run(input: tuple):
(permuted_hidden_states, first_token_off, inv_perm_idx) = input if use_customized_permute:
output = torch.empty_like(hidden_states) (
moe_unpermute( permuted_hidden_states,
output, first_token_off,
permuted_hidden_states, inv_perm_idx,
topk_weights, m_indices,
inv_perm_idx, ) = input
first_token_off, output = torch.empty_like(hidden_states)
) moe_unpermute(
output,
permuted_hidden_states,
topk_weights,
inv_perm_idx,
first_token_off,
)
else:
(
permuted_hidden_states,
a1q_scale,
sorted_token_ids,
expert_ids,
inv_perm,
) = input
_moe_unpermute_and_reduce(
output_hidden_states,
permuted_hidden_states,
inv_perm,
topk_weights,
True,
)
# JIT compilation & warmup # JIT compilation & warmup
input = prepare() input = prepare()
@@ -215,7 +278,8 @@ class BenchmarkWorker:
dtype: torch.dtype, dtype: torch.dtype,
use_fp8_w8a8: bool, use_fp8_w8a8: bool,
use_int8_w8a16: bool, use_int8_w8a16: bool,
) -> tuple[float, float]: use_customized_permute: bool = False,
) -> tuple[dict[str, int], float]:
set_random_seed(self.seed) set_random_seed(self.seed)
permute_time = benchmark_permute( permute_time = benchmark_permute(
@@ -227,6 +291,7 @@ class BenchmarkWorker:
use_fp8_w8a8, use_fp8_w8a8,
use_int8_w8a16, use_int8_w8a16,
num_iters=100, num_iters=100,
use_customized_permute=use_customized_permute,
) )
unpermute_time = benchmark_unpermute( unpermute_time = benchmark_unpermute(
num_tokens, num_tokens,
@@ -237,6 +302,7 @@ class BenchmarkWorker:
use_fp8_w8a8, use_fp8_w8a8,
use_int8_w8a16, use_int8_w8a16,
num_iters=100, num_iters=100,
use_customized_permute=use_customized_permute,
) )
return permute_time, unpermute_time return permute_time, unpermute_time
@@ -264,7 +330,6 @@ def main(args: argparse.Namespace):
config.architectures[0] == "DeepseekV3ForCausalLM" config.architectures[0] == "DeepseekV3ForCausalLM"
or config.architectures[0] == "DeepseekV2ForCausalLM" or config.architectures[0] == "DeepseekV2ForCausalLM"
or config.architectures[0] == "Glm4MoeForCausalLM" or config.architectures[0] == "Glm4MoeForCausalLM"
or config.architectures[0] == "Glm4MoeLiteForCausalLM"
): ):
E = config.n_routed_experts E = config.n_routed_experts
topk = config.num_experts_per_tok topk = config.num_experts_per_tok
@@ -283,6 +348,7 @@ def main(args: argparse.Namespace):
dtype = torch.float16 if current_platform.is_rocm() else config.dtype dtype = torch.float16 if current_platform.is_rocm() else config.dtype
use_fp8_w8a8 = args.dtype == "fp8_w8a8" use_fp8_w8a8 = args.dtype == "fp8_w8a8"
use_int8_w8a16 = args.dtype == "int8_w8a16" use_int8_w8a16 = args.dtype == "int8_w8a16"
use_customized_permute = args.use_customized_permute
if args.batch_size is None: if args.batch_size is None:
batch_sizes = [ batch_sizes = [
@@ -334,6 +400,7 @@ def main(args: argparse.Namespace):
dtype, dtype,
use_fp8_w8a8, use_fp8_w8a8,
use_int8_w8a16, use_int8_w8a16,
use_customized_permute,
) )
for batch_size in batch_sizes for batch_size in batch_sizes
], ],
@@ -353,6 +420,7 @@ if __name__ == "__main__":
parser.add_argument( parser.add_argument(
"--dtype", type=str, choices=["auto", "fp8_w8a8", "int8_w8a16"], default="auto" "--dtype", type=str, choices=["auto", "fp8_w8a8", "int8_w8a16"], default="auto"
) )
parser.add_argument("--use-customized-permute", action="store_true")
parser.add_argument("--seed", type=int, default=0) parser.add_argument("--seed", type=int, default=0)
parser.add_argument("--batch-size", type=int, required=False) parser.add_argument("--batch-size", type=int, required=False)
parser.add_argument("--trust-remote-code", action="store_true") parser.add_argument("--trust-remote-code", action="store_true")

View File

@@ -22,8 +22,8 @@ from vllm.utils.argparse_utils import FlexibleArgumentParser
mp.set_start_method("spawn", force=True) mp.set_start_method("spawn", force=True)
assert current_platform.is_cuda() or current_platform.is_rocm(), ( assert current_platform.is_cuda(), (
"Only support tune w8a8 block fp8 kernel on CUDA/ROCm device." "Only support tune w8a8 block fp8 kernel on CUDA device."
) )
DTYPE_MAP = { DTYPE_MAP = {

View File

@@ -14,6 +14,7 @@ from vllm.triton_utils import triton
from vllm.utils.deep_gemm import ( from vllm.utils.deep_gemm import (
calc_diff, calc_diff,
fp8_gemm_nt, fp8_gemm_nt,
get_col_major_tma_aligned_tensor,
per_block_cast_to_fp8, per_block_cast_to_fp8,
) )
@@ -47,9 +48,8 @@ def benchmark_shape(
block_size = [128, 128] block_size = [128, 128]
# Pre-quantize A for all implementations # Pre-quantize A for all implementations
A_deepgemm, A_scale_deepgemm = per_token_group_quant_fp8( A_deepgemm, A_scale_deepgemm = per_token_group_quant_fp8(A, block_size[1])
A, block_size[1], column_major_scales=True, tma_aligned_scales=True A_scale_deepgemm = get_col_major_tma_aligned_tensor(A_scale_deepgemm)
)
C_deepgemm = torch.empty((m, n), device="cuda", dtype=torch.bfloat16) C_deepgemm = torch.empty((m, n), device="cuda", dtype=torch.bfloat16)
A_vllm, A_scale_vllm = per_token_group_quant_fp8(A, block_size[1]) A_vllm, A_scale_vllm = per_token_group_quant_fp8(A, block_size[1])
A_vllm_cutlass, A_scale_vllm_cutlass = per_token_group_quant_fp8( A_vllm_cutlass, A_scale_vllm_cutlass = per_token_group_quant_fp8(

View File

@@ -13,8 +13,6 @@ endif()
# #
# Define environment variables for special configurations # Define environment variables for special configurations
# #
set(ENABLE_AVX2 $ENV{VLLM_CPU_AVX2})
set(ENABLE_AVX512 $ENV{VLLM_CPU_AVX512})
set(ENABLE_AVX512BF16 $ENV{VLLM_CPU_AVX512BF16}) set(ENABLE_AVX512BF16 $ENV{VLLM_CPU_AVX512BF16})
set(ENABLE_AVX512VNNI $ENV{VLLM_CPU_AVX512VNNI}) set(ENABLE_AVX512VNNI $ENV{VLLM_CPU_AVX512VNNI})
set(ENABLE_AMXBF16 $ENV{VLLM_CPU_AMXBF16}) set(ENABLE_AMXBF16 $ENV{VLLM_CPU_AMXBF16})
@@ -105,16 +103,6 @@ else()
find_isa(${CPUINFO} "bf16" ARM_BF16_FOUND) # Check for ARM BF16 support find_isa(${CPUINFO} "bf16" ARM_BF16_FOUND) # Check for ARM BF16 support
find_isa(${CPUINFO} "S390" S390_FOUND) find_isa(${CPUINFO} "S390" S390_FOUND)
find_isa(${CPUINFO} "v" RVV_FOUND) # Check for RISC-V RVV support find_isa(${CPUINFO} "v" RVV_FOUND) # Check for RISC-V RVV support
# Support cross-compilation by allowing override via environment variables
if (ENABLE_AVX2)
set(AVX2_FOUND ON)
message(STATUS "AVX2 support enabled via VLLM_CPU_AVX2 environment variable")
endif()
if (ENABLE_AVX512)
set(AVX512_FOUND ON)
message(STATUS "AVX512 support enabled via VLLM_CPU_AVX512 environment variable")
endif()
endif() endif()
if (AVX512_FOUND AND NOT AVX512_DISABLED) if (AVX512_FOUND AND NOT AVX512_DISABLED)
@@ -391,12 +379,6 @@ if (AVX512_FOUND AND NOT AVX512_DISABLED)
endif() endif()
endif() endif()
if (ASIMD_FOUND AND NOT APPLE_SILICON_FOUND)
set(VLLM_EXT_SRC
"csrc/cpu/shm.cpp"
${VLLM_EXT_SRC})
endif()
if(USE_ONEDNN) if(USE_ONEDNN)
set(VLLM_EXT_SRC set(VLLM_EXT_SRC
"csrc/cpu/dnnl_kernels.cpp" "csrc/cpu/dnnl_kernels.cpp"

View File

@@ -19,7 +19,7 @@ else()
FetchContent_Declare( FetchContent_Declare(
flashmla flashmla
GIT_REPOSITORY https://github.com/vllm-project/FlashMLA GIT_REPOSITORY https://github.com/vllm-project/FlashMLA
GIT_TAG c2afa9cb93e674d5a9120a170a6da57b89267208 GIT_TAG 46d64a8ebef03fa50b4ae74937276a5c940e3f95
GIT_PROGRESS TRUE GIT_PROGRESS TRUE
CONFIGURE_COMMAND "" CONFIGURE_COMMAND ""
BUILD_COMMAND "" BUILD_COMMAND ""
@@ -30,24 +30,6 @@ endif()
FetchContent_MakeAvailable(flashmla) FetchContent_MakeAvailable(flashmla)
message(STATUS "FlashMLA is available at ${flashmla_SOURCE_DIR}") message(STATUS "FlashMLA is available at ${flashmla_SOURCE_DIR}")
# Vendor FlashMLA interface into vLLM with torch-ops shim.
set(FLASHMLA_VENDOR_DIR "${CMAKE_SOURCE_DIR}/vllm/third_party/flashmla")
file(MAKE_DIRECTORY "${FLASHMLA_VENDOR_DIR}")
file(READ "${flashmla_SOURCE_DIR}/flash_mla/flash_mla_interface.py"
FLASHMLA_INTERFACE_CONTENT)
string(REPLACE "import flash_mla.cuda as flash_mla_cuda"
"import vllm._flashmla_C\nflash_mla_cuda = torch.ops._flashmla_C"
FLASHMLA_INTERFACE_CONTENT
"${FLASHMLA_INTERFACE_CONTENT}")
file(WRITE "${FLASHMLA_VENDOR_DIR}/flash_mla_interface.py"
"${FLASHMLA_INTERFACE_CONTENT}")
# Install the generated flash_mla_interface.py to the wheel
# Use COMPONENT _flashmla_C to ensure it's installed with the C extension
install(FILES "${FLASHMLA_VENDOR_DIR}/flash_mla_interface.py"
DESTINATION vllm/third_party/flashmla/
COMPONENT _flashmla_C)
# The FlashMLA kernels only work on hopper and require CUDA 12.3 or later. # The FlashMLA kernels only work on hopper and require CUDA 12.3 or later.
# Only build FlashMLA kernels if we are building for something compatible with # Only build FlashMLA kernels if we are building for something compatible with
# sm90a # sm90a
@@ -73,42 +55,16 @@ if(FLASH_MLA_ARCHS)
set(FlashMLA_SOURCES set(FlashMLA_SOURCES
${flashmla_SOURCE_DIR}/csrc/torch_api.cpp ${flashmla_SOURCE_DIR}/csrc/torch_api.cpp
${flashmla_SOURCE_DIR}/csrc/pybind.cpp
# Misc kernels for decoding ${flashmla_SOURCE_DIR}/csrc/smxx/get_mla_metadata.cu
${flashmla_SOURCE_DIR}/csrc/smxx/decode/get_decoding_sched_meta/get_decoding_sched_meta.cu ${flashmla_SOURCE_DIR}/csrc/smxx/mla_combine.cu
${flashmla_SOURCE_DIR}/csrc/smxx/decode/combine/combine.cu ${flashmla_SOURCE_DIR}/csrc/sm90/decode/dense/splitkv_mla.cu
${flashmla_SOURCE_DIR}/csrc/sm90/decode/sparse_fp8/splitkv_mla.cu
# sm90 dense decode
${flashmla_SOURCE_DIR}/csrc/sm90/decode/dense/instantiations/fp16.cu
${flashmla_SOURCE_DIR}/csrc/sm90/decode/dense/instantiations/bf16.cu
# sm90 sparse decode
${flashmla_SOURCE_DIR}/csrc/sm90/decode/sparse_fp8/instantiations/model1_persistent_h64.cu
${flashmla_SOURCE_DIR}/csrc/sm90/decode/sparse_fp8/instantiations/model1_persistent_h128.cu
${flashmla_SOURCE_DIR}/csrc/sm90/decode/sparse_fp8/instantiations/v32_persistent_h64.cu
${flashmla_SOURCE_DIR}/csrc/sm90/decode/sparse_fp8/instantiations/v32_persistent_h128.cu
# sm90 sparse prefill
${flashmla_SOURCE_DIR}/csrc/sm90/prefill/sparse/fwd.cu ${flashmla_SOURCE_DIR}/csrc/sm90/prefill/sparse/fwd.cu
${flashmla_SOURCE_DIR}/csrc/sm90/prefill/sparse/instantiations/phase1_k512.cu ${flashmla_SOURCE_DIR}/csrc/sm100/decode/sparse_fp8/splitkv_mla.cu
${flashmla_SOURCE_DIR}/csrc/sm90/prefill/sparse/instantiations/phase1_k512_topklen.cu
${flashmla_SOURCE_DIR}/csrc/sm90/prefill/sparse/instantiations/phase1_k576.cu
${flashmla_SOURCE_DIR}/csrc/sm90/prefill/sparse/instantiations/phase1_k576_topklen.cu
# sm100 dense prefill & backward
${flashmla_SOURCE_DIR}/csrc/sm100/prefill/dense/fmha_cutlass_fwd_sm100.cu ${flashmla_SOURCE_DIR}/csrc/sm100/prefill/dense/fmha_cutlass_fwd_sm100.cu
${flashmla_SOURCE_DIR}/csrc/sm100/prefill/dense/fmha_cutlass_bwd_sm100.cu
# sm100 sparse prefill ${flashmla_SOURCE_DIR}/csrc/sm100/prefill/sparse/fwd.cu
${flashmla_SOURCE_DIR}/csrc/sm100/prefill/sparse/fwd/head64/instantiations/phase1_k512.cu
${flashmla_SOURCE_DIR}/csrc/sm100/prefill/sparse/fwd/head64/instantiations/phase1_k576.cu
${flashmla_SOURCE_DIR}/csrc/sm100/prefill/sparse/fwd/head128/instantiations/phase1_k512.cu
${flashmla_SOURCE_DIR}/csrc/sm100/prefill/sparse/fwd/head128/instantiations/phase1_k576.cu
${flashmla_SOURCE_DIR}/csrc/sm100/prefill/sparse/fwd_for_small_topk/head128/instantiations/phase1_prefill_k512.cu
# sm100 sparse decode
${flashmla_SOURCE_DIR}/csrc/sm100/decode/head64/instantiations/v32.cu
${flashmla_SOURCE_DIR}/csrc/sm100/decode/head64/instantiations/model1.cu
${flashmla_SOURCE_DIR}/csrc/sm100/prefill/sparse/fwd_for_small_topk/head128/instantiations/phase1_decode_k512.cu
) )
set(FlashMLA_Extension_SOURCES set(FlashMLA_Extension_SOURCES
@@ -120,7 +76,6 @@ if(FLASH_MLA_ARCHS)
set(FlashMLA_INCLUDES set(FlashMLA_INCLUDES
${flashmla_SOURCE_DIR}/csrc ${flashmla_SOURCE_DIR}/csrc
${flashmla_SOURCE_DIR}/csrc/kerutils/include
${flashmla_SOURCE_DIR}/csrc/sm90 ${flashmla_SOURCE_DIR}/csrc/sm90
${flashmla_SOURCE_DIR}/csrc/cutlass/include ${flashmla_SOURCE_DIR}/csrc/cutlass/include
${flashmla_SOURCE_DIR}/csrc/cutlass/tools/util/include ${flashmla_SOURCE_DIR}/csrc/cutlass/tools/util/include
@@ -128,6 +83,7 @@ if(FLASH_MLA_ARCHS)
set(FlashMLA_Extension_INCLUDES set(FlashMLA_Extension_INCLUDES
${flashmla_SOURCE_DIR}/csrc ${flashmla_SOURCE_DIR}/csrc
${flashmla_SOURCE_DIR}/csrc/sm90
${flashmla_SOURCE_DIR}/csrc/extension/sm90/dense_fp8/ ${flashmla_SOURCE_DIR}/csrc/extension/sm90/dense_fp8/
${flashmla_SOURCE_DIR}/csrc/cutlass/include ${flashmla_SOURCE_DIR}/csrc/cutlass/include
${flashmla_SOURCE_DIR}/csrc/cutlass/tools/util/include ${flashmla_SOURCE_DIR}/csrc/cutlass/tools/util/include
@@ -154,12 +110,9 @@ if(FLASH_MLA_ARCHS)
# Keep Stable ABI for the module, but *not* for CUDA/C++ files. # Keep Stable ABI for the module, but *not* for CUDA/C++ files.
# This prevents Py_LIMITED_API from affecting nvcc and C++ compiles. # This prevents Py_LIMITED_API from affecting nvcc and C++ compiles.
# Also enable C++20 for the FlashMLA sources (required for std::span, requires, etc.)
target_compile_options(_flashmla_C PRIVATE target_compile_options(_flashmla_C PRIVATE
$<$<COMPILE_LANGUAGE:CUDA>:-UPy_LIMITED_API> $<$<COMPILE_LANGUAGE:CUDA>:-UPy_LIMITED_API>
$<$<COMPILE_LANGUAGE:CXX>:-UPy_LIMITED_API> $<$<COMPILE_LANGUAGE:CXX>:-UPy_LIMITED_API>)
$<$<COMPILE_LANGUAGE:CXX>:-std=c++20>
$<$<COMPILE_LANGUAGE:CUDA>:-std=c++20>)
define_extension_target( define_extension_target(
_flashmla_extension_C _flashmla_extension_C

View File

@@ -7,7 +7,6 @@
#include <vector> #include <vector>
void swap_blocks(torch::Tensor& src, torch::Tensor& dst, void swap_blocks(torch::Tensor& src, torch::Tensor& dst,
int64_t block_size_in_bytes,
const torch::Tensor& block_mapping); const torch::Tensor& block_mapping);
void reshape_and_cache(torch::Tensor& key, torch::Tensor& value, void reshape_and_cache(torch::Tensor& key, torch::Tensor& value,

View File

@@ -24,14 +24,7 @@
typedef __hip_bfloat16 __nv_bfloat16; typedef __hip_bfloat16 __nv_bfloat16;
#endif #endif
#if defined(__gfx942__)
constexpr float kFp8ScaleDivisor = 224.f;
#else
constexpr float kFp8ScaleDivisor = 448.f;
#endif
void swap_blocks(torch::Tensor& src, torch::Tensor& dst, void swap_blocks(torch::Tensor& src, torch::Tensor& dst,
int64_t block_size_in_bytes,
const torch::Tensor& block_mapping) { const torch::Tensor& block_mapping) {
torch::Device src_device = src.device(); torch::Device src_device = src.device();
torch::Device dst_device = dst.device(); torch::Device dst_device = dst.device();
@@ -56,6 +49,10 @@ void swap_blocks(torch::Tensor& src, torch::Tensor& dst,
char* src_ptr = static_cast<char*>(src.data_ptr()); char* src_ptr = static_cast<char*>(src.data_ptr());
char* dst_ptr = static_cast<char*>(dst.data_ptr()); char* dst_ptr = static_cast<char*>(dst.data_ptr());
// We use the stride instead of numel in case the cache is padded for memory
// alignment reasons, we assume the blocks data (inclusive of any padding)
// is contiguous in memory
const int64_t block_size_in_bytes = src.element_size() * src.stride(0);
const at::cuda::OptionalCUDAGuard device_guard( const at::cuda::OptionalCUDAGuard device_guard(
src_device.is_cuda() ? src_device : dst_device); src_device.is_cuda() ? src_device : dst_device);
const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
@@ -208,8 +205,7 @@ __global__ void reshape_and_cache_flash_kernel(
const int64_t block_stride, const int64_t page_stride, const int64_t block_stride, const int64_t page_stride,
const int64_t head_stride, const int64_t key_stride, const int64_t head_stride, const int64_t key_stride,
const int64_t value_stride, const int num_heads, const int head_size, const int64_t value_stride, const int num_heads, const int head_size,
const int block_size, const float* k_scale, const float* v_scale, const int block_size, const float* k_scale, const float* v_scale) {
const int kv_scale_stride) {
const int64_t token_idx = blockIdx.x; const int64_t token_idx = blockIdx.x;
const int64_t slot_idx = slot_mapping[token_idx]; const int64_t slot_idx = slot_mapping[token_idx];
// NOTE: slot_idx can be -1 if the token is padded // NOTE: slot_idx can be -1 if the token is padded
@@ -233,23 +229,21 @@ __global__ void reshape_and_cache_flash_kernel(
// this is true for the NHD layout where `head_stride == head_size` // this is true for the NHD layout where `head_stride == head_size`
const bool is_contiguous_heads = (head_stride == head_size); const bool is_contiguous_heads = (head_stride == head_size);
float k_scale_val = (kv_dt == Fp8KVCacheDataType::kAuto) ? 0.f : *k_scale;
float v_scale_val = (kv_dt == Fp8KVCacheDataType::kAuto) ? 0.f : *v_scale;
constexpr int VEC_SIZE = (sizeof(scalar_t) == 2) ? 8 : 4; constexpr int VEC_SIZE = (sizeof(scalar_t) == 2) ? 8 : 4;
CopyWithScaleOp<cache_t, scalar_t, kv_dt> k_op{k_scale_val};
if (is_contiguous_heads && kv_scale_stride == 0) { CopyWithScaleOp<cache_t, scalar_t, kv_dt> v_op{v_scale_val};
// NHD layout and k/v_scales are [1] (i.e. single scale for all heads) if (is_contiguous_heads) {
// NHD layout
// kv cache: [num_blocks, block_size, num_heads, head_size] // kv cache: [num_blocks, block_size, num_heads, head_size]
float k_scale_val = (kv_dt == Fp8KVCacheDataType::kAuto) ? 0.f : *k_scale;
float v_scale_val = (kv_dt == Fp8KVCacheDataType::kAuto) ? 0.f : *v_scale;
CopyWithScaleOp<cache_t, scalar_t, kv_dt> k_op{k_scale_val};
CopyWithScaleOp<cache_t, scalar_t, kv_dt> v_op{v_scale_val};
vectorize_with_alignment<VEC_SIZE>(key_src, key_dst, n_elems, threadIdx.x, vectorize_with_alignment<VEC_SIZE>(key_src, key_dst, n_elems, threadIdx.x,
blockDim.x, k_op); blockDim.x, k_op);
vectorize_with_alignment<VEC_SIZE>(value_src, value_dst, n_elems, vectorize_with_alignment<VEC_SIZE>(value_src, value_dst, n_elems,
threadIdx.x, blockDim.x, v_op); threadIdx.x, blockDim.x, v_op);
} else { } else {
// HND layout OR k/v_scales are [num_heads] (i.e. per-attn-head)
// HND layout: heads are strided, but each head_size segment is contiguous // HND layout: heads are strided, but each head_size segment is contiguous
// kv cache: [num_blocks, num_heads, block_size, head_size] // kv cache: [num_blocks, num_heads, block_size, head_size]
const int lane = threadIdx.x & 31; // 0..31 within warp const int lane = threadIdx.x & 31; // 0..31 within warp
@@ -265,16 +259,6 @@ __global__ void reshape_and_cache_flash_kernel(
cache_t* __restrict__ v_dst_h = cache_t* __restrict__ v_dst_h =
value_dst + static_cast<int64_t>(head) * head_stride; value_dst + static_cast<int64_t>(head) * head_stride;
float k_scale_val = (kv_dt == Fp8KVCacheDataType::kAuto)
? 0.f
: k_scale[head * kv_scale_stride];
float v_scale_val = (kv_dt == Fp8KVCacheDataType::kAuto)
? 0.f
: v_scale[head * kv_scale_stride];
CopyWithScaleOp<cache_t, scalar_t, kv_dt> k_op{k_scale_val};
CopyWithScaleOp<cache_t, scalar_t, kv_dt> v_op{v_scale_val};
// within each head, let the 32 threads of the warp perform the vector // within each head, let the 32 threads of the warp perform the vector
// copy // copy
vectorize_with_alignment<VEC_SIZE>(k_src_h, k_dst_h, head_size, lane, 32, vectorize_with_alignment<VEC_SIZE>(k_src_h, k_dst_h, head_size, lane, 32,
@@ -407,7 +391,8 @@ __global__ void concat_and_cache_ds_mla_kernel(
} }
// Compute the scale for the tile // Compute the scale for the tile
float tile_scale = fmaxf(max_abs / kFp8ScaleDivisor, FLT_MIN); float tile_scale = max_abs / 448.f;
tile_scale = fmaxf(tile_scale, FLT_MIN);
// The first lane of each half-warp writes the scale to kv_cache // The first lane of each half-warp writes the scale to kv_cache
if ((lane_idx == 0) || (lane_idx == 16)) { if ((lane_idx == 0) || (lane_idx == 16)) {
@@ -476,8 +461,11 @@ __global__ void indexer_k_quant_and_cache_kernel(
#endif #endif
} }
float scale = fmaxf(amax, 1e-4) / kFp8ScaleDivisor; #if defined(__gfx942__)
float scale = fmaxf(amax, 1e-4) / 224.0f;
#else
float scale = fmaxf(amax, 1e-4) / 448.0f;
#endif
if (use_ue8m0) { if (use_ue8m0) {
scale = exp2f(ceilf(log2f(scale))); scale = exp2f(ceilf(log2f(scale)));
} }
@@ -620,8 +608,7 @@ void reshape_and_cache(
slot_mapping.data_ptr<int64_t>(), block_stride, page_stride, \ slot_mapping.data_ptr<int64_t>(), block_stride, page_stride, \
head_stride, key_stride, value_stride, num_heads, head_size, \ head_stride, key_stride, value_stride, num_heads, head_size, \
block_size, reinterpret_cast<const float*>(k_scale.data_ptr()), \ block_size, reinterpret_cast<const float*>(k_scale.data_ptr()), \
reinterpret_cast<const float*>(v_scale.data_ptr()), \ reinterpret_cast<const float*>(v_scale.data_ptr()));
kv_scale_stride);
void reshape_and_cache_flash( void reshape_and_cache_flash(
torch::Tensor& key, // [num_tokens, num_heads, head_size] torch::Tensor& key, // [num_tokens, num_heads, head_size]
@@ -630,9 +617,8 @@ void reshape_and_cache_flash(
torch::Tensor& torch::Tensor&
value_cache, // [num_blocks, block_size, num_heads, head_size] value_cache, // [num_blocks, block_size, num_heads, head_size]
torch::Tensor& slot_mapping, // [num_tokens] or [num_actual_tokens] torch::Tensor& slot_mapping, // [num_tokens] or [num_actual_tokens]
const std::string& kv_cache_dtype, const std::string& kv_cache_dtype, torch::Tensor& k_scale,
torch::Tensor& k_scale, // [1] or [num_heads] torch::Tensor& v_scale) {
torch::Tensor& v_scale) { // [1] or [num_heads]
// NOTE(woosuk): In vLLM V1, key.size(0) can be different from // NOTE(woosuk): In vLLM V1, key.size(0) can be different from
// slot_mapping.size(0) because of padding for CUDA graphs. // slot_mapping.size(0) because of padding for CUDA graphs.
// In vLLM V0, key.size(0) is always equal to slot_mapping.size(0) because // In vLLM V0, key.size(0) is always equal to slot_mapping.size(0) because
@@ -655,12 +641,6 @@ void reshape_and_cache_flash(
int64_t head_stride = key_cache.stride(2); int64_t head_stride = key_cache.stride(2);
TORCH_CHECK(key_cache.stride(0) == value_cache.stride(0)); TORCH_CHECK(key_cache.stride(0) == value_cache.stride(0));
TORCH_CHECK(k_scale.sizes() == v_scale.sizes(),
"k_scale and v_scale must have the same shape");
TORCH_CHECK(k_scale.numel() == 1 || k_scale.numel() == num_heads,
"k_scale and v_scale must be of shape [1] or [num_heads]");
int kv_scale_stride = (k_scale.numel() > 1) ? 1 : 0;
dim3 grid(num_tokens); dim3 grid(num_tokens);
dim3 block(std::min(num_heads * head_size, 512)); dim3 block(std::min(num_heads * head_size, 512));
const at::cuda::OptionalCUDAGuard device_guard(device_of(key)); const at::cuda::OptionalCUDAGuard device_guard(device_of(key));

View File

@@ -80,10 +80,8 @@ struct FP16Vec16 : public Vec<FP16Vec16> {
reg.val[1] = vld1q_f16(reinterpret_cast<const __fp16*>(ptr) + 8); reg.val[1] = vld1q_f16(reinterpret_cast<const __fp16*>(ptr) + 8);
} }
// ASIMD does not support non-temporal loads
explicit FP16Vec16(bool, const void* ptr) : FP16Vec16(ptr) {}
explicit FP16Vec16(const FP32Vec16& vec); explicit FP16Vec16(const FP32Vec16& vec);
void save(void* ptr) const { void save(void* ptr) const {
vst1q_f16(reinterpret_cast<__fp16*>(ptr), reg.val[0]); vst1q_f16(reinterpret_cast<__fp16*>(ptr), reg.val[0]);
vst1q_f16(reinterpret_cast<__fp16*>(ptr) + 8, reg.val[1]); vst1q_f16(reinterpret_cast<__fp16*>(ptr) + 8, reg.val[1]);
@@ -192,9 +190,6 @@ struct BF16Vec16 : public Vec<BF16Vec16> {
explicit BF16Vec16(const void* ptr) explicit BF16Vec16(const void* ptr)
: reg(*reinterpret_cast<const bfloat16x8x2_t*>(ptr)) {}; : reg(*reinterpret_cast<const bfloat16x8x2_t*>(ptr)) {};
// ASIMD does not support non-temporal loads
explicit BF16Vec16(bool, const void* ptr) : BF16Vec16(ptr) {}
explicit BF16Vec16(bfloat16x8x2_t data) : reg(data) {}; explicit BF16Vec16(bfloat16x8x2_t data) : reg(data) {};
explicit BF16Vec16(const FP32Vec16&); explicit BF16Vec16(const FP32Vec16&);
@@ -479,9 +474,6 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
: reg({vld1q_f32(ptr), vld1q_f32(ptr + 4), vld1q_f32(ptr + 8), : reg({vld1q_f32(ptr), vld1q_f32(ptr + 4), vld1q_f32(ptr + 8),
vld1q_f32(ptr + 12)}) {} vld1q_f32(ptr + 12)}) {}
// ASIMD does not support non-temporal loads
explicit FP32Vec16(bool, const float* ptr) : FP32Vec16(ptr) {}
explicit FP32Vec16(float32x4x4_t data) : reg(data) {} explicit FP32Vec16(float32x4x4_t data) : reg(data) {}
explicit FP32Vec16(const FP32Vec8& data) { explicit FP32Vec16(const FP32Vec8& data) {
@@ -764,96 +756,6 @@ struct INT8Vec16 : public Vec<INT8Vec16> {
}; };
}; };
struct INT8Vec64 : public Vec<INT8Vec64> {
constexpr static int VEC_ELEM_NUM = 64;
union AliasReg {
int8x16x4_t reg;
int8_t values[VEC_ELEM_NUM];
};
int8x16x4_t reg;
explicit INT8Vec64(const int8_t* ptr) { reg = vld1q_s8_x4(ptr); }
// ASIMD does not support non-temporal loads
explicit INT8Vec64(bool, const int8_t* ptr) : INT8Vec64(ptr) {}
void save(int8_t* ptr) const { vst1q_s8_x4(ptr, reg); }
// masked store
void save(int8_t* p, int elem_num) const {
TORCH_CHECK(elem_num <= VEC_ELEM_NUM && elem_num > 0);
if (elem_num == VEC_ELEM_NUM) {
vst1q_s8_x4(p, reg);
return;
}
const int full_quadwords = elem_num / 16;
const int remaining_bytes = elem_num % 16;
for (int i = 0; i < full_quadwords; ++i) {
vst1q_s8(p + 16 * i, reg.val[i]);
}
if (remaining_bytes) {
const int8x16_t v = reg.val[full_quadwords];
int8_t* tail = p + 16 * full_quadwords;
switch (remaining_bytes) {
case 15:
tail[14] = vgetq_lane_s8(v, 14);
[[fallthrough]];
case 14:
tail[13] = vgetq_lane_s8(v, 13);
[[fallthrough]];
case 13:
tail[12] = vgetq_lane_s8(v, 12);
[[fallthrough]];
case 12:
tail[11] = vgetq_lane_s8(v, 11);
[[fallthrough]];
case 11:
tail[10] = vgetq_lane_s8(v, 10);
[[fallthrough]];
case 10:
tail[9] = vgetq_lane_s8(v, 9);
[[fallthrough]];
case 9:
tail[8] = vgetq_lane_s8(v, 8);
[[fallthrough]];
case 8:
tail[7] = vgetq_lane_s8(v, 7);
[[fallthrough]];
case 7:
tail[6] = vgetq_lane_s8(v, 6);
[[fallthrough]];
case 6:
tail[5] = vgetq_lane_s8(v, 5);
[[fallthrough]];
case 5:
tail[4] = vgetq_lane_s8(v, 4);
[[fallthrough]];
case 4:
tail[3] = vgetq_lane_s8(v, 3);
[[fallthrough]];
case 3:
tail[2] = vgetq_lane_s8(v, 2);
[[fallthrough]];
case 2:
tail[1] = vgetq_lane_s8(v, 1);
[[fallthrough]];
case 1:
tail[0] = vgetq_lane_s8(v, 0);
break;
default:
break;
}
}
}
// ASIMD does not support non-temporal stores
void nt_save(int8_t* ptr) const { save(ptr); }
}; // INT8Vec64
template <typename T> template <typename T>
struct VecType { struct VecType {
using vec_type = void; using vec_type = void;

View File

@@ -116,7 +116,7 @@ class Dequantizer4b {
scalar_vec_t output_vec_0(wb_0); scalar_vec_t output_vec_0(wb_0);
scalar_vec_t output_vec_1(wb_1); scalar_vec_t output_vec_1(wb_1);
// AMX needs to interleave K elements to pack as 32 bits // AMX needs to interlave K elements to pack as 32 bits
if constexpr (isa == ISA::AMX) { if constexpr (isa == ISA::AMX) {
vec_op::interleave_save(output_vec_0, output_vec_1, curr_weight); vec_op::interleave_save(output_vec_0, output_vec_1, curr_weight);
} else { } else {

View File

@@ -360,14 +360,13 @@ void onednn_scaled_mm(
const std::optional<torch::Tensor>& azp, // [M] or [1] const std::optional<torch::Tensor>& azp, // [M] or [1]
const std::optional<torch::Tensor>& azp_adj, // [M] or [1] const std::optional<torch::Tensor>& azp_adj, // [M] or [1]
const std::optional<torch::Tensor>& bias, // [N] const std::optional<torch::Tensor>& bias, // [N]
const torch::Tensor& handler_tensor) { int64_t handler) {
CPU_KERNEL_GUARD_IN(onednn_scaled_mm) CPU_KERNEL_GUARD_IN(onednn_scaled_mm)
TORCH_CHECK(a.dim() == 2); TORCH_CHECK(a.dim() == 2);
TORCH_CHECK(a.is_contiguous()); TORCH_CHECK(a.is_contiguous());
TORCH_CHECK(c.is_contiguous()); TORCH_CHECK(c.is_contiguous());
W8A8MatMulPrimitiveHandler* ptr = W8A8MatMulPrimitiveHandler* ptr =
reinterpret_cast<W8A8MatMulPrimitiveHandler*>( reinterpret_cast<W8A8MatMulPrimitiveHandler*>(handler);
handler_tensor.item<int64_t>());
const int32_t* azp_ptr = nullptr; const int32_t* azp_ptr = nullptr;
if (azp.has_value()) { if (azp.has_value()) {
azp_ptr = azp->data_ptr<int32_t>(); azp_ptr = azp->data_ptr<int32_t>();
@@ -520,14 +519,13 @@ int64_t create_onednn_mm_handler(const torch::Tensor& b,
void onednn_mm(torch::Tensor& c, // [M, OC], row-major void onednn_mm(torch::Tensor& c, // [M, OC], row-major
const torch::Tensor& a, // [M, IC], row-major const torch::Tensor& a, // [M, IC], row-major
const std::optional<torch::Tensor>& bias, const std::optional<torch::Tensor>& bias, int64_t handler) {
const torch::Tensor& handler_tensor) {
CPU_KERNEL_GUARD_IN(onednn_mm) CPU_KERNEL_GUARD_IN(onednn_mm)
TORCH_CHECK(a.dim() == 2); TORCH_CHECK(a.dim() == 2);
TORCH_CHECK(a.stride(-1) == 1); TORCH_CHECK(a.stride(-1) == 1);
TORCH_CHECK(c.stride(-1) == 1); TORCH_CHECK(c.stride(-1) == 1);
MatMulPrimitiveHandler* ptr = MatMulPrimitiveHandler* ptr =
reinterpret_cast<MatMulPrimitiveHandler*>(handler_tensor.item<int64_t>()); reinterpret_cast<MatMulPrimitiveHandler*>(handler);
// ACL matmuls expect contiguous source tensors // ACL matmuls expect contiguous source tensors
#ifdef VLLM_USE_ACL #ifdef VLLM_USE_ACL

View File

@@ -5,10 +5,6 @@
#include <sys/stat.h> #include <sys/stat.h>
#include <unistd.h> #include <unistd.h>
#ifdef __aarch64__
#include <atomic>
#endif
namespace { namespace {
#define MAX_SHM_RANK_NUM 8 #define MAX_SHM_RANK_NUM 8
#define PER_THREAD_SHM_BUFFER_BYTES (4 * 1024 * 1024) #define PER_THREAD_SHM_BUFFER_BYTES (4 * 1024 * 1024)
@@ -38,17 +34,8 @@ struct KernelVecType<c10::Half> {
}; };
struct ThreadSHMContext { struct ThreadSHMContext {
#ifdef __aarch64__
// memory model is weaker on AArch64, so we use atomic variables for
// consumer (load-acquire) and producer (store-release) to make sure
// that a stamp cannot be ready before the corresponding data is ready.
std::atomic<char> _curr_thread_stamp[2];
std::atomic<char> _ready_thread_stamp[2];
static_assert(std::atomic<char>::is_always_lock_free);
#else
volatile char _curr_thread_stamp[2]; volatile char _curr_thread_stamp[2];
volatile char _ready_thread_stamp[2]; volatile char _ready_thread_stamp[2];
#endif // __aarch64__
int local_stamp_buffer_idx; int local_stamp_buffer_idx;
int remote_stamp_buffer_idx; int remote_stamp_buffer_idx;
int thread_id; int thread_id;
@@ -75,17 +62,10 @@ struct ThreadSHMContext {
TORCH_CHECK(group_size <= MAX_SHM_RANK_NUM); TORCH_CHECK(group_size <= MAX_SHM_RANK_NUM);
TORCH_CHECK((size_t)this % 64 == 0); TORCH_CHECK((size_t)this % 64 == 0);
TORCH_CHECK((size_t)thread_shm_ptr % 64 == 0); TORCH_CHECK((size_t)thread_shm_ptr % 64 == 0);
#ifdef __aarch64__
_curr_thread_stamp[0].store(1, std::memory_order_relaxed);
_curr_thread_stamp[1].store(1, std::memory_order_relaxed);
_ready_thread_stamp[0].store(0, std::memory_order_relaxed);
_ready_thread_stamp[1].store(0, std::memory_order_relaxed);
#else
_curr_thread_stamp[0] = 1; _curr_thread_stamp[0] = 1;
_curr_thread_stamp[1] = 1; _curr_thread_stamp[1] = 1;
_ready_thread_stamp[0] = 0; _ready_thread_stamp[0] = 0;
_ready_thread_stamp[1] = 0; _ready_thread_stamp[1] = 0;
#endif // __aarch64__
_thread_buffer_mask[0] = 0; _thread_buffer_mask[0] = 0;
_thread_buffer_mask[1] = 0; _thread_buffer_mask[1] = 0;
for (int i = 0; i < MAX_SHM_RANK_NUM; ++i) { for (int i = 0; i < MAX_SHM_RANK_NUM; ++i) {
@@ -123,43 +103,19 @@ struct ThreadSHMContext {
_thread_buffer_mask[local_stamp_buffer_idx] ^= 0xFFFFFFFFFFFFFFFF; _thread_buffer_mask[local_stamp_buffer_idx] ^= 0xFFFFFFFFFFFFFFFF;
} }
char get_curr_stamp(int idx) const { char get_curr_stamp(int idx) const { return _curr_thread_stamp[idx]; }
#ifdef __aarch64__
return _curr_thread_stamp[idx].load(std::memory_order_acquire);
#else
return _curr_thread_stamp[idx];
#endif // __aarch64__
}
char get_ready_stamp(int idx) const { char get_ready_stamp(int idx) const { return _ready_thread_stamp[idx]; }
#ifdef __aarch64__
return _ready_thread_stamp[idx].load(std::memory_order_acquire);
#else
return _ready_thread_stamp[idx];
#endif // __aarch64__
}
void next_stamp() { void next_stamp() {
#ifdef __aarch64__
_curr_thread_stamp[local_stamp_buffer_idx].fetch_add(
1, std::memory_order_release);
#else
_mm_mfence(); _mm_mfence();
_curr_thread_stamp[local_stamp_buffer_idx] += 1; _curr_thread_stamp[local_stamp_buffer_idx] += 1;
#endif // __aarch64__
} }
void commit_ready_stamp() { void commit_ready_stamp() {
#ifdef __aarch64__
_ready_thread_stamp[local_stamp_buffer_idx].store(
_curr_thread_stamp[local_stamp_buffer_idx].load(
std::memory_order_relaxed),
std::memory_order_release);
#else
_mm_mfence(); _mm_mfence();
_ready_thread_stamp[local_stamp_buffer_idx] = _ready_thread_stamp[local_stamp_buffer_idx] =
_curr_thread_stamp[local_stamp_buffer_idx]; _curr_thread_stamp[local_stamp_buffer_idx];
#endif // __aarch64__
} }
int get_swizzled_rank(int idx) { return swizzled_ranks[idx]; } int get_swizzled_rank(int idx) { return swizzled_ranks[idx]; }
@@ -186,11 +142,7 @@ struct ThreadSHMContext {
break; break;
} }
++_spinning_count; ++_spinning_count;
#ifdef __aarch64__
__asm__ __volatile__("yield");
#else
_mm_pause(); _mm_pause();
#endif // __aarch64__
} }
} }
@@ -237,10 +189,10 @@ struct ThreadSHMContext {
class SHMManager { class SHMManager {
public: public:
explicit SHMManager(const std::string& name, const int rank, explicit SHMManager(const std::string& name, const int rank,
const int group_size, const int thread_num) const int group_size)
: _rank(rank), : _rank(rank),
_group_size(group_size), _group_size(group_size),
_thread_num(thread_num), _thread_num(omp_get_max_threads()),
_shm_names({""}), _shm_names({""}),
_shared_mem_ptrs({nullptr}), _shared_mem_ptrs({nullptr}),
_shm_ctx(nullptr) { _shm_ctx(nullptr) {
@@ -282,11 +234,11 @@ class SHMManager {
} }
static int64_t create_singleton_instance(const std::string& name, static int64_t create_singleton_instance(const std::string& name,
const int group_size, const int rank, const int group_size,
const int thread_num) { const int rank) {
std::lock_guard<std::mutex> guard(SingletonInstancesLock); std::lock_guard<std::mutex> guard(SingletonInstancesLock);
SingletonInstances.emplace_back( SingletonInstances.emplace_back(
std::make_unique<SHMManager>(name, rank, group_size, thread_num)); std::make_unique<SHMManager>(name, rank, group_size));
return static_cast<int64_t>(SingletonInstances.size() - 1); return static_cast<int64_t>(SingletonInstances.size() - 1);
} }
@@ -854,9 +806,8 @@ std::vector<torch::Tensor> shm_recv_tensor_list(int64_t handle, int64_t src) {
} }
int64_t init_shm_manager(const std::string& name, const int64_t group_size, int64_t init_shm_manager(const std::string& name, const int64_t group_size,
const int64_t rank, const int64_t thread_num) { const int64_t rank) {
return SHMManager::create_singleton_instance(name, group_size, rank, return SHMManager::create_singleton_instance(name, group_size, rank);
thread_num);
} }
std::string join_shm_manager(int64_t handle, const std::string& name) { std::string join_shm_manager(int64_t handle, const std::string& name) {

View File

@@ -19,14 +19,13 @@ void onednn_scaled_mm(torch::Tensor& c, const torch::Tensor& a,
const std::optional<torch::Tensor>& azp, const std::optional<torch::Tensor>& azp,
const std::optional<torch::Tensor>& azp_adj, const std::optional<torch::Tensor>& azp_adj,
const std::optional<torch::Tensor>& bias, const std::optional<torch::Tensor>& bias,
const torch::Tensor& handler_tensor); int64_t handler);
int64_t create_onednn_mm_handler(const torch::Tensor& b, int64_t create_onednn_mm_handler(const torch::Tensor& b,
int64_t primitive_cache_size); int64_t primitive_cache_size);
void onednn_mm(torch::Tensor& c, const torch::Tensor& a, void onednn_mm(torch::Tensor& c, const torch::Tensor& a,
const std::optional<torch::Tensor>& bias, const std::optional<torch::Tensor>& bias, int64_t handler);
const torch::Tensor& handler_tensor);
bool is_onednn_acl_supported(); bool is_onednn_acl_supported();
@@ -35,7 +34,7 @@ void mla_decode_kvcache(torch::Tensor& out, torch::Tensor& query,
torch::Tensor& block_tables, torch::Tensor& seq_lens); torch::Tensor& block_tables, torch::Tensor& seq_lens);
int64_t init_shm_manager(const std::string& name, const int64_t group_size, int64_t init_shm_manager(const std::string& name, const int64_t group_size,
const int64_t rank, const int64_t thread_num); const int64_t rank);
std::string join_shm_manager(int64_t handle, const std::string& name); std::string join_shm_manager(int64_t handle, const std::string& name);
@@ -197,7 +196,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
// oneDNN GEMM // oneDNN GEMM
ops.def( ops.def(
"onednn_mm(Tensor! c, Tensor a, Tensor? bias, " "onednn_mm(Tensor! c, Tensor a, Tensor? bias, "
"Tensor handler_tensor) -> ()"); "int handler) -> ()");
ops.impl("onednn_mm", torch::kCPU, &onednn_mm); ops.impl("onednn_mm", torch::kCPU, &onednn_mm);
// Check if oneDNN was built with ACL backend // Check if oneDNN was built with ACL backend
@@ -213,7 +212,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
// oneDNN scaled_mm for W8A8 with static per-tensor activation quantization // oneDNN scaled_mm for W8A8 with static per-tensor activation quantization
ops.def( ops.def(
"onednn_scaled_mm(Tensor! c, Tensor a, Tensor a_scales, Tensor? azp, " "onednn_scaled_mm(Tensor! c, Tensor a, Tensor a_scales, Tensor? azp, "
"Tensor? azp_adj, Tensor? bias, Tensor handler_tensor) -> ()"); "Tensor? azp_adj, Tensor? bias, int handler) -> ()");
ops.impl("onednn_scaled_mm", torch::kCPU, &onednn_scaled_mm); ops.impl("onednn_scaled_mm", torch::kCPU, &onednn_scaled_mm);
// Compute int8 quantized tensor for given scaling factor. // Compute int8 quantized tensor for given scaling factor.
@@ -231,11 +230,9 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
#endif #endif
// SHM CCL // SHM CCL
#if defined(__AVX512F__) || (defined(__aarch64__) && !defined(__APPLE__)) #ifdef __AVX512F__
ops.def( ops.def("init_shm_manager(str name, int group_size, int rank) -> int",
"init_shm_manager(str name, int group_size, int rank, int thread_num) -> " &init_shm_manager);
"int",
&init_shm_manager);
ops.def("join_shm_manager(int handle, str name) -> str", &join_shm_manager); ops.def("join_shm_manager(int handle, str name) -> str", &join_shm_manager);
ops.def("shm_allreduce(int handle, Tensor! data) -> ()"); ops.def("shm_allreduce(int handle, Tensor! data) -> ()");
ops.impl("shm_allreduce", torch::kCPU, &shm_allreduce); ops.impl("shm_allreduce", torch::kCPU, &shm_allreduce);
@@ -253,7 +250,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
ops.impl("shm_send_tensor_list", torch::kCPU, &shm_send_tensor_list); ops.impl("shm_send_tensor_list", torch::kCPU, &shm_send_tensor_list);
ops.def("shm_recv_tensor_list(int handle, int src) -> Tensor[](a)", ops.def("shm_recv_tensor_list(int handle, int src) -> Tensor[](a)",
&shm_recv_tensor_list); &shm_recv_tensor_list);
#endif // #if defined(__AVX512F__) || defined(__aarch64__) #endif
// sgl-kernels // sgl-kernels
#if defined(__AVX512BF16__) && defined(__AVX512F__) && defined(__AVX512VNNI__) #if defined(__AVX512BF16__) && defined(__AVX512F__) && defined(__AVX512VNNI__)
@@ -294,7 +291,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
"value_cache, Tensor(a3!) output, Tensor query_start_loc, Tensor " "value_cache, Tensor(a3!) output, Tensor query_start_loc, Tensor "
"seq_lens, float scale, bool causal, Tensor? alibi_slopes, SymInt " "seq_lens, float scale, bool causal, Tensor? alibi_slopes, SymInt "
"sliding_window_left, SymInt sliding_window_right, Tensor block_table, " "sliding_window_left, SymInt sliding_window_right, Tensor block_table, "
"float softcap, Tensor scheduler_metadata, Tensor? s_aux) -> ()", "float softcap, Tensor sheduler_metadata, Tensor? s_aux) -> ()",
&cpu_attention_with_kv_cache); &cpu_attention_with_kv_cache);
// placeholders // placeholders

View File

@@ -31,6 +31,8 @@ namespace moe {
constexpr unsigned FULL_WARP_MASK = 0xffffffff; constexpr unsigned FULL_WARP_MASK = 0xffffffff;
constexpr int32_t WARP_SIZE = 32; constexpr int32_t WARP_SIZE = 32;
constexpr int32_t BLOCK_SIZE = 512;
constexpr int32_t NUM_WARPS_PER_BLOCK = BLOCK_SIZE / WARP_SIZE;
namespace warp_topk { namespace warp_topk {
@@ -63,6 +65,14 @@ __forceinline__ __device__ bool is_better_than(T val, T baseline, idxT index,
return res; return res;
} }
template <typename T, typename idxT>
int calc_smem_size_for_block_wide(int num_of_warp, int64_t k) {
int64_t cache_topk = (sizeof(T) + sizeof(idxT)) * num_of_warp * k;
int64_t n = std::max<int>(num_of_warp / 2 * k, num_of_warp * WARP_SIZE);
return max(cache_topk,
round_up_to_multiple_of<256>(n * sizeof(T)) + n * sizeof(idxT));
}
template <int size, bool ascending, bool reverse, typename T, typename idxT, template <int size, bool ascending, bool reverse, typename T, typename idxT,
bool is_stable> bool is_stable>
struct BitonicMerge { struct BitonicMerge {
@@ -257,15 +267,6 @@ class WarpSort {
} }
} }
// Accessors for per-lane selected value/index.
// NOTE: For the common case `capacity == WARP_SIZE`, `max_arr_len_ == 1`
// and callers should use `i == 0`.
__device__ __forceinline__ idxT get_idx(int i = 0) const {
return idx_arr_[i];
}
__device__ __forceinline__ T get_val(int i = 0) const { return val_arr_[i]; }
protected: protected:
static constexpr int max_arr_len_ = capacity / WARP_SIZE; static constexpr int max_arr_len_ = capacity / WARP_SIZE;
@@ -284,7 +285,6 @@ class WarpSelect : public WarpSort<capacity, greater, T, idxT, is_stable> {
__device__ WarpSelect(idxT k, T dummy) __device__ WarpSelect(idxT k, T dummy)
: WarpSort<capacity, greater, T, idxT, is_stable>(k, dummy), : WarpSort<capacity, greater, T, idxT, is_stable>(k, dummy),
k_th_(dummy), k_th_(dummy),
k_th_idx_(0),
k_th_lane_((k - 1) % WARP_SIZE) { k_th_lane_((k - 1) % WARP_SIZE) {
extern __shared__ char smem_buf[]; // extern __shared__ T smem_buf[]; extern __shared__ char smem_buf[]; // extern __shared__ T smem_buf[];
@@ -346,6 +346,9 @@ class WarpSelect : public WarpSort<capacity, greater, T, idxT, is_stable> {
idxT idx = (lane_ < smem_buf_len_) ? idx_smem_[lane_] : 0; idxT idx = (lane_ < smem_buf_len_) ? idx_smem_[lane_] : 0;
merge_buf_(val, idx); merge_buf_(val, idx);
} }
// after done(), smem is used for merging results among warps
__syncthreads();
} }
private: private:
@@ -500,186 +503,255 @@ __device__ void topk_with_k2(T* output, T const* input, BiasT const* bias,
} }
} }
template <typename T, typename BiasT, typename IdxT, ScoringFunc SF> template <typename T, typename BiasT, ScoringFunc SF>
__global__ void grouped_topk_fused_kernel( __global__ void topk_with_k2_kernel(T* output, T* input, BiasT const* bias,
T* scores, float* topk_values, IdxT* topk_indices, BiasT const* bias, int64_t const num_tokens,
int64_t const num_tokens, int64_t const num_experts, int64_t const n_group, int64_t const num_cases,
int64_t const topk_group, int64_t const topk, bool renormalize, int64_t const n_group,
int64_t const num_experts_per_group) {
int32_t warp_id = threadIdx.x / WARP_SIZE;
int32_t lane_id = threadIdx.x % WARP_SIZE;
int32_t case_id = blockIdx.x * NUM_WARPS_PER_BLOCK + warp_id;
if (case_id < num_cases) {
input += case_id * num_experts_per_group;
// bias is per expert group, offset to current group
int32_t group_id = case_id % n_group;
BiasT const* group_bias = bias + group_id * num_experts_per_group;
output += case_id;
cg::thread_block block = cg::this_thread_block();
cg::thread_block_tile<32> tile = cg::tiled_partition<32>(block);
#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
asm volatile("griddepcontrol.wait;");
#endif
topk_with_k2<T, BiasT, SF>(output, input, group_bias, tile, lane_id,
num_experts_per_group);
}
#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
asm volatile("griddepcontrol.launch_dependents;");
#endif
}
template <typename T, typename BiasT, typename IdxT, ScoringFunc SF,
int NGroup = -1>
__global__ void group_idx_and_topk_idx_kernel(
T* scores, T const* group_scores, float* topk_values, IdxT* topk_indices,
BiasT const* bias, int64_t const num_tokens, int64_t const n_group,
int64_t const topk_group, int64_t const topk, int64_t const num_experts,
int64_t const num_experts_per_group, bool renormalize,
double routed_scaling_factor) { double routed_scaling_factor) {
int32_t const token_id = static_cast<int32_t>(blockIdx.x); int32_t warp_id = threadIdx.x / WARP_SIZE;
if (token_id >= num_tokens) { int32_t lane_id = threadIdx.x % WARP_SIZE;
return; int32_t case_id =
} blockIdx.x * NUM_WARPS_PER_BLOCK + warp_id; // one per token
scores += case_id * num_experts;
group_scores += case_id * n_group;
topk_values += case_id * topk;
topk_indices += case_id * topk;
int32_t const warp_id = threadIdx.x / WARP_SIZE; constexpr bool kUseStaticNGroup = (NGroup > 0);
int32_t const lane_id = threadIdx.x % WARP_SIZE; // use int32 to avoid implicit conversion
int32_t const n_group_i32 =
kUseStaticNGroup ? NGroup : static_cast<int32_t>(n_group);
int32_t const n_group_i32 = static_cast<int32_t>(n_group); int32_t align_num_experts_per_group =
int32_t const topk_group_i32 = static_cast<int32_t>(topk_group); warp_topk::round_up_to_multiple_of<WARP_SIZE>(num_experts_per_group);
int32_t const topk_i32 = static_cast<int32_t>(topk);
int32_t const num_experts_i32 = static_cast<int32_t>(num_experts);
int32_t const num_warps = blockDim.x / WARP_SIZE;
if (warp_id >= n_group_i32 || num_warps < n_group_i32) {
return;
}
int32_t const num_experts_per_group = num_experts_i32 / n_group_i32;
T* scores_token = scores + static_cast<int64_t>(token_id) * num_experts;
cg::thread_block block = cg::this_thread_block(); cg::thread_block block = cg::this_thread_block();
cg::thread_block_tile<32> tile = cg::tiled_partition<32>(block); cg::thread_block_tile<32> tile = cg::tiled_partition<32>(block);
extern __shared__ char smem_buf[]; extern __shared__ char smem_buf[]; // NOTE: reuse the shared memory here to
// warpSelect internal staging buffer layout // store the target topk idx
size_t const val_bytes = int32_t* s_topk_idx = reinterpret_cast<int32_t*>(smem_buf);
static_cast<size_t>(num_warps) * WARP_SIZE * sizeof(T); T* s_topk_value =
size_t const val_bytes_aligned = reinterpret_cast<T*>(s_topk_idx + NUM_WARPS_PER_BLOCK * topk) +
warp_topk::round_up_to_multiple_of<256>(val_bytes); warp_id * topk;
size_t const idx_bytes = s_topk_idx += warp_id * topk;
static_cast<size_t>(num_warps) * WARP_SIZE * sizeof(int32_t);
size_t const internal_bytes = val_bytes_aligned + idx_bytes;
// user-managed shared memory starts after warpSelect internal staging. T value = neg_inf<T>();
uintptr_t ptr_u = reinterpret_cast<uintptr_t>(smem_buf + internal_bytes); T topk_group_value = neg_inf<T>();
ptr_u = (ptr_u + 15) & ~static_cast<uintptr_t>(15); // align to 16B int32_t num_equalto_topkth_group;
T* s_group_scores = reinterpret_cast<T*>(ptr_u);
#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)) #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
asm volatile("griddepcontrol.wait;"); // I think all prolog can be put before asm volatile("griddepcontrol.wait;"); // I think all prolog can be put before
// acqbulk because it's ptr arithmetic // acqbulk because it's ptr arithmetic
#endif #endif
// phase 1: per-group scan if (case_id < num_tokens) {
int32_t const group_offset = warp_id * num_experts_per_group; // calculate group_idx
topk_with_k2<T, BiasT, SF>(s_group_scores + warp_id, int32_t target_num_min =
scores_token + group_offset, bias + group_offset, WARP_SIZE - n_group_i32 + static_cast<int32_t>(topk_group);
tile, lane_id, num_experts_per_group); // The check is necessary to avoid abnormal input
if (lane_id < n_group_i32 && is_finite(group_scores[lane_id])) {
value = group_scores[lane_id];
}
int count_equal_to_top_value = WARP_SIZE - n_group_i32;
int pre_count_equal_to_top_value = 0;
// Use loop to find the largset top_group
while (count_equal_to_top_value < target_num_min) {
topk_group_value = cg::reduce(tile, value, cg::greater<T>());
if (value == topk_group_value) {
value = neg_inf<T>();
}
pre_count_equal_to_top_value = count_equal_to_top_value;
count_equal_to_top_value =
__popc(__ballot_sync(FULL_WARP_MASK, (value == neg_inf<T>())));
}
num_equalto_topkth_group = target_num_min - pre_count_equal_to_top_value;
}
__syncthreads();
warp_topk::WarpSelect</*capability*/ WARP_SIZE, /*greater*/ true, T, int32_t,
/* is_stable */ true>
queue((int32_t)topk, neg_inf<T>());
int count_equalto_topkth_group = 0;
bool if_proceed_next_topk = topk_group_value != neg_inf<T>();
if (case_id < num_tokens && if_proceed_next_topk) {
auto process_group = [&](int i_group) {
if ((group_scores[i_group] > topk_group_value) ||
((group_scores[i_group] == topk_group_value) &&
(count_equalto_topkth_group < num_equalto_topkth_group))) {
int32_t offset = i_group * num_experts_per_group;
for (int32_t i = lane_id; i < align_num_experts_per_group;
i += WARP_SIZE) {
T candidates = neg_inf<T>();
if (i < num_experts_per_group) {
// apply scoring function (if any) and add bias
T input = scores[offset + i];
if (is_finite(input)) {
T score = apply_scoring<SF>(input);
candidates = score + static_cast<T>(bias[offset + i]);
}
}
queue.add(candidates, offset + i);
}
if (group_scores[i_group] == topk_group_value) {
count_equalto_topkth_group++;
}
}
};
if constexpr (kUseStaticNGroup) {
#pragma unroll
for (int i_group = 0; i_group < NGroup; ++i_group) {
process_group(i_group);
}
} else {
for (int i_group = 0; i_group < n_group_i32; ++i_group) {
process_group(i_group);
}
}
queue.done();
// Get the topk_idx
queue.dumpIdx(s_topk_idx);
}
// Load the valid score value
// Calculate the summation
float topk_sum = 1e-20;
if (case_id < num_tokens && if_proceed_next_topk) {
for (int i = lane_id;
i < warp_topk::round_up_to_multiple_of<WARP_SIZE>(topk);
i += WARP_SIZE) {
T value = cuda_cast<T, float>(0.0f);
if (i < topk) {
// Load the score value (without bias) for normalization
T input = scores[s_topk_idx[i]];
value = apply_scoring<SF>(input);
s_topk_value[i] = value;
}
if (renormalize) {
topk_sum +=
cg::reduce(tile, cuda_cast<float, T>(value), cg::plus<float>());
}
}
}
__syncthreads(); __syncthreads();
// phase 2: warp0 selects groups + merges candidates to final topk if (case_id < num_tokens) {
if (warp_id != 0) { if (if_proceed_next_topk) {
return; float scale = routed_scaling_factor;
} if (renormalize) {
scale /= topk_sum;
topk_values += static_cast<int64_t>(token_id) * topk; }
topk_indices += static_cast<int64_t>(token_id) * topk; for (int i = lane_id; i < topk; i += WARP_SIZE) {
float base = cuda_cast<float, T>(s_topk_value[i]);
// select topk_group groups by group score float value = base * scale;
warp_topk::WarpSelect</*capability*/ WARP_SIZE, /*greater*/ true, T, int32_t, topk_indices[i] = s_topk_idx[i];
/* is_stable */ true> topk_values[i] = value;
group_sel(static_cast<int32_t>(topk_group_i32), neg_inf<T>()); }
} else {
// all lanes must participate in WarpSelect::add(). for (int i = lane_id; i < topk; i += WARP_SIZE) {
T gscore = (lane_id < n_group_i32) ? s_group_scores[lane_id] : neg_inf<T>(); topk_indices[i] = i;
group_sel.add(gscore, lane_id); topk_values[i] = 1.0f / topk;
group_sel.done();
// proceed only if the k-th selected group score is not -inf
bool proceed = false;
if (topk_group_i32 > 0) {
int const kth_lane = topk_group_i32 - 1;
// broadcast the k-th selected group score to all lanes
T kth_val = __shfl_sync(FULL_WARP_MASK, group_sel.get_val(0), kth_lane);
proceed = (kth_val != neg_inf<T>());
}
if (!proceed) {
for (int i = lane_id; i < topk_i32; i += WARP_SIZE) {
topk_indices[i] = static_cast<IdxT>(i);
topk_values[i] = 1.0f / static_cast<float>(topk_i32);
}
#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
asm volatile("griddepcontrol.launch_dependents;");
#endif
return;
}
// merge per-group topk candidates for selected groups, then select topk
warp_topk::WarpSelect</*capability*/ WARP_SIZE, /*greater*/ true, T, int32_t,
/* is_stable */ true>
expert_sel(static_cast<int32_t>(topk_i32), neg_inf<T>());
// selected group ids reside in lanes [0, topk_group)
int32_t sel_gid_lane = (lane_id < topk_group_i32) ? group_sel.get_idx(0) : 0;
// add candidates from selected groups to expert_sel
for (int32_t g = 0; g < topk_group_i32; ++g) {
int32_t gid = __shfl_sync(FULL_WARP_MASK, sel_gid_lane, g);
int32_t const offset = gid * num_experts_per_group;
int32_t const align_num_experts_per_group =
warp_topk::round_up_to_multiple_of<WARP_SIZE>(num_experts_per_group);
for (int32_t i = lane_id; i < align_num_experts_per_group; i += WARP_SIZE) {
// all lanes must call `add()` the same number of times.
T cand = neg_inf<T>();
int32_t idx = 0;
if (i < num_experts_per_group) {
idx = offset + i;
T input = scores_token[idx];
if (is_finite(input)) {
T score = apply_scoring<SF>(input);
cand = score + static_cast<T>(bias[idx]);
}
} }
expert_sel.add(cand, idx);
} }
// Note: when if_proceed_next_topk==false, choose the first 8 experts as the
// default result.
} }
expert_sel.done();
// compute unbiased routing weights + optional renorm.
float lane_unbiased = 0.0f;
IdxT lane_idx = 0;
if (lane_id < topk_i32) {
lane_idx = static_cast<IdxT>(expert_sel.get_idx(0));
T in = scores_token[static_cast<int32_t>(lane_idx)];
lane_unbiased = cuda_cast<float, T>(apply_scoring<SF>(in));
}
float topk_sum = 1e-20f;
if (renormalize) {
topk_sum += cg::reduce(tile, lane_unbiased, cg::plus<float>());
}
float scale = static_cast<float>(routed_scaling_factor);
if (renormalize) {
scale /= topk_sum;
}
if (lane_id < topk_i32) {
topk_indices[lane_id] = lane_idx;
topk_values[lane_id] = lane_unbiased * scale;
}
#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)) #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
asm volatile("griddepcontrol.launch_dependents;"); asm volatile("griddepcontrol.launch_dependents;");
#endif #endif
} }
template <typename T, typename BiasT, typename IdxT, ScoringFunc SF>
inline void launch_group_idx_and_topk_kernel(
cudaLaunchConfig_t const& config, T* scores, T* group_scores,
float* topk_values, IdxT* topk_indices, BiasT const* bias,
int64_t const num_tokens, int64_t const n_group, int64_t const topk_group,
int64_t const topk, int64_t const num_experts,
int64_t const num_experts_per_group, bool const renormalize,
double const routed_scaling_factor) {
auto launch = [&](auto* kernel_instance2) {
cudaLaunchKernelEx(&config, kernel_instance2, scores, group_scores,
topk_values, topk_indices, bias, num_tokens, n_group,
topk_group, topk, num_experts, num_experts_per_group,
renormalize, routed_scaling_factor);
};
switch (n_group) {
case 4: {
launch(&group_idx_and_topk_idx_kernel<T, BiasT, IdxT, SF, 4>);
break;
}
case 8: {
launch(&group_idx_and_topk_idx_kernel<T, BiasT, IdxT, SF, 8>);
break;
}
case 16: {
launch(&group_idx_and_topk_idx_kernel<T, BiasT, IdxT, SF, 16>);
break;
}
case 32: {
launch(&group_idx_and_topk_idx_kernel<T, BiasT, IdxT, SF, 32>);
break;
}
default: {
launch(&group_idx_and_topk_idx_kernel<T, BiasT, IdxT, SF>);
break;
}
}
}
template <typename T, typename BiasT, typename IdxT> template <typename T, typename BiasT, typename IdxT>
void invokeNoAuxTc(T* scores, float* topk_values, IdxT* topk_indices, void invokeNoAuxTc(T* scores, T* group_scores, float* topk_values,
BiasT const* bias, int64_t const num_tokens, IdxT* topk_indices, BiasT const* bias,
int64_t const num_experts, int64_t const n_group, int64_t const num_tokens, int64_t const num_experts,
int64_t const topk_group, int64_t const topk, int64_t const n_group, int64_t const topk_group,
bool const renormalize, double const routed_scaling_factor, int64_t const topk, bool const renormalize,
int const scoring_func, bool enable_pdl = false, double const routed_scaling_factor, int const scoring_func,
cudaStream_t const stream = 0) { bool enable_pdl = false, cudaStream_t const stream = 0) {
int64_t num_cases = num_tokens * n_group;
int64_t topk_with_k2_num_blocks = (num_cases - 1) / NUM_WARPS_PER_BLOCK + 1;
cudaLaunchConfig_t config; cudaLaunchConfig_t config;
// One block per token; one warp per group. config.gridDim = topk_with_k2_num_blocks;
config.gridDim = static_cast<uint32_t>(num_tokens); config.blockDim = BLOCK_SIZE;
config.blockDim = static_cast<uint32_t>(n_group) * WARP_SIZE; config.dynamicSmemBytes = 0;
// Dynamic shared memory: WarpSelect staging + per-group topk buffers.
int32_t const num_warps = static_cast<int32_t>(n_group);
size_t const val_bytes =
static_cast<size_t>(num_warps) * WARP_SIZE * sizeof(T);
size_t const val_bytes_aligned =
warp_topk::round_up_to_multiple_of<256>(val_bytes);
size_t const idx_bytes =
static_cast<size_t>(num_warps) * WARP_SIZE * sizeof(int32_t);
size_t const internal_bytes = val_bytes_aligned + idx_bytes;
size_t const extra_bytes = 16 + static_cast<size_t>(n_group) * sizeof(T);
config.dynamicSmemBytes = internal_bytes + extra_bytes;
config.stream = stream; config.stream = stream;
cudaLaunchAttribute attrs[1]; cudaLaunchAttribute attrs[1];
attrs[0].id = cudaLaunchAttributeProgrammaticStreamSerialization; attrs[0].id = cudaLaunchAttributeProgrammaticStreamSerialization;
@@ -687,35 +759,66 @@ void invokeNoAuxTc(T* scores, float* topk_values, IdxT* topk_indices,
config.numAttrs = 1; config.numAttrs = 1;
config.attrs = attrs; config.attrs = attrs;
auto const sf = static_cast<ScoringFunc>(scoring_func); auto const sf = static_cast<ScoringFunc>(scoring_func);
int64_t const num_experts_per_group = num_experts / n_group;
auto launch_topk_with_k2 = [&](auto* kernel_instance1) {
cudaLaunchKernelEx(&config, kernel_instance1, group_scores, scores, bias,
num_tokens, num_cases, n_group, num_experts_per_group);
};
switch (sf) { switch (sf) {
case SCORING_NONE: { case SCORING_NONE: {
auto* kernel_instance = auto* kernel_instance1 = &topk_with_k2_kernel<T, BiasT, SCORING_NONE>;
&grouped_topk_fused_kernel<T, BiasT, IdxT, SCORING_NONE>; launch_topk_with_k2(kernel_instance1);
cudaLaunchKernelEx(&config, kernel_instance, scores, topk_values, break;
topk_indices, bias, num_tokens, num_experts, n_group,
topk_group, topk, renormalize, routed_scaling_factor);
return;
} }
case SCORING_SIGMOID: { case SCORING_SIGMOID: {
auto* kernel_instance = auto* kernel_instance1 = &topk_with_k2_kernel<T, BiasT, SCORING_SIGMOID>;
&grouped_topk_fused_kernel<T, BiasT, IdxT, SCORING_SIGMOID>; launch_topk_with_k2(kernel_instance1);
cudaLaunchKernelEx(&config, kernel_instance, scores, topk_values, break;
topk_indices, bias, num_tokens, num_experts, n_group,
topk_group, topk, renormalize, routed_scaling_factor);
return;
} }
default: default:
// should be guarded by higher level checks. // should be guarded by higher level checks.
TORCH_CHECK(false, "Unsupported scoring_func in invokeNoAuxTc"); TORCH_CHECK(false, "Unsupported scoring_func in invokeNoAuxTc");
} }
int64_t topk_with_k_group_num_blocks =
(num_tokens - 1) / NUM_WARPS_PER_BLOCK + 1;
size_t dynamic_smem_in_bytes =
warp_topk::calc_smem_size_for_block_wide<T, int32_t>(NUM_WARPS_PER_BLOCK,
topk);
config.gridDim = topk_with_k_group_num_blocks;
config.blockDim = BLOCK_SIZE;
config.dynamicSmemBytes = dynamic_smem_in_bytes;
config.stream = stream;
attrs[0].id = cudaLaunchAttributeProgrammaticStreamSerialization;
attrs[0].val.programmaticStreamSerializationAllowed = enable_pdl;
config.numAttrs = 1;
config.attrs = attrs;
switch (sf) {
case SCORING_NONE: {
launch_group_idx_and_topk_kernel<T, BiasT, IdxT, SCORING_NONE>(
config, scores, group_scores, topk_values, topk_indices, bias,
num_tokens, n_group, topk_group, topk, num_experts,
num_experts_per_group, renormalize, routed_scaling_factor);
break;
}
case SCORING_SIGMOID: {
launch_group_idx_and_topk_kernel<T, BiasT, IdxT, SCORING_SIGMOID>(
config, scores, group_scores, topk_values, topk_indices, bias,
num_tokens, n_group, topk_group, topk, num_experts,
num_experts_per_group, renormalize, routed_scaling_factor);
break;
}
default:
TORCH_CHECK(false, "Unsupported scoring_func in invokeNoAuxTc");
}
} }
#define INSTANTIATE_NOAUX_TC(T, BiasT, IdxT) \ #define INSTANTIATE_NOAUX_TC(T, BiasT, IdxT) \
template void invokeNoAuxTc<T, BiasT, IdxT>( \ template void invokeNoAuxTc<T, BiasT, IdxT>( \
T * scores, float* topk_values, IdxT* topk_indices, BiasT const* bias, \ T * scores, T * group_scores, float* topk_values, IdxT* topk_indices, \
int64_t const num_tokens, int64_t const num_experts, \ BiasT const* bias, int64_t const num_tokens, int64_t const num_experts, \
int64_t const n_group, int64_t const topk_group, int64_t const topk, \ int64_t const n_group, int64_t const topk_group, int64_t const topk, \
bool const renormalize, double const routed_scaling_factor, \ bool const renormalize, double const routed_scaling_factor, \
int const scoring_func, bool enable_pdl, cudaStream_t const stream); int const scoring_func, bool enable_pdl, cudaStream_t const stream);
INSTANTIATE_NOAUX_TC(float, float, int32_t); INSTANTIATE_NOAUX_TC(float, float, int32_t);
@@ -740,21 +843,17 @@ std::tuple<torch::Tensor, torch::Tensor> grouped_topk(
int64_t num_tokens = input_size[0]; int64_t num_tokens = input_size[0];
int64_t num_experts = input_size[1]; int64_t num_experts = input_size[1];
TORCH_CHECK(input_size.size() == 2, "scores must be a 2D Tensor"); TORCH_CHECK(input_size.size() == 2, "scores must be a 2D Tensor");
TORCH_CHECK(n_group > 0, "n_group must be positive");
TORCH_CHECK(topk > 0, "topk must be positive");
TORCH_CHECK(topk_group > 0, "topk_group must be positive");
TORCH_CHECK(topk_group <= n_group, "topk_group must be <= n_group");
TORCH_CHECK(num_experts % n_group == 0, TORCH_CHECK(num_experts % n_group == 0,
"num_experts should be divisible by n_group"); "num_experts should be divisible by n_group");
TORCH_CHECK(n_group <= 32, TORCH_CHECK(n_group <= 32,
"n_group should be smaller than or equal to 32 for now"); "n_group should be smaller than or equal to 32 for now");
TORCH_CHECK(topk <= 32, "topk should be smaller than or equal to 32 for now"); TORCH_CHECK(topk <= 32, "topk should be smaller than or equal to 32 for now");
TORCH_CHECK(topk <= topk_group * (num_experts / n_group),
"topk must be <= topk_group * (num_experts / n_group)");
TORCH_CHECK(scoring_func == vllm::moe::SCORING_NONE || TORCH_CHECK(scoring_func == vllm::moe::SCORING_NONE ||
scoring_func == vllm::moe::SCORING_SIGMOID, scoring_func == vllm::moe::SCORING_SIGMOID,
"scoring_func must be SCORING_NONE (0) or SCORING_SIGMOID (1)"); "scoring_func must be SCORING_NONE (0) or SCORING_SIGMOID (1)");
torch::Tensor group_scores = torch::empty(
{num_tokens, n_group}, torch::dtype(data_type).device(torch::kCUDA));
// Always output float32 for topk_values (eliminates Python-side conversion) // Always output float32 for topk_values (eliminates Python-side conversion)
torch::Tensor topk_values = torch::empty( torch::Tensor topk_values = torch::empty(
{num_tokens, topk}, torch::dtype(torch::kFloat32).device(torch::kCUDA)); {num_tokens, topk}, torch::dtype(torch::kFloat32).device(torch::kCUDA));
@@ -769,6 +868,7 @@ std::tuple<torch::Tensor, torch::Tensor> grouped_topk(
case torch::kFloat16: \ case torch::kFloat16: \
vllm::moe::invokeNoAuxTc<T, half, IdxT>( \ vllm::moe::invokeNoAuxTc<T, half, IdxT>( \
reinterpret_cast<T*>(scores.mutable_data_ptr()), \ reinterpret_cast<T*>(scores.mutable_data_ptr()), \
reinterpret_cast<T*>(group_scores.mutable_data_ptr()), \
reinterpret_cast<float*>(topk_values.mutable_data_ptr()), \ reinterpret_cast<float*>(topk_values.mutable_data_ptr()), \
reinterpret_cast<IdxT*>(topk_indices.mutable_data_ptr()), \ reinterpret_cast<IdxT*>(topk_indices.mutable_data_ptr()), \
reinterpret_cast<half const*>(bias.data_ptr()), num_tokens, \ reinterpret_cast<half const*>(bias.data_ptr()), num_tokens, \
@@ -779,6 +879,7 @@ std::tuple<torch::Tensor, torch::Tensor> grouped_topk(
case torch::kFloat32: \ case torch::kFloat32: \
vllm::moe::invokeNoAuxTc<T, float, IdxT>( \ vllm::moe::invokeNoAuxTc<T, float, IdxT>( \
reinterpret_cast<T*>(scores.mutable_data_ptr()), \ reinterpret_cast<T*>(scores.mutable_data_ptr()), \
reinterpret_cast<T*>(group_scores.mutable_data_ptr()), \
reinterpret_cast<float*>(topk_values.mutable_data_ptr()), \ reinterpret_cast<float*>(topk_values.mutable_data_ptr()), \
reinterpret_cast<IdxT*>(topk_indices.mutable_data_ptr()), \ reinterpret_cast<IdxT*>(topk_indices.mutable_data_ptr()), \
reinterpret_cast<float const*>(bias.data_ptr()), num_tokens, \ reinterpret_cast<float const*>(bias.data_ptr()), num_tokens, \
@@ -789,6 +890,7 @@ std::tuple<torch::Tensor, torch::Tensor> grouped_topk(
case torch::kBFloat16: \ case torch::kBFloat16: \
vllm::moe::invokeNoAuxTc<T, __nv_bfloat16, IdxT>( \ vllm::moe::invokeNoAuxTc<T, __nv_bfloat16, IdxT>( \
reinterpret_cast<T*>(scores.mutable_data_ptr()), \ reinterpret_cast<T*>(scores.mutable_data_ptr()), \
reinterpret_cast<T*>(group_scores.mutable_data_ptr()), \
reinterpret_cast<float*>(topk_values.mutable_data_ptr()), \ reinterpret_cast<float*>(topk_values.mutable_data_ptr()), \
reinterpret_cast<IdxT*>(topk_indices.mutable_data_ptr()), \ reinterpret_cast<IdxT*>(topk_indices.mutable_data_ptr()), \
reinterpret_cast<__nv_bfloat16 const*>(bias.data_ptr()), \ reinterpret_cast<__nv_bfloat16 const*>(bias.data_ptr()), \

View File

@@ -58,7 +58,7 @@ TEMPLATE = (
"( MARLIN_KERNEL_PARAMS );" "( MARLIN_KERNEL_PARAMS );"
) )
THREAD_CONFIGS = [(128, 128, 256), (64, 256, 256), (64, 128, 128), (128, 64, 128)] THREAD_CONFIGS = [(128, 128, 256), (64, 256, 256), (64, 128, 128)]
THREAD_M_BLOCKS = [0.5, 1, 2, 3, 4] THREAD_M_BLOCKS = [0.5, 1, 2, 3, 4]

View File

@@ -3,8 +3,8 @@
#define MARLIN_NAMESPACE_NAME marlin_moe_wna16 #define MARLIN_NAMESPACE_NAME marlin_moe_wna16
#endif #endif
#include "quantization/marlin/marlin.cuh" #include "quantization/gptq_marlin/marlin.cuh"
#include "quantization/marlin/marlin_dtypes.cuh" #include "quantization/gptq_marlin/marlin_dtypes.cuh"
#include "core/scalar_type.hpp" #include "core/scalar_type.hpp"
#define MARLIN_KERNEL_PARAMS \ #define MARLIN_KERNEL_PARAMS \

View File

@@ -23,10 +23,10 @@
#define MARLIN_NAMESPACE_NAME marlin_moe_wna16 #define MARLIN_NAMESPACE_NAME marlin_moe_wna16
#endif #endif
#include "quantization/marlin/marlin.cuh" #include "quantization/gptq_marlin/marlin.cuh"
#include "quantization/marlin/marlin_dtypes.cuh" #include "quantization/gptq_marlin/marlin_dtypes.cuh"
#include "quantization/marlin/dequant.h" #include "quantization/gptq_marlin/dequant.h"
#include "quantization/marlin/marlin_mma.h" #include "quantization/gptq_marlin/marlin_mma.h"
#include "core/scalar_type.hpp" #include "core/scalar_type.hpp"
#define STATIC_ASSERT_SCALAR_TYPE_VALID(scalar_t) \ #define STATIC_ASSERT_SCALAR_TYPE_VALID(scalar_t) \

View File

@@ -126,16 +126,14 @@ thread_config_t small_batch_thread_configs[] = {
// thread_k, thread_n, num_threads // thread_k, thread_n, num_threads
{128, 128, 256}, {128, 128, 256},
{64, 128, 128}, {64, 128, 128}};
{128, 64, 128}};
thread_config_t large_batch_thread_configs[] = { thread_config_t large_batch_thread_configs[] = {
// Ordered by priority // Ordered by priority
// thread_k, thread_n, num_threads // thread_k, thread_n, num_threads
{64, 256, 256}, {64, 256, 256},
{64, 128, 128}, {64, 128, 128}};
{128, 64, 128}};
typedef struct { typedef struct {
int blocks_per_sm; int blocks_per_sm;

View File

@@ -4,13 +4,7 @@
void topk_softmax(torch::Tensor& topk_weights, torch::Tensor& topk_indices, void topk_softmax(torch::Tensor& topk_weights, torch::Tensor& topk_indices,
torch::Tensor& token_expert_indices, torch::Tensor& token_expert_indices,
torch::Tensor& gating_output, bool renormalize, torch::Tensor& gating_output, bool renormalize);
std::optional<torch::Tensor> bias);
void topk_sigmoid(torch::Tensor& topk_weights, torch::Tensor& topk_indices,
torch::Tensor& token_expert_indices,
torch::Tensor& gating_output, bool renormalize,
std::optional<torch::Tensor> bias);
void moe_sum(torch::Tensor& input, torch::Tensor& output); void moe_sum(torch::Tensor& input, torch::Tensor& output);

View File

@@ -42,7 +42,7 @@ void moe_permute(
auto sort_workspace = torch::empty( auto sort_workspace = torch::empty(
{sorter_size}, {sorter_size},
torch::dtype(torch::kInt8).device(torch::kCUDA).requires_grad(false)); torch::dtype(torch::kInt8).device(torch::kCUDA).requires_grad(false));
torch::Tensor topk_ids_for_sort = topk_ids; auto copy_topk_ids = topk_ids.clone(); // copy topk_ids for preprocess
auto permuted_experts_id = torch::empty_like(topk_ids); auto permuted_experts_id = torch::empty_like(topk_ids);
auto sorted_row_idx = torch::empty_like(inv_permuted_idx); auto sorted_row_idx = torch::empty_like(inv_permuted_idx);
@@ -62,51 +62,35 @@ void moe_permute(
const int* expert_map_ptr = get_ptr<int>(expert_map.value()); const int* expert_map_ptr = get_ptr<int>(expert_map.value());
valid_num_ptr = valid_num_ptr =
get_ptr<int64_t>(expert_first_token_offset) + n_local_expert; get_ptr<int64_t>(expert_first_token_offset) + n_local_expert;
topk_ids_for_sort = topk_ids.clone(); preprocessTopkIdLauncher(get_ptr<int>(copy_topk_ids), n_token * topk,
preprocessTopkIdLauncher(get_ptr<int>(topk_ids_for_sort), n_token * topk,
expert_map_ptr, n_expert, stream); expert_map_ptr, n_expert, stream);
} }
// expert sort topk expert id and scan expert id get expert_first_token_offset // expert sort topk expert id and scan expert id get expert_first_token_offset
sortAndScanExpert( sortAndScanExpert(
get_ptr<const int>(topk_ids_for_sort), get_ptr<int>(token_expert_indices), get_ptr<int>(copy_topk_ids), get_ptr<int>(token_expert_indices),
get_ptr<int>(permuted_experts_id), get_ptr<int>(sorted_row_idx), get_ptr<int>(permuted_experts_id), get_ptr<int>(sorted_row_idx),
get_ptr<int64_t>(expert_first_token_offset), n_token, n_expert, get_ptr<int64_t>(expert_first_token_offset), n_token, n_expert,
n_local_expert, topk, sorter, get_ptr<int>(sort_workspace), stream); n_local_expert, topk, sorter, get_ptr<int>(sort_workspace), stream);
// DeepGEMM: use getMIndices kernel to compute
// 1) align_expert_first_token_offset (aligned prefix offsets)
// 2) m_indices (expert id for each aligned row)
// eg. expert0: 3, expert1: 5, expert2: 2 tokens respectively
// expert_first_token_offset = [0, 3, 8, 10], align_block_size = 4
// expert0: 3->4, expert1: 5->8, expert2: 2->4
// align_expert_first_token_offset = [0, 4, 12, 16]
// so m_indices = [0,0,0,0, 1,1,1,1,1,1,1,1, 2,2,2,2]
torch::Tensor align_expert_first_token_offset;
const int64_t* aligned_expert_first_token_offset_ptr = nullptr;
if (align_block_size.has_value()) {
align_expert_first_token_offset =
torch::zeros_like(expert_first_token_offset);
getMIndices(get_ptr<int64_t>(expert_first_token_offset),
get_ptr<int64_t>(align_expert_first_token_offset),
get_ptr<int>(m_indices), n_local_expert, align_block_size_value,
stream);
aligned_expert_first_token_offset_ptr =
get_ptr<int64_t>(align_expert_first_token_offset);
}
// dispatch expandInputRowsKernelLauncher // dispatch expandInputRowsKernelLauncher
MOE_DISPATCH(input.scalar_type(), [&] { MOE_DISPATCH(input.scalar_type(), [&] {
expandInputRowsKernelLauncher<scalar_t>( expandInputRowsKernelLauncher<scalar_t>(
get_ptr<scalar_t>(input), get_ptr<scalar_t>(permuted_input), get_ptr<scalar_t>(input), get_ptr<scalar_t>(permuted_input),
get_ptr<int>(permuted_experts_id), get_ptr<int>(sorted_row_idx), get_ptr<int>(permuted_experts_id), get_ptr<int>(sorted_row_idx),
get_ptr<int>(inv_permuted_idx), get_ptr<int>(permuted_idx), get_ptr<int>(inv_permuted_idx), get_ptr<int>(permuted_idx),
get_ptr<int64_t>(expert_first_token_offset), get_ptr<int64_t>(expert_first_token_offset), n_token, valid_num_ptr,
aligned_expert_first_token_offset_ptr, n_token, valid_num_ptr, n_hidden, n_hidden, topk, n_local_expert, align_block_size_value, stream);
topk, n_local_expert, align_block_size_value, stream);
}); });
// get m_indices and update expert_first_token_offset with align block
// this is only required for DeepGemm and not required for CUTLASS group gemm // this is only required for DeepGemm and not required for CUTLASS group gemm
if (align_block_size.has_value()) { if (align_block_size.has_value()) {
auto align_expert_first_token_offset =
torch::zeros_like(expert_first_token_offset);
getMIndices(get_ptr<int64_t>(expert_first_token_offset),
get_ptr<int64_t>(align_expert_first_token_offset),
get_ptr<int>(m_indices), n_local_expert, align_block_size_value,
stream);
expert_first_token_offset.copy_(align_expert_first_token_offset); expert_first_token_offset.copy_(align_expert_first_token_offset);
} }
} }

View File

@@ -109,7 +109,7 @@ void computeExpertFirstTokenOffset(int const* sorted_indices,
sorted_indices, total_indices, num_experts, expert_first_token_offset); sorted_indices, total_indices, num_experts, expert_first_token_offset);
} }
void sortAndScanExpert(const int* expert_for_source_row, const int* source_rows, void sortAndScanExpert(int* expert_for_source_row, const int* source_rows,
int* permuted_experts, int* permuted_rows, int* permuted_experts, int* permuted_rows,
int64_t* expert_first_token_offset, int num_rows, int64_t* expert_first_token_offset, int num_rows,
int num_experts, int num_experts_per_node, int k, int num_experts, int num_experts_per_node, int k,

View File

@@ -48,7 +48,7 @@ void computeExpertFirstTokenOffset(int const* sorted_indices,
int64_t* expert_first_token_offset, int64_t* expert_first_token_offset,
cudaStream_t stream); cudaStream_t stream);
void sortAndScanExpert(const int* expert_for_source_row, const int* source_rows, void sortAndScanExpert(int* expert_for_source_row, const int* source_rows,
int* permuted_experts, int* permuted_rows, int* permuted_experts, int* permuted_rows,
int64_t* expert_first_token_offset, int num_rows, int64_t* expert_first_token_offset, int num_rows,
int num_experts, int num_experts_per_node, int k, int num_experts, int num_experts_per_node, int k,
@@ -60,8 +60,7 @@ void expandInputRowsKernelLauncher(
T const* unpermuted_input, T* permuted_output, int* sorted_experts, T const* unpermuted_input, T* permuted_output, int* sorted_experts,
int const* expanded_dest_row_to_expanded_source_row, int const* expanded_dest_row_to_expanded_source_row,
int* expanded_source_row_to_expanded_dest_row, int* permuted_idx, int* expanded_source_row_to_expanded_dest_row, int* permuted_idx,
int64_t const* expert_first_token_offset, int64_t* expert_first_token_offset, int64_t const num_rows,
int64_t const* aligned_expert_first_token_offset, int64_t const num_rows,
int64_t const* num_valid_tokens_ptr, int64_t const cols, int const k, int64_t const* num_valid_tokens_ptr, int64_t const cols, int const k,
int num_local_experts, const int& align_block_size, cudaStream_t stream); int num_local_experts, const int& align_block_size, cudaStream_t stream);

View File

@@ -5,8 +5,7 @@ __global__ void expandInputRowsKernel(
T const* unpermuted_input, T* permuted_output, int* sorted_experts, T const* unpermuted_input, T* permuted_output, int* sorted_experts,
int const* expanded_dest_row_to_expanded_source_row, int const* expanded_dest_row_to_expanded_source_row,
int* expanded_source_row_to_expanded_dest_row, int* permuted_idx, int* expanded_source_row_to_expanded_dest_row, int* permuted_idx,
int64_t const* expert_first_token_offset, int64_t* expert_first_token_offset, int64_t const num_rows,
int64_t const* aligned_expert_first_token_offset, int64_t const num_rows,
int64_t const* num_dest_rows, int64_t const cols, int64_t k, int64_t const* num_dest_rows, int64_t const cols, int64_t k,
int num_local_experts, int align_block_size) { int num_local_experts, int align_block_size) {
// Reverse permutation map. // Reverse permutation map.
@@ -19,22 +18,35 @@ __global__ void expandInputRowsKernel(
expanded_dest_row_to_expanded_source_row[expanded_dest_row]; expanded_dest_row_to_expanded_source_row[expanded_dest_row];
int expert_id = sorted_experts[expanded_dest_row]; int expert_id = sorted_experts[expanded_dest_row];
extern __shared__ int64_t smem_expert_first_token_offset[];
if constexpr (ALIGN_BLOCK_SIZE) { if constexpr (ALIGN_BLOCK_SIZE) {
// convert (unaligned) expanded_dest_row -> aligned expanded_dest_row. // load g2s
// aligned_expert_first_token_offset[e] provides the aligned prefix start for (int idx = threadIdx.x; idx < num_local_experts + 1;
// for expert e. For non-local experts we map to the end (total aligned M). idx += blockDim.x) {
int64_t aligned_base = 0; smem_expert_first_token_offset[idx] =
int64_t token_offset_in_expert = 0; __ldg(expert_first_token_offset + idx);
if (expert_id >= num_local_experts) {
aligned_base =
__ldg(aligned_expert_first_token_offset + num_local_experts);
token_offset_in_expert = 0;
} else {
aligned_base = __ldg(aligned_expert_first_token_offset + expert_id);
token_offset_in_expert =
expanded_dest_row - __ldg(expert_first_token_offset + expert_id);
} }
expanded_dest_row = aligned_base + token_offset_in_expert; __syncthreads();
int lane_idx = threadIdx.x & 31;
if (lane_idx == 0) {
// set token_offset_in_expert = 0 if this expert is not local expert
int token_offset_in_expert =
expert_id >= num_local_experts
? 0
: expanded_dest_row - smem_expert_first_token_offset[expert_id];
int64_t accumulate_align_offset = 0;
#pragma unroll 1
for (int eidx = 1; eidx <= min(expert_id, num_local_experts); eidx++) {
auto n_token_in_expert = smem_expert_first_token_offset[eidx] -
smem_expert_first_token_offset[eidx - 1];
accumulate_align_offset += (n_token_in_expert + align_block_size - 1) /
align_block_size * align_block_size;
}
expanded_dest_row = accumulate_align_offset + token_offset_in_expert;
}
// lane0 shuffle broadcast align_expanded_dest_row
expanded_dest_row = __shfl_sync(0xffffffff, expanded_dest_row, 0);
} }
if (threadIdx.x == 0) { if (threadIdx.x == 0) {
@@ -76,8 +88,7 @@ void expandInputRowsKernelLauncher(
T const* unpermuted_input, T* permuted_output, int* sorted_experts, T const* unpermuted_input, T* permuted_output, int* sorted_experts,
int const* expanded_dest_row_to_expanded_source_row, int const* expanded_dest_row_to_expanded_source_row,
int* expanded_source_row_to_expanded_dest_row, int* permuted_idx, int* expanded_source_row_to_expanded_dest_row, int* permuted_idx,
int64_t const* expert_first_token_offset, int64_t* expert_first_token_offset, int64_t const num_rows,
int64_t const* aligned_expert_first_token_offset, int64_t const num_rows,
int64_t const* num_valid_tokens_ptr, int64_t const cols, int const k, int64_t const* num_valid_tokens_ptr, int64_t const cols, int const k,
int num_local_experts, const int& align_block_size, cudaStream_t stream) { int num_local_experts, const int& align_block_size, cudaStream_t stream) {
int64_t const blocks = num_rows * k; int64_t const blocks = num_rows * k;
@@ -93,12 +104,14 @@ void expandInputRowsKernelLauncher(
bool is_align_block_size = align_block_size != -1; bool is_align_block_size = align_block_size != -1;
auto func = func_map[is_check_skip][is_align_block_size]; auto func = func_map[is_check_skip][is_align_block_size];
func<<<blocks, threads, 0, stream>>>( int64_t smem_size = sizeof(int64_t) * (num_local_experts + 1);
func<<<blocks, threads, smem_size, stream>>>(
unpermuted_input, permuted_output, sorted_experts, unpermuted_input, permuted_output, sorted_experts,
expanded_dest_row_to_expanded_source_row, expanded_dest_row_to_expanded_source_row,
expanded_source_row_to_expanded_dest_row, permuted_idx, expanded_source_row_to_expanded_dest_row, permuted_idx,
expert_first_token_offset, aligned_expert_first_token_offset, num_rows, expert_first_token_offset, num_rows, num_valid_tokens_ptr, cols, k,
num_valid_tokens_ptr, cols, k, num_local_experts, align_block_size); num_local_experts, align_block_size);
} }
template <class T, class U> template <class T, class U>

View File

@@ -62,12 +62,6 @@ __device__ __forceinline__ float toFloat(T value) {
} }
} }
// Scoring function enums
enum ScoringFunc {
SCORING_SOFTMAX = 0, // apply softmax
SCORING_SIGMOID = 1 // apply sigmoid
};
// ====================== Softmax things =============================== // ====================== Softmax things ===============================
// We have our own implementation of softmax here so we can support transposing the output // We have our own implementation of softmax here so we can support transposing the output
// in the softmax kernel when we extend this module to support expert-choice routing. // in the softmax kernel when we extend this module to support expert-choice routing.
@@ -131,27 +125,6 @@ __launch_bounds__(TPB) __global__
} }
} }
template <int TPB, typename InputType>
__launch_bounds__(TPB) __global__
void moeSigmoid(const InputType* input, const bool* finished, float* output, const int num_cols)
{
const int thread_row_offset = blockIdx.x * num_cols;
// Don't touch finished rows.
if ((finished != nullptr) && finished[blockIdx.x])
{
return;
}
for (int ii = threadIdx.x; ii < num_cols; ii += TPB)
{
const int idx = thread_row_offset + ii;
const float val = toFloat(input[idx]);
const float sigmoid_val = 1.0f / (1.0f + __expf(-val));
output[idx] = sigmoid_val;
}
}
template <int TPB, typename IndType> template <int TPB, typename IndType>
__launch_bounds__(TPB) __global__ void moeTopK( __launch_bounds__(TPB) __global__ void moeTopK(
const float* inputs_after_softmax, const float* inputs_after_softmax,
@@ -163,8 +136,7 @@ __launch_bounds__(TPB) __global__ void moeTopK(
const int k, const int k,
const int start_expert, const int start_expert,
const int end_expert, const int end_expert,
const bool renormalize, const bool renormalize)
const float* bias)
{ {
using cub_kvp = cub::KeyValuePair<int, float>; using cub_kvp = cub::KeyValuePair<int, float>;
@@ -190,13 +162,7 @@ __launch_bounds__(TPB) __global__ void moeTopK(
{ {
const int idx = thread_read_offset + expert; const int idx = thread_read_offset + expert;
inp_kvp.key = expert; inp_kvp.key = expert;
inp_kvp.value = inputs_after_softmax[idx];
// Apply correction bias if provided
if (bias != nullptr) {
inp_kvp.value = inputs_after_softmax[idx] + bias[expert];
} else {
inp_kvp.value = inputs_after_softmax[idx];
}
for (int prior_k = 0; prior_k < k_idx; ++prior_k) for (int prior_k = 0; prior_k < k_idx; ++prior_k)
{ {
@@ -220,13 +186,12 @@ __launch_bounds__(TPB) __global__ void moeTopK(
const bool should_process_row = row_is_active && node_uses_expert; const bool should_process_row = row_is_active && node_uses_expert;
const int idx = k * block_row + k_idx; const int idx = k * block_row + k_idx;
// Return the unbiased scores for output weights output[idx] = result_kvp.value;
output[idx] = inputs_after_softmax[thread_read_offset + expert];
indices[idx] = should_process_row ? (expert - start_expert) : num_experts; indices[idx] = should_process_row ? (expert - start_expert) : num_experts;
assert(indices[idx] >= 0); assert(indices[idx] >= 0);
source_rows[idx] = k_idx * num_rows + block_row; source_rows[idx] = k_idx * num_rows + block_row;
if (renormalize) { if (renormalize) {
selected_sum += inputs_after_softmax[thread_read_offset + expert]; selected_sum += result_kvp.value;
} }
} }
__syncthreads(); __syncthreads();
@@ -260,12 +225,10 @@ __launch_bounds__(TPB) __global__ void moeTopK(
2) This implementation assumes k is small, but will work for any k. 2) This implementation assumes k is small, but will work for any k.
*/ */
template <int VPT, int NUM_EXPERTS, int WARPS_PER_CTA, int BYTES_PER_LDG, int WARP_SIZE_PARAM, typename IndType, template <int VPT, int NUM_EXPERTS, int WARPS_PER_CTA, int BYTES_PER_LDG, int WARP_SIZE_PARAM, typename IndType, typename InputType = float>
typename InputType = float, ScoringFunc SF>
__launch_bounds__(WARPS_PER_CTA* WARP_SIZE_PARAM) __global__ __launch_bounds__(WARPS_PER_CTA* WARP_SIZE_PARAM) __global__
void topkGating(const InputType* input, const bool* finished, float* output, const int num_rows, IndType* indices, void topkGatingSoftmax(const InputType* input, const bool* finished, float* output, const int num_rows, IndType* indices,
int* source_rows, const int k, const int start_expert, const int end_expert, const bool renormalize, int* source_rows, const int k, const int start_expert, const int end_expert, const bool renormalize)
const float* bias)
{ {
static_assert(std::is_same_v<InputType, float> || std::is_same_v<InputType, __nv_bfloat16> || static_assert(std::is_same_v<InputType, float> || std::is_same_v<InputType, __nv_bfloat16> ||
std::is_same_v<InputType, __half>, std::is_same_v<InputType, __half>,
@@ -390,89 +353,61 @@ __launch_bounds__(WARPS_PER_CTA* WARP_SIZE_PARAM) __global__
} }
} }
if constexpr (SF == SCORING_SOFTMAX) { // First, we perform a max reduce within the thread. We can do the max in fp16 safely (I think) and just
// First, we perform a max reduce within the thread. // convert to float afterwards for the exp + sum reduction.
float thread_max = row_chunk[0]; float thread_max = row_chunk[0];
#pragma unroll #pragma unroll
for (int ii = 1; ii < VPT; ++ii) { for (int ii = 1; ii < VPT; ++ii)
{
thread_max = max(thread_max, row_chunk[ii]); thread_max = max(thread_max, row_chunk[ii]);
} }
// Now, we find the max within the thread group and distribute among the threads. We use a butterfly reduce. // Now, we find the max within the thread group and distribute among the threads. We use a butterfly reduce.
#pragma unroll #pragma unroll
for (int mask = THREADS_PER_ROW / 2; mask > 0; mask /= 2) for (int mask = THREADS_PER_ROW / 2; mask > 0; mask /= 2)
{ {
thread_max = max(thread_max, VLLM_SHFL_XOR_SYNC_WIDTH(thread_max, mask, THREADS_PER_ROW)); thread_max = max(thread_max, VLLM_SHFL_XOR_SYNC_WIDTH(thread_max, mask, THREADS_PER_ROW));
} }
// From this point, thread max in all the threads have the max within the row. // From this point, thread max in all the threads have the max within the row.
// Now, we subtract the max from each element in the thread and take the exp. We also compute the thread local sum. // Now, we subtract the max from each element in the thread and take the exp. We also compute the thread local sum.
float row_sum = 0; float row_sum = 0;
#pragma unroll #pragma unroll
for (int ii = 0; ii < VPT; ++ii) for (int ii = 0; ii < VPT; ++ii)
{ {
row_chunk[ii] = expf(row_chunk[ii] - thread_max); row_chunk[ii] = expf(row_chunk[ii] - thread_max);
row_sum += row_chunk[ii]; row_sum += row_chunk[ii];
} }
// Now, we perform the sum reduce within each thread group. Similar to the max reduce, we use a bufferfly pattern. // Now, we perform the sum reduce within each thread group. Similar to the max reduce, we use a bufferfly pattern.
#pragma unroll #pragma unroll
for (int mask = THREADS_PER_ROW / 2; mask > 0; mask /= 2) for (int mask = THREADS_PER_ROW / 2; mask > 0; mask /= 2)
{ {
row_sum += VLLM_SHFL_XOR_SYNC_WIDTH(row_sum, mask, THREADS_PER_ROW); row_sum += VLLM_SHFL_XOR_SYNC_WIDTH(row_sum, mask, THREADS_PER_ROW);
} }
// From this point, all threads have the max and the sum for their rows in the thread_max and thread_sum variables // From this point, all threads have the max and the sum for their rows in the thread_max and thread_sum variables
// respectively. Finally, we can scale the rows for the softmax. Technically, for top-k gating we don't need to // respectively. Finally, we can scale the rows for the softmax. Technically, for top-k gating we don't need to
// compute the entire softmax row. We can likely look at the maxes and only compute for the top-k values in the row. // compute the entire softmax row. We can likely look at the maxes and only compute for the top-k values in the row.
// However, this kernel will likely not be a bottle neck and it seems better to closer match torch and find the // However, this kernel will likely not be a bottle neck and it seems better to closer match torch and find the
// argmax after computing the softmax. // argmax after computing the softmax.
const float reciprocal_row_sum = 1.f / row_sum; const float reciprocal_row_sum = 1.f / row_sum;
#pragma unroll #pragma unroll
for (int ii = 0; ii < VPT; ++ii) for (int ii = 0; ii < VPT; ++ii)
{ {
row_chunk[ii] = row_chunk[ii] * reciprocal_row_sum; row_chunk[ii] = row_chunk[ii] * reciprocal_row_sum;
}
} else if constexpr (SF == SCORING_SIGMOID) {
#pragma unroll
for (int ii = 0; ii < VPT; ++ii)
{
row_chunk[ii] = 1.0f / (1.0f + __expf(-row_chunk[ii]));
}
} }
static constexpr int COLS_PER_GROUP_LDG = ELTS_PER_LDG * THREADS_PER_ROW; // Now, softmax_res contains the softmax of the row chunk. Now, I want to find the topk elements in each row, along
// If bias is not null, use biased value for selection
float row_chunk_for_choice[VPT];
// Apply correction bias
if (bias != nullptr) {
#pragma unroll
for (int ldg = 0; ldg < LDG_PER_THREAD; ++ldg) {
#pragma unroll
for (int ii = 0; ii < ELTS_PER_LDG; ++ii) {
const int expert = first_elt_read_by_thread + ldg * COLS_PER_GROUP_LDG + ii;
float bias_val = expert < NUM_EXPERTS ? bias[expert] : 0.0f;
row_chunk_for_choice[ldg * ELTS_PER_LDG + ii] = row_chunk[ldg * ELTS_PER_LDG + ii] + bias_val;
}
}
} else {
#pragma unroll
for (int ii = 0; ii < VPT; ++ii) {
row_chunk_for_choice[ii] = row_chunk[ii];
}
}
// Now, row_chunk contains the softmax / sigmoid of the row chunk. Now, I want to find the topk elements in each row, along
// with the max index. // with the max index.
int start_col = first_elt_read_by_thread; int start_col = first_elt_read_by_thread;
static constexpr int COLS_PER_GROUP_LDG = ELTS_PER_LDG * THREADS_PER_ROW;
float selected_sum = 0.f; float selected_sum = 0.f;
for (int k_idx = 0; k_idx < k; ++k_idx) for (int k_idx = 0; k_idx < k; ++k_idx)
{ {
// First, each thread does the local argmax // First, each thread does the local argmax
float max_val_for_choice = row_chunk_for_choice[0];
float max_val = row_chunk[0]; float max_val = row_chunk[0];
int expert = start_col; int expert = start_col;
#pragma unroll #pragma unroll
@@ -481,14 +416,12 @@ __launch_bounds__(WARPS_PER_CTA* WARP_SIZE_PARAM) __global__
#pragma unroll #pragma unroll
for (int ii = 0; ii < ELTS_PER_LDG; ++ii) for (int ii = 0; ii < ELTS_PER_LDG; ++ii)
{ {
float val_for_choice = row_chunk_for_choice[ldg * ELTS_PER_LDG + ii];
float val = row_chunk[ldg * ELTS_PER_LDG + ii]; float val = row_chunk[ldg * ELTS_PER_LDG + ii];
// No check on the experts here since columns with the smallest index are processed first and only // No check on the experts here since columns with the smallest index are processed first and only
// updated if > (not >=) // updated if > (not >=)
if (val_for_choice > max_val_for_choice) if (val > max_val)
{ {
max_val_for_choice = val_for_choice;
max_val = val; max_val = val;
expert = col + ii; expert = col + ii;
} }
@@ -501,14 +434,12 @@ __launch_bounds__(WARPS_PER_CTA* WARP_SIZE_PARAM) __global__
#pragma unroll #pragma unroll
for (int mask = THREADS_PER_ROW / 2; mask > 0; mask /= 2) for (int mask = THREADS_PER_ROW / 2; mask > 0; mask /= 2)
{ {
float other_max_for_choice = VLLM_SHFL_XOR_SYNC_WIDTH(max_val_for_choice, mask, THREADS_PER_ROW);
float other_max = VLLM_SHFL_XOR_SYNC_WIDTH(max_val, mask, THREADS_PER_ROW); float other_max = VLLM_SHFL_XOR_SYNC_WIDTH(max_val, mask, THREADS_PER_ROW);
int other_expert = VLLM_SHFL_XOR_SYNC_WIDTH(expert, mask, THREADS_PER_ROW); int other_expert = VLLM_SHFL_XOR_SYNC_WIDTH(expert, mask, THREADS_PER_ROW);
// We want lower indices to "win" in every thread so we break ties this way // We want lower indices to "win" in every thread so we break ties this way
if (other_max_for_choice > max_val_for_choice || (other_max_for_choice == max_val_for_choice && other_expert < expert)) if (other_max > max_val || (other_max == max_val && other_expert < expert))
{ {
max_val_for_choice = other_max_for_choice;
max_val = other_max; max_val = other_max;
expert = other_expert; expert = other_expert;
} }
@@ -543,7 +474,7 @@ __launch_bounds__(WARPS_PER_CTA* WARP_SIZE_PARAM) __global__
{ {
const int offset_for_expert = expert % ELTS_PER_LDG; const int offset_for_expert = expert % ELTS_PER_LDG;
// Safe to set to any negative value since row_chunk values must be between 0 and 1. // Safe to set to any negative value since row_chunk values must be between 0 and 1.
row_chunk_for_choice[ldg_group_for_expert * ELTS_PER_LDG + offset_for_expert] = -10000.f; row_chunk[ldg_group_for_expert * ELTS_PER_LDG + offset_for_expert] = -10000.f;
} }
} }
} }
@@ -577,10 +508,10 @@ struct TopkConstants
}; };
} // namespace detail } // namespace detail
template <int EXPERTS, int WARPS_PER_TB, int WARP_SIZE_PARAM, int MAX_BYTES_PER_LDG, typename IndType, typename InputType, ScoringFunc SF> template <int EXPERTS, int WARPS_PER_TB, int WARP_SIZE_PARAM, int MAX_BYTES_PER_LDG, typename IndType, typename InputType>
void topkGatingLauncherHelper(const InputType* input, const bool* finished, float* output, IndType* indices, void topkGatingSoftmaxLauncherHelper(const InputType* input, const bool* finished, float* output, IndType* indices,
int* source_row, const int num_rows, const int k, const int start_expert, const int end_expert, const bool renormalize, int* source_row, const int num_rows, const int k, const int start_expert, const int end_expert, const bool renormalize,
const float* bias, cudaStream_t stream) cudaStream_t stream)
{ {
static constexpr int BYTES_PER_LDG = MIN(MAX_BYTES_PER_LDG, sizeof(InputType) * EXPERTS); static constexpr int BYTES_PER_LDG = MIN(MAX_BYTES_PER_LDG, sizeof(InputType) * EXPERTS);
using Constants = detail::TopkConstants<EXPERTS, BYTES_PER_LDG, WARP_SIZE_PARAM, InputType>; using Constants = detail::TopkConstants<EXPERTS, BYTES_PER_LDG, WARP_SIZE_PARAM, InputType>;
@@ -590,51 +521,43 @@ void topkGatingLauncherHelper(const InputType* input, const bool* finished, floa
const int num_blocks = (num_warps + WARPS_PER_TB - 1) / WARPS_PER_TB; const int num_blocks = (num_warps + WARPS_PER_TB - 1) / WARPS_PER_TB;
dim3 block_dim(WARP_SIZE_PARAM, WARPS_PER_TB); dim3 block_dim(WARP_SIZE_PARAM, WARPS_PER_TB);
topkGating<VPT, EXPERTS, WARPS_PER_TB, BYTES_PER_LDG, WARP_SIZE_PARAM, IndType, InputType, SF><<<num_blocks, block_dim, 0, stream>>>( topkGatingSoftmax<VPT, EXPERTS, WARPS_PER_TB, BYTES_PER_LDG, WARP_SIZE_PARAM, IndType, InputType><<<num_blocks, block_dim, 0, stream>>>(
input, finished, output, num_rows, indices, source_row, k, start_expert, end_expert, renormalize, bias); input, finished, output, num_rows, indices, source_row, k, start_expert, end_expert, renormalize);
} }
#ifndef USE_ROCM #ifndef USE_ROCM
#define LAUNCH_TOPK(NUM_EXPERTS, WARPS_PER_TB, MAX_BYTES) \ #define LAUNCH_SOFTMAX(NUM_EXPERTS, WARPS_PER_TB, MAX_BYTES) \
static_assert(WARP_SIZE == 32, \ static_assert(WARP_SIZE == 32, \
"Unsupported warp size. Only 32 is supported for CUDA"); \ "Unsupported warp size. Only 32 is supported for CUDA"); \
topkGatingLauncherHelper<NUM_EXPERTS, WARPS_PER_TB, WARP_SIZE, MAX_BYTES, \ topkGatingSoftmaxLauncherHelper<NUM_EXPERTS, WARPS_PER_TB, WARP_SIZE, MAX_BYTES>( \
IndType, InputType, SF>( \ gating_output, nullptr, topk_weights, topk_indices, token_expert_indices, \
gating_output, nullptr, topk_weights, topk_indices, \ num_tokens, topk, 0, num_experts, renormalize, stream);
token_expert_indices, num_tokens, topk, 0, num_experts, renormalize, \
bias, stream);
#else #else
#define LAUNCH_TOPK(NUM_EXPERTS, WARPS_PER_TB, MAX_BYTES) \ #define LAUNCH_SOFTMAX(NUM_EXPERTS, WARPS_PER_TB, MAX_BYTES) \
if (WARP_SIZE == 64) { \ if (WARP_SIZE == 64) { \
topkGatingLauncherHelper<NUM_EXPERTS, WARPS_PER_TB, 64, MAX_BYTES, \ topkGatingSoftmaxLauncherHelper<NUM_EXPERTS, WARPS_PER_TB, 64, MAX_BYTES>( \
IndType, InputType, SF>( \ gating_output, nullptr, topk_weights, topk_indices, token_expert_indices, \
gating_output, nullptr, topk_weights, topk_indices, \ num_tokens, topk, 0, num_experts, renormalize, stream); \
token_expert_indices, num_tokens, topk, 0, num_experts, renormalize, \ } else if (WARP_SIZE == 32) { \
bias, stream); \ topkGatingSoftmaxLauncherHelper<NUM_EXPERTS, WARPS_PER_TB, 32, MAX_BYTES>( \
} else if (WARP_SIZE == 32) { \ gating_output, nullptr, topk_weights, topk_indices, token_expert_indices, \
topkGatingLauncherHelper<NUM_EXPERTS, WARPS_PER_TB, 32, MAX_BYTES, \ num_tokens, topk, 0, num_experts, renormalize, stream); \
IndType, InputType, SF>( \ } else { \
gating_output, nullptr, topk_weights, topk_indices, \ assert(false && "Unsupported warp size. Only 32 and 64 are supported for ROCm"); \
token_expert_indices, num_tokens, topk, 0, num_experts, renormalize, \
bias, stream); \
} else { \
assert(false && \
"Unsupported warp size. Only 32 and 64 are supported for ROCm"); \
} }
#endif #endif
template <typename IndType, typename InputType, ScoringFunc SF> template <typename IndType, typename InputType>
void topkGatingKernelLauncher( void topkGatingSoftmaxKernelLauncher(
const InputType* gating_output, const InputType* gating_output,
float* topk_weights, float* topk_weights,
IndType* topk_indices, IndType* topk_indices,
int* token_expert_indices, int* token_expert_indices,
float* workspace, float* softmax_workspace,
const int num_tokens, const int num_tokens,
const int num_experts, const int num_experts,
const int topk, const int topk,
const bool renormalize, const bool renormalize,
const float* bias,
cudaStream_t stream) { cudaStream_t stream) {
static constexpr int WARPS_PER_TB = 4; static constexpr int WARPS_PER_TB = 4;
static constexpr int BYTES_PER_LDG_POWER_OF_2 = 16; static constexpr int BYTES_PER_LDG_POWER_OF_2 = 16;
@@ -646,71 +569,64 @@ void topkGatingKernelLauncher(
#endif #endif
switch (num_experts) { switch (num_experts) {
case 1: case 1:
LAUNCH_TOPK(1, WARPS_PER_TB, BYTES_PER_LDG_POWER_OF_2); LAUNCH_SOFTMAX(1, WARPS_PER_TB, BYTES_PER_LDG_POWER_OF_2);
break; break;
case 2: case 2:
LAUNCH_TOPK(2, WARPS_PER_TB, BYTES_PER_LDG_POWER_OF_2); LAUNCH_SOFTMAX(2, WARPS_PER_TB, BYTES_PER_LDG_POWER_OF_2);
break; break;
case 4: case 4:
LAUNCH_TOPK(4, WARPS_PER_TB, BYTES_PER_LDG_POWER_OF_2); LAUNCH_SOFTMAX(4, WARPS_PER_TB, BYTES_PER_LDG_POWER_OF_2);
break; break;
case 8: case 8:
LAUNCH_TOPK(8, WARPS_PER_TB, BYTES_PER_LDG_POWER_OF_2); LAUNCH_SOFTMAX(8, WARPS_PER_TB, BYTES_PER_LDG_POWER_OF_2);
break; break;
case 16: case 16:
LAUNCH_TOPK(16, WARPS_PER_TB, BYTES_PER_LDG_POWER_OF_2); LAUNCH_SOFTMAX(16, WARPS_PER_TB, BYTES_PER_LDG_POWER_OF_2);
break; break;
case 32: case 32:
LAUNCH_TOPK(32, WARPS_PER_TB, BYTES_PER_LDG_POWER_OF_2); LAUNCH_SOFTMAX(32, WARPS_PER_TB, BYTES_PER_LDG_POWER_OF_2);
break; break;
case 64: case 64:
LAUNCH_TOPK(64, WARPS_PER_TB, BYTES_PER_LDG_POWER_OF_2); LAUNCH_SOFTMAX(64, WARPS_PER_TB, BYTES_PER_LDG_POWER_OF_2);
break; break;
case 128: case 128:
LAUNCH_TOPK(128, WARPS_PER_TB, BYTES_PER_LDG_POWER_OF_2); LAUNCH_SOFTMAX(128, WARPS_PER_TB, BYTES_PER_LDG_POWER_OF_2);
break; break;
case 256: case 256:
LAUNCH_TOPK(256, WARPS_PER_TB, BYTES_PER_LDG_POWER_OF_2); LAUNCH_SOFTMAX(256, WARPS_PER_TB, BYTES_PER_LDG_POWER_OF_2);
break; break;
case 512: case 512:
LAUNCH_TOPK(512, WARPS_PER_TB, BYTES_PER_LDG_POWER_OF_2); LAUNCH_SOFTMAX(512, WARPS_PER_TB, BYTES_PER_LDG_POWER_OF_2);
break; break;
// (CUDA only) support multiples of 64 when num_experts is not power of 2. // (CUDA only) support multiples of 64 when num_experts is not power of 2.
// ROCm uses WARP_SIZE 64 so 8 bytes loading won't fit for some of num_experts, // ROCm uses WARP_SIZE 64 so 8 bytes loading won't fit for some of num_experts,
// alternatively we can test 4 bytes loading and enable it in future. // alternatively we can test 4 bytes loading and enable it in future.
#ifndef USE_ROCM #ifndef USE_ROCM
case 192: case 192:
LAUNCH_TOPK(192, WARPS_PER_TB, BYTES_PER_LDG_MULTIPLE_64); LAUNCH_SOFTMAX(192, WARPS_PER_TB, BYTES_PER_LDG_MULTIPLE_64);
break; break;
case 320: case 320:
LAUNCH_TOPK(320, WARPS_PER_TB, BYTES_PER_LDG_MULTIPLE_64); LAUNCH_SOFTMAX(320, WARPS_PER_TB, BYTES_PER_LDG_MULTIPLE_64);
break; break;
case 384: case 384:
LAUNCH_TOPK(384, WARPS_PER_TB, BYTES_PER_LDG_MULTIPLE_64); LAUNCH_SOFTMAX(384, WARPS_PER_TB, BYTES_PER_LDG_MULTIPLE_64);
break; break;
case 448: case 448:
LAUNCH_TOPK(448, WARPS_PER_TB, BYTES_PER_LDG_MULTIPLE_64); LAUNCH_SOFTMAX(448, WARPS_PER_TB, BYTES_PER_LDG_MULTIPLE_64);
break; break;
case 576: case 576:
LAUNCH_TOPK(576, WARPS_PER_TB, BYTES_PER_LDG_MULTIPLE_64); LAUNCH_SOFTMAX(576, WARPS_PER_TB, BYTES_PER_LDG_MULTIPLE_64);
break; break;
#endif #endif
default: { default: {
TORCH_CHECK(workspace != nullptr, TORCH_CHECK(softmax_workspace != nullptr,
"workspace must be provided for num_experts that are not a power of 2 or multiple of 64."); "softmax_workspace must be provided for num_experts that are not a power of 2 or multiple of 64.");
static constexpr int TPB = 256; static constexpr int TPB = 256;
if constexpr (SF == SCORING_SOFTMAX) { moeSoftmax<TPB, InputType><<<num_tokens, TPB, 0, stream>>>(
moeSoftmax<TPB, InputType><<<num_tokens, TPB, 0, stream>>>( gating_output, nullptr, softmax_workspace, num_experts);
gating_output, nullptr, workspace, num_experts);
} else if constexpr (SF == SCORING_SIGMOID) {
moeSigmoid<TPB, InputType><<<num_tokens, TPB, 0, stream>>>(
gating_output, nullptr, workspace, num_experts);
} else {
TORCH_CHECK(false, "Unsupported scoring func");
}
moeTopK<TPB><<<num_tokens, TPB, 0, stream>>>( moeTopK<TPB><<<num_tokens, TPB, 0, stream>>>(
workspace, nullptr, topk_weights, topk_indices, token_expert_indices, softmax_workspace, nullptr, topk_weights, topk_indices, token_expert_indices,
num_experts, topk, 0, num_experts, renormalize, bias); num_experts, topk, 0, num_experts, renormalize);
} }
} }
} }
@@ -719,55 +635,40 @@ void topkGatingKernelLauncher(
} // namespace vllm } // namespace vllm
template<typename ComputeType, vllm::moe::ScoringFunc SF> template<typename ComputeType>
void dispatch_topk_launch( void dispatch_topk_softmax_launch(
torch::Tensor& gating_output, torch::Tensor& gating_output,
torch::Tensor& topk_weights, torch::Tensor& topk_weights,
torch::Tensor& topk_indices, torch::Tensor& topk_indices,
torch::Tensor& token_expert_indices, torch::Tensor& token_expert_indices,
torch::Tensor& softmax_workspace, torch::Tensor& softmax_workspace,
int num_tokens, int num_experts, int topk, bool renormalize, int num_tokens, int num_experts, int topk, bool renormalize, cudaStream_t stream)
std::optional<torch::Tensor> bias, {
cudaStream_t stream)
{
const float* bias_ptr = nullptr;
if (bias.has_value()) {
const torch::Tensor& bias_tensor = bias.value();
TORCH_CHECK(bias_tensor.scalar_type() == at::ScalarType::Float, "bias tensor must be float32");
TORCH_CHECK(bias_tensor.dim() == 1, "bias tensor must be 1D");
TORCH_CHECK(bias_tensor.size(0) == num_experts, "bias size mismatch, expected: ", num_experts);
TORCH_CHECK(bias_tensor.is_contiguous(), "bias tensor must be contiguous");
bias_ptr = bias_tensor.data_ptr<float>();
}
if (topk_indices.scalar_type() == at::ScalarType::Int) { if (topk_indices.scalar_type() == at::ScalarType::Int) {
vllm::moe::topkGatingKernelLauncher<int, ComputeType, SF>( vllm::moe::topkGatingSoftmaxKernelLauncher<int, ComputeType>(
reinterpret_cast<const ComputeType*>(gating_output.data_ptr()), reinterpret_cast<const ComputeType*>(gating_output.data_ptr()),
topk_weights.data_ptr<float>(), topk_weights.data_ptr<float>(),
topk_indices.data_ptr<int>(), topk_indices.data_ptr<int>(),
token_expert_indices.data_ptr<int>(), token_expert_indices.data_ptr<int>(),
softmax_workspace.data_ptr<float>(), softmax_workspace.data_ptr<float>(),
num_tokens, num_experts, topk, renormalize, num_tokens, num_experts, topk, renormalize, stream);
bias_ptr, stream);
} else if (topk_indices.scalar_type() == at::ScalarType::UInt32) { } else if (topk_indices.scalar_type() == at::ScalarType::UInt32) {
vllm::moe::topkGatingKernelLauncher<uint32_t, ComputeType, SF>( vllm::moe::topkGatingSoftmaxKernelLauncher<uint32_t, ComputeType>(
reinterpret_cast<const ComputeType*>(gating_output.data_ptr()), reinterpret_cast<const ComputeType*>(gating_output.data_ptr()),
topk_weights.data_ptr<float>(), topk_weights.data_ptr<float>(),
topk_indices.data_ptr<uint32_t>(), topk_indices.data_ptr<uint32_t>(),
token_expert_indices.data_ptr<int>(), token_expert_indices.data_ptr<int>(),
softmax_workspace.data_ptr<float>(), softmax_workspace.data_ptr<float>(),
num_tokens, num_experts, topk, renormalize, num_tokens, num_experts, topk, renormalize, stream);
bias_ptr, stream);
} else { } else {
TORCH_CHECK(topk_indices.scalar_type() == at::ScalarType::Long); TORCH_CHECK(topk_indices.scalar_type() == at::ScalarType::Long);
vllm::moe::topkGatingKernelLauncher<int64_t, ComputeType, SF>( vllm::moe::topkGatingSoftmaxKernelLauncher<int64_t, ComputeType>(
reinterpret_cast<const ComputeType*>(gating_output.data_ptr()), reinterpret_cast<const ComputeType*>(gating_output.data_ptr()),
topk_weights.data_ptr<float>(), topk_weights.data_ptr<float>(),
topk_indices.data_ptr<int64_t>(), topk_indices.data_ptr<int64_t>(),
token_expert_indices.data_ptr<int>(), token_expert_indices.data_ptr<int>(),
softmax_workspace.data_ptr<float>(), softmax_workspace.data_ptr<float>(),
num_tokens, num_experts, topk, renormalize, num_tokens, num_experts, topk, renormalize, stream);
bias_ptr, stream);
} }
} }
@@ -776,8 +677,7 @@ void topk_softmax(
torch::Tensor& topk_indices, // [num_tokens, topk] torch::Tensor& topk_indices, // [num_tokens, topk]
torch::Tensor& token_expert_indices, // [num_tokens, topk] torch::Tensor& token_expert_indices, // [num_tokens, topk]
torch::Tensor& gating_output, // [num_tokens, num_experts] torch::Tensor& gating_output, // [num_tokens, num_experts]
bool renormalize, bool renormalize)
std::optional<torch::Tensor> bias)
{ {
const int num_experts = gating_output.size(-1); const int num_experts = gating_output.size(-1);
const auto num_tokens = gating_output.numel() / num_experts; const auto num_tokens = gating_output.numel() / num_experts;
@@ -793,55 +693,14 @@ void topk_softmax(
torch::Tensor softmax_workspace = torch::empty({workspace_size}, workspace_options); torch::Tensor softmax_workspace = torch::empty({workspace_size}, workspace_options);
if (gating_output.scalar_type() == at::ScalarType::Float) { if (gating_output.scalar_type() == at::ScalarType::Float) {
dispatch_topk_launch<float, vllm::moe::SCORING_SOFTMAX>(gating_output, topk_weights, topk_indices, dispatch_topk_softmax_launch<float>(gating_output, topk_weights, topk_indices,
token_expert_indices, softmax_workspace, num_tokens, num_experts, topk, renormalize, token_expert_indices, softmax_workspace, num_tokens, num_experts, topk, renormalize, stream);
bias, stream);
} else if (gating_output.scalar_type() == at::ScalarType::Half) { } else if (gating_output.scalar_type() == at::ScalarType::Half) {
dispatch_topk_launch<__half, vllm::moe::SCORING_SOFTMAX>(gating_output, topk_weights, topk_indices, dispatch_topk_softmax_launch<__half>(gating_output, topk_weights, topk_indices,
token_expert_indices, softmax_workspace, num_tokens, num_experts, topk, renormalize, token_expert_indices, softmax_workspace, num_tokens, num_experts, topk, renormalize, stream);
bias, stream);
} else if (gating_output.scalar_type() == at::ScalarType::BFloat16) { } else if (gating_output.scalar_type() == at::ScalarType::BFloat16) {
dispatch_topk_launch<__nv_bfloat16, vllm::moe::SCORING_SOFTMAX>(gating_output, topk_weights, topk_indices, dispatch_topk_softmax_launch<__nv_bfloat16>(gating_output, topk_weights, topk_indices,
token_expert_indices, softmax_workspace, num_tokens, num_experts, topk, renormalize, token_expert_indices, softmax_workspace, num_tokens, num_experts, topk, renormalize, stream);
bias, stream);
} else {
TORCH_CHECK(false, "Unsupported gating_output data type: ", gating_output.scalar_type());
}
}
void topk_sigmoid(
torch::Tensor& topk_weights, // [num_tokens, topk]
torch::Tensor& topk_indices, // [num_tokens, topk]
torch::Tensor& token_expert_indices, // [num_tokens, topk]
torch::Tensor& gating_output, // [num_tokens, num_experts]
bool renormalize,
std::optional<torch::Tensor> bias)
{
const int num_experts = gating_output.size(-1);
const auto num_tokens = gating_output.numel() / num_experts;
const int topk = topk_weights.size(-1);
const bool is_pow_2 = (num_experts != 0) && ((num_experts & (num_experts - 1)) == 0);
const bool needs_workspace = !is_pow_2 || num_experts > 256;
const int64_t workspace_size = needs_workspace ? num_tokens * num_experts : 0;
const at::cuda::OptionalCUDAGuard device_guard(device_of(gating_output));
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
const auto workspace_options = gating_output.options().dtype(at::ScalarType::Float);
torch::Tensor workspace = torch::empty({workspace_size}, workspace_options);
if (gating_output.scalar_type() == at::ScalarType::Float) {
dispatch_topk_launch<float, vllm::moe::SCORING_SIGMOID>(gating_output, topk_weights, topk_indices,
token_expert_indices, workspace, num_tokens, num_experts, topk, renormalize,
bias, stream);
} else if (gating_output.scalar_type() == at::ScalarType::Half) {
dispatch_topk_launch<__half, vllm::moe::SCORING_SIGMOID>(gating_output, topk_weights, topk_indices,
token_expert_indices, workspace, num_tokens, num_experts, topk, renormalize,
bias, stream);
} else if (gating_output.scalar_type() == at::ScalarType::BFloat16) {
dispatch_topk_launch<__nv_bfloat16, vllm::moe::SCORING_SIGMOID>(gating_output, topk_weights, topk_indices,
token_expert_indices, workspace, num_tokens, num_experts, topk, renormalize,
bias, stream);
} else { } else {
TORCH_CHECK(false, "Unsupported gating_output data type: ", gating_output.scalar_type()); TORCH_CHECK(false, "Unsupported gating_output data type: ", gating_output.scalar_type());
} }

View File

@@ -5,17 +5,9 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
// Apply topk softmax to the gating outputs. // Apply topk softmax to the gating outputs.
m.def( m.def(
"topk_softmax(Tensor! topk_weights, Tensor! topk_indices, Tensor! " "topk_softmax(Tensor! topk_weights, Tensor! topk_indices, Tensor! "
"token_expert_indices, Tensor gating_output, bool renormalize, Tensor? " "token_expert_indices, Tensor gating_output, bool renormalize) -> ()");
"bias) -> ()");
m.impl("topk_softmax", torch::kCUDA, &topk_softmax); m.impl("topk_softmax", torch::kCUDA, &topk_softmax);
// Apply topk sigmoid to the gating outputs.
m.def(
"topk_sigmoid(Tensor! topk_weights, Tensor! topk_indices, Tensor! "
"token_expert_indices, Tensor gating_output, bool renormalize, Tensor? "
"bias) -> ()");
m.impl("topk_sigmoid", torch::kCUDA, &topk_sigmoid);
// Calculate the result of moe by summing up the partial results // Calculate the result of moe by summing up the partial results
// from all selected experts. // from all selected experts.
m.def("moe_sum(Tensor input, Tensor! output) -> ()"); m.def("moe_sum(Tensor input, Tensor! output) -> ()");

View File

@@ -260,6 +260,12 @@ void get_cutlass_moe_mm_data(
const int64_t num_experts, const int64_t n, const int64_t k, const int64_t num_experts, const int64_t n, const int64_t k,
const std::optional<torch::Tensor>& blockscale_offsets); const std::optional<torch::Tensor>& blockscale_offsets);
void get_cutlass_moe_mm_problem_sizes(
const torch::Tensor& topk_ids, torch::Tensor& problem_sizes1,
torch::Tensor& problem_sizes2, const int64_t num_experts, const int64_t n,
const int64_t k, const std::optional<torch::Tensor>& blockscale_offsets,
std::optional<bool> force_swap_ab = std::nullopt);
void get_cutlass_moe_mm_problem_sizes_from_expert_offsets( void get_cutlass_moe_mm_problem_sizes_from_expert_offsets(
const torch::Tensor& expert_first_token_offset, const torch::Tensor& expert_first_token_offset,
torch::Tensor& problem_sizes1, torch::Tensor& problem_sizes2, torch::Tensor& problem_sizes1, torch::Tensor& problem_sizes2,
@@ -293,8 +299,7 @@ std::vector<torch::Tensor> cutlass_sparse_compress(torch::Tensor const& a);
void scaled_fp4_quant(torch::Tensor& output, torch::Tensor const& input, void scaled_fp4_quant(torch::Tensor& output, torch::Tensor const& input,
torch::Tensor& output_scale, torch::Tensor& output_scale,
torch::Tensor const& input_scale, torch::Tensor const& input_scale);
bool is_sf_swizzled_layout);
void scaled_fp4_experts_quant( void scaled_fp4_experts_quant(
torch::Tensor& output, torch::Tensor& output_scale, torch::Tensor& output, torch::Tensor& output_scale,

View File

@@ -27,24 +27,17 @@
#include "cuda_utils.h" #include "cuda_utils.h"
#include "launch_bounds_utils.h" #include "launch_bounds_utils.h"
// Define before including nvfp4_utils.cuh so the header
// can use this macro during compilation.
#define NVFP4_ENABLE_ELTS16 1
#include "nvfp4_utils.cuh" #include "nvfp4_utils.cuh"
namespace vllm { namespace vllm {
// Use UE4M3 by default. // Use UE4M3 by default.
template <class Type, bool UE8M0_SF = false> template <class Type, bool UE8M0_SF = false>
__global__ void __launch_bounds__(512, VLLM_BLOCKS_PER_SM(512)) __global__ void __launch_bounds__(1024, VLLM_BLOCKS_PER_SM(1024))
silu_mul_cvt_fp16_to_fp4(int32_t numRows, int32_t numCols, silu_mul_cvt_fp16_to_fp4(int32_t numRows, int32_t numCols, Type const* in,
int32_t num_padded_cols, float const* SFScale, uint32_t* out,
Type const* __restrict__ in, uint32_t* SFout) {
float const* __restrict__ SFScale, using PackedVec = PackedVec<Type>;
uint32_t* __restrict__ out,
uint32_t* __restrict__ SFout) {
using PackedVec = vllm::PackedVec<Type>;
static constexpr int CVT_FP4_NUM_THREADS_PER_SF = static constexpr int CVT_FP4_NUM_THREADS_PER_SF =
(CVT_FP4_SF_VEC_SIZE / CVT_FP4_ELTS_PER_THREAD); (CVT_FP4_SF_VEC_SIZE / CVT_FP4_ELTS_PER_THREAD);
static_assert(sizeof(PackedVec) == sizeof(Type) * CVT_FP4_ELTS_PER_THREAD, static_assert(sizeof(PackedVec) == sizeof(Type) * CVT_FP4_ELTS_PER_THREAD,
@@ -56,60 +49,34 @@ __global__ void __launch_bounds__(512, VLLM_BLOCKS_PER_SM(512))
// Get the global scaling factor, which will be applied to the SF. // Get the global scaling factor, which will be applied to the SF.
// Note SFScale is the same as next GEMM's alpha, which is // Note SFScale is the same as next GEMM's alpha, which is
// (448.f / (Alpha_A / 6.f)). // (448.f / (Alpha_A / 6.f)).
float const SFScaleVal = (SFScale == nullptr) ? 1.0f : SFScale[0]; float const SFScaleVal = SFScale == nullptr ? 1.0f : SFScale[0];
int32_t const colIdx = blockDim.x * blockIdx.y + threadIdx.x;
int elem_idx = colIdx * CVT_FP4_ELTS_PER_THREAD;
// Input tensor row/col loops. // Input tensor row/col loops.
for (int rowIdx = blockIdx.x; rowIdx < numRows; rowIdx += gridDim.x) { for (int rowIdx = blockIdx.x; rowIdx < numRows; rowIdx += gridDim.x) {
if (colIdx < num_padded_cols) { for (int colIdx = threadIdx.x; colIdx < numCols / CVT_FP4_ELTS_PER_THREAD;
PackedVec in_vec; colIdx += blockDim.x) {
PackedVec in_vec2;
int64_t inOffset = int64_t inOffset =
rowIdx * (numCols * 2 / CVT_FP4_ELTS_PER_THREAD) + colIdx; rowIdx * (numCols * 2 / CVT_FP4_ELTS_PER_THREAD) + colIdx;
int64_t inOffset2 = rowIdx * (numCols * 2 / CVT_FP4_ELTS_PER_THREAD) + int64_t inOffset2 = rowIdx * (numCols * 2 / CVT_FP4_ELTS_PER_THREAD) +
numCols / CVT_FP4_ELTS_PER_THREAD + colIdx; numCols / CVT_FP4_ELTS_PER_THREAD + colIdx;
PackedVec in_vec = reinterpret_cast<PackedVec const*>(in)[inOffset];
PackedVec in_vec2 = reinterpret_cast<PackedVec const*>(in)[inOffset2];
bool valid = (rowIdx < numRows) && (elem_idx < numCols); // Get the output tensor offset.
if constexpr (CVT_FP4_PACK16) { // Same as inOffset because 8 elements are packed into one uint32_t.
ld256_or_zero_cg_u32<Type>( int64_t outOffset = rowIdx * (numCols / CVT_FP4_ELTS_PER_THREAD) + colIdx;
in_vec, &reinterpret_cast<const uint32_t*>(in)[inOffset * 8], auto& out_pos = out[outOffset];
valid);
ld256_or_zero_cg_u32<Type>(
in_vec2, &reinterpret_cast<const uint32_t*>(in)[inOffset2 * 8],
valid);
} else {
ld128_or_zero_cg_u32<Type>(
in_vec, &reinterpret_cast<const uint32_t*>(in)[inOffset * 4],
valid);
ld128_or_zero_cg_u32<Type>(
in_vec2, &reinterpret_cast<const uint32_t*>(in)[inOffset2 * 4],
valid);
}
// Compute silu and mul // Compute silu and mul
PackedVec out_silu_mul = compute_silu_mul<Type>(in_vec, in_vec2); PackedVec out_silu_mul = compute_silu_mul(in_vec, in_vec2);
auto sf_out = auto sf_out =
cvt_quant_to_fp4_get_sf_out_offset<uint32_t, cvt_quant_to_fp4_get_sf_out_offset<uint32_t,
CVT_FP4_NUM_THREADS_PER_SF>( CVT_FP4_NUM_THREADS_PER_SF>(
rowIdx, colIdx, numKTiles, SFout); rowIdx, colIdx, numKTiles, SFout);
auto out_val = out_pos = cvt_warp_fp16_to_fp4<Type, UE8M0_SF>(out_silu_mul, SFScaleVal,
cvt_warp_fp16_to_fp4<Type, CVT_FP4_NUM_THREADS_PER_SF, UE8M0_SF>( sf_out);
out_silu_mul, SFScaleVal, sf_out);
if (valid) {
if constexpr (CVT_FP4_PACK16) {
int64_t outOffset = rowIdx * (numCols / 8) + colIdx * 2;
uint64_t packed64 =
(uint64_t(out_val.hi) << 32) | uint64_t(out_val.lo);
reinterpret_cast<uint64_t*>(out)[outOffset >> 1] = packed64;
} else {
out[inOffset] = out_val;
}
}
} }
} }
} }
@@ -136,23 +103,17 @@ void silu_and_mul_nvfp4_quant_sm1xxa(torch::Tensor& output, // [..., d]
auto output_ptr = static_cast<int64_t*>(output.data_ptr()); auto output_ptr = static_cast<int64_t*>(output.data_ptr());
const at::cuda::OptionalCUDAGuard device_guard(device_of(input)); const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
auto stream = at::cuda::getCurrentCUDAStream(input.get_device()); auto stream = at::cuda::getCurrentCUDAStream(input.get_device());
dim3 block(std::min(int(n / ELTS_PER_THREAD), 512)); dim3 block(std::min(int(n / ELTS_PER_THREAD), 1024));
int const numBlocksPerSM = int const numBlocksPerSM =
vllm_runtime_blocks_per_sm(static_cast<int>(block.x)); vllm_runtime_blocks_per_sm(static_cast<int>(block.x));
dim3 grid(std::min(int(m), multiProcessorCount * numBlocksPerSM));
int sf_n_unpadded = int(n / CVT_FP4_SF_VEC_SIZE);
int grid_y = vllm::div_round_up(sf_n_unpadded, static_cast<int>(block.x));
int grid_x = std::min(
int(m), std::max(1, (multiProcessorCount * numBlocksPerSM) / grid_y));
dim3 grid(grid_x, grid_y);
VLLM_DISPATCH_HALF_TYPES( VLLM_DISPATCH_HALF_TYPES(
input.scalar_type(), "silu_and_mul_nvfp4_quant_kernel", [&] { input.scalar_type(), "silu_and_mul_nvfp4_quant_kernel", [&] {
using cuda_type = vllm::CUDATypeConverter<scalar_t>::Type; using cuda_type = vllm::CUDATypeConverter<scalar_t>::Type;
auto input_ptr = static_cast<cuda_type const*>(input.data_ptr()); auto input_ptr = static_cast<cuda_type const*>(input.data_ptr());
vllm::silu_mul_cvt_fp16_to_fp4<cuda_type><<<grid, block, 0, stream>>>( vllm::silu_mul_cvt_fp16_to_fp4<cuda_type><<<grid, block, 0, stream>>>(
m, n, sf_n_unpadded, input_ptr, input_sf_ptr, m, n, input_ptr, input_sf_ptr,
reinterpret_cast<uint32_t*>(output_ptr), reinterpret_cast<uint32_t*>(output_ptr),
reinterpret_cast<uint32_t*>(sf_out)); reinterpret_cast<uint32_t*>(sf_out));
}); });

View File

@@ -140,8 +140,8 @@ __global__ void __launch_bounds__(512, VLLM_BLOCKS_PER_SM(512))
CVT_FP4_NUM_THREADS_PER_SF>( CVT_FP4_NUM_THREADS_PER_SF>(
rowIdx_in_expert, colIdx, numKTiles, SFout_in_expert); rowIdx_in_expert, colIdx, numKTiles, SFout_in_expert);
out_pos = cvt_warp_fp16_to_fp4<Type, CVT_FP4_NUM_THREADS_PER_SF, UE8M0_SF>( out_pos =
quant_input, SFScaleVal, sf_out); cvt_warp_fp16_to_fp4<Type, UE8M0_SF>(quant_input, SFScaleVal, sf_out);
} }
} }
@@ -246,8 +246,8 @@ __global__ void __launch_bounds__(1024, VLLM_BLOCKS_PER_SM(1024))
CVT_FP4_NUM_THREADS_PER_SF>( CVT_FP4_NUM_THREADS_PER_SF>(
rowIdx_in_expert, colIdx, numKTiles, SFout_in_expert); rowIdx_in_expert, colIdx, numKTiles, SFout_in_expert);
out_pos = cvt_warp_fp16_to_fp4<Type, CVT_FP4_NUM_THREADS_PER_SF, UE8M0_SF>( out_pos =
quant_input, SFScaleVal, sf_out); cvt_warp_fp16_to_fp4<Type, UE8M0_SF>(quant_input, SFScaleVal, sf_out);
} }
} }

View File

@@ -21,8 +21,7 @@
void scaled_fp4_quant_sm1xxa(torch::Tensor const& output, void scaled_fp4_quant_sm1xxa(torch::Tensor const& output,
torch::Tensor const& input, torch::Tensor const& input,
torch::Tensor const& output_sf, torch::Tensor const& output_sf,
torch::Tensor const& input_sf, torch::Tensor const& input_sf);
bool is_sf_swizzled_layout);
#endif #endif
#if (defined(ENABLE_NVFP4_SM100) && ENABLE_NVFP4_SM100) || \ #if (defined(ENABLE_NVFP4_SM100) && ENABLE_NVFP4_SM100) || \
@@ -52,12 +51,10 @@ void silu_and_mul_scaled_fp4_experts_quant_sm1xxa(
#endif #endif
void scaled_fp4_quant(torch::Tensor& output, torch::Tensor const& input, void scaled_fp4_quant(torch::Tensor& output, torch::Tensor const& input,
torch::Tensor& output_sf, torch::Tensor const& input_sf, torch::Tensor& output_sf, torch::Tensor const& input_sf) {
bool is_sf_swizzled_layout) {
#if (defined(ENABLE_NVFP4_SM100) && ENABLE_NVFP4_SM100) || \ #if (defined(ENABLE_NVFP4_SM100) && ENABLE_NVFP4_SM100) || \
(defined(ENABLE_NVFP4_SM120) && ENABLE_NVFP4_SM120) (defined(ENABLE_NVFP4_SM120) && ENABLE_NVFP4_SM120)
return scaled_fp4_quant_sm1xxa(output, input, output_sf, input_sf, return scaled_fp4_quant_sm1xxa(output, input, output_sf, input_sf);
is_sf_swizzled_layout);
#endif #endif
TORCH_CHECK_NOT_IMPLEMENTED(false, "No compiled nvfp4 quantization kernel"); TORCH_CHECK_NOT_IMPLEMENTED(false, "No compiled nvfp4 quantization kernel");
} }

View File

@@ -27,23 +27,29 @@
#include "cuda_utils.h" #include "cuda_utils.h"
#include "launch_bounds_utils.h" #include "launch_bounds_utils.h"
// Define before including nvfp4_utils.cuh so the header
// can use this macro during compilation.
#define NVFP4_ENABLE_ELTS16 1
#include "nvfp4_utils.cuh" #include "nvfp4_utils.cuh"
namespace vllm { namespace vllm {
template <typename Int>
__host__ __device__ inline Int round_up(Int x, Int y) {
static_assert(std::is_integral_v<Int>,
"round_up argument must be integral type");
return ((x + y - 1) / y) * y;
}
// Compute effective rows for grid configuration with swizzled SF layouts.
inline int computeEffectiveRows(int m) {
constexpr int ROW_TILE = 128;
return round_up(m, ROW_TILE);
}
// Use UE4M3 by default. // Use UE4M3 by default.
template <class Type, bool UE8M0_SF = false> template <class Type, bool UE8M0_SF = false>
__global__ void __launch_bounds__(512, VLLM_BLOCKS_PER_SM(512)) __global__ void __launch_bounds__(512, VLLM_BLOCKS_PER_SM(512))
cvt_fp16_to_fp4(int32_t numRows, int32_t numCols, int32_t num_padded_cols, cvt_fp16_to_fp4(int32_t numRows, int32_t numCols, Type const* in,
Type const* __restrict__ in, float const* SFScale, uint32_t* out, uint32_t* SFout) {
float const* __restrict__ SFScale, using PackedVec = PackedVec<Type>;
uint32_t* __restrict__ out, uint32_t* __restrict__ SFout) {
using PackedVec = vllm::PackedVec<Type>;
static constexpr int CVT_FP4_NUM_THREADS_PER_SF = static constexpr int CVT_FP4_NUM_THREADS_PER_SF =
(CVT_FP4_SF_VEC_SIZE / CVT_FP4_ELTS_PER_THREAD); (CVT_FP4_SF_VEC_SIZE / CVT_FP4_ELTS_PER_THREAD);
static_assert(sizeof(PackedVec) == sizeof(Type) * CVT_FP4_ELTS_PER_THREAD, static_assert(sizeof(PackedVec) == sizeof(Type) * CVT_FP4_ELTS_PER_THREAD,
@@ -53,31 +59,33 @@ __global__ void __launch_bounds__(512, VLLM_BLOCKS_PER_SM(512))
int32_t const numKTiles = (numCols + 63) / 64; int32_t const numKTiles = (numCols + 63) / 64;
int sf_m = round_up<int>(numRows, 128); int sf_m = round_up<int>(numRows, 128);
int32_t const colIdx = blockDim.x * blockIdx.y + threadIdx.x; int sf_n_unpadded = numCols / CVT_FP4_SF_VEC_SIZE;
int elem_idx = colIdx * CVT_FP4_ELTS_PER_THREAD; int sf_n_int = round_up<int>(sf_n_unpadded, 4) / 4;
int num_padded_cols = sf_n_int * 4 * CVT_FP4_SF_VEC_SIZE;
// Get the global scaling factor, which will be applied to the SF. // Get the global scaling factor, which will be applied to the SF.
// Note SFScale is the same as next GEMM's alpha, which is // Note SFScale is the same as next GEMM's alpha, which is
// (448.f / (Alpha_A / 6.f)). // (448.f / (Alpha_A / 6.f)).
float const global_scale = (SFScale == nullptr) ? 1.0f : SFScale[0]; float const global_scale = SFScale == nullptr ? 1.0f : SFScale[0];
// Iterate over all rows and cols including padded ones - // Iterate over all rows and cols including padded ones -
// ensures we visit every single scale factor address to initialize it. // ensures we visit every single scale factor address to initialize it.
for (int rowIdx = blockIdx.x; rowIdx < sf_m; rowIdx += gridDim.x) { for (int rowIdx = blockIdx.x; rowIdx < sf_m; rowIdx += gridDim.x) {
if (colIdx < num_padded_cols) { for (int colIdx = threadIdx.x;
colIdx < num_padded_cols / CVT_FP4_ELTS_PER_THREAD;
colIdx += blockDim.x) {
int elem_idx = colIdx * CVT_FP4_ELTS_PER_THREAD;
PackedVec in_vec; PackedVec in_vec;
int64_t inOffset = rowIdx * (numCols / CVT_FP4_ELTS_PER_THREAD) + colIdx; int64_t inOffset = rowIdx * (numCols / CVT_FP4_ELTS_PER_THREAD) + colIdx;
// If we are outside valid rows OR outside valid columns -> Use Zeros // If we are outside valid rows OR outside valid columns -> Use Zeros
bool valid = (rowIdx < numRows) && (elem_idx < numCols); if (rowIdx >= numRows || elem_idx >= numCols) {
if constexpr (CVT_FP4_PACK16) { memset(&in_vec, 0, sizeof(PackedVec));
ld256_or_zero_cg_u32<Type>(
in_vec, &reinterpret_cast<const uint32_t*>(in)[inOffset * 8],
valid);
} else { } else {
ld128_or_zero_cg_u32<Type>( // Valid Region: Load actual data
in_vec, &reinterpret_cast<const uint32_t*>(in)[inOffset * 4], in_vec = reinterpret_cast<PackedVec const*>(in)[inOffset];
valid);
} }
auto sf_out = auto sf_out =
@@ -86,85 +94,13 @@ __global__ void __launch_bounds__(512, VLLM_BLOCKS_PER_SM(512))
rowIdx, colIdx, numKTiles, SFout); rowIdx, colIdx, numKTiles, SFout);
auto out_val = auto out_val =
cvt_warp_fp16_to_fp4<Type, CVT_FP4_NUM_THREADS_PER_SF, UE8M0_SF>( cvt_warp_fp16_to_fp4<Type, UE8M0_SF>(in_vec, global_scale, sf_out);
in_vec, global_scale, sf_out);
// We do NOT write output for padding because the 'out' tensor is not // We do NOT write output for padding because the 'out' tensor is not
// padded. // padded.
if (valid) { if (rowIdx < numRows && elem_idx < numCols) {
if constexpr (CVT_FP4_PACK16) { // Same as inOffset because 8 elements are packed into one uint32_t.
int64_t outOffset = rowIdx * (numCols / 8) + colIdx * 2; out[inOffset] = out_val;
uint64_t packed64 =
(uint64_t(out_val.hi) << 32) | uint64_t(out_val.lo);
reinterpret_cast<uint64_t*>(out)[outOffset >> 1] = packed64;
} else {
out[inOffset] = out_val;
}
}
}
}
}
// Use UE4M3 by default.
template <class Type, bool UE8M0_SF = false>
__global__ void __launch_bounds__(512, VLLM_BLOCKS_PER_SM(512))
cvt_fp16_to_fp4_sf_major(int32_t numRows, int32_t numCols,
int32_t sf_n_unpadded, Type const* __restrict__ in,
float const* __restrict__ SFScale,
uint32_t* __restrict__ out,
uint32_t* __restrict__ SFout) {
using PackedVec = PackedVec<Type>;
static constexpr int CVT_FP4_NUM_THREADS_PER_SF =
(CVT_FP4_SF_VEC_SIZE / CVT_FP4_ELTS_PER_THREAD);
static_assert(sizeof(PackedVec) == sizeof(Type) * CVT_FP4_ELTS_PER_THREAD,
"Vec size is not matched.");
int32_t const colIdx = blockDim.x * blockIdx.y + threadIdx.x;
int elem_idx = colIdx * CVT_FP4_ELTS_PER_THREAD;
// Get the global scaling factor, which will be applied to the SF.
// Note SFScale is the same as next GEMM's alpha, which is
// (448.f / (Alpha_A / 6.f)).
float const global_scale = (SFScale == nullptr) ? 1.0f : SFScale[0];
// Iterate over all rows and cols including padded ones -
// ensures we visit every single scale factor address to initialize it.
for (int rowIdx = blockIdx.x; rowIdx < numRows; rowIdx += gridDim.x) {
if (colIdx < sf_n_unpadded) {
PackedVec in_vec;
int64_t inOffset = rowIdx * (numCols / CVT_FP4_ELTS_PER_THREAD) + colIdx;
// If we are outside valid rows OR outside valid columns -> Use Zeros
bool valid = (rowIdx < numRows) && (elem_idx < numCols);
if constexpr (CVT_FP4_PACK16) {
ld256_or_zero_cg_u32<Type>(
in_vec, &reinterpret_cast<const uint32_t*>(in)[inOffset * 8],
valid);
} else {
ld128_or_zero_cg_u32<Type>(
in_vec, &reinterpret_cast<const uint32_t*>(in)[inOffset * 4],
valid);
}
auto sf_out =
sf_out_rowmajor_u8<uint32_t>(rowIdx, colIdx, sf_n_unpadded, SFout);
auto out_val =
cvt_warp_fp16_to_fp4<Type, CVT_FP4_NUM_THREADS_PER_SF, UE8M0_SF>(
in_vec, global_scale, sf_out);
// We do NOT write output for padding because the 'out' tensor is not
// padded.
if (valid) {
if constexpr (CVT_FP4_PACK16) {
int64_t outOffset = rowIdx * (numCols / 8) + colIdx * 2;
uint64_t packed64 =
(uint64_t(out_val.hi) << 32) | uint64_t(out_val.lo);
reinterpret_cast<uint64_t*>(out)[outOffset >> 1] = packed64;
} else {
out[inOffset] = out_val;
}
} }
} }
} }
@@ -175,8 +111,7 @@ __global__ void __launch_bounds__(512, VLLM_BLOCKS_PER_SM(512))
void scaled_fp4_quant_sm1xxa(torch::Tensor const& output, void scaled_fp4_quant_sm1xxa(torch::Tensor const& output,
torch::Tensor const& input, torch::Tensor const& input,
torch::Tensor const& output_sf, torch::Tensor const& output_sf,
torch::Tensor const& input_sf, torch::Tensor const& input_sf) {
bool is_sf_swizzled_layout) {
int32_t m = input.size(0); int32_t m = input.size(0);
int32_t n = input.size(1); int32_t n = input.size(1);
@@ -194,48 +129,19 @@ void scaled_fp4_quant_sm1xxa(torch::Tensor const& output,
const at::cuda::OptionalCUDAGuard device_guard(device_of(input)); const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
auto stream = at::cuda::getCurrentCUDAStream(input.get_device()); auto stream = at::cuda::getCurrentCUDAStream(input.get_device());
int sf_n_unpadded = int(n / CVT_FP4_SF_VEC_SIZE);
// Grid, Block size. Each thread converts 8 values. // Grid, Block size. Each thread converts 8 values.
dim3 block(std::min(int(n / ELTS_PER_THREAD), 512)); dim3 block(std::min(int(n / ELTS_PER_THREAD), 512));
int const numBlocksPerSM = int const numBlocksPerSM =
vllm_runtime_blocks_per_sm(static_cast<int>(block.x)); vllm_runtime_blocks_per_sm(static_cast<int>(block.x));
int effectiveRows = vllm::computeEffectiveRows(m);
dim3 grid(std::min(effectiveRows, multiProcessorCount * numBlocksPerSM));
if (is_sf_swizzled_layout) { VLLM_DISPATCH_HALF_TYPES(input.scalar_type(), "nvfp4_quant_kernel", [&] {
int sf_n_int = int(vllm::round_up(sf_n_unpadded, 4) / 4); using cuda_type = vllm::CUDATypeConverter<scalar_t>::Type;
int32_t num_padded_cols = auto input_ptr = static_cast<cuda_type const*>(input.data_ptr());
sf_n_int * 4 * CVT_FP4_SF_VEC_SIZE / CVT_FP4_ELTS_PER_THREAD; // NOTE: We don't support e8m0 scales at this moment.
vllm::cvt_fp16_to_fp4<cuda_type, false><<<grid, block, 0, stream>>>(
int grid_y = vllm::div_round_up(num_padded_cols, static_cast<int>(block.x)); m, n, input_ptr, input_sf_ptr, reinterpret_cast<uint32_t*>(output_ptr),
int grid_x = reinterpret_cast<uint32_t*>(sf_out));
std::min(vllm::computeEffectiveRows(m), });
std::max(1, (multiProcessorCount * numBlocksPerSM) / grid_y)); }
dim3 grid(grid_x, grid_y);
VLLM_DISPATCH_HALF_TYPES(input.scalar_type(), "nvfp4_quant_kernel", [&] {
using cuda_type = vllm::CUDATypeConverter<scalar_t>::Type;
auto input_ptr = static_cast<cuda_type const*>(input.data_ptr());
// NOTE: We don't support e8m0 scales at this moment.
vllm::cvt_fp16_to_fp4<cuda_type, false><<<grid, block, 0, stream>>>(
m, n, num_padded_cols, input_ptr, input_sf_ptr,
reinterpret_cast<uint32_t*>(output_ptr),
reinterpret_cast<uint32_t*>(sf_out));
});
} else {
int grid_y = vllm::div_round_up(sf_n_unpadded, static_cast<int>(block.x));
int grid_x = std::min(
m, std::max(1, (multiProcessorCount * numBlocksPerSM) / grid_y));
dim3 grid(grid_x, grid_y);
VLLM_DISPATCH_HALF_TYPES(input.scalar_type(), "nvfp4_quant_kernel", [&] {
using cuda_type = vllm::CUDATypeConverter<scalar_t>::Type;
auto input_ptr = static_cast<cuda_type const*>(input.data_ptr());
// NOTE: We don't support e8m0 scales at this moment.
vllm::cvt_fp16_to_fp4_sf_major<cuda_type, false>
<<<grid, block, 0, stream>>>(m, n, sf_n_unpadded, input_ptr,
input_sf_ptr,
reinterpret_cast<uint32_t*>(output_ptr),
reinterpret_cast<uint32_t*>(sf_out));
});
}
}

Some files were not shown because too many files have changed in this diff Show More