Compare commits
376 Commits
v0.14.0rc2
...
v0.15.0rc1
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
58996f3589 | ||
|
|
b539f988e1 | ||
|
|
6c00645712 | ||
|
|
b781eeaa15 | ||
|
|
e0b005d9cf | ||
|
|
3b8f0fe59e | ||
|
|
c831911be2 | ||
|
|
157caf511b | ||
|
|
0b53bec60b | ||
|
|
c568581ff3 | ||
|
|
2d7053438a | ||
|
|
5a93b9162b | ||
|
|
6d86fde09c | ||
|
|
510ed1e8d3 | ||
|
|
8caffd92df | ||
|
|
58a05b0ca1 | ||
|
|
6ee7f18f33 | ||
|
|
8f987883cb | ||
|
|
ebe0ba91db | ||
|
|
43a013c3a2 | ||
|
|
c25dbee40d | ||
|
|
19ab0f7ce5 | ||
|
|
67fe677c53 | ||
|
|
d56afd45fd | ||
|
|
a2393ed496 | ||
|
|
be6931ee27 | ||
|
|
9ef3b718d9 | ||
|
|
bb17e8f11c | ||
|
|
dcd80206b7 | ||
|
|
f4a0921c9c | ||
|
|
208c56256f | ||
|
|
9ac818a551 | ||
|
|
6ca2c91b96 | ||
|
|
e33192b269 | ||
|
|
61274bdef5 | ||
|
|
b40db4dfec | ||
|
|
11b556878b | ||
|
|
ee484b3f4b | ||
|
|
a9b53dd435 | ||
|
|
254db42ede | ||
|
|
105d104576 | ||
|
|
566cdb6cfb | ||
|
|
2f0d3ba745 | ||
|
|
edf927bc9f | ||
|
|
22aeb43007 | ||
|
|
a698e8e7ad | ||
|
|
151e5451c2 | ||
|
|
73b243463b | ||
|
|
7e67df5570 | ||
|
|
ff6c1da4e6 | ||
|
|
fcb9df99bd | ||
|
|
1ebdff412a | ||
|
|
91601ff478 | ||
|
|
d4dbb7af63 | ||
|
|
203d0bc0c2 | ||
|
|
17ab54de81 | ||
|
|
cd775bdbe0 | ||
|
|
da5e7b12be | ||
|
|
719ac592ed | ||
|
|
1209b784f2 | ||
|
|
5fa0f6efa9 | ||
|
|
bc0d291bfe | ||
|
|
9ad7f89f55 | ||
|
|
6450b536a6 | ||
|
|
0f19427db5 | ||
|
|
51931c5c9a | ||
|
|
06b557ecd9 | ||
|
|
81c2a889ce | ||
|
|
8edaf38570 | ||
|
|
5c86a89805 | ||
|
|
0ccecf8833 | ||
|
|
0b9a735e11 | ||
|
|
14d03b8ddb | ||
|
|
d0cbac5827 | ||
|
|
c0d820457a | ||
|
|
97ef11dd34 | ||
|
|
ecc3dd66cc | ||
|
|
7e1f10d562 | ||
|
|
a28b94e6ef | ||
|
|
0118cdcc02 | ||
|
|
136c499f6e | ||
|
|
ebd0a17e0e | ||
|
|
37c9859fab | ||
|
|
4561f13985 | ||
|
|
6cc6d92be5 | ||
|
|
dfab5f3764 | ||
|
|
586a57ad7e | ||
|
|
3a41459501 | ||
|
|
8518b30447 | ||
|
|
2d6b537157 | ||
|
|
68b0a6c1ba | ||
|
|
5206e5e28c | ||
|
|
fec9da0af4 | ||
|
|
bbbd696af9 | ||
|
|
9b77bb790d | ||
|
|
305e53ade8 | ||
|
|
1cb4341fbc | ||
|
|
1fb648bf10 | ||
|
|
7e22309755 | ||
|
|
90c2007932 | ||
|
|
d95d650762 | ||
|
|
13d8746c54 | ||
|
|
10e94c84f6 | ||
|
|
243e78c20f | ||
|
|
aac0b817fa | ||
|
|
05f3d714db | ||
|
|
3f3f89529d | ||
|
|
5da4c7d789 | ||
|
|
160c6fa387 | ||
|
|
a8eb1182f1 | ||
|
|
fa6e599a61 | ||
|
|
7ef5873752 | ||
|
|
5e4e0e51f4 | ||
|
|
f61c9da711 | ||
|
|
7fe255889e | ||
|
|
dc917cceb8 | ||
|
|
fc56f4a071 | ||
|
|
d08b356ee0 | ||
|
|
f744810184 | ||
|
|
44f08af3a7 | ||
|
|
955b43a5a5 | ||
|
|
744ef30484 | ||
|
|
300622e609 | ||
|
|
69d09fdd6c | ||
|
|
3a63be0faa | ||
|
|
803e3f3f68 | ||
|
|
70917b1c55 | ||
|
|
c517d8c934 | ||
|
|
fc37187a51 | ||
|
|
ff365eea94 | ||
|
|
444e2e7e1f | ||
|
|
bc14663e6a | ||
|
|
654a71fc3c | ||
|
|
15e302dfce | ||
|
|
d117a4d1a9 | ||
|
|
421012b63a | ||
|
|
841d53aaa8 | ||
|
|
1752262e96 | ||
|
|
ea6102b85d | ||
|
|
328cbb2773 | ||
|
|
64e3d67ac0 | ||
|
|
098b2d66fe | ||
|
|
8ebf271bb6 | ||
|
|
49a1262267 | ||
|
|
2b8a38b6d6 | ||
|
|
1bf1a34b19 | ||
|
|
a810299838 | ||
|
|
eb1629da24 | ||
|
|
019e2c3b7c | ||
|
|
f5fdec8ce2 | ||
|
|
1579c9b5fd | ||
|
|
889722f3bf | ||
|
|
49d9653852 | ||
|
|
a1d82466ea | ||
|
|
24a163ed77 | ||
|
|
378385b90c | ||
|
|
c5487e2b96 | ||
|
|
6437ff1fb9 | ||
|
|
5e00b561cd | ||
|
|
408195ec59 | ||
|
|
63227accf5 | ||
|
|
e675dda67b | ||
|
|
24dc30f7ff | ||
|
|
180fba653e | ||
|
|
f999539869 | ||
|
|
e1da249c93 | ||
|
|
9b693d023c | ||
|
|
808d6fd7b9 | ||
|
|
1861ae8aae | ||
|
|
4e31b7f228 | ||
|
|
6c20e89c02 | ||
|
|
85f55c943c | ||
|
|
cea3c754c4 | ||
|
|
42135d6898 | ||
|
|
e14467be43 | ||
|
|
7727ce35c2 | ||
|
|
6bb2bc71e2 | ||
|
|
c80f92c14d | ||
|
|
f23fb5a7c1 | ||
|
|
360aa93f8f | ||
|
|
27ca95b3c9 | ||
|
|
b4f64e5b02 | ||
|
|
7ab80a8e37 | ||
|
|
0900cedb3f | ||
|
|
6f067b1fb7 | ||
|
|
27b81e010d | ||
|
|
7013e9ac8f | ||
|
|
c78ee240b3 | ||
|
|
d2389c1262 | ||
|
|
22375f8d13 | ||
|
|
9b67338b78 | ||
|
|
2261340806 | ||
|
|
86c69dc54c | ||
|
|
7c5dedc247 | ||
|
|
193069d129 | ||
|
|
f0feb1cf81 | ||
|
|
09194b90a5 | ||
|
|
9ab4388cd3 | ||
|
|
04a9e064db | ||
|
|
c025263ddd | ||
|
|
6c97b9b9b6 | ||
|
|
4ca62a0dbd | ||
|
|
7901109ea5 | ||
|
|
13f6630a9e | ||
|
|
fda3f03eb2 | ||
|
|
bb9172030e | ||
|
|
c4e5bdf61b | ||
|
|
7f1bcd18ff | ||
|
|
8be263c3fb | ||
|
|
e1a34c3a5d | ||
|
|
148117ea2e | ||
|
|
e9c83cdc51 | ||
|
|
b75e85dede | ||
|
|
4753f3bf69 | ||
|
|
6c01ffb897 | ||
|
|
7b7cdce968 | ||
|
|
12dab78f49 | ||
|
|
05dc4bfab6 | ||
|
|
1a1fc3bbc0 | ||
|
|
43fada5360 | ||
|
|
4a5299c93f | ||
|
|
73f2a81c75 | ||
|
|
7350331718 | ||
|
|
9d1e611f0e | ||
|
|
0727cc9ecf | ||
|
|
a0490be8f1 | ||
|
|
cd3ac5b797 | ||
|
|
2636d76257 | ||
|
|
aa7f37ccfa | ||
|
|
c88860d759 | ||
|
|
758df5afe7 | ||
|
|
cdd03d25d3 | ||
|
|
74c583bc50 | ||
|
|
c0a350ca73 | ||
|
|
71832ba71e | ||
|
|
11bbf86f6a | ||
|
|
3c8740aacb | ||
|
|
7518a3dc65 | ||
|
|
976af2f314 | ||
|
|
9a1f16da1e | ||
|
|
bb1848cd62 | ||
|
|
6101a26dc9 | ||
|
|
f5d1740030 | ||
|
|
eebc58df0c | ||
|
|
16de822c71 | ||
|
|
5480c6b1fa | ||
|
|
ba29ab441e | ||
|
|
afc3622602 | ||
|
|
327a02d8db | ||
|
|
2f03035a61 | ||
|
|
38bf2ffb21 | ||
|
|
c826c72a96 | ||
|
|
fe36bf5e80 | ||
|
|
963dc0b865 | ||
|
|
8cc26acd8b | ||
|
|
4a6af8813f | ||
|
|
4147910f1e | ||
|
|
3055232ba0 | ||
|
|
965765aef9 | ||
|
|
9e078d0582 | ||
|
|
2b99f210f5 | ||
|
|
1646fea672 | ||
|
|
d3317bbba4 | ||
|
|
8e61425ee6 | ||
|
|
2e7c89e708 | ||
|
|
037a6487af | ||
|
|
5a3050a089 | ||
|
|
484e22bc18 | ||
|
|
ca21288080 | ||
|
|
4c82b6fac7 | ||
|
|
a884bc62d6 | ||
|
|
7a1030431a | ||
|
|
9fd918e510 | ||
|
|
c9a533079c | ||
|
|
6ca4f400d8 | ||
|
|
180e981d56 | ||
|
|
b84c426a8c | ||
|
|
b66b0d6abb | ||
|
|
03da3b52ef | ||
|
|
14ce524249 | ||
|
|
4ae77dfd42 | ||
|
|
73f635a75f | ||
|
|
35bf5d08e8 | ||
|
|
5de6dd0662 | ||
|
|
709502558c | ||
|
|
46f8a982b1 | ||
|
|
bcf2333cd6 | ||
|
|
83239ff19a | ||
|
|
c277fbdf31 | ||
|
|
aca5c51487 | ||
|
|
31c29257c8 | ||
|
|
8c11001ba2 | ||
|
|
bd292be0c0 | ||
|
|
41c544f78a | ||
|
|
1be5a73571 | ||
|
|
c36ba69bda | ||
|
|
047413375c | ||
|
|
74e4bb1c5a | ||
|
|
b34474bf2c | ||
|
|
6218034dd7 | ||
|
|
77c16df31d | ||
|
|
130d6c9514 | ||
|
|
361dfdc9d8 | ||
|
|
8ebfacaa75 | ||
|
|
b89275d018 | ||
|
|
28459785ff | ||
|
|
8853a50af2 | ||
|
|
c5891b5430 | ||
|
|
707b44cc28 | ||
|
|
3a4e10c847 | ||
|
|
cbbae38f93 | ||
|
|
cdba4c74b3 | ||
|
|
a52d1396a7 | ||
|
|
1e584823f8 | ||
|
|
4c1c501a7e | ||
|
|
ae1eba6a9a | ||
|
|
e9ec2a72d8 | ||
|
|
2c9b4cf5bf | ||
|
|
9d7ae3fcdb | ||
|
|
3c2685645e | ||
|
|
773d7073ae | ||
|
|
edadca109c | ||
|
|
d86fc23bdd | ||
|
|
375e5984fe | ||
|
|
19b251fe3d | ||
|
|
15422ed3f7 | ||
|
|
8471b27df9 | ||
|
|
66652e8082 | ||
|
|
e27078ea80 | ||
|
|
d084e9fca7 | ||
|
|
3a612322eb | ||
|
|
9ea07b41da | ||
|
|
552b262936 | ||
|
|
00e6402d56 | ||
|
|
ce0946249d | ||
|
|
3f28174c6a | ||
|
|
769d0629e1 | ||
|
|
90db5b31e4 | ||
|
|
b8199f6049 | ||
|
|
7e6f123810 | ||
|
|
9312a6c03a | ||
|
|
6388b50058 | ||
|
|
048bb59728 | ||
|
|
7933638051 | ||
|
|
6b176095e3 | ||
|
|
9d0d7f48d5 | ||
|
|
50632adc58 | ||
|
|
6fa6e7ef0c | ||
|
|
90c0836902 | ||
|
|
8ef50d9a6b | ||
|
|
2a60ac91d0 | ||
|
|
9e65bb4ef4 | ||
|
|
0db574b185 | ||
|
|
2f4a71daf2 | ||
|
|
69f8a0ea37 | ||
|
|
f28125d87b | ||
|
|
46f8c6b725 | ||
|
|
af54d2e2d0 | ||
|
|
6beef12b9b | ||
|
|
ab74b2a27a | ||
|
|
2263d44b68 | ||
|
|
4f3676e726 | ||
|
|
510265472c | ||
|
|
4f02cb2eac | ||
|
|
252c011012 | ||
|
|
98f60e5acb | ||
|
|
fefce49807 | ||
|
|
a5bbbd2f24 | ||
|
|
8c8653b672 | ||
|
|
232214b2ae | ||
|
|
eb28e8068d | ||
|
|
542a4059b2 | ||
|
|
df7e12715f | ||
|
|
44c34f22d9 | ||
|
|
80221e1884 | ||
|
|
5e714f7ff4 |
@@ -1,7 +1,8 @@
|
||||
name: vllm_ci
|
||||
job_dirs:
|
||||
- ".buildkite/test_areas"
|
||||
- ".buildkite/image_build"
|
||||
- ".buildkite/test_areas"
|
||||
- ".buildkite/hardware_tests"
|
||||
run_all_patterns:
|
||||
- "docker/Dockerfile"
|
||||
- "CMakeLists.txt"
|
||||
|
||||
28
.buildkite/hardware_tests/amd.yaml
Normal file
28
.buildkite/hardware_tests/amd.yaml
Normal file
@@ -0,0 +1,28 @@
|
||||
group: Hardware
|
||||
steps:
|
||||
- label: "AMD: :docker: build image"
|
||||
device: amd_cpu
|
||||
no_plugin: true
|
||||
commands:
|
||||
- >
|
||||
docker build
|
||||
--build-arg max_jobs=16
|
||||
--build-arg REMOTE_VLLM=1
|
||||
--build-arg ARG_PYTORCH_ROCM_ARCH='gfx90a;gfx942'
|
||||
--build-arg VLLM_BRANCH=$BUILDKITE_COMMIT
|
||||
--tag "rocm/vllm-ci:${BUILDKITE_COMMIT}"
|
||||
-f docker/Dockerfile.rocm
|
||||
--target test
|
||||
--no-cache
|
||||
--progress plain .
|
||||
- docker push "rocm/vllm-ci:${BUILDKITE_COMMIT}"
|
||||
env:
|
||||
DOCKER_BUILDKIT: "1"
|
||||
retry:
|
||||
automatic:
|
||||
- exit_status: -1 # Agent was lost
|
||||
limit: 1
|
||||
- exit_status: -10 # Agent was lost
|
||||
limit: 1
|
||||
- exit_status: 1 # Machine occasionally fail
|
||||
limit: 1
|
||||
8
.buildkite/hardware_tests/arm.yaml
Normal file
8
.buildkite/hardware_tests/arm.yaml
Normal file
@@ -0,0 +1,8 @@
|
||||
group: Hardware
|
||||
steps:
|
||||
- label: "Arm CPU Test"
|
||||
soft_fail: true
|
||||
device: arm_cpu
|
||||
no_plugin: true
|
||||
commands:
|
||||
- bash .buildkite/scripts/hardware_ci/run-cpu-test-arm.sh
|
||||
10
.buildkite/hardware_tests/ascend_npu.yaml
Normal file
10
.buildkite/hardware_tests/ascend_npu.yaml
Normal file
@@ -0,0 +1,10 @@
|
||||
group: Hardware
|
||||
depends_on: ~
|
||||
steps:
|
||||
- label: "Ascend NPU Test"
|
||||
soft_fail: true
|
||||
timeout_in_minutes: 20
|
||||
no_plugin: true
|
||||
device: ascend_npu
|
||||
commands:
|
||||
- bash .buildkite/scripts/hardware_ci/run-npu-test.sh
|
||||
10
.buildkite/hardware_tests/gh200.yaml
Normal file
10
.buildkite/hardware_tests/gh200.yaml
Normal file
@@ -0,0 +1,10 @@
|
||||
group: Hardware
|
||||
steps:
|
||||
- label: "GH200 Test"
|
||||
soft_fail: true
|
||||
device: gh200
|
||||
no_plugin: true
|
||||
optional: true
|
||||
commands:
|
||||
- nvidia-smi
|
||||
- bash .buildkite/scripts/hardware_ci/run-gh200-test.sh
|
||||
23
.buildkite/hardware_tests/intel.yaml
Normal file
23
.buildkite/hardware_tests/intel.yaml
Normal file
@@ -0,0 +1,23 @@
|
||||
group: Hardware
|
||||
depends_on: ~
|
||||
steps:
|
||||
- label: "Intel CPU Test"
|
||||
soft_fail: true
|
||||
device: intel_cpu
|
||||
no_plugin: true
|
||||
commands:
|
||||
- bash .buildkite/scripts/hardware_ci/run-cpu-test.sh
|
||||
|
||||
- label: "Intel HPU Test"
|
||||
soft_fail: true
|
||||
device: intel_hpu
|
||||
no_plugin: true
|
||||
commands:
|
||||
- bash .buildkite/scripts/hardware_ci/run-hpu-test.sh
|
||||
|
||||
- label: "Intel GPU Test"
|
||||
soft_fail: true
|
||||
device: intel_gpu
|
||||
no_plugin: true
|
||||
commands:
|
||||
- bash .buildkite/scripts/hardware_ci/run-xpu-test.sh
|
||||
@@ -1,56 +1,254 @@
|
||||
#!/bin/bash
|
||||
set -e
|
||||
set -euo pipefail
|
||||
|
||||
if [[ $# -lt 8 ]]; then
|
||||
echo "Usage: $0 <registry> <repo> <commit> <branch> <vllm_use_precompiled> <vllm_merge_base_commit> <cache_from> <cache_to>"
|
||||
exit 1
|
||||
# replace invalid characters in Docker image tags and truncate to 128 chars
|
||||
clean_docker_tag() {
|
||||
local input="$1"
|
||||
echo "$input" | sed 's/[^a-zA-Z0-9._-]/_/g' | cut -c1-128
|
||||
}
|
||||
|
||||
print_usage_and_exit() {
|
||||
echo "Usage: $0 <registry> <repo> <commit> <branch> <vllm_use_precompiled> <vllm_merge_base_commit> <cache_from> <cache_to>"
|
||||
exit 1
|
||||
}
|
||||
|
||||
print_instance_info() {
|
||||
echo ""
|
||||
echo "=== Debug: Instance Information ==="
|
||||
# Get IMDSv2 token
|
||||
if TOKEN=$(curl -s -X PUT "http://169.254.169.254/latest/api/token" \
|
||||
-H "X-aws-ec2-metadata-token-ttl-seconds: 21600" 2>/dev/null); then
|
||||
AMI_ID=$(curl -s -H "X-aws-ec2-metadata-token: $TOKEN" \
|
||||
http://169.254.169.254/latest/meta-data/ami-id 2>/dev/null || echo "unknown")
|
||||
INSTANCE_TYPE=$(curl -s -H "X-aws-ec2-metadata-token: $TOKEN" \
|
||||
http://169.254.169.254/latest/meta-data/instance-type 2>/dev/null || echo "unknown")
|
||||
INSTANCE_ID=$(curl -s -H "X-aws-ec2-metadata-token: $TOKEN" \
|
||||
http://169.254.169.254/latest/meta-data/instance-id 2>/dev/null || echo "unknown")
|
||||
AZ=$(curl -s -H "X-aws-ec2-metadata-token: $TOKEN" \
|
||||
http://169.254.169.254/latest/meta-data/placement/availability-zone 2>/dev/null || echo "unknown")
|
||||
echo "AMI ID: ${AMI_ID}"
|
||||
echo "Instance Type: ${INSTANCE_TYPE}"
|
||||
echo "Instance ID: ${INSTANCE_ID}"
|
||||
echo "AZ: ${AZ}"
|
||||
else
|
||||
echo "Not running on EC2 or IMDS not available"
|
||||
fi
|
||||
# Check for warm cache AMI (marker file baked into custom AMI)
|
||||
if [[ -f /etc/vllm-ami-info ]]; then
|
||||
echo "Cache: warm (custom vLLM AMI)"
|
||||
cat /etc/vllm-ami-info
|
||||
else
|
||||
echo "Cache: cold (standard AMI)"
|
||||
fi
|
||||
echo "==================================="
|
||||
echo ""
|
||||
}
|
||||
|
||||
setup_buildx_builder() {
|
||||
echo "--- :buildkite: Setting up buildx builder"
|
||||
if [[ -S "${BUILDKIT_SOCKET}" ]]; then
|
||||
# Custom AMI with standalone buildkitd - use remote driver for warm cache
|
||||
echo "✅ Found local buildkitd socket at ${BUILDKIT_SOCKET}"
|
||||
echo "Using remote driver to connect to buildkitd (warm cache available)"
|
||||
if docker buildx inspect baked-vllm-builder >/dev/null 2>&1; then
|
||||
echo "Using existing baked-vllm-builder"
|
||||
docker buildx use baked-vllm-builder
|
||||
else
|
||||
echo "Creating baked-vllm-builder with remote driver"
|
||||
docker buildx create \
|
||||
--name baked-vllm-builder \
|
||||
--driver remote \
|
||||
--use \
|
||||
"unix://${BUILDKIT_SOCKET}"
|
||||
fi
|
||||
docker buildx inspect --bootstrap
|
||||
elif docker buildx inspect "${BUILDER_NAME}" >/dev/null 2>&1; then
|
||||
# Existing builder available
|
||||
echo "Using existing builder: ${BUILDER_NAME}"
|
||||
docker buildx use "${BUILDER_NAME}"
|
||||
docker buildx inspect --bootstrap
|
||||
else
|
||||
# No local buildkitd, no existing builder - create new docker-container builder
|
||||
echo "No local buildkitd found, using docker-container driver"
|
||||
docker buildx create --name "${BUILDER_NAME}" --driver docker-container --use
|
||||
docker buildx inspect --bootstrap
|
||||
fi
|
||||
|
||||
# builder info
|
||||
echo "Active builder:"
|
||||
docker buildx ls | grep -E '^\*|^NAME' || docker buildx ls
|
||||
}
|
||||
|
||||
check_and_skip_if_image_exists() {
|
||||
if [[ -n "${IMAGE_TAG:-}" ]]; then
|
||||
echo "--- :mag: Checking if image exists"
|
||||
if docker manifest inspect "${IMAGE_TAG}" >/dev/null 2>&1; then
|
||||
echo "Image already exists: ${IMAGE_TAG}"
|
||||
echo "Skipping build"
|
||||
exit 0
|
||||
fi
|
||||
echo "Image not found, proceeding with build"
|
||||
fi
|
||||
}
|
||||
|
||||
ecr_login() {
|
||||
aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin "$REGISTRY"
|
||||
aws ecr get-login-password --region us-east-1 | docker login --username AWS --password-stdin 936637512419.dkr.ecr.us-east-1.amazonaws.com
|
||||
}
|
||||
|
||||
prepare_cache_tags() {
|
||||
# resolve and set: CACHE_TO, CACHE_FROM, CACHE_FROM_BASE_BRANCH, CACHE_FROM_MAIN
|
||||
TEST_CACHE_ECR="936637512419.dkr.ecr.us-east-1.amazonaws.com/vllm-ci-test-cache"
|
||||
MAIN_CACHE_ECR="936637512419.dkr.ecr.us-east-1.amazonaws.com/vllm-ci-postmerge-cache"
|
||||
|
||||
if [[ "$BUILDKITE_PULL_REQUEST" == "false" ]]; then
|
||||
if [[ "$BUILDKITE_BRANCH" == "main" ]]; then
|
||||
cache="${MAIN_CACHE_ECR}:latest"
|
||||
else
|
||||
clean_branch=$(clean_docker_tag "$BUILDKITE_BRANCH")
|
||||
cache="${TEST_CACHE_ECR}:${clean_branch}"
|
||||
fi
|
||||
CACHE_TO="$cache"
|
||||
CACHE_FROM="$cache"
|
||||
CACHE_FROM_BASE_BRANCH="$cache"
|
||||
else
|
||||
CACHE_TO="${TEST_CACHE_ECR}:pr-${BUILDKITE_PULL_REQUEST}"
|
||||
CACHE_FROM="${TEST_CACHE_ECR}:pr-${BUILDKITE_PULL_REQUEST}"
|
||||
if [[ "$BUILDKITE_PULL_REQUEST_BASE_BRANCH" == "main" ]]; then
|
||||
CACHE_FROM_BASE_BRANCH="${MAIN_CACHE_ECR}:latest"
|
||||
else
|
||||
clean_base=$(clean_docker_tag "$BUILDKITE_PULL_REQUEST_BASE_BRANCH")
|
||||
CACHE_FROM_BASE_BRANCH="${TEST_CACHE_ECR}:${clean_base}"
|
||||
fi
|
||||
fi
|
||||
|
||||
CACHE_FROM_MAIN="${MAIN_CACHE_ECR}:latest"
|
||||
export CACHE_TO CACHE_FROM CACHE_FROM_BASE_BRANCH CACHE_FROM_MAIN
|
||||
}
|
||||
|
||||
resolve_parent_commit() {
|
||||
if [[ -z "${PARENT_COMMIT:-}" ]]; then
|
||||
PARENT_COMMIT=$(git rev-parse HEAD~1 2>/dev/null || echo "")
|
||||
if [[ -n "${PARENT_COMMIT}" ]]; then
|
||||
echo "Computed parent commit for cache fallback: ${PARENT_COMMIT}"
|
||||
export PARENT_COMMIT
|
||||
else
|
||||
echo "Could not determine parent commit (may be first commit in repo)"
|
||||
fi
|
||||
else
|
||||
echo "Using provided PARENT_COMMIT: ${PARENT_COMMIT}"
|
||||
fi
|
||||
}
|
||||
|
||||
print_bake_config() {
|
||||
echo "--- :page_facing_up: Resolved bake configuration"
|
||||
BAKE_CONFIG_FILE="bake-config-build-${BUILDKITE_BUILD_NUMBER:-local}.json"
|
||||
docker buildx bake -f "${VLLM_BAKE_FILE}" -f "${CI_HCL_PATH}" --print "${TARGET}" | tee "${BAKE_CONFIG_FILE}" || true
|
||||
echo "Saved bake config to ${BAKE_CONFIG_FILE}"
|
||||
echo "--- :arrow_down: Uploading bake config to Buildkite"
|
||||
buildkite-agent artifact upload "${BAKE_CONFIG_FILE}"
|
||||
}
|
||||
|
||||
#################################
|
||||
# Main Script #
|
||||
#################################
|
||||
print_instance_info
|
||||
|
||||
if [[ $# -lt 7 ]]; then
|
||||
print_usage_and_exit
|
||||
fi
|
||||
|
||||
# input args
|
||||
REGISTRY=$1
|
||||
REPO=$2
|
||||
BUILDKITE_COMMIT=$3
|
||||
BRANCH=$4
|
||||
VLLM_USE_PRECOMPILED=$5
|
||||
VLLM_MERGE_BASE_COMMIT=$6
|
||||
CACHE_FROM=$7
|
||||
CACHE_TO=$8
|
||||
IMAGE_TAG=$7
|
||||
IMAGE_TAG_LATEST=${8:-} # only used for main branch, optional
|
||||
|
||||
# authenticate with AWS ECR
|
||||
aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin $REGISTRY
|
||||
aws ecr get-login-password --region us-east-1 | docker login --username AWS --password-stdin 936637512419.dkr.ecr.us-east-1.amazonaws.com
|
||||
# build config
|
||||
TARGET="test-ci"
|
||||
CI_HCL_URL="${CI_HCL_URL:-https://raw.githubusercontent.com/vllm-project/ci-infra/main/docker/ci.hcl}"
|
||||
VLLM_BAKE_FILE="${VLLM_BAKE_FILE:-docker/docker-bake.hcl}"
|
||||
BUILDER_NAME="${BUILDER_NAME:-vllm-builder}"
|
||||
CI_HCL_PATH="/tmp/ci.hcl"
|
||||
BUILDKIT_SOCKET="/run/buildkit/buildkitd.sock"
|
||||
|
||||
# docker buildx
|
||||
docker buildx create --name vllm-builder --driver docker-container --use
|
||||
docker buildx inspect --bootstrap
|
||||
docker buildx ls
|
||||
prepare_cache_tags
|
||||
ecr_login
|
||||
|
||||
# skip build if image already exists
|
||||
if [[ -z $(docker manifest inspect $REGISTRY/$REPO:$BUILDKITE_COMMIT) ]]; then
|
||||
echo "Image not found, proceeding with build..."
|
||||
else
|
||||
echo "Image found"
|
||||
exit 0
|
||||
# Environment info (for docs and human readers)
|
||||
# CI_HCL_URL - URL to ci.hcl (default: from ci-infra main branch)
|
||||
# VLLM_CI_BRANCH - ci-infra branch to use (default: main)
|
||||
# VLLM_BAKE_FILE - Path to vLLM's bake file (default: docker/docker-bake.hcl)
|
||||
# BUILDER_NAME - Name for buildx builder (default: vllm-builder)
|
||||
#
|
||||
# Build configuration (exported as environment variables for bake):
|
||||
export BUILDKITE_COMMIT
|
||||
export PARENT_COMMIT
|
||||
export IMAGE_TAG
|
||||
export IMAGE_TAG_LATEST
|
||||
export CACHE_FROM
|
||||
export CACHE_FROM_BASE_BRANCH
|
||||
export CACHE_FROM_MAIN
|
||||
export CACHE_TO
|
||||
export VLLM_USE_PRECOMPILED
|
||||
export VLLM_MERGE_BASE_COMMIT
|
||||
|
||||
# print args
|
||||
echo "--- :mag: Arguments"
|
||||
echo "REGISTRY: ${REGISTRY}"
|
||||
echo "REPO: ${REPO}"
|
||||
echo "BUILDKITE_COMMIT: ${BUILDKITE_COMMIT}"
|
||||
echo "BRANCH: ${BRANCH}"
|
||||
echo "VLLM_USE_PRECOMPILED: ${VLLM_USE_PRECOMPILED}"
|
||||
echo "VLLM_MERGE_BASE_COMMIT: ${VLLM_MERGE_BASE_COMMIT}"
|
||||
echo "IMAGE_TAG: ${IMAGE_TAG}"
|
||||
echo "IMAGE_TAG_LATEST: ${IMAGE_TAG_LATEST}"
|
||||
|
||||
# print build configuration
|
||||
echo "--- :mag: Build configuration"
|
||||
echo "TARGET: ${TARGET}"
|
||||
echo "CI HCL URL: ${CI_HCL_URL}"
|
||||
echo "vLLM bake file: ${VLLM_BAKE_FILE}"
|
||||
echo "BUILDER_NAME: ${BUILDER_NAME}"
|
||||
echo "CI_HCL_PATH: ${CI_HCL_PATH}"
|
||||
echo "BUILDKIT_SOCKET: ${BUILDKIT_SOCKET}"
|
||||
|
||||
echo "--- :mag: Cache tags"
|
||||
echo "CACHE_TO: ${CACHE_TO}"
|
||||
echo "CACHE_FROM: ${CACHE_FROM}"
|
||||
echo "CACHE_FROM_BASE_BRANCH: ${CACHE_FROM_BASE_BRANCH}"
|
||||
echo "CACHE_FROM_MAIN: ${CACHE_FROM_MAIN}"
|
||||
|
||||
check_and_skip_if_image_exists
|
||||
|
||||
echo "--- :docker: Setting up Docker buildx bake"
|
||||
echo "Target: ${TARGET}"
|
||||
echo "CI HCL URL: ${CI_HCL_URL}"
|
||||
echo "vLLM bake file: ${VLLM_BAKE_FILE}"
|
||||
|
||||
if [[ ! -f "${VLLM_BAKE_FILE}" ]]; then
|
||||
echo "Error: vLLM bake file not found at ${VLLM_BAKE_FILE}"
|
||||
echo "Make sure you're running from the vLLM repository root"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [[ "${VLLM_USE_PRECOMPILED:-0}" == "1" ]]; then
|
||||
merge_base_commit_build_args="--build-arg VLLM_MERGE_BASE_COMMIT=${VLLM_MERGE_BASE_COMMIT}"
|
||||
else
|
||||
merge_base_commit_build_args=""
|
||||
fi
|
||||
echo "--- :arrow_down: Downloading ci.hcl"
|
||||
curl -sSfL -o "${CI_HCL_PATH}" "${CI_HCL_URL}"
|
||||
echo "Downloaded to ${CI_HCL_PATH}"
|
||||
|
||||
# build
|
||||
docker buildx build --file docker/Dockerfile \
|
||||
--build-arg max_jobs=16 \
|
||||
--build-arg buildkite_commit=$BUILDKITE_COMMIT \
|
||||
--build-arg USE_SCCACHE=1 \
|
||||
--build-arg TORCH_CUDA_ARCH_LIST="8.0 8.9 9.0 10.0" \
|
||||
--build-arg FI_TORCH_CUDA_ARCH_LIST="8.0 8.9 9.0a 10.0a" \
|
||||
--build-arg VLLM_USE_PRECOMPILED="${VLLM_USE_PRECOMPILED:-0}" \
|
||||
${merge_base_commit_build_args} \
|
||||
--cache-from type=registry,ref=${CACHE_FROM},mode=max \
|
||||
--cache-to type=registry,ref=${CACHE_TO},mode=max \
|
||||
--tag ${REGISTRY}/${REPO}:${BUILDKITE_COMMIT} \
|
||||
$( [[ "${BRANCH}" == "main" ]] && echo "--tag ${REGISTRY}/${REPO}:latest" ) \
|
||||
--push \
|
||||
--target test \
|
||||
--progress plain .
|
||||
setup_buildx_builder
|
||||
|
||||
# Compute parent commit for cache fallback (if not already set)
|
||||
resolve_parent_commit
|
||||
export PARENT_COMMIT
|
||||
|
||||
print_bake_config
|
||||
|
||||
echo "--- :docker: Building ${TARGET}"
|
||||
docker --debug buildx bake -f "${VLLM_BAKE_FILE}" -f "${CI_HCL_PATH}" --progress plain "${TARGET}"
|
||||
|
||||
echo "--- :white_check_mark: Build complete"
|
||||
|
||||
@@ -4,7 +4,8 @@ steps:
|
||||
key: image-build
|
||||
depends_on: []
|
||||
commands:
|
||||
- .buildkite/image_build/image_build.sh $REGISTRY $REPO $BUILDKITE_COMMIT $BRANCH $VLLM_USE_PRECOMPILED $VLLM_MERGE_BASE_COMMIT $CACHE_FROM $CACHE_TO
|
||||
- if [[ "$BUILDKITE_BRANCH" != "main" ]]; then .buildkite/image_build/image_build.sh $REGISTRY $REPO $BUILDKITE_COMMIT $BRANCH $VLLM_USE_PRECOMPILED $VLLM_MERGE_BASE_COMMIT $IMAGE_TAG; fi
|
||||
- if [[ "$BUILDKITE_BRANCH" == "main" ]]; then .buildkite/image_build/image_build.sh $REGISTRY $REPO $BUILDKITE_COMMIT $BRANCH $VLLM_USE_PRECOMPILED $VLLM_MERGE_BASE_COMMIT $IMAGE_TAG_LATEST; fi
|
||||
retry:
|
||||
automatic:
|
||||
- exit_status: -1 # Agent was lost
|
||||
|
||||
5
.buildkite/lm-eval-harness/configs/models-small-rocm.txt
Normal file
5
.buildkite/lm-eval-harness/configs/models-small-rocm.txt
Normal file
@@ -0,0 +1,5 @@
|
||||
Qwen2.5-1.5B-Instruct.yaml
|
||||
Meta-Llama-3.2-1B-Instruct-INT8-compressed-tensors.yaml
|
||||
Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml
|
||||
Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml
|
||||
Qwen1.5-MoE-W4A16-compressed-tensors.yaml
|
||||
@@ -1,219 +1,287 @@
|
||||
steps:
|
||||
# aarch64 + CUDA builds
|
||||
- label: "Build arm64 wheel - CUDA 12.9"
|
||||
depends_on: ~
|
||||
id: build-wheel-arm64-cuda-12-9
|
||||
agents:
|
||||
queue: arm64_cpu_queue_postmerge
|
||||
commands:
|
||||
# #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here:
|
||||
# https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
||||
- "mkdir artifacts"
|
||||
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
||||
- "bash .buildkite/scripts/upload-wheels.sh"
|
||||
env:
|
||||
DOCKER_BUILDKIT: "1"
|
||||
|
||||
- label: "Build arm64 wheel - CUDA 13.0"
|
||||
depends_on: ~
|
||||
id: build-wheel-arm64-cuda-13-0
|
||||
agents:
|
||||
queue: arm64_cpu_queue_postmerge
|
||||
commands:
|
||||
# #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here:
|
||||
# https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
||||
- "mkdir artifacts"
|
||||
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
||||
- "bash .buildkite/scripts/upload-wheels.sh manylinux_2_35"
|
||||
env:
|
||||
DOCKER_BUILDKIT: "1"
|
||||
|
||||
# aarch64 build
|
||||
- label: "Build arm64 CPU wheel"
|
||||
depends_on: ~
|
||||
id: build-wheel-arm64-cpu
|
||||
agents:
|
||||
queue: arm64_cpu_queue_postmerge
|
||||
commands:
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_BUILD_ACL=ON --tag vllm-ci:build-image --target vllm-build --progress plain -f docker/Dockerfile.cpu ."
|
||||
- "mkdir artifacts"
|
||||
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
||||
- "bash .buildkite/scripts/upload-wheels.sh manylinux_2_35"
|
||||
env:
|
||||
DOCKER_BUILDKIT: "1"
|
||||
|
||||
# x86 + CUDA builds
|
||||
- label: "Build wheel - CUDA 12.9"
|
||||
depends_on: ~
|
||||
id: build-wheel-cuda-12-9
|
||||
agents:
|
||||
queue: cpu_queue_postmerge
|
||||
commands:
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
||||
- "mkdir artifacts"
|
||||
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
||||
- "bash .buildkite/scripts/upload-wheels.sh manylinux_2_31"
|
||||
env:
|
||||
DOCKER_BUILDKIT: "1"
|
||||
|
||||
- label: "Build wheel - CUDA 13.0"
|
||||
depends_on: ~
|
||||
id: build-wheel-cuda-13-0
|
||||
agents:
|
||||
queue: cpu_queue_postmerge
|
||||
commands:
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
||||
- "mkdir artifacts"
|
||||
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
||||
- "bash .buildkite/scripts/upload-wheels.sh manylinux_2_35"
|
||||
env:
|
||||
DOCKER_BUILDKIT: "1"
|
||||
|
||||
# x86 CPU wheel build
|
||||
- label: "Build x86 CPU wheel"
|
||||
depends_on: ~
|
||||
id: build-wheel-x86-cpu
|
||||
agents:
|
||||
queue: cpu_queue_postmerge
|
||||
commands:
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_CPU_AVX512BF16=true --build-arg VLLM_CPU_AVX512VNNI=true --build-arg VLLM_CPU_AMXBF16=true --tag vllm-ci:build-image --target vllm-build --progress plain -f docker/Dockerfile.cpu ."
|
||||
- "mkdir artifacts"
|
||||
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
||||
- "bash .buildkite/scripts/upload-wheels.sh manylinux_2_35"
|
||||
env:
|
||||
DOCKER_BUILDKIT: "1"
|
||||
|
||||
# Build release images (12.9)
|
||||
- label: "Build release image (x86)"
|
||||
depends_on: ~
|
||||
id: build-release-image-x86
|
||||
agents:
|
||||
queue: cpu_queue_postmerge
|
||||
commands:
|
||||
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
|
||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)"
|
||||
# re-tag to default image tag and push, just in case arm64 build fails
|
||||
- "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
|
||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
|
||||
|
||||
- label: "Build release image (arm64)"
|
||||
depends_on: ~
|
||||
id: build-release-image-arm64
|
||||
agents:
|
||||
queue: arm64_cpu_queue_postmerge
|
||||
commands:
|
||||
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
|
||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)"
|
||||
|
||||
# Add job to create multi-arch manifest
|
||||
- label: "Create multi-arch manifest"
|
||||
depends_on:
|
||||
- build-release-image-x86
|
||||
- build-release-image-arm64
|
||||
id: create-multi-arch-manifest
|
||||
agents:
|
||||
queue: cpu_queue_postmerge
|
||||
commands:
|
||||
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
||||
- "docker manifest create public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64 --amend"
|
||||
- "docker manifest push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
|
||||
|
||||
- label: "Annotate release workflow"
|
||||
depends_on:
|
||||
- create-multi-arch-manifest
|
||||
id: annotate-release-workflow
|
||||
agents:
|
||||
queue: cpu_queue_postmerge
|
||||
commands:
|
||||
- "bash .buildkite/scripts/annotate-release.sh"
|
||||
|
||||
- input: "Provide Release version here"
|
||||
id: input-release-version
|
||||
fields:
|
||||
- text: "What is the release version?"
|
||||
key: release-version
|
||||
|
||||
- block: "Build CPU release image"
|
||||
key: block-cpu-release-image-build
|
||||
depends_on: ~
|
||||
- group: "Build Python wheels"
|
||||
key: "build-wheels"
|
||||
steps:
|
||||
- label: "Build wheel - aarch64 - CUDA 12.9"
|
||||
depends_on: ~
|
||||
id: build-wheel-arm64-cuda-12-9
|
||||
agents:
|
||||
queue: arm64_cpu_queue_postmerge
|
||||
commands:
|
||||
# #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here:
|
||||
# https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
||||
- "mkdir artifacts"
|
||||
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
||||
- "bash .buildkite/scripts/upload-nightly-wheels.sh"
|
||||
env:
|
||||
DOCKER_BUILDKIT: "1"
|
||||
|
||||
- label: "Build and publish CPU release image"
|
||||
depends_on: block-cpu-release-image-build
|
||||
agents:
|
||||
queue: cpu_queue_postmerge
|
||||
commands:
|
||||
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_CPU_AVX512BF16=true --build-arg VLLM_CPU_AVX512VNNI=true --build-arg VLLM_CPU_AMXBF16=true --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ."
|
||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest"
|
||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version)"
|
||||
env:
|
||||
DOCKER_BUILDKIT: "1"
|
||||
- label: "Build wheel - aarch64 - CUDA 13.0"
|
||||
depends_on: ~
|
||||
id: build-wheel-arm64-cuda-13-0
|
||||
agents:
|
||||
queue: arm64_cpu_queue_postmerge
|
||||
commands:
|
||||
# #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here:
|
||||
# https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
||||
- "mkdir artifacts"
|
||||
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
||||
- "bash .buildkite/scripts/upload-nightly-wheels.sh manylinux_2_35"
|
||||
env:
|
||||
DOCKER_BUILDKIT: "1"
|
||||
|
||||
- block: "Build arm64 CPU release image"
|
||||
key: block-arm64-cpu-release-image-build
|
||||
depends_on: ~
|
||||
- label: "Build wheel - aarch64 - CPU"
|
||||
depends_on: ~
|
||||
id: build-wheel-arm64-cpu
|
||||
agents:
|
||||
queue: arm64_cpu_queue_postmerge
|
||||
commands:
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_BUILD_ACL=ON --tag vllm-ci:build-image --target vllm-build --progress plain -f docker/Dockerfile.cpu ."
|
||||
- "mkdir artifacts"
|
||||
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
||||
- "bash .buildkite/scripts/upload-nightly-wheels.sh manylinux_2_35"
|
||||
env:
|
||||
DOCKER_BUILDKIT: "1"
|
||||
|
||||
- label: "Build and publish arm64 CPU release image"
|
||||
depends_on: block-arm64-cpu-release-image-build
|
||||
agents:
|
||||
queue: arm64_cpu_queue_postmerge
|
||||
commands:
|
||||
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ."
|
||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:latest"
|
||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:$(buildkite-agent meta-data get release-version)"
|
||||
env:
|
||||
DOCKER_BUILDKIT: "1"
|
||||
- label: "Build wheel - x86_64 - CUDA 12.9"
|
||||
depends_on: ~
|
||||
id: build-wheel-x86-cuda-12-9
|
||||
agents:
|
||||
queue: cpu_queue_postmerge
|
||||
commands:
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
||||
- "mkdir artifacts"
|
||||
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
||||
- "bash .buildkite/scripts/upload-nightly-wheels.sh manylinux_2_31"
|
||||
env:
|
||||
DOCKER_BUILDKIT: "1"
|
||||
|
||||
- block: "Build ROCm release image"
|
||||
key: block-rocm-release-image-build
|
||||
depends_on: ~
|
||||
- label: "Build wheel - x86_64 - CUDA 13.0"
|
||||
depends_on: ~
|
||||
id: build-wheel-x86-cuda-13-0
|
||||
agents:
|
||||
queue: cpu_queue_postmerge
|
||||
commands:
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
||||
- "mkdir artifacts"
|
||||
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
||||
- "bash .buildkite/scripts/upload-nightly-wheels.sh manylinux_2_35"
|
||||
env:
|
||||
DOCKER_BUILDKIT: "1"
|
||||
|
||||
- label: "Build release image (ROCm)"
|
||||
depends_on: block-rocm-release-image-build
|
||||
id: build-release-image-rocm
|
||||
agents:
|
||||
queue: cpu_queue_postmerge
|
||||
commands:
|
||||
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
||||
# Build base image first
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --tag rocm/vllm-dev:base-$BUILDKITE_COMMIT --target final --progress plain -f docker/Dockerfile.rocm_base ."
|
||||
# Build vLLM ROCm image using the base
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg BASE_IMAGE=rocm/vllm-dev:base-$BUILDKITE_COMMIT --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-rocm --target vllm-openai --progress plain -f docker/Dockerfile.rocm ."
|
||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-rocm"
|
||||
- label: "Build wheel - x86_64 - CPU"
|
||||
depends_on: ~
|
||||
id: build-wheel-x86-cpu
|
||||
agents:
|
||||
queue: cpu_queue_postmerge
|
||||
commands:
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_CPU_AVX512BF16=true --build-arg VLLM_CPU_AVX512VNNI=true --build-arg VLLM_CPU_AMXBF16=true --tag vllm-ci:build-image --target vllm-build --progress plain -f docker/Dockerfile.cpu ."
|
||||
- "mkdir artifacts"
|
||||
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
||||
- "bash .buildkite/scripts/upload-nightly-wheels.sh manylinux_2_35"
|
||||
env:
|
||||
DOCKER_BUILDKIT: "1"
|
||||
|
||||
|
||||
- label: "Build and publish nightly multi-arch image to DockerHub"
|
||||
depends_on:
|
||||
- create-multi-arch-manifest
|
||||
if: build.env("NIGHTLY") == "1"
|
||||
agents:
|
||||
queue: cpu_queue_postmerge
|
||||
commands:
|
||||
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
||||
- "docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64"
|
||||
- "docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64"
|
||||
- "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64 vllm/vllm-openai:nightly-x86_64"
|
||||
- "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64 vllm/vllm-openai:nightly-aarch64"
|
||||
- "docker push vllm/vllm-openai:nightly-x86_64"
|
||||
- "docker push vllm/vllm-openai:nightly-aarch64"
|
||||
- "docker manifest create vllm/vllm-openai:nightly vllm/vllm-openai:nightly-x86_64 vllm/vllm-openai:nightly-aarch64 --amend"
|
||||
- "docker manifest create vllm/vllm-openai:nightly-$BUILDKITE_COMMIT vllm/vllm-openai:nightly-x86_64 vllm/vllm-openai:nightly-aarch64 --amend"
|
||||
- "docker manifest push vllm/vllm-openai:nightly"
|
||||
- "docker manifest push vllm/vllm-openai:nightly-$BUILDKITE_COMMIT"
|
||||
# Clean up old nightly builds (keep only last 14)
|
||||
- "bash .buildkite/scripts/cleanup-nightly-builds.sh"
|
||||
plugins:
|
||||
- docker-login#v3.0.0:
|
||||
username: vllmbot
|
||||
password-env: DOCKERHUB_TOKEN
|
||||
env:
|
||||
DOCKER_BUILDKIT: "1"
|
||||
DOCKERHUB_USERNAME: "vllmbot"
|
||||
- group: "Build release Docker images"
|
||||
key: "build-release-images"
|
||||
steps:
|
||||
- label: "Build release image - x86_64 - CUDA 12.9"
|
||||
depends_on: ~
|
||||
id: build-release-image-x86
|
||||
agents:
|
||||
queue: cpu_queue_postmerge
|
||||
commands:
|
||||
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
|
||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)"
|
||||
# re-tag to default image tag and push, just in case arm64 build fails
|
||||
- "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
|
||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
|
||||
|
||||
- label: "Build release image - aarch64 - CUDA 12.9"
|
||||
depends_on: ~
|
||||
id: build-release-image-arm64
|
||||
agents:
|
||||
queue: arm64_cpu_queue_postmerge
|
||||
commands:
|
||||
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
|
||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)"
|
||||
|
||||
- label: "Build release image - x86_64 - CUDA 13.0"
|
||||
depends_on: ~
|
||||
id: build-release-image-x86-cuda-13-0
|
||||
agents:
|
||||
queue: cpu_queue_postmerge
|
||||
commands:
|
||||
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg INSTALL_KV_CONNECTORS=true --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130 --target vllm-openai --progress plain -f docker/Dockerfile ."
|
||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130"
|
||||
# re-tag to default image tag and push, just in case arm64 build fails
|
||||
- "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu130"
|
||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu130"
|
||||
|
||||
- label: "Build release image - aarch64 - CUDA 13.0"
|
||||
depends_on: ~
|
||||
id: build-release-image-arm64-cuda-13-0
|
||||
agents:
|
||||
queue: arm64_cpu_queue_postmerge
|
||||
commands:
|
||||
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
||||
# compute capability 12.0 for RTX-50 series / RTX PRO 6000 Blackwell, 12.1 for DGX Spark
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0 12.1' --build-arg INSTALL_KV_CONNECTORS=true --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130 --target vllm-openai --progress plain -f docker/Dockerfile ."
|
||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130"
|
||||
|
||||
- block: "Build release image for x86_64 CPU"
|
||||
key: block-cpu-release-image-build
|
||||
depends_on: ~
|
||||
|
||||
- label: "Build release image - x86_64 - CPU"
|
||||
depends_on:
|
||||
- block-cpu-release-image-build
|
||||
- input-release-version
|
||||
agents:
|
||||
queue: cpu_queue_postmerge
|
||||
commands:
|
||||
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_CPU_AVX512BF16=true --build-arg VLLM_CPU_AVX512VNNI=true --build-arg VLLM_CPU_AMXBF16=true --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ."
|
||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest"
|
||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version)"
|
||||
env:
|
||||
DOCKER_BUILDKIT: "1"
|
||||
|
||||
- block: "Build release image for arm64 CPU"
|
||||
key: block-arm64-cpu-release-image-build
|
||||
depends_on: ~
|
||||
|
||||
- label: "Build release image - arm64 - CPU"
|
||||
depends_on:
|
||||
- block-arm64-cpu-release-image-build
|
||||
- input-release-version
|
||||
agents:
|
||||
queue: arm64_cpu_queue_postmerge
|
||||
commands:
|
||||
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ."
|
||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:latest"
|
||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:$(buildkite-agent meta-data get release-version)"
|
||||
env:
|
||||
DOCKER_BUILDKIT: "1"
|
||||
|
||||
- block: "Build release image for x86_64 ROCm"
|
||||
key: block-rocm-release-image-build
|
||||
depends_on: ~
|
||||
|
||||
- label: "Build release image - x86_64 - ROCm"
|
||||
depends_on: block-rocm-release-image-build
|
||||
id: build-release-image-rocm
|
||||
agents:
|
||||
queue: cpu_queue_postmerge
|
||||
commands:
|
||||
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
||||
# Build base image first
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --tag rocm/vllm-dev:base-$BUILDKITE_COMMIT --target final --progress plain -f docker/Dockerfile.rocm_base ."
|
||||
# Build vLLM ROCm image using the base
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg BASE_IMAGE=rocm/vllm-dev:base-$BUILDKITE_COMMIT --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-rocm --target vllm-openai --progress plain -f docker/Dockerfile.rocm ."
|
||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-rocm"
|
||||
|
||||
- group: "Publish release images"
|
||||
key: "publish-release-images"
|
||||
steps:
|
||||
- label: "Create multi-arch manifest - CUDA 12.9"
|
||||
depends_on:
|
||||
- build-release-image-x86
|
||||
- build-release-image-arm64
|
||||
id: create-multi-arch-manifest
|
||||
agents:
|
||||
queue: small_cpu_queue_postmerge
|
||||
commands:
|
||||
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
||||
- "docker manifest create public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64 --amend"
|
||||
- "docker manifest push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
|
||||
|
||||
- label: "Annotate release workflow - CUDA 12.9"
|
||||
depends_on:
|
||||
- create-multi-arch-manifest
|
||||
id: annotate-release-workflow
|
||||
agents:
|
||||
queue: small_cpu_queue_postmerge
|
||||
commands:
|
||||
- "bash .buildkite/scripts/annotate-release.sh"
|
||||
|
||||
- label: "Create multi-arch manifest - CUDA 13.0"
|
||||
depends_on:
|
||||
- build-release-image-x86-cuda-13-0
|
||||
- build-release-image-arm64-cuda-13-0
|
||||
id: create-multi-arch-manifest-cuda-13-0
|
||||
agents:
|
||||
queue: small_cpu_queue_postmerge
|
||||
commands:
|
||||
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
||||
- "docker manifest create public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu130 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64-cu130 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64-cu130 --amend"
|
||||
- "docker manifest push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu130"
|
||||
|
||||
- label: "Publish nightly multi-arch image to DockerHub"
|
||||
depends_on:
|
||||
- create-multi-arch-manifest
|
||||
if: build.env("NIGHTLY") == "1"
|
||||
agents:
|
||||
queue: small_cpu_queue_postmerge
|
||||
commands:
|
||||
- "bash .buildkite/scripts/push-nightly-builds.sh"
|
||||
# Clean up old nightly builds (keep only last 14)
|
||||
- "bash .buildkite/scripts/cleanup-nightly-builds.sh"
|
||||
plugins:
|
||||
- docker-login#v3.0.0:
|
||||
username: vllmbot
|
||||
password-env: DOCKERHUB_TOKEN
|
||||
env:
|
||||
DOCKER_BUILDKIT: "1"
|
||||
DOCKERHUB_USERNAME: "vllmbot"
|
||||
|
||||
- label: "Publish nightly multi-arch image to DockerHub - CUDA 13.0"
|
||||
depends_on:
|
||||
- create-multi-arch-manifest-cuda-13-0
|
||||
if: build.env("NIGHTLY") == "1"
|
||||
agents:
|
||||
queue: small_cpu_queue_postmerge
|
||||
commands:
|
||||
- "bash .buildkite/scripts/push-nightly-builds.sh cu130"
|
||||
# Clean up old nightly builds (keep only last 14)
|
||||
- "bash .buildkite/scripts/cleanup-nightly-builds.sh cu130-nightly-"
|
||||
plugins:
|
||||
- docker-login#v3.0.0:
|
||||
username: vllmbot
|
||||
password-env: DOCKERHUB_TOKEN
|
||||
env:
|
||||
DOCKER_BUILDKIT: "1"
|
||||
DOCKERHUB_USERNAME: "vllmbot"
|
||||
|
||||
- group: "Publish wheels"
|
||||
key: "publish-wheels"
|
||||
steps:
|
||||
- block: "Confirm update release wheels to PyPI (experimental, use with caution)?"
|
||||
key: block-upload-release-wheels
|
||||
depends_on:
|
||||
- input-release-version
|
||||
- build-wheels
|
||||
|
||||
- label: "Upload release wheels to PyPI and GitHub"
|
||||
depends_on:
|
||||
- block-upload-release-wheels
|
||||
id: upload-release-wheels
|
||||
agents:
|
||||
queue: small_cpu_queue_postmerge
|
||||
commands:
|
||||
- "bash .buildkite/scripts/upload-release-wheels.sh"
|
||||
|
||||
# =============================================================================
|
||||
# ROCm Release Pipeline (x86_64 only)
|
||||
|
||||
242
.buildkite/scripts/cherry-pick-from-milestone.sh
Executable file
242
.buildkite/scripts/cherry-pick-from-milestone.sh
Executable file
@@ -0,0 +1,242 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# cherry-pick-from-milestone.sh
|
||||
# Find commits from a GitHub milestone that are missing from the current branch
|
||||
# and output them in chronological order for cherry-picking.
|
||||
#
|
||||
# Usage: ./cherry-pick-from-milestone.sh <milestone> [--dry-run] [--execute]
|
||||
#
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
# Colors for output
|
||||
RED='\033[0;31m'
|
||||
GREEN='\033[0;32m'
|
||||
YELLOW='\033[1;33m'
|
||||
BLUE='\033[0;34m'
|
||||
NC='\033[0m' # No Color
|
||||
|
||||
usage() {
|
||||
cat <<EOF
|
||||
Usage: $(basename "$0") <milestone> [options]
|
||||
|
||||
Find commits from a GitHub milestone that need to be cherry-picked into the current branch.
|
||||
|
||||
Arguments:
|
||||
milestone The GitHub milestone name (e.g., v0.14.0)
|
||||
|
||||
Options:
|
||||
--dry-run Show the cherry-pick commands without executing (default)
|
||||
--execute Actually execute the cherry-picks
|
||||
--main-branch Specify the main branch name (default: main)
|
||||
--help Show this help message
|
||||
|
||||
Examples:
|
||||
$(basename "$0") v0.14.0
|
||||
$(basename "$0") v0.14.0 --dry-run
|
||||
$(basename "$0") v0.14.0 --execute
|
||||
$(basename "$0") v0.14.0 --main-branch master
|
||||
EOF
|
||||
exit 1
|
||||
}
|
||||
|
||||
log_info() {
|
||||
echo -e "${BLUE}[INFO]${NC} $1"
|
||||
}
|
||||
|
||||
log_success() {
|
||||
echo -e "${GREEN}[OK]${NC} $1"
|
||||
}
|
||||
|
||||
log_warn() {
|
||||
echo -e "${YELLOW}[WARN]${NC} $1"
|
||||
}
|
||||
|
||||
log_error() {
|
||||
echo -e "${RED}[ERROR]${NC} $1" >&2
|
||||
}
|
||||
|
||||
# Default values
|
||||
MILESTONE=""
|
||||
DRY_RUN=true
|
||||
MAIN_BRANCH="main"
|
||||
|
||||
# Parse arguments
|
||||
while [[ $# -gt 0 ]]; do
|
||||
case $1 in
|
||||
--dry-run)
|
||||
DRY_RUN=true
|
||||
shift
|
||||
;;
|
||||
--execute)
|
||||
DRY_RUN=false
|
||||
shift
|
||||
;;
|
||||
--main-branch)
|
||||
MAIN_BRANCH="$2"
|
||||
shift 2
|
||||
;;
|
||||
--help|-h)
|
||||
usage
|
||||
;;
|
||||
-*)
|
||||
log_error "Unknown option: $1"
|
||||
usage
|
||||
;;
|
||||
*)
|
||||
if [[ -z "$MILESTONE" ]]; then
|
||||
MILESTONE="$1"
|
||||
else
|
||||
log_error "Unexpected argument: $1"
|
||||
usage
|
||||
fi
|
||||
shift
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
# Validate milestone argument
|
||||
if [[ -z "$MILESTONE" ]]; then
|
||||
log_error "Milestone is required"
|
||||
usage
|
||||
fi
|
||||
|
||||
# Check if we're in a git repository
|
||||
if ! git rev-parse --is-inside-work-tree &>/dev/null; then
|
||||
log_error "Not in a git repository"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Check if gh CLI is available
|
||||
if ! command -v gh &>/dev/null; then
|
||||
log_error "GitHub CLI (gh) is not installed"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Check if authenticated with gh
|
||||
if ! gh auth status &>/dev/null; then
|
||||
log_error "Not authenticated with GitHub CLI. Run 'gh auth login' first."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
CURRENT_BRANCH=$(git branch --show-current)
|
||||
log_info "Current branch: ${CURRENT_BRANCH}"
|
||||
log_info "Main branch: ${MAIN_BRANCH}"
|
||||
log_info "Milestone: ${MILESTONE}"
|
||||
echo ""
|
||||
|
||||
# Fetch latest from remote
|
||||
log_info "Fetching latest from remote..."
|
||||
git fetch origin "$MAIN_BRANCH" --quiet
|
||||
|
||||
# Get merged PRs from the milestone, sorted by merge date
|
||||
log_info "Fetching merged PRs from milestone '${MILESTONE}'..."
|
||||
|
||||
# Store PR data in a temp file
|
||||
PR_DATA=$(mktemp)
|
||||
trap "rm -f $PR_DATA" EXIT
|
||||
|
||||
if ! gh pr list --state merged --search "milestone:${MILESTONE}" \
|
||||
--limit 1000 \
|
||||
--json number,title,mergeCommit,mergedAt \
|
||||
--jq 'sort_by(.mergedAt) | .[] | "\(.mergeCommit.oid)\t\(.number)\t\(.title)"' > "$PR_DATA" 2>/dev/null; then
|
||||
log_error "Failed to fetch PRs from milestone '${MILESTONE}'"
|
||||
log_error "This could be due to:"
|
||||
log_error " - Milestone does not exist"
|
||||
log_error " - Network/authentication issues"
|
||||
log_error " - Invalid milestone name format"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [[ ! -s "$PR_DATA" ]]; then
|
||||
log_warn "No merged PRs found for milestone '${MILESTONE}'"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
TOTAL_PRS=$(wc -l < "$PR_DATA")
|
||||
log_info "Found ${TOTAL_PRS} merged PR(s) in milestone"
|
||||
echo ""
|
||||
|
||||
# Find commits that are missing from current branch
|
||||
MISSING_COMMITS=()
|
||||
MISSING_INFO=()
|
||||
|
||||
while IFS=$'\t' read -r sha pr_number title; do
|
||||
# Skip if SHA is empty or null
|
||||
if [[ -z "$sha" || "$sha" == "null" ]]; then
|
||||
log_warn "PR #${pr_number} has no merge commit SHA, skipping"
|
||||
continue
|
||||
fi
|
||||
|
||||
# Check if this commit is already in the current branch
|
||||
if git merge-base --is-ancestor "$sha" HEAD 2>/dev/null; then
|
||||
log_success "PR #${pr_number} already in branch: ${title:0:60}"
|
||||
else
|
||||
log_warn "PR #${pr_number} MISSING: ${title:0:60}"
|
||||
MISSING_COMMITS+=("$sha")
|
||||
MISSING_INFO+=("$sha PR #${pr_number}: ${title}")
|
||||
fi
|
||||
done < "$PR_DATA"
|
||||
|
||||
echo ""
|
||||
|
||||
if [[ ${#MISSING_COMMITS[@]} -eq 0 ]]; then
|
||||
log_success "All PRs from milestone '${MILESTONE}' are already in the current branch!"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
log_info "Found ${#MISSING_COMMITS[@]} missing commit(s) to cherry-pick"
|
||||
echo ""
|
||||
|
||||
# Output the cherry-pick commands
|
||||
echo "=========================================="
|
||||
echo "Cherry-pick commands (in chronological order):"
|
||||
echo "=========================================="
|
||||
echo ""
|
||||
|
||||
for info in "${MISSING_INFO[@]}"; do
|
||||
echo "# $info"
|
||||
done
|
||||
echo ""
|
||||
|
||||
echo "# Run these commands to cherry-pick all missing commits:"
|
||||
echo "git cherry-pick ${MISSING_COMMITS[*]}"
|
||||
echo ""
|
||||
|
||||
# Or one by one
|
||||
echo "# Or cherry-pick one at a time:"
|
||||
for sha in "${MISSING_COMMITS[@]}"; do
|
||||
echo "git cherry-pick $sha"
|
||||
done
|
||||
echo ""
|
||||
|
||||
# Execute if requested
|
||||
if [[ "$DRY_RUN" == false ]]; then
|
||||
echo "=========================================="
|
||||
log_info "Executing cherry-picks..."
|
||||
echo "=========================================="
|
||||
|
||||
for i in "${!MISSING_COMMITS[@]}"; do
|
||||
sha="${MISSING_COMMITS[$i]}"
|
||||
info="${MISSING_INFO[$i]}"
|
||||
|
||||
echo ""
|
||||
log_info "Cherry-picking: $info"
|
||||
|
||||
if git cherry-pick "$sha"; then
|
||||
log_success "Successfully cherry-picked $sha"
|
||||
else
|
||||
log_error "Failed to cherry-pick $sha"
|
||||
log_error "Resolve conflicts and run 'git cherry-pick --continue', or 'git cherry-pick --abort' to cancel"
|
||||
exit 1
|
||||
fi
|
||||
done
|
||||
|
||||
echo ""
|
||||
log_success "All cherry-picks completed successfully!"
|
||||
else
|
||||
echo "=========================================="
|
||||
echo -e "${YELLOW}Dry run mode - no changes made${NC}"
|
||||
echo "Run with --execute to perform the cherry-picks"
|
||||
echo "=========================================="
|
||||
fi
|
||||
@@ -3,7 +3,14 @@
|
||||
set -ex
|
||||
|
||||
# Clean up old nightly builds from DockerHub, keeping only the last 14 builds
|
||||
# This script uses DockerHub API to list and delete old tags with "nightly-" prefix
|
||||
# This script uses DockerHub API to list and delete old tags with specified prefix
|
||||
# Usage: cleanup-nightly-builds.sh [TAG_PREFIX]
|
||||
# Example: cleanup-nightly-builds.sh "nightly-" or cleanup-nightly-builds.sh "cu130-nightly-"
|
||||
|
||||
# Get tag prefix from argument, default to "nightly-" if not provided
|
||||
TAG_PREFIX="${1:-nightly-}"
|
||||
|
||||
echo "Cleaning up tags with prefix: $TAG_PREFIX"
|
||||
|
||||
# DockerHub API endpoint for vllm/vllm-openai repository
|
||||
REPO_API_URL="https://hub.docker.com/v2/repositories/vllm/vllm-openai/tags"
|
||||
@@ -45,7 +52,7 @@ get_all_tags() {
|
||||
set -x
|
||||
|
||||
# Get both last_updated timestamp and tag name, separated by |
|
||||
local tags=$(echo "$response" | jq -r '.results[] | select(.name | startswith("nightly-")) | "\(.last_updated)|\(.name)"')
|
||||
local tags=$(echo "$response" | jq -r --arg prefix "$TAG_PREFIX" '.results[] | select(.name | startswith($prefix)) | "\(.last_updated)|\(.name)"')
|
||||
|
||||
if [ -z "$tags" ]; then
|
||||
break
|
||||
|
||||
36
.buildkite/scripts/push-nightly-builds.sh
Executable file
36
.buildkite/scripts/push-nightly-builds.sh
Executable file
@@ -0,0 +1,36 @@
|
||||
#!/bin/bash
|
||||
|
||||
set -ex
|
||||
|
||||
# Get tag variant from argument, default to empty if not provided, should be something like "cu130".
|
||||
# Due to limits in cleanup script, we must move variants to use separate tags like "cu130-nightly",
|
||||
# otherwise they will be cleaned up together with the main "nightly" tags.
|
||||
|
||||
TAG_VARIANT="$1"
|
||||
if [ -n "$TAG_VARIANT" ]; then
|
||||
ORIG_TAG_SUFFIX="-$TAG_VARIANT"
|
||||
TAG_NAME="$TAG_VARIANT-nightly"
|
||||
else
|
||||
ORIG_TAG_SUFFIX=""
|
||||
TAG_NAME="nightly"
|
||||
fi
|
||||
|
||||
ORIG_TAG_NAME="$BUILDKITE_COMMIT"
|
||||
|
||||
echo "Pushing original tag $ORIG_TAG_NAME$ORIG_TAG_SUFFIX to new nightly tag name: $TAG_NAME"
|
||||
|
||||
# pull original arch-dependent images from AWS ECR Public
|
||||
aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7
|
||||
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:$ORIG_TAG_NAME-x86_64$ORIG_TAG_SUFFIX
|
||||
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:$ORIG_TAG_NAME-aarch64$ORIG_TAG_SUFFIX
|
||||
# tag arch-dependent images
|
||||
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$ORIG_TAG_NAME-x86_64$ORIG_TAG_SUFFIX vllm/vllm-openai:$TAG_NAME-x86_64
|
||||
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$ORIG_TAG_NAME-aarch64$ORIG_TAG_SUFFIX vllm/vllm-openai:$TAG_NAME-aarch64
|
||||
# push arch-dependent images to DockerHub
|
||||
docker push vllm/vllm-openai:$TAG_NAME-x86_64
|
||||
docker push vllm/vllm-openai:$TAG_NAME-aarch64
|
||||
# push arch-independent manifest to DockerHub
|
||||
docker manifest create vllm/vllm-openai:$TAG_NAME vllm/vllm-openai:$TAG_NAME-x86_64 vllm/vllm-openai:$TAG_NAME-aarch64 --amend
|
||||
docker manifest create vllm/vllm-openai:$TAG_NAME-$BUILDKITE_COMMIT vllm/vllm-openai:$TAG_NAME-x86_64 vllm/vllm-openai:$TAG_NAME-aarch64 --amend
|
||||
docker manifest push vllm/vllm-openai:$TAG_NAME
|
||||
docker manifest push vllm/vllm-openai:$TAG_NAME-$BUILDKITE_COMMIT
|
||||
@@ -18,15 +18,18 @@ wait_for_server() {
|
||||
|
||||
MODEL="Qwen/Qwen3-Next-80B-A3B-Instruct"
|
||||
|
||||
# Set BACKENDS based on platform
|
||||
# Set BACKENDS and platform-specific args based on platform
|
||||
if command -v rocm-smi &> /dev/null || [[ -d /opt/rocm ]] || [[ -n "${ROCM_PATH:-}" ]]; then
|
||||
# ROCm platform
|
||||
BACKENDS=("allgather_reducescatter")
|
||||
# Disable MOE padding for ROCm since it is causing eplb to fail
|
||||
export VLLM_ROCM_MOE_PADDING=0
|
||||
PLATFORM_ARGS=("--no-async-scheduling")
|
||||
echo "Disabled async scheduling for ROCm platform due to issues with spec decode."
|
||||
else
|
||||
# Non-ROCm platform (CUDA/other)
|
||||
BACKENDS=("deepep_high_throughput" "deepep_low_latency")
|
||||
PLATFORM_ARGS=()
|
||||
fi
|
||||
|
||||
cleanup() {
|
||||
@@ -54,6 +57,7 @@ for BACK in "${BACKENDS[@]}"; do
|
||||
--trust-remote-code \
|
||||
--max-model-len 2048 \
|
||||
--gpu-memory-utilization 0.9 \
|
||||
"${PLATFORM_ARGS[@]}" \
|
||||
--port $PORT &
|
||||
SERVER_PID=$!
|
||||
wait_for_server $PORT
|
||||
|
||||
227
.buildkite/scripts/trigger-ci-build.sh
Executable file
227
.buildkite/scripts/trigger-ci-build.sh
Executable file
@@ -0,0 +1,227 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# trigger-ci-build.sh
|
||||
# Trigger a Buildkite CI build using the bk CLI for the current commit and branch
|
||||
# with RUN_ALL=1 and NIGHTLY=1 environment variables.
|
||||
#
|
||||
# Usage: ./trigger-ci-build.sh [options]
|
||||
#
|
||||
# Requires: bk CLI (https://buildkite.com/docs/platform/cli)
|
||||
#
|
||||
# SAFETY: Dry-run by default. Use --execute to actually trigger a build.
|
||||
#
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
# Colors for output
|
||||
RED='\033[0;31m'
|
||||
GREEN='\033[0;32m'
|
||||
YELLOW='\033[1;33m'
|
||||
BLUE='\033[0;34m'
|
||||
NC='\033[0m' # No Color
|
||||
|
||||
# Default configuration
|
||||
PIPELINE="ci"
|
||||
DRY_RUN=true
|
||||
|
||||
usage() {
|
||||
cat <<EOF
|
||||
Usage: $(basename "$0") [options]
|
||||
|
||||
Trigger a Buildkite CI build using the bk CLI for the current commit and branch.
|
||||
Sets RUN_ALL=1 and NIGHTLY=1 environment variables.
|
||||
|
||||
SAFETY: Dry-run by default. Use --execute to actually trigger a build.
|
||||
|
||||
Options:
|
||||
--execute Actually trigger the build (default: dry-run)
|
||||
--pipeline Buildkite pipeline slug (default: ${PIPELINE})
|
||||
--commit Override commit SHA (default: current HEAD)
|
||||
--branch Override branch name (default: current branch)
|
||||
--message Custom build message (default: auto-generated)
|
||||
--help Show this help message
|
||||
|
||||
Prerequisites:
|
||||
- bk CLI installed: brew tap buildkite/buildkite && brew install buildkite/buildkite/bk
|
||||
- bk configured: bk configure
|
||||
|
||||
Examples:
|
||||
$(basename "$0") # Dry-run, show what would happen
|
||||
$(basename "$0") --execute # Actually trigger the build
|
||||
$(basename "$0") --pipeline ci-shadow # Dry-run with different pipeline
|
||||
EOF
|
||||
exit 1
|
||||
}
|
||||
|
||||
log_info() {
|
||||
echo -e "${BLUE}[INFO]${NC} $1"
|
||||
}
|
||||
|
||||
log_success() {
|
||||
echo -e "${GREEN}[OK]${NC} $1"
|
||||
}
|
||||
|
||||
log_warn() {
|
||||
echo -e "${YELLOW}[WARN]${NC} $1"
|
||||
}
|
||||
|
||||
log_error() {
|
||||
echo -e "${RED}[ERROR]${NC} $1" >&2
|
||||
}
|
||||
|
||||
# Parse arguments
|
||||
COMMIT=""
|
||||
BRANCH=""
|
||||
MESSAGE=""
|
||||
|
||||
while [[ $# -gt 0 ]]; do
|
||||
case $1 in
|
||||
--execute)
|
||||
DRY_RUN=false
|
||||
shift
|
||||
;;
|
||||
--pipeline)
|
||||
PIPELINE="$2"
|
||||
shift 2
|
||||
;;
|
||||
--commit)
|
||||
COMMIT="$2"
|
||||
shift 2
|
||||
;;
|
||||
--branch)
|
||||
BRANCH="$2"
|
||||
shift 2
|
||||
;;
|
||||
--message)
|
||||
MESSAGE="$2"
|
||||
shift 2
|
||||
;;
|
||||
--help|-h)
|
||||
usage
|
||||
;;
|
||||
-*)
|
||||
log_error "Unknown option: $1"
|
||||
usage
|
||||
;;
|
||||
*)
|
||||
log_error "Unexpected argument: $1"
|
||||
usage
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
# Check if bk CLI is installed
|
||||
if ! command -v bk &>/dev/null; then
|
||||
log_error "Buildkite CLI (bk) is not installed"
|
||||
echo ""
|
||||
echo "Install with:"
|
||||
echo " brew tap buildkite/buildkite && brew install buildkite/buildkite/bk"
|
||||
echo ""
|
||||
echo "Then configure:"
|
||||
echo " bk configure"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Check if we're in a git repository
|
||||
if ! git rev-parse --is-inside-work-tree &>/dev/null; then
|
||||
log_error "Not in a git repository"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Get current commit and branch if not overridden
|
||||
if [[ -z "$COMMIT" ]]; then
|
||||
COMMIT=$(git rev-parse HEAD)
|
||||
fi
|
||||
|
||||
if [[ -z "$BRANCH" ]]; then
|
||||
BRANCH=$(git branch --show-current)
|
||||
if [[ -z "$BRANCH" ]]; then
|
||||
# Detached HEAD state - try to get branch from ref
|
||||
BRANCH=$(git rev-parse --abbrev-ref HEAD)
|
||||
fi
|
||||
fi
|
||||
|
||||
# Generate default message if not provided
|
||||
if [[ -z "$MESSAGE" ]]; then
|
||||
COMMIT_MSG=$(git log -1 --pretty=format:"%s" "$COMMIT" 2>/dev/null || echo "Manual build")
|
||||
MESSAGE="[Manual] ${COMMIT_MSG}"
|
||||
fi
|
||||
|
||||
# Safety check: Verify the commit exists on the remote
|
||||
log_info "Verifying commit exists on remote..."
|
||||
git fetch origin --quiet 2>/dev/null || true
|
||||
|
||||
# Check if commit is reachable from any remote branch
|
||||
REMOTE_BRANCHES=$(git branch -r --contains "$COMMIT" 2>/dev/null || true)
|
||||
if [[ -z "$REMOTE_BRANCHES" ]]; then
|
||||
log_error "Commit ${COMMIT} does not exist on any remote branch!"
|
||||
echo ""
|
||||
echo "The CI system will fail to checkout this commit."
|
||||
echo "Please push your changes first:"
|
||||
echo ""
|
||||
echo " git push origin ${BRANCH}"
|
||||
echo ""
|
||||
exit 1
|
||||
fi
|
||||
|
||||
log_success "Commit found on remote branches:"
|
||||
echo "$REMOTE_BRANCHES" | head -5 | sed 's/^/ /'
|
||||
if [[ $(echo "$REMOTE_BRANCHES" | wc -l) -gt 5 ]]; then
|
||||
echo " ... and more"
|
||||
fi
|
||||
echo ""
|
||||
|
||||
log_info "Pipeline: ${PIPELINE}"
|
||||
log_info "Branch: ${BRANCH}"
|
||||
log_info "Commit: ${COMMIT}"
|
||||
log_info "Message: ${MESSAGE}"
|
||||
log_info "Environment: RUN_ALL=1, NIGHTLY=1"
|
||||
echo ""
|
||||
|
||||
# Build the command
|
||||
CMD=(bk build create
|
||||
-y
|
||||
-w
|
||||
-i
|
||||
--pipeline "${PIPELINE}"
|
||||
--commit "${COMMIT}"
|
||||
--branch "${BRANCH}"
|
||||
--message "${MESSAGE}"
|
||||
--env "RUN_ALL=1"
|
||||
--env "NIGHTLY=1"
|
||||
)
|
||||
|
||||
if [[ "$DRY_RUN" == true ]]; then
|
||||
echo "=========================================="
|
||||
log_warn "DRY-RUN MODE - No build will be triggered"
|
||||
echo "=========================================="
|
||||
echo ""
|
||||
echo "Command that would be executed:"
|
||||
echo ""
|
||||
# Escape single quotes in values for safe shell display
|
||||
escape_for_shell() {
|
||||
printf '%s' "$1" | sed "s/'/'\\\\''/g"
|
||||
}
|
||||
echo " bk build create \\"
|
||||
echo " -y \\"
|
||||
echo " -w \\"
|
||||
echo " -i \\"
|
||||
echo " --pipeline '$(escape_for_shell "${PIPELINE}")' \\"
|
||||
echo " --commit '$(escape_for_shell "${COMMIT}")' \\"
|
||||
echo " --branch '$(escape_for_shell "${BRANCH}")' \\"
|
||||
echo " --message '$(escape_for_shell "${MESSAGE}")' \\"
|
||||
echo " --env 'RUN_ALL=1' \\"
|
||||
echo " --env 'NIGHTLY=1'"
|
||||
echo ""
|
||||
echo "=========================================="
|
||||
echo -e "${YELLOW}To actually trigger this build, run:${NC}"
|
||||
echo ""
|
||||
echo " $0 --execute"
|
||||
echo "=========================================="
|
||||
exit 0
|
||||
fi
|
||||
|
||||
log_info "Triggering build..."
|
||||
|
||||
# Execute the command - bk will print the URL and open browser
|
||||
"${CMD[@]}"
|
||||
104
.buildkite/scripts/upload-release-wheels.sh
Normal file
104
.buildkite/scripts/upload-release-wheels.sh
Normal file
@@ -0,0 +1,104 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
set -e
|
||||
|
||||
BUCKET="vllm-wheels"
|
||||
SUBPATH=$BUILDKITE_COMMIT
|
||||
S3_COMMIT_PREFIX="s3://$BUCKET/$SUBPATH/"
|
||||
|
||||
RELEASE_VERSION=$(buildkite-agent meta-data get release-version)
|
||||
echo "Release version from Buildkite: $RELEASE_VERSION"
|
||||
GIT_VERSION=$(git describe --exact-match --tags $BUILDKITE_COMMIT 2>/dev/null)
|
||||
if [ -z "$GIT_VERSION" ]; then
|
||||
echo "[FATAL] Not on a git tag, cannot create release."
|
||||
exit 1
|
||||
else
|
||||
echo "Git version for commit $BUILDKITE_COMMIT: $GIT_VERSION"
|
||||
fi
|
||||
# sanity check for version mismatch
|
||||
if [ "$RELEASE_VERSION" != "$GIT_VERSION" ]; then
|
||||
if [ "$FORCE_RELEASE_IGNORE_VERSION_MISMATCH" == "true" ]; then
|
||||
echo "[WARNING] Force release and ignore version mismatch"
|
||||
else
|
||||
echo "[FATAL] Release version from Buildkite does not match Git version."
|
||||
exit 1
|
||||
fi
|
||||
fi
|
||||
PURE_VERSION=${RELEASE_VERSION#v} # remove leading 'v'
|
||||
|
||||
# check pypi token
|
||||
if [ -z "$PYPI_TOKEN" ]; then
|
||||
echo "[FATAL] PYPI_TOKEN is not set."
|
||||
exit 1
|
||||
else
|
||||
export TWINE_USERNAME="__token__"
|
||||
export TWINE_PASSWORD="$PYPI_TOKEN"
|
||||
fi
|
||||
|
||||
# check github token
|
||||
if [ -z "$GITHUB_TOKEN" ]; then
|
||||
echo "[FATAL] GITHUB_TOKEN is not set."
|
||||
exit 1
|
||||
else
|
||||
export GH_TOKEN="$GITHUB_TOKEN"
|
||||
fi
|
||||
|
||||
set -x # avoid printing secrets above
|
||||
|
||||
# download gh CLI from github
|
||||
# Get latest gh CLI version from GitHub API
|
||||
GH_VERSION=$(curl -s https://api.github.com/repos/cli/cli/releases/latest | grep '"tag_name":' | sed -E 's/.*"([^"]+)".*/\1/' | sed 's/^v//')
|
||||
if [ -z "$GH_VERSION" ]; then
|
||||
echo "[FATAL] Failed to get latest gh CLI version from GitHub"
|
||||
exit 1
|
||||
fi
|
||||
echo "Downloading gh CLI version: $GH_VERSION"
|
||||
GH_TARBALL="gh_${GH_VERSION}_linux_amd64.tar.gz"
|
||||
GH_URL="https://github.com/cli/cli/releases/download/v${GH_VERSION}/${GH_TARBALL}"
|
||||
GH_INSTALL_DIR="/tmp/gh-install"
|
||||
mkdir -p "$GH_INSTALL_DIR"
|
||||
pushd "$GH_INSTALL_DIR"
|
||||
curl -L -o "$GH_TARBALL" "$GH_URL"
|
||||
tar -xzf "$GH_TARBALL"
|
||||
GH_BIN=$(realpath $(find . -name "gh" -type f -executable | head -n 1))
|
||||
if [ -z "$GH_BIN" ]; then
|
||||
echo "[FATAL] Failed to find gh CLI executable"
|
||||
exit 1
|
||||
fi
|
||||
echo "gh CLI downloaded successfully, version: $($GH_BIN --version)"
|
||||
echo "Last 5 releases on GitHub:" # as a sanity check of gh and GH_TOKEN
|
||||
command "$GH_BIN" release list --limit 5
|
||||
popd
|
||||
|
||||
# install twine from pypi
|
||||
python3 -m venv /tmp/vllm-release-env
|
||||
source /tmp/vllm-release-env/bin/activate
|
||||
pip install twine
|
||||
python3 -m twine --version
|
||||
|
||||
# copy release wheels to local directory
|
||||
DIST_DIR=/tmp/vllm-release-dist
|
||||
echo "Existing wheels on S3:"
|
||||
aws s3 ls "$S3_COMMIT_PREFIX"
|
||||
echo "Copying wheels to local directory"
|
||||
mkdir -p $DIST_DIR
|
||||
# include only wheels for the release version, ignore all files with "dev" or "rc" in the name (without excluding 'aarch64')
|
||||
aws s3 cp --recursive --exclude "*" --include "vllm-${PURE_VERSION}*.whl" --exclude "*dev*" --exclude "*rc[0-9]*" "$S3_COMMIT_PREFIX" $DIST_DIR
|
||||
echo "Wheels copied to local directory"
|
||||
# generate source tarball
|
||||
git archive --format=tar.gz --output="$DIST_DIR/vllm-${PURE_VERSION}.tar.gz" $BUILDKITE_COMMIT
|
||||
ls -la $DIST_DIR
|
||||
|
||||
|
||||
# upload wheels to PyPI (only default variant, i.e. files without '+' in the name)
|
||||
PYPI_WHEEL_FILES=$(find $DIST_DIR -name "vllm-${PURE_VERSION}*.whl" -not -name "*+*")
|
||||
if [ -z "$PYPI_WHEEL_FILES" ]; then
|
||||
echo "No default variant wheels found, quitting..."
|
||||
exit 1
|
||||
fi
|
||||
python3 -m twine check $PYPI_WHEEL_FILES
|
||||
python3 -m twine --non-interactive --verbose upload $PYPI_WHEEL_FILES
|
||||
echo "Wheels uploaded to PyPI"
|
||||
|
||||
# create release on GitHub with the release version and all wheels
|
||||
command "$GH_BIN" release create $GIT_VERSION -d --latest --notes-from-tag --verify-tag $DIST_DIR/*.whl
|
||||
@@ -71,6 +71,7 @@ steps:
|
||||
- tests/test_inputs.py
|
||||
- tests/test_outputs.py
|
||||
- tests/multimodal
|
||||
- tests/renderers
|
||||
- tests/standalone_tests/lazy_imports.py
|
||||
- tests/tokenizers_
|
||||
- tests/tool_parsers
|
||||
@@ -82,6 +83,7 @@ steps:
|
||||
- pytest -v -s test_inputs.py
|
||||
- pytest -v -s test_outputs.py
|
||||
- pytest -v -s -m 'cpu_test' multimodal
|
||||
- pytest -v -s renderers
|
||||
- pytest -v -s tokenizers_
|
||||
- pytest -v -s tool_parsers
|
||||
- pytest -v -s transformers_utils
|
||||
@@ -428,6 +430,8 @@ steps:
|
||||
timeout_in_minutes: 30
|
||||
gpu: h100
|
||||
source_file_dependencies:
|
||||
- vllm/config/attention.py
|
||||
- vllm/model_executor/layers/attention
|
||||
- vllm/v1/attention
|
||||
- tests/v1/attention
|
||||
commands:
|
||||
@@ -452,10 +456,12 @@ steps:
|
||||
timeout_in_minutes: 30
|
||||
gpu: b200
|
||||
source_file_dependencies:
|
||||
- vllm/config/attention.py
|
||||
- vllm/model_executor/layers/attention
|
||||
- vllm/v1/attention
|
||||
- tests/v1/attention
|
||||
commands:
|
||||
- VLLM_DISABLE_FLASHINFER_PREFILL=1 pytest -v -s v1/attention # TODO: FI prefill is bugged and causes incorrectness, fix this
|
||||
- pytest -v -s v1/attention
|
||||
|
||||
- label: V1 Test others (CPU) # 5 mins
|
||||
mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
|
||||
@@ -703,6 +709,17 @@ steps:
|
||||
- pytest -v -s kernels/moe/test_batched_deepgemm.py
|
||||
- pytest -v -s kernels/attention/test_deepgemm_attention.py
|
||||
|
||||
- label: Kernels Helion Test
|
||||
timeout_in_minutes: 30
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
agent_pool: mi325_1
|
||||
source_file_dependencies:
|
||||
- vllm/utils/import_utils.py
|
||||
- tests/kernels/helion/
|
||||
commands:
|
||||
- pip install helion
|
||||
- pytest -v -s kernels/helion/
|
||||
|
||||
- label: Model Executor Test # 23min
|
||||
timeout_in_minutes: 35
|
||||
torch_nightly: true
|
||||
@@ -855,7 +872,7 @@ steps:
|
||||
|
||||
- label: Language Models Tests (Standard)
|
||||
timeout_in_minutes: 25
|
||||
mirror_hardwares: [amdexperimental]
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
agent_pool: mi325_1
|
||||
# grade: Blocking
|
||||
torch_nightly: true
|
||||
@@ -1114,7 +1131,7 @@ steps:
|
||||
- csrc/quantization/cutlass_w8a8/moe/
|
||||
- vllm/model_executor/layers/fused_moe/cutlass_moe.py
|
||||
- vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py
|
||||
- vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py
|
||||
- vllm/model_executor/layers/fused_moe/flashinfer_a2a_prepare_finalize.py
|
||||
- vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
|
||||
- vllm/v1/attention/backends/flashinfer.py
|
||||
- vllm/v1/attention/backends/mla/cutlass_mla.py
|
||||
@@ -1451,7 +1468,7 @@ steps:
|
||||
- bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large-amd.txt
|
||||
|
||||
- label: NixlConnector PD accuracy tests (Distributed) # 30min
|
||||
mirror_hardwares: [amdexperimental]
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
agent_pool: mi325_4
|
||||
# grade: Blocking
|
||||
timeout_in_minutes: 30
|
||||
@@ -1462,10 +1479,10 @@ steps:
|
||||
- tests/v1/kv_connector/nixl_integration/
|
||||
commands:
|
||||
- uv pip install --system -r /vllm-workspace/requirements/kv_connectors_rocm.txt
|
||||
- VLLM_ATTENTION_BACKEND=ROCM_ATTN bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh
|
||||
- ROCM_ATTN=1 bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh
|
||||
|
||||
- label: DP EP NixlConnector PD accuracy tests (Distributed) # 15min
|
||||
mirror_hardwares: [amdexperimental]
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
agent_pool: mi325_4
|
||||
# grade: Blocking
|
||||
timeout_in_minutes: 15
|
||||
@@ -1476,7 +1493,7 @@ steps:
|
||||
- tests/v1/kv_connector/nixl_integration/
|
||||
commands:
|
||||
- uv pip install --system -r /vllm-workspace/requirements/kv_connectors_rocm.txt
|
||||
- VLLM_ATTENTION_BACKEND=ROCM_ATTN DP_EP=1 bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh
|
||||
- DP_EP=1 ROCM_ATTN=1 bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh
|
||||
|
||||
##### multi gpus test #####
|
||||
##### A100 test #####
|
||||
@@ -1662,17 +1679,6 @@ steps:
|
||||
commands:
|
||||
- bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1
|
||||
|
||||
- label: DeepSeek V2-Lite Async EPLB Accuracy
|
||||
timeout_in_minutes: 60
|
||||
mirror_hardwares: [amdexperimental]
|
||||
agent_pool: mi325_4
|
||||
# grade: Blocking
|
||||
gpu: h100
|
||||
optional: true
|
||||
num_gpus: 4
|
||||
working_dir: "/vllm-workspace"
|
||||
commands:
|
||||
- bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_async_eplb.sh 0.25 1319 8030
|
||||
|
||||
- label: Qwen3-Next-80B-A3B-Instruct MTP Async EPLB Accuracy
|
||||
timeout_in_minutes: 60
|
||||
|
||||
@@ -64,6 +64,7 @@ steps:
|
||||
- tests/test_inputs.py
|
||||
- tests/test_outputs.py
|
||||
- tests/multimodal
|
||||
- tests/renderers
|
||||
- tests/standalone_tests/lazy_imports.py
|
||||
- tests/tokenizers_
|
||||
- tests/tool_parsers
|
||||
@@ -75,6 +76,7 @@ steps:
|
||||
- pytest -v -s test_inputs.py
|
||||
- pytest -v -s test_outputs.py
|
||||
- pytest -v -s -m 'cpu_test' multimodal
|
||||
- pytest -v -s renderers
|
||||
- pytest -v -s tokenizers_
|
||||
- pytest -v -s tool_parsers
|
||||
- pytest -v -s transformers_utils
|
||||
@@ -374,6 +376,8 @@ steps:
|
||||
timeout_in_minutes: 30
|
||||
gpu: h100
|
||||
source_file_dependencies:
|
||||
- vllm/config/attention.py
|
||||
- vllm/model_executor/layers/attention
|
||||
- vllm/v1/attention
|
||||
- tests/v1/attention
|
||||
commands:
|
||||
@@ -396,10 +400,12 @@ steps:
|
||||
timeout_in_minutes: 30
|
||||
gpu: b200
|
||||
source_file_dependencies:
|
||||
- vllm/config/attention.py
|
||||
- vllm/model_executor/layers/attention
|
||||
- vllm/v1/attention
|
||||
- tests/v1/attention
|
||||
commands:
|
||||
- VLLM_DISABLE_FLASHINFER_PREFILL=1 pytest -v -s v1/attention # TODO: FI prefill is bugged and causes incorrectness, fix this
|
||||
- pytest -v -s v1/attention
|
||||
|
||||
- label: V1 Test others (CPU) # 5 mins
|
||||
source_file_dependencies:
|
||||
@@ -624,6 +630,56 @@ steps:
|
||||
- pytest -v -s kernels/moe/test_batched_deepgemm.py
|
||||
- pytest -v -s kernels/attention/test_deepgemm_attention.py
|
||||
|
||||
- label: Kernels Helion Test
|
||||
timeout_in_minutes: 30
|
||||
gpu: h100
|
||||
source_file_dependencies:
|
||||
- vllm/utils/import_utils.py
|
||||
- tests/kernels/helion/
|
||||
commands:
|
||||
- pip install helion
|
||||
- pytest -v -s kernels/helion/
|
||||
|
||||
|
||||
- label: Kernels FP8 MoE Test (1 H100)
|
||||
timeout_in_minutes: 90
|
||||
gpu: h100
|
||||
num_gpus: 1
|
||||
optional: true
|
||||
commands:
|
||||
- pytest -v -s kernels/moe/test_cutlass_moe.py
|
||||
- pytest -v -s kernels/moe/test_flashinfer.py
|
||||
- pytest -v -s kernels/moe/test_gpt_oss_triton_kernels.py
|
||||
- pytest -v -s kernels/moe/test_modular_oai_triton_moe.py
|
||||
- pytest -v -s kernels/moe/test_moe.py
|
||||
# - pytest -v -s kernels/moe/test_block_fp8.py - failing on main
|
||||
- pytest -v -s kernels/moe/test_block_int8.py
|
||||
- pytest -v -s kernels/moe/test_triton_moe_no_act_mul.py
|
||||
- pytest -v -s kernels/moe/test_triton_moe_ptpc_fp8.py
|
||||
|
||||
- label: Kernels FP8 MoE Test (2 H100s)
|
||||
timeout_in_minutes: 90
|
||||
gpu: h100
|
||||
num_gpus: 2
|
||||
optional: true
|
||||
commands:
|
||||
- pytest -v -s kernels/moe/test_deepep_deepgemm_moe.py
|
||||
- pytest -v -s kernels/moe/test_deepep_moe.py
|
||||
- pytest -v -s kernels/moe/test_pplx_cutlass_moe.py
|
||||
# - pytest -v -s kernels/moe/test_pplx_moe.py - failing on main
|
||||
|
||||
- label: Kernels Fp4 MoE Test (B200)
|
||||
timeout_in_minutes: 60
|
||||
gpu: b200
|
||||
num_gpus: 1
|
||||
optional: true
|
||||
commands:
|
||||
- pytest -v -s kernels/moe/test_cutedsl_moe.py
|
||||
- pytest -v -s kernels/moe/test_flashinfer_moe.py
|
||||
- pytest -v -s kernels/moe/test_nvfp4_moe.py
|
||||
- pytest -v -s kernels/moe/test_ocp_mx_moe.py
|
||||
|
||||
|
||||
- label: Model Executor Test # 23min
|
||||
timeout_in_minutes: 35
|
||||
torch_nightly: true
|
||||
@@ -951,7 +1007,7 @@ steps:
|
||||
# Whisper needs spawn method to avoid deadlock
|
||||
- VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper
|
||||
|
||||
- label: Blackwell Test # 21 min
|
||||
- label: Blackwell Test # 23 min
|
||||
timeout_in_minutes: 30
|
||||
working_dir: "/vllm-workspace/"
|
||||
gpu: b200
|
||||
@@ -961,7 +1017,7 @@ steps:
|
||||
- csrc/quantization/cutlass_w8a8/moe/
|
||||
- vllm/model_executor/layers/fused_moe/cutlass_moe.py
|
||||
- vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py
|
||||
- vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py
|
||||
- vllm/model_executor/layers/fused_moe/flashinfer_a2a_prepare_finalize.py
|
||||
- vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
|
||||
- vllm/v1/attention/backends/flashinfer.py
|
||||
- vllm/v1/attention/backends/mla/cutlass_mla.py
|
||||
@@ -991,6 +1047,8 @@ steps:
|
||||
- pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py
|
||||
- pytest -v -s tests/kernels/moe/test_flashinfer.py
|
||||
- pytest -v -s tests/kernels/moe/test_cutedsl_moe.py
|
||||
# e2e
|
||||
- pytest -v -s tests/models/quantization/test_nvfp4.py
|
||||
|
||||
- label: Blackwell Fusion and Compile Tests # 30 min
|
||||
timeout_in_minutes: 40
|
||||
@@ -1045,6 +1103,48 @@ steps:
|
||||
# Run all e2e fusion tests
|
||||
- pytest -v -s tests/compile/distributed/test_fusions_e2e.py
|
||||
|
||||
- label: Hopper Fusion E2E Tests (H100) # 10min
|
||||
timeout_in_minutes: 70
|
||||
working_dir: "/vllm-workspace/"
|
||||
gpu: h100
|
||||
optional: true
|
||||
source_file_dependencies:
|
||||
- csrc/quantization/fp4/
|
||||
- vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
|
||||
- vllm/v1/attention/backends/flashinfer.py
|
||||
- vllm/compilation/
|
||||
# can affect pattern matching
|
||||
- vllm/model_executor/layers/layernorm.py
|
||||
- vllm/model_executor/layers/activation.py
|
||||
- vllm/model_executor/layers/quantization/input_quant_fp8.py
|
||||
- tests/compile/test_fusion_attn.py
|
||||
commands:
|
||||
- export VLLM_TEST_CLEAN_GPU_MEMORY=1
|
||||
# skip Llama-4 since it does not fit on this device
|
||||
- pytest -v -s tests/compile/test_fusion_attn.py -k 'not Llama-4'
|
||||
|
||||
- label: Hopper Fusion Distributed E2E Tests (2xH100) # 70min
|
||||
timeout_in_minutes: 70
|
||||
working_dir: "/vllm-workspace/"
|
||||
gpu: h100
|
||||
optional: true
|
||||
num_gpus: 2
|
||||
source_file_dependencies:
|
||||
- csrc/quantization/fp4/
|
||||
- vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
|
||||
- vllm/v1/attention/backends/flashinfer.py
|
||||
- vllm/compilation/
|
||||
# can affect pattern matching
|
||||
- vllm/model_executor/layers/layernorm.py
|
||||
- vllm/model_executor/layers/activation.py
|
||||
- vllm/model_executor/layers/quantization/input_quant_fp8.py
|
||||
- tests/compile/distributed/test_fusions_e2e.py
|
||||
commands:
|
||||
- export VLLM_TEST_CLEAN_GPU_MEMORY=1
|
||||
# Run all e2e fusion tests
|
||||
- pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'
|
||||
- pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
|
||||
|
||||
- label: Blackwell GPT-OSS Eval
|
||||
timeout_in_minutes: 60
|
||||
working_dir: "/vllm-workspace/"
|
||||
@@ -1216,7 +1316,7 @@ steps:
|
||||
- pytest -v -s distributed/test_distributed_oot.py
|
||||
- pytest -v -s entrypoints/openai/test_oot_registration.py # it needs a clean process
|
||||
- pytest -v -s models/test_oot_registration.py # it needs a clean process
|
||||
- pytest -v -s plugins/lora_resolvers # unit tests for in-tree lora resolver plugins
|
||||
- pytest -v -s plugins/lora_resolvers # unit tests for lora resolver plugins
|
||||
|
||||
- label: Pipeline + Context Parallelism Test # 45min
|
||||
timeout_in_minutes: 60
|
||||
@@ -1344,22 +1444,31 @@ steps:
|
||||
- export VLLM_USE_DEEP_GEMM=0 # We found Triton is faster than DeepGEMM for H100
|
||||
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt --tp-size=4
|
||||
|
||||
##### H200 test #####
|
||||
- label: Distributed Tests (H200) # optional
|
||||
gpu: h200
|
||||
- label: Sequence Parallel Tests (H100) # 60 min
|
||||
timeout_in_minutes: 60
|
||||
working_dir: "/vllm-workspace/"
|
||||
gpu: h100
|
||||
optional: true
|
||||
num_gpus: 2
|
||||
commands:
|
||||
- export VLLM_TEST_CLEAN_GPU_MEMORY=1
|
||||
# Run sequence parallel tests
|
||||
- pytest -v -s tests/distributed/test_sequence_parallel.py
|
||||
- pytest -v -s tests/compile/distributed/test_sequence_parallelism.py
|
||||
|
||||
- label: Distributed Tests (H100) # optional
|
||||
gpu: h100
|
||||
optional: true
|
||||
working_dir: "/vllm-workspace/"
|
||||
num_gpus: 2
|
||||
commands:
|
||||
- VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_async_tp.py
|
||||
- pytest -v -s tests/compile/distributed/test_sequence_parallelism.py
|
||||
- pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
|
||||
- "VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'"
|
||||
- VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/distributed/test_sequence_parallel.py
|
||||
- pytest -v -s tests/distributed/test_context_parallel.py
|
||||
- CUDA_VISIBLE_DEVICES=1,2 VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=deepep_high_throughput
|
||||
- VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=deepep_high_throughput
|
||||
- pytest -v -s tests/v1/distributed/test_dbo.py
|
||||
|
||||
##### H200 test #####
|
||||
|
||||
- label: LM Eval Large Models (H200) # optional
|
||||
timeout_in_minutes: 60
|
||||
gpu: h200
|
||||
|
||||
@@ -4,8 +4,10 @@ depends_on:
|
||||
steps:
|
||||
- label: V1 attention (H100)
|
||||
timeout_in_minutes: 30
|
||||
gpu: h100
|
||||
device: h100
|
||||
source_file_dependencies:
|
||||
- vllm/config/attention.py
|
||||
- vllm/model_executor/layers/attention
|
||||
- vllm/v1/attention
|
||||
- tests/v1/attention
|
||||
commands:
|
||||
@@ -13,9 +15,11 @@ steps:
|
||||
|
||||
- label: V1 attention (B200)
|
||||
timeout_in_minutes: 30
|
||||
gpu: b200
|
||||
device: b200
|
||||
source_file_dependencies:
|
||||
- vllm/config/attention.py
|
||||
- vllm/model_executor/layers/attention
|
||||
- vllm/v1/attention
|
||||
- tests/v1/attention
|
||||
commands:
|
||||
- VLLM_DISABLE_FLASHINFER_PREFILL=1 pytest -v -s v1/attention # TODO: FI prefill is bugged and causes incorrectness, fix this
|
||||
- pytest -v -s v1/attention
|
||||
|
||||
@@ -5,7 +5,7 @@ steps:
|
||||
- label: Fusion and Compile Tests (B200)
|
||||
timeout_in_minutes: 40
|
||||
working_dir: "/vllm-workspace/"
|
||||
gpu: b200
|
||||
device: b200
|
||||
source_file_dependencies:
|
||||
- csrc/quantization/fp4/
|
||||
- vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
|
||||
@@ -26,7 +26,7 @@ steps:
|
||||
- nvidia-smi
|
||||
- pytest -v -s tests/compile/test_fusion_attn.py
|
||||
- pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
|
||||
# this runner has 2 GPUs available even though num_gpus=2 is not set
|
||||
# this runner has 2 GPUs available even though num_devices=2 is not set
|
||||
- pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
|
||||
# Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time
|
||||
# Wrap with quotes to escape yaml
|
||||
@@ -37,9 +37,9 @@ steps:
|
||||
- label: Fusion E2E (2 GPUs)(B200)
|
||||
timeout_in_minutes: 40
|
||||
working_dir: "/vllm-workspace/"
|
||||
gpu: b200
|
||||
device: b200
|
||||
optional: true
|
||||
num_gpus: 2
|
||||
num_devices: 2
|
||||
source_file_dependencies:
|
||||
- csrc/quantization/fp4/
|
||||
- vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
|
||||
|
||||
@@ -5,7 +5,7 @@ steps:
|
||||
- label: Distributed Comm Ops
|
||||
timeout_in_minutes: 20
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
num_gpus: 2
|
||||
num_devices: 2
|
||||
source_file_dependencies:
|
||||
- vllm/distributed
|
||||
- tests/distributed
|
||||
@@ -18,7 +18,7 @@ steps:
|
||||
- label: Distributed (2 GPUs)
|
||||
timeout_in_minutes: 90
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
num_gpus: 2
|
||||
num_devices: 2
|
||||
source_file_dependencies:
|
||||
- vllm/compilation/
|
||||
- vllm/distributed/
|
||||
@@ -54,7 +54,7 @@ steps:
|
||||
- label: Distributed Tests (4 GPUs)
|
||||
timeout_in_minutes: 50
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
num_gpus: 4
|
||||
num_devices: 4
|
||||
source_file_dependencies:
|
||||
- vllm/distributed/
|
||||
- tests/distributed/test_utils
|
||||
@@ -103,8 +103,8 @@ steps:
|
||||
|
||||
- label: Distributed Tests (8 GPUs)(H100)
|
||||
timeout_in_minutes: 10
|
||||
gpu: h100
|
||||
num_gpus: 8
|
||||
device: h100
|
||||
num_devices: 8
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
source_file_dependencies:
|
||||
- examples/offline_inference/torchrun_dp_example.py
|
||||
@@ -120,9 +120,9 @@ steps:
|
||||
- torchrun --nproc-per-node=8 ../examples/offline_inference/torchrun_dp_example.py --tp-size=2 --pp-size=1 --dp-size=4 --enable-ep
|
||||
|
||||
- label: Distributed Tests (4 GPUs)(A100)
|
||||
gpu: a100
|
||||
device: a100
|
||||
optional: true
|
||||
num_gpus: 4
|
||||
num_devices: 4
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
commands:
|
||||
@@ -133,26 +133,34 @@ steps:
|
||||
- TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
|
||||
- pytest -v -s -x lora/test_mixtral.py
|
||||
|
||||
- label: Distributed Tests (2 GPUs)(H200)
|
||||
gpu: h200
|
||||
- label: Sequence Parallel Tests (H100)
|
||||
timeout_in_minutes: 60
|
||||
working_dir: "/vllm-workspace/"
|
||||
device: h100
|
||||
optional: true
|
||||
num_devices: 2
|
||||
commands:
|
||||
- export VLLM_TEST_CLEAN_GPU_MEMORY=1
|
||||
# Run sequence parallel tests
|
||||
- pytest -v -s tests/distributed/test_sequence_parallel.py
|
||||
- pytest -v -s tests/compile/distributed/test_sequence_parallelism.py
|
||||
|
||||
- label: Distributed Tests (2 GPUs)(H100)
|
||||
device: h100
|
||||
optional: true
|
||||
working_dir: "/vllm-workspace/"
|
||||
num_gpus: 2
|
||||
num_devices: 2
|
||||
commands:
|
||||
- VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_async_tp.py
|
||||
- pytest -v -s tests/compile/distributed/test_sequence_parallelism.py
|
||||
- pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
|
||||
- VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'
|
||||
- VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/distributed/test_sequence_parallel.py
|
||||
- pytest -v -s tests/distributed/test_context_parallel.py
|
||||
- CUDA_VISIBLE_DEVICES=1,2 VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=deepep_high_throughput
|
||||
- VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=deepep_high_throughput
|
||||
- pytest -v -s tests/v1/distributed/test_dbo.py
|
||||
|
||||
- label: Distributed Tests (2 GPUs)(B200)
|
||||
gpu: b200
|
||||
device: b200
|
||||
optional: true
|
||||
working_dir: "/vllm-workspace/"
|
||||
num_gpus: 2
|
||||
num_devices: 2
|
||||
commands:
|
||||
- pytest -v -s tests/distributed/test_context_parallel.py
|
||||
- pytest -v -s tests/distributed/test_nccl_symm_mem_allreduce.py
|
||||
@@ -161,8 +169,9 @@ steps:
|
||||
- label: 2 Node Test (4 GPUs)
|
||||
timeout_in_minutes: 30
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
num_gpus: 2
|
||||
num_devices: 2
|
||||
num_nodes: 2
|
||||
no_plugin: true
|
||||
source_file_dependencies:
|
||||
- vllm/distributed/
|
||||
- vllm/engine/
|
||||
@@ -176,7 +185,7 @@ steps:
|
||||
- label: Distributed NixlConnector PD accuracy (4 GPUs)
|
||||
timeout_in_minutes: 30
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
num_gpus: 4
|
||||
num_devices: 4
|
||||
source_file_dependencies:
|
||||
- vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
|
||||
- tests/v1/kv_connector/nixl_integration/
|
||||
@@ -184,10 +193,21 @@ steps:
|
||||
- uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
|
||||
- bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh
|
||||
|
||||
- label: DP EP Distributed NixlConnector PD accuracy tests (4 GPUs)
|
||||
timeout_in_minutes: 30
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
num_devices: 4
|
||||
source_file_dependencies:
|
||||
- vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
|
||||
- tests/v1/kv_connector/nixl_integration/
|
||||
commands:
|
||||
- uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
|
||||
- DP_EP=1 bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh
|
||||
|
||||
- label: Pipeline + Context Parallelism (4 GPUs))
|
||||
timeout_in_minutes: 60
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
num_gpus: 4
|
||||
num_devices: 4
|
||||
source_file_dependencies:
|
||||
- vllm/distributed/
|
||||
- vllm/engine/
|
||||
@@ -196,4 +216,46 @@ steps:
|
||||
- tests/distributed/
|
||||
commands:
|
||||
- pytest -v -s distributed/test_pp_cudagraph.py
|
||||
- pytest -v -s distributed/test_pipeline_parallel.py
|
||||
- pytest -v -s distributed/test_pipeline_parallel.py
|
||||
|
||||
- label: Hopper Fusion E2E Tests (H100)
|
||||
timeout_in_minutes: 70
|
||||
working_dir: "/vllm-workspace/"
|
||||
device: h100
|
||||
optional: true
|
||||
source_file_dependencies:
|
||||
- csrc/quantization/fp4/
|
||||
- vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
|
||||
- vllm/v1/attention/backends/flashinfer.py
|
||||
- vllm/compilation/
|
||||
# can affect pattern matching
|
||||
- vllm/model_executor/layers/layernorm.py
|
||||
- vllm/model_executor/layers/activation.py
|
||||
- vllm/model_executor/layers/quantization/input_quant_fp8.py
|
||||
- tests/compile/test_fusion_attn.py
|
||||
commands:
|
||||
- export VLLM_TEST_CLEAN_GPU_MEMORY=1
|
||||
# skip Llama-4 since it does not fit on this device
|
||||
- pytest -v -s tests/compile/test_fusion_attn.py -k 'not Llama-4'
|
||||
|
||||
- label: Hopper Fusion Distributed E2E Tests (2xH100)
|
||||
timeout_in_minutes: 70
|
||||
working_dir: "/vllm-workspace/"
|
||||
device: h100
|
||||
optional: true
|
||||
num_devices: 2
|
||||
source_file_dependencies:
|
||||
- csrc/quantization/fp4/
|
||||
- vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
|
||||
- vllm/v1/attention/backends/flashinfer.py
|
||||
- vllm/compilation/
|
||||
# can affect pattern matching
|
||||
- vllm/model_executor/layers/layernorm.py
|
||||
- vllm/model_executor/layers/activation.py
|
||||
- vllm/model_executor/layers/quantization/input_quant_fp8.py
|
||||
- tests/compile/distributed/test_fusions_e2e.py
|
||||
commands:
|
||||
- export VLLM_TEST_CLEAN_GPU_MEMORY=1
|
||||
# Run all e2e fusion tests
|
||||
- pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'
|
||||
- pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
|
||||
|
||||
@@ -4,27 +4,27 @@ depends_on:
|
||||
steps:
|
||||
- label: DeepSeek V2-Lite Accuracy
|
||||
timeout_in_minutes: 60
|
||||
gpu: h100
|
||||
device: h100
|
||||
optional: true
|
||||
num_gpus: 4
|
||||
num_devices: 4
|
||||
working_dir: "/vllm-workspace"
|
||||
commands:
|
||||
- bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh 0.25 200 8010
|
||||
|
||||
- label: Qwen3-30B-A3B-FP8-block Accuracy
|
||||
timeout_in_minutes: 60
|
||||
gpu: h100
|
||||
device: h100
|
||||
optional: true
|
||||
num_gpus: 4
|
||||
num_devices: 4
|
||||
working_dir: "/vllm-workspace"
|
||||
commands:
|
||||
- bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020
|
||||
|
||||
- label: Qwen3-30B-A3B-FP8-block Accuracy (B200)
|
||||
timeout_in_minutes: 60
|
||||
gpu: b200
|
||||
device: b200
|
||||
optional: true
|
||||
num_gpus: 2
|
||||
num_devices: 2
|
||||
working_dir: "/vllm-workspace"
|
||||
commands:
|
||||
- bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1
|
||||
@@ -33,10 +33,11 @@ steps:
|
||||
timeout_in_minutes: 30
|
||||
optional: true
|
||||
soft_fail: true
|
||||
num_gpus: 2
|
||||
num_devices: 2
|
||||
working_dir: "/vllm-workspace"
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- .buildkite/scripts/run-prime-rl-test.sh
|
||||
commands:
|
||||
- nvidia-smi
|
||||
- bash .buildkite/scripts/run-prime-rl-test.sh
|
||||
|
||||
@@ -23,4 +23,8 @@ steps:
|
||||
# TODO: accuracy does not match, whether setting
|
||||
# VLLM_USE_FLASHINFER_SAMPLER or not on H100.
|
||||
- pytest -v -s v1/e2e
|
||||
- pytest -v -s v1/engine
|
||||
# Run this test standalone for now;
|
||||
# need to untangle use (implicit) use of spawn/fork across the tests.
|
||||
- pytest -v -s v1/engine/test_preprocess_error_handling.py
|
||||
# Run the rest of v1/engine tests
|
||||
- pytest -v -s v1/engine --ignore v1/engine/test_preprocess_error_handling.py
|
||||
|
||||
@@ -14,7 +14,7 @@ steps:
|
||||
- label: EPLB Execution
|
||||
timeout_in_minutes: 20
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
num_gpus: 4
|
||||
num_devices: 4
|
||||
source_file_dependencies:
|
||||
- vllm/distributed/eplb
|
||||
- tests/distributed/test_eplb_execute.py
|
||||
|
||||
@@ -57,8 +57,8 @@ steps:
|
||||
|
||||
- label: Kernels DeepGEMM Test (H100)
|
||||
timeout_in_minutes: 45
|
||||
gpu: h100
|
||||
num_gpus: 1
|
||||
device: h100
|
||||
num_devices: 1
|
||||
source_file_dependencies:
|
||||
- tools/install_deepgemm.sh
|
||||
- vllm/utils/deep_gemm.py
|
||||
@@ -77,7 +77,7 @@ steps:
|
||||
- label: Kernels (B200)
|
||||
timeout_in_minutes: 30
|
||||
working_dir: "/vllm-workspace/"
|
||||
gpu: b200
|
||||
device: b200
|
||||
# optional: true
|
||||
source_file_dependencies:
|
||||
- csrc/quantization/fp4/
|
||||
@@ -85,7 +85,7 @@ steps:
|
||||
- csrc/quantization/cutlass_w8a8/moe/
|
||||
- vllm/model_executor/layers/fused_moe/cutlass_moe.py
|
||||
- vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py
|
||||
- vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py
|
||||
- vllm/model_executor/layers/fused_moe/flashinfer_a2a_prepare_finalize.py
|
||||
- vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
|
||||
- vllm/v1/attention/backends/flashinfer.py
|
||||
- vllm/v1/attention/backends/mla/cutlass_mla.py
|
||||
@@ -114,4 +114,55 @@ steps:
|
||||
- pytest -v -s tests/kernels/moe/test_nvfp4_moe.py
|
||||
- pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py
|
||||
- pytest -v -s tests/kernels/moe/test_flashinfer.py
|
||||
- pytest -v -s tests/kernels/moe/test_cutedsl_moe.py
|
||||
- pytest -v -s tests/kernels/moe/test_cutedsl_moe.py
|
||||
# e2e
|
||||
- pytest -v -s tests/models/quantization/test_nvfp4.py
|
||||
|
||||
- label: Kernels Helion Test
|
||||
timeout_in_minutes: 30
|
||||
device: h100
|
||||
source_file_dependencies:
|
||||
- vllm/utils/import_utils.py
|
||||
- tests/kernels/helion/
|
||||
commands:
|
||||
- pip install helion
|
||||
- pytest -v -s kernels/helion/
|
||||
|
||||
|
||||
- label: Kernels FP8 MoE Test (1 H100)
|
||||
timeout_in_minutes: 90
|
||||
device: h100
|
||||
num_devices: 1
|
||||
optional: true
|
||||
commands:
|
||||
- pytest -v -s kernels/moe/test_cutlass_moe.py
|
||||
- pytest -v -s kernels/moe/test_flashinfer.py
|
||||
- pytest -v -s kernels/moe/test_gpt_oss_triton_kernels.py
|
||||
- pytest -v -s kernels/moe/test_modular_oai_triton_moe.py
|
||||
- pytest -v -s kernels/moe/test_moe.py
|
||||
# - pytest -v -s kernels/moe/test_block_fp8.py - failing on main
|
||||
- pytest -v -s kernels/moe/test_block_int8.py
|
||||
- pytest -v -s kernels/moe/test_triton_moe_no_act_mul.py
|
||||
- pytest -v -s kernels/moe/test_triton_moe_ptpc_fp8.py
|
||||
|
||||
- label: Kernels FP8 MoE Test (2 H100s)
|
||||
timeout_in_minutes: 90
|
||||
device: h100
|
||||
num_devices: 2
|
||||
optional: true
|
||||
commands:
|
||||
- pytest -v -s kernels/moe/test_deepep_deepgemm_moe.py
|
||||
- pytest -v -s kernels/moe/test_deepep_moe.py
|
||||
- pytest -v -s kernels/moe/test_pplx_cutlass_moe.py
|
||||
# - pytest -v -s kernels/moe/test_pplx_moe.py - failing on main
|
||||
|
||||
- label: Kernels Fp4 MoE Test (B200)
|
||||
timeout_in_minutes: 60
|
||||
device: b200
|
||||
num_devices: 1
|
||||
optional: true
|
||||
commands:
|
||||
- pytest -v -s kernels/moe/test_cutedsl_moe.py
|
||||
- pytest -v -s kernels/moe/test_flashinfer_moe.py
|
||||
- pytest -v -s kernels/moe/test_nvfp4_moe.py
|
||||
- pytest -v -s kernels/moe/test_ocp_mx_moe.py
|
||||
|
||||
@@ -12,9 +12,9 @@ steps:
|
||||
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt
|
||||
|
||||
- label: LM Eval Large Models (4 GPUs)(A100)
|
||||
gpu: a100
|
||||
device: a100
|
||||
optional: true
|
||||
num_gpus: 4
|
||||
num_devices: 4
|
||||
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
|
||||
source_file_dependencies:
|
||||
- csrc/
|
||||
@@ -24,9 +24,9 @@ steps:
|
||||
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4
|
||||
|
||||
- label: LM Eval Large Models (4 GPUs)(H100)
|
||||
gpu: h100
|
||||
device: h100
|
||||
optional: true
|
||||
num_gpus: 4
|
||||
num_devices: 4
|
||||
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
|
||||
source_file_dependencies:
|
||||
- csrc/
|
||||
@@ -37,10 +37,39 @@ steps:
|
||||
|
||||
- label: LM Eval Small Models (B200)
|
||||
timeout_in_minutes: 120
|
||||
gpu: b200
|
||||
device: b200
|
||||
optional: true
|
||||
source_file_dependencies:
|
||||
- csrc/
|
||||
- vllm/model_executor/layers/quantization
|
||||
commands:
|
||||
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt
|
||||
|
||||
- label: LM Eval Large Models (H200)
|
||||
timeout_in_minutes: 60
|
||||
device: h200
|
||||
optional: true
|
||||
num_devices: 8
|
||||
commands:
|
||||
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-h200.txt
|
||||
|
||||
- label: MoE Refactor Integration Test (H100 - TEMPORARY)
|
||||
device: h100
|
||||
optional: true
|
||||
num_devices: 2
|
||||
commands:
|
||||
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=evals/gsm8k/configs/moe-refactor/config-h100.txt
|
||||
|
||||
- label: MoE Refactor Integration Test (B200 - TEMPORARY)
|
||||
gpu: b200
|
||||
optional: true
|
||||
num_devices: 2
|
||||
commands:
|
||||
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=evals/gsm8k/configs/moe-refactor/config-b200.txt
|
||||
|
||||
- label: MoE Refactor Integration Test (B200 DP - TEMPORARY)
|
||||
device: b200
|
||||
optional: true
|
||||
num_devices: 2
|
||||
commands:
|
||||
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=evals/gsm8k/configs/moe-refactor-dp-ep/config-b200.txt
|
||||
|
||||
@@ -14,7 +14,7 @@ steps:
|
||||
|
||||
- label: LoRA TP (Distributed)
|
||||
timeout_in_minutes: 30
|
||||
num_gpus: 4
|
||||
num_devices: 4
|
||||
source_file_dependencies:
|
||||
- vllm/lora
|
||||
- tests/lora
|
||||
|
||||
@@ -31,7 +31,7 @@ steps:
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/v1
|
||||
no_gpu: true
|
||||
device: cpu
|
||||
commands:
|
||||
# split the test to avoid interference
|
||||
- pytest -v -s -m 'cpu_test' v1/core
|
||||
@@ -82,7 +82,7 @@ steps:
|
||||
|
||||
- label: Metrics, Tracing (2 GPUs)
|
||||
timeout_in_minutes: 20
|
||||
num_gpus: 2
|
||||
num_devices: 2
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/v1/tracing
|
||||
@@ -121,17 +121,19 @@ steps:
|
||||
- tests/test_inputs.py
|
||||
- tests/test_outputs.py
|
||||
- tests/multimodal
|
||||
- tests/renderers
|
||||
- tests/standalone_tests/lazy_imports.py
|
||||
- tests/tokenizers_
|
||||
- tests/tool_parsers
|
||||
- tests/transformers_utils
|
||||
- tests/config
|
||||
no_gpu: true
|
||||
device: cpu
|
||||
commands:
|
||||
- python3 standalone_tests/lazy_imports.py
|
||||
- pytest -v -s test_inputs.py
|
||||
- pytest -v -s test_outputs.py
|
||||
- pytest -v -s -m 'cpu_test' multimodal
|
||||
- pytest -v -s renderers
|
||||
- pytest -v -s tokenizers_
|
||||
- pytest -v -s tool_parsers
|
||||
- pytest -v -s transformers_utils
|
||||
@@ -140,7 +142,7 @@ steps:
|
||||
- label: GPT-OSS Eval (B200)
|
||||
timeout_in_minutes: 60
|
||||
working_dir: "/vllm-workspace/"
|
||||
gpu: b200
|
||||
device: b200
|
||||
optional: true
|
||||
source_file_dependencies:
|
||||
- tests/evals/gpt_oss
|
||||
@@ -153,7 +155,7 @@ steps:
|
||||
|
||||
- label: Batch Invariance (H100)
|
||||
timeout_in_minutes: 25
|
||||
gpu: h100
|
||||
device: h100
|
||||
source_file_dependencies:
|
||||
- vllm/v1/attention
|
||||
- vllm/model_executor/layers
|
||||
|
||||
@@ -44,7 +44,7 @@ steps:
|
||||
- vllm/
|
||||
- tests/models/test_utils.py
|
||||
- tests/models/test_vision.py
|
||||
no_gpu: true
|
||||
device: cpu
|
||||
commands:
|
||||
- pytest -v -s models/test_utils.py models/test_vision.py
|
||||
|
||||
|
||||
@@ -5,7 +5,7 @@ steps:
|
||||
- label: Distributed Model Tests (2 GPUs)
|
||||
timeout_in_minutes: 50
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
num_gpus: 2
|
||||
num_devices: 2
|
||||
source_file_dependencies:
|
||||
- vllm/model_executor/model_loader/sharded_state_loader.py
|
||||
- vllm/model_executor/models/
|
||||
|
||||
@@ -18,7 +18,7 @@ steps:
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/models/multimodal
|
||||
no_gpu: true
|
||||
device: cpu
|
||||
commands:
|
||||
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
|
||||
- pytest -v -s models/multimodal/processing --ignore models/multimodal/processing/test_tensor_schema.py
|
||||
|
||||
@@ -5,7 +5,7 @@ steps:
|
||||
- label: Plugin Tests (2 GPUs)
|
||||
timeout_in_minutes: 60
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
num_gpus: 2
|
||||
num_devices: 2
|
||||
source_file_dependencies:
|
||||
- vllm/plugins/
|
||||
- tests/plugins/
|
||||
|
||||
@@ -16,14 +16,14 @@ steps:
|
||||
# https://github.com/pytorch/ao/issues/2919, we'll have to skip new torchao tests for now
|
||||
# we can only upgrade after this is resolved
|
||||
# TODO(jerryzh168): resolve the above comment
|
||||
- uv pip install --system torchao==0.13.0 --index-url https://download.pytorch.org/whl/cu129
|
||||
- uv pip install --system torchao==0.14.1 --index-url https://download.pytorch.org/whl/cu129
|
||||
- uv pip install --system conch-triton-kernels
|
||||
- VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py
|
||||
|
||||
- label: Quantized MoE Test (B200)
|
||||
timeout_in_minutes: 60
|
||||
working_dir: "/vllm-workspace/"
|
||||
gpu: b200
|
||||
device: b200
|
||||
source_file_dependencies:
|
||||
- tests/quantization/test_blackwell_moe.py
|
||||
- vllm/model_executor/models/deepseek_v2.py
|
||||
|
||||
@@ -5,7 +5,7 @@ steps:
|
||||
- label: Weight Loading Multiple GPU # 33min
|
||||
timeout_in_minutes: 45
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
num_gpus: 2
|
||||
num_devices: 2
|
||||
optional: true
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
@@ -15,8 +15,8 @@ steps:
|
||||
|
||||
- label: Weight Loading Multiple GPU - Large Models # optional
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
num_gpus: 2
|
||||
gpu: a100
|
||||
num_devices: 2
|
||||
device: a100
|
||||
optional: true
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
|
||||
12
.github/mergify.yml
vendored
12
.github/mergify.yml
vendored
@@ -414,6 +414,18 @@ pull_request_rules:
|
||||
remove:
|
||||
- needs-rebase
|
||||
|
||||
- name: label-bug
|
||||
description: Automatically apply bug label
|
||||
conditions:
|
||||
- label != stale
|
||||
- or:
|
||||
- title~=(?i)\bbug\b
|
||||
- title~=(?i)\bbugfix\b
|
||||
actions:
|
||||
label:
|
||||
add:
|
||||
- bug
|
||||
|
||||
- name: label-kv-connector
|
||||
description: Automatically apply kv-connector label
|
||||
conditions:
|
||||
|
||||
3
.github/workflows/macos-smoke-test.yml
vendored
3
.github/workflows/macos-smoke-test.yml
vendored
@@ -29,8 +29,9 @@ jobs:
|
||||
|
||||
- name: Install dependencies and build vLLM
|
||||
run: |
|
||||
uv pip install -r requirements/cpu-build.txt --index-strategy unsafe-best-match
|
||||
uv pip install -r requirements/cpu.txt --index-strategy unsafe-best-match
|
||||
uv pip install -e .
|
||||
uv pip install -e . --no-build-isolation
|
||||
env:
|
||||
CMAKE_BUILD_PARALLEL_LEVEL: 4
|
||||
|
||||
|
||||
6
.gitignore
vendored
6
.gitignore
vendored
@@ -7,6 +7,9 @@ vllm/vllm_flash_attn/*
|
||||
# OpenAI triton kernels copied from source
|
||||
vllm/third_party/triton_kernels/*
|
||||
|
||||
# FlashMLA interface copied from source
|
||||
vllm/third_party/flashmla/flash_mla_interface.py
|
||||
|
||||
# triton jit
|
||||
.triton
|
||||
|
||||
@@ -191,6 +194,9 @@ CLAUDE.md
|
||||
AGENTS.md
|
||||
.codex/
|
||||
|
||||
# Cursor
|
||||
.cursor/
|
||||
|
||||
# DS Store
|
||||
.DS_Store
|
||||
|
||||
|
||||
@@ -147,6 +147,13 @@ repos:
|
||||
entry: python tools/pre_commit/validate_config.py
|
||||
language: python
|
||||
additional_dependencies: [regex]
|
||||
- id: validate-docker-versions
|
||||
name: Validate docker/versions.json matches Dockerfile
|
||||
entry: python tools/generate_versions_json.py --check
|
||||
language: python
|
||||
files: ^docker/(Dockerfile|versions\.json)$
|
||||
pass_filenames: false
|
||||
additional_dependencies: [dockerfile-parse]
|
||||
# Keep `suggestion` last
|
||||
- id: suggestion
|
||||
name: Suggestion
|
||||
|
||||
@@ -377,7 +377,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
# preselected input type pairs and schedules.
|
||||
# Generate sources:
|
||||
set(MARLIN_GEN_SCRIPT
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/csrc/quantization/gptq_marlin/generate_kernels.py)
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/csrc/quantization/marlin/generate_kernels.py)
|
||||
file(MD5 ${MARLIN_GEN_SCRIPT} MARLIN_GEN_SCRIPT_HASH)
|
||||
list(JOIN CUDA_ARCHS "," CUDA_ARCHS_STR)
|
||||
set(MARLIN_GEN_SCRIPT_HASH_AND_ARCH "${MARLIN_GEN_SCRIPT_HASH}(ARCH:${CUDA_ARCHS_STR})")
|
||||
@@ -412,7 +412,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
endif()
|
||||
|
||||
if (MARLIN_ARCHS)
|
||||
file(GLOB MARLIN_TEMPLATE_KERNEL_SRC "csrc/quantization/gptq_marlin/sm80_kernel_*_float16.cu")
|
||||
file(GLOB MARLIN_TEMPLATE_KERNEL_SRC "csrc/quantization/marlin/sm80_kernel_*_float16.cu")
|
||||
set_gencode_flags_for_srcs(
|
||||
SRCS "${MARLIN_TEMPLATE_KERNEL_SRC}"
|
||||
CUDA_ARCHS "${MARLIN_ARCHS}")
|
||||
@@ -422,7 +422,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
endif()
|
||||
list(APPEND VLLM_EXT_SRC ${MARLIN_TEMPLATE_KERNEL_SRC})
|
||||
|
||||
file(GLOB MARLIN_TEMPLATE_BF16_KERNEL_SRC "csrc/quantization/gptq_marlin/sm80_kernel_*_bfloat16.cu")
|
||||
file(GLOB MARLIN_TEMPLATE_BF16_KERNEL_SRC "csrc/quantization/marlin/sm80_kernel_*_bfloat16.cu")
|
||||
set_gencode_flags_for_srcs(
|
||||
SRCS "${MARLIN_TEMPLATE_BF16_KERNEL_SRC}"
|
||||
CUDA_ARCHS "${MARLIN_BF16_ARCHS}")
|
||||
@@ -434,7 +434,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
endif()
|
||||
|
||||
if (MARLIN_SM75_ARCHS)
|
||||
file(GLOB MARLIN_TEMPLATE_SM75_KERNEL_SRC "csrc/quantization/gptq_marlin/sm75_kernel_*.cu")
|
||||
file(GLOB MARLIN_TEMPLATE_SM75_KERNEL_SRC "csrc/quantization/marlin/sm75_kernel_*.cu")
|
||||
set_gencode_flags_for_srcs(
|
||||
SRCS "${MARLIN_TEMPLATE_SM75_KERNEL_SRC}"
|
||||
CUDA_ARCHS "${MARLIN_SM75_ARCHS}")
|
||||
@@ -446,7 +446,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
endif()
|
||||
|
||||
if (MARLIN_FP8_ARCHS)
|
||||
file(GLOB MARLIN_TEMPLATE_FP8_KERNEL_SRC "csrc/quantization/gptq_marlin/sm89_kernel_*.cu")
|
||||
file(GLOB MARLIN_TEMPLATE_FP8_KERNEL_SRC "csrc/quantization/marlin/sm89_kernel_*.cu")
|
||||
set_gencode_flags_for_srcs(
|
||||
SRCS "${MARLIN_TEMPLATE_FP8_KERNEL_SRC}"
|
||||
CUDA_ARCHS "${MARLIN_FP8_ARCHS}")
|
||||
@@ -459,10 +459,10 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
|
||||
set(MARLIN_SRCS
|
||||
"csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu"
|
||||
"csrc/quantization/gptq_marlin/gptq_marlin.cu"
|
||||
"csrc/quantization/gptq_marlin/marlin_int4_fp8_preprocess.cu"
|
||||
"csrc/quantization/gptq_marlin/gptq_marlin_repack.cu"
|
||||
"csrc/quantization/gptq_marlin/awq_marlin_repack.cu")
|
||||
"csrc/quantization/marlin/marlin.cu"
|
||||
"csrc/quantization/marlin/marlin_int4_fp8_preprocess.cu"
|
||||
"csrc/quantization/marlin/gptq_marlin_repack.cu"
|
||||
"csrc/quantization/marlin/awq_marlin_repack.cu")
|
||||
set_gencode_flags_for_srcs(
|
||||
SRCS "${MARLIN_SRCS}"
|
||||
CUDA_ARCHS "${MARLIN_OTHER_ARCHS}")
|
||||
|
||||
@@ -20,8 +20,12 @@ FLOAT4_E2M1_MAX = scalar_types.float4_e2m1f.max()
|
||||
FLOAT8_E4M3_MAX = torch.finfo(torch.float8_e4m3fn).max
|
||||
|
||||
PROVIDER_CFGS = {
|
||||
"vllm": dict(backend="vllm", enabled=True),
|
||||
"flashinfer": dict(backend="flashinfer", enabled=True),
|
||||
"vllm": dict(backend="vllm", is_sf_swizzled_layout=False, enabled=True),
|
||||
"vllm-swizzle": dict(backend="vllm", is_sf_swizzled_layout=True, enabled=True),
|
||||
"flashinfer": dict(backend="flashinfer", is_sf_swizzled_layout=False, enabled=True),
|
||||
"flashinfer-swizzle": dict(
|
||||
backend="flashinfer", is_sf_swizzled_layout=True, enabled=True
|
||||
),
|
||||
}
|
||||
|
||||
_enabled = [k for k, v in PROVIDER_CFGS.items() if v["enabled"]]
|
||||
@@ -36,7 +40,7 @@ def compute_global_scale(tensor: torch.Tensor) -> torch.Tensor:
|
||||
@triton.testing.perf_report(
|
||||
triton.testing.Benchmark(
|
||||
x_names=["batch_size"],
|
||||
x_vals=[1, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096],
|
||||
x_vals=[1, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192],
|
||||
x_log=False,
|
||||
line_arg="provider",
|
||||
line_vals=_enabled,
|
||||
@@ -63,19 +67,36 @@ def benchmark(batch_size, provider, N, K):
|
||||
|
||||
if cfg["backend"] == "vllm":
|
||||
# vLLM's FP4 quantization
|
||||
ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
|
||||
lambda: ops.scaled_fp4_quant(a, a_global_scale),
|
||||
quantiles=quantiles,
|
||||
)
|
||||
if cfg["is_sf_swizzled_layout"]:
|
||||
ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
|
||||
lambda: ops.scaled_fp4_quant(
|
||||
a, a_global_scale, is_sf_swizzled_layout=True
|
||||
),
|
||||
quantiles=quantiles,
|
||||
)
|
||||
else:
|
||||
ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
|
||||
lambda: ops.scaled_fp4_quant(
|
||||
a, a_global_scale, is_sf_swizzled_layout=False
|
||||
),
|
||||
quantiles=quantiles,
|
||||
)
|
||||
elif cfg["backend"] == "flashinfer":
|
||||
# FlashInfer's FP4 quantization
|
||||
# Use is_sf_swizzled_layout=True to match vLLM's output format
|
||||
ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
|
||||
lambda: flashinfer_fp4_quantize(
|
||||
a, a_global_scale, is_sf_swizzled_layout=True
|
||||
),
|
||||
quantiles=quantiles,
|
||||
)
|
||||
if cfg["is_sf_swizzled_layout"]:
|
||||
ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
|
||||
lambda: flashinfer_fp4_quantize(
|
||||
a, a_global_scale, is_sf_swizzled_layout=True
|
||||
),
|
||||
quantiles=quantiles,
|
||||
)
|
||||
else:
|
||||
ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
|
||||
lambda: flashinfer_fp4_quantize(
|
||||
a, a_global_scale, is_sf_swizzled_layout=False
|
||||
),
|
||||
quantiles=quantiles,
|
||||
)
|
||||
|
||||
# Convert ms to us for better readability at small batch sizes
|
||||
to_us = lambda t_ms: t_ms * 1000
|
||||
@@ -92,7 +113,9 @@ def prepare_shapes(args):
|
||||
return out
|
||||
|
||||
|
||||
def _test_accuracy_once(M: int, K: int, dtype: torch.dtype, device: str):
|
||||
def _test_accuracy_once(
|
||||
M: int, K: int, dtype: torch.dtype, device: str, is_sf_swizzled_layout: bool
|
||||
):
|
||||
"""Test accuracy between vLLM and FlashInfer FP4 quantization."""
|
||||
# Create input tensor
|
||||
a = torch.randn((M, K), device=device, dtype=dtype)
|
||||
@@ -101,11 +124,13 @@ def _test_accuracy_once(M: int, K: int, dtype: torch.dtype, device: str):
|
||||
a_global_scale = compute_global_scale(a)
|
||||
|
||||
# vLLM quantization
|
||||
vllm_fp4, vllm_scale = ops.scaled_fp4_quant(a, a_global_scale)
|
||||
vllm_fp4, vllm_scale = ops.scaled_fp4_quant(
|
||||
a, a_global_scale, is_sf_swizzled_layout=is_sf_swizzled_layout
|
||||
)
|
||||
|
||||
# FlashInfer quantization (with swizzled layout to match vLLM's output)
|
||||
flashinfer_fp4, flashinfer_scale = flashinfer_fp4_quantize(
|
||||
a, a_global_scale, is_sf_swizzled_layout=True
|
||||
a, a_global_scale, is_sf_swizzled_layout=is_sf_swizzled_layout
|
||||
)
|
||||
flashinfer_scale = flashinfer_scale.view(torch.float8_e4m3fn)
|
||||
|
||||
@@ -114,7 +139,14 @@ def _test_accuracy_once(M: int, K: int, dtype: torch.dtype, device: str):
|
||||
vllm_fp4,
|
||||
flashinfer_fp4,
|
||||
)
|
||||
print(f"M={M}, K={K}, dtype={dtype}: PASSED")
|
||||
# Compare scales
|
||||
torch.testing.assert_close(
|
||||
vllm_scale,
|
||||
flashinfer_scale,
|
||||
)
|
||||
print(
|
||||
f"M={M}, K={K}, dtype={dtype}, is_sf_swizzled_layout={is_sf_swizzled_layout}: PASSED" # noqa: E501
|
||||
)
|
||||
|
||||
|
||||
def test_accuracy():
|
||||
@@ -130,9 +162,10 @@ def test_accuracy():
|
||||
Ms = [1, 1024]
|
||||
Ks = [4096]
|
||||
|
||||
for M in Ms:
|
||||
for K in Ks:
|
||||
_test_accuracy_once(M, K, dtype, device)
|
||||
for is_sf_swizzled_layout in [True, False]:
|
||||
for M in Ms:
|
||||
for K in Ks:
|
||||
_test_accuracy_once(M, K, dtype, device, is_sf_swizzled_layout)
|
||||
|
||||
print("\nAll accuracy tests passed!")
|
||||
|
||||
@@ -145,7 +178,7 @@ if __name__ == "__main__":
|
||||
"--models",
|
||||
nargs="+",
|
||||
type=str,
|
||||
default=["meta-llama/Llama-3.1-8B-Instruct"],
|
||||
default=["meta-llama/Llama-3.3-70B-Instruct"],
|
||||
choices=list(WEIGHT_SHAPES.keys()),
|
||||
)
|
||||
parser.add_argument("--tp-sizes", nargs="+", type=int, default=[1])
|
||||
|
||||
@@ -7,7 +7,7 @@ import itertools
|
||||
import torch
|
||||
|
||||
import vllm.model_executor.layers.activation # noqa F401
|
||||
from vllm.model_executor.custom_op import CustomOp
|
||||
from vllm.model_executor.custom_op import op_registry
|
||||
from vllm.triton_utils import triton
|
||||
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
||||
from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE, set_random_seed
|
||||
@@ -33,14 +33,14 @@ def benchmark_activation(
|
||||
torch.set_default_device(device)
|
||||
|
||||
if func_name == "gelu_and_mul":
|
||||
layer = CustomOp.op_registry[func_name](approximate="none")
|
||||
layer = op_registry[func_name](approximate="none")
|
||||
elif func_name == "gelu_and_mul_tanh":
|
||||
layer = CustomOp.op_registry["gelu_and_mul"](approximate="tanh")
|
||||
layer = op_registry["gelu_and_mul"](approximate="tanh")
|
||||
elif func_name == "fatrelu_and_mul":
|
||||
threshold = 0.5
|
||||
layer = CustomOp.op_registry[func_name](threshold)
|
||||
layer = op_registry[func_name](threshold)
|
||||
else:
|
||||
layer = CustomOp.op_registry[func_name]()
|
||||
layer = op_registry[func_name]()
|
||||
|
||||
x = torch.randn(num_tokens, dim, dtype=dtype, device=device)
|
||||
compiled_layer = torch.compile(layer.forward_native)
|
||||
|
||||
@@ -9,6 +9,7 @@ but use different quantization strategies and backends.
|
||||
import torch
|
||||
|
||||
import vllm.model_executor.layers.fused_moe.modular_kernel as mk
|
||||
from tests.kernels.moe.utils import make_dummy_moe_config
|
||||
from vllm import _custom_ops as ops
|
||||
from vllm.model_executor.layers.fused_moe.config import fp8_w8a8_moe_quant_config
|
||||
from vllm.model_executor.layers.fused_moe.cutlass_moe import CutlassExpertsFp8
|
||||
@@ -138,12 +139,13 @@ def bench_run(
|
||||
fn = mk.FusedMoEModularKernel(
|
||||
MoEPrepareAndFinalizeNoEP(),
|
||||
CutlassExpertsFp8(
|
||||
out_dtype=a.dtype,
|
||||
e=num_experts,
|
||||
n=n,
|
||||
k=k,
|
||||
moe_config=make_dummy_moe_config(
|
||||
num_experts=num_experts,
|
||||
hidden_dim=k,
|
||||
intermediate_size_per_partition=n,
|
||||
in_dtype=a.dtype,
|
||||
),
|
||||
quant_config=quant_config,
|
||||
device=w1.device,
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
@@ -12,6 +12,7 @@ import torch
|
||||
import torch.utils.benchmark as benchmark
|
||||
|
||||
import vllm.model_executor.layers.fused_moe.modular_kernel as mk
|
||||
from tests.kernels.moe.utils import make_dummy_moe_config
|
||||
from vllm import _custom_ops as ops
|
||||
from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config
|
||||
from vllm.model_executor.layers.fused_moe.config import (
|
||||
@@ -196,10 +197,9 @@ def bench_run(
|
||||
)
|
||||
|
||||
kernel = mk.FusedMoEModularKernel(
|
||||
MoEPrepareAndFinalizeNoEP(defer_input_quant=True),
|
||||
MoEPrepareAndFinalizeNoEP(),
|
||||
CutlassExpertsFp4(
|
||||
out_dtype=dtype,
|
||||
max_experts_per_worker=e,
|
||||
make_dummy_moe_config(),
|
||||
quant_config=quant_config,
|
||||
),
|
||||
)
|
||||
@@ -242,10 +242,9 @@ def bench_run(
|
||||
)
|
||||
|
||||
kernel = mk.FusedMoEModularKernel(
|
||||
MoEPrepareAndFinalizeNoEP(defer_input_quant=True),
|
||||
MoEPrepareAndFinalizeNoEP(),
|
||||
CutlassExpertsFp4(
|
||||
out_dtype=dtype,
|
||||
max_experts_per_worker=e,
|
||||
make_dummy_moe_config(),
|
||||
quant_config=quant_config,
|
||||
),
|
||||
)
|
||||
|
||||
99
benchmarks/kernels/benchmark_fused_topk.py
Normal file
99
benchmarks/kernels/benchmark_fused_topk.py
Normal file
@@ -0,0 +1,99 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import itertools
|
||||
|
||||
import torch
|
||||
|
||||
from vllm.model_executor.layers.fused_moe.router.fused_topk_router import fused_topk
|
||||
from vllm.triton_utils import triton
|
||||
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
||||
|
||||
num_tokens_range = [2**i for i in range(0, 8, 2)]
|
||||
num_experts_range = [16, 32, 64, 128, 256, 512]
|
||||
topk_range = [3, 4]
|
||||
configs = list(itertools.product(num_tokens_range, num_experts_range, topk_range))
|
||||
|
||||
|
||||
def torch_topk(
|
||||
gating_output: torch.Tensor,
|
||||
topk: int,
|
||||
renormalize: bool,
|
||||
scoring_func: str = "softmax",
|
||||
):
|
||||
if scoring_func == "softmax":
|
||||
scores = torch.softmax(gating_output.float(), dim=-1)
|
||||
else:
|
||||
scores = torch.sigmoid(gating_output.float())
|
||||
topk_weights, topk_ids = torch.topk(scores, k=topk, dim=-1)
|
||||
|
||||
if renormalize:
|
||||
topk_weights = topk_weights / topk_weights.sum(dim=-1, keepdim=True)
|
||||
|
||||
return topk_weights, topk_ids
|
||||
|
||||
|
||||
def get_benchmark(scoring_func):
|
||||
@triton.testing.perf_report(
|
||||
triton.testing.Benchmark(
|
||||
x_names=["num_tokens", "num_experts", "topk"],
|
||||
x_vals=[list(_) for _ in configs],
|
||||
line_arg="provider",
|
||||
line_vals=["torch", "vllm"],
|
||||
line_names=["Torch", "vLLM"],
|
||||
styles=[("blue", "-"), ("red", "-")],
|
||||
ylabel="us",
|
||||
plot_name=f"fused-topk-perf-{scoring_func}",
|
||||
args={},
|
||||
)
|
||||
)
|
||||
def benchmark(num_tokens, num_experts, topk, provider):
|
||||
dtype = torch.bfloat16
|
||||
hidden_size = 1024
|
||||
renormalize = True
|
||||
hidden_states = torch.randn(
|
||||
(num_tokens, hidden_size), dtype=dtype, device="cuda"
|
||||
)
|
||||
gating_output = torch.randn(
|
||||
(num_tokens, num_experts), dtype=dtype, device="cuda"
|
||||
)
|
||||
|
||||
quantiles = [0.5, 0.2, 0.8]
|
||||
|
||||
if provider == "torch":
|
||||
ms, min_ms, max_ms = triton.testing.do_bench(
|
||||
lambda: torch_topk(
|
||||
gating_output=gating_output,
|
||||
topk=topk,
|
||||
renormalize=renormalize,
|
||||
scoring_func=scoring_func,
|
||||
),
|
||||
quantiles=quantiles,
|
||||
)
|
||||
else:
|
||||
ms, min_ms, max_ms = triton.testing.do_bench(
|
||||
lambda: fused_topk(
|
||||
hidden_states=hidden_states,
|
||||
gating_output=gating_output,
|
||||
topk=topk,
|
||||
renormalize=renormalize,
|
||||
scoring_func=scoring_func,
|
||||
),
|
||||
quantiles=quantiles,
|
||||
)
|
||||
|
||||
return 1000 * ms, 1000 * max_ms, 1000 * min_ms
|
||||
|
||||
return benchmark
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = FlexibleArgumentParser(description="Benchmark the MoE topk kernel.")
|
||||
parser.add_argument("--scoring-func", type=str, default="softmax")
|
||||
parser.add_argument("--save-path", type=str, default="./configs/fused_topk/")
|
||||
args = parser.parse_args()
|
||||
|
||||
# Get the benchmark function
|
||||
benchmark = get_benchmark(args.scoring_func)
|
||||
# Run performance benchmark
|
||||
benchmark.run(print_data=True, save_path=args.save_path)
|
||||
@@ -6,6 +6,7 @@ import torch.utils.benchmark as benchmark
|
||||
from benchmark_shapes import WEIGHT_SHAPES_MOE
|
||||
|
||||
import vllm.model_executor.layers.fused_moe.modular_kernel as mk
|
||||
from tests.kernels.moe.utils import make_dummy_moe_config
|
||||
from vllm import _custom_ops as ops
|
||||
from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config
|
||||
from vllm.model_executor.layers.fused_moe.config import fp8_w8a8_moe_quant_config
|
||||
@@ -134,13 +135,13 @@ def bench_run(
|
||||
fn = mk.FusedMoEModularKernel(
|
||||
MoEPrepareAndFinalizeNoEP(),
|
||||
CutlassExpertsFp8(
|
||||
out_dtype=a.dtype,
|
||||
# NOTE(rob): w2 is shaped as [E, hidden, intermediate]
|
||||
e=w2.shape[0],
|
||||
n=w2.shape[2],
|
||||
k=w2.shape[1],
|
||||
moe_config=make_dummy_moe_config(
|
||||
num_experts=w2.shape[0],
|
||||
hidden_dim=w2.shape[1],
|
||||
intermediate_size_per_partition=w2.shape[2],
|
||||
in_dtype=a.dtype,
|
||||
),
|
||||
quant_config=quant_config,
|
||||
device=w1.device,
|
||||
),
|
||||
)
|
||||
|
||||
@@ -166,13 +167,13 @@ def bench_run(
|
||||
fn = mk.FusedMoEModularKernel(
|
||||
MoEPrepareAndFinalizeNoEP(),
|
||||
CutlassExpertsFp8(
|
||||
out_dtype=a.dtype,
|
||||
# NOTE(rob): w2 is shaped as [E, hidden, intermediate]
|
||||
e=w2.shape[0],
|
||||
n=w2.shape[2],
|
||||
k=w2.shape[1],
|
||||
moe_config=make_dummy_moe_config(
|
||||
num_experts=w2.shape[0],
|
||||
hidden_dim=w2.shape[1],
|
||||
intermediate_size_per_partition=w2.shape[2],
|
||||
in_dtype=a.dtype,
|
||||
),
|
||||
quant_config=quant_config,
|
||||
device=w1.device,
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
@@ -231,7 +231,7 @@ def marlin_create_bench_fn(bt: BenchmarkTensors) -> Callable:
|
||||
assert bt.w_tok_s is None
|
||||
assert bt.group_size is not None
|
||||
|
||||
fn = lambda: ops.gptq_marlin_gemm(
|
||||
fn = lambda: ops.marlin_gemm(
|
||||
a=bt.a,
|
||||
c=None,
|
||||
b_q_weight=w_q,
|
||||
|
||||
@@ -239,7 +239,7 @@ def bench_run(
|
||||
"sm_version": sm_version,
|
||||
"CUBLAS_M_THRESHOLD": CUBLAS_M_THRESHOLD,
|
||||
# Kernels
|
||||
"gptq_marlin_gemm": ops.gptq_marlin_gemm,
|
||||
"marlin_gemm": ops.marlin_gemm,
|
||||
"gptq_marlin_24_gemm": ops.gptq_marlin_24_gemm,
|
||||
"gptq_marlin_repack": ops.gptq_marlin_repack,
|
||||
"allspark_w8a16_gemm": ops.allspark_w8a16_gemm,
|
||||
@@ -263,21 +263,21 @@ def bench_run(
|
||||
|
||||
results.append(
|
||||
benchmark.Timer(
|
||||
stmt="output = gptq_marlin_gemm(a, None, marlin_q_w, marlin_s, None, marlin_s2, marlin_zp, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, quant_type, size_m, size_n, size_k, is_k_full, False, False, False)", # noqa: E501
|
||||
stmt="output = marlin_gemm(a, None, marlin_q_w, marlin_s, None, marlin_s2, marlin_zp, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, quant_type, size_m, size_n, size_k, is_k_full, False, False, False)", # noqa: E501
|
||||
globals=globals,
|
||||
label=label,
|
||||
sub_label=sub_label,
|
||||
description="gptq_marlin_gemm",
|
||||
description="marlin_gemm",
|
||||
).blocked_autorange(min_run_time=min_run_time)
|
||||
)
|
||||
|
||||
results.append(
|
||||
benchmark.Timer(
|
||||
stmt="output = gptq_marlin_gemm(a, None, marlin_q_w, marlin_s, None, marlin_s2, marlin_zp, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, quant_type, size_m, size_n, size_k, is_k_full, False, True, False)", # noqa: E501
|
||||
stmt="output = marlin_gemm(a, None, marlin_q_w, marlin_s, None, marlin_s2, marlin_zp, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, quant_type, size_m, size_n, size_k, is_k_full, False, True, False)", # noqa: E501
|
||||
globals=globals,
|
||||
label=label,
|
||||
sub_label=sub_label,
|
||||
description="gptq_marlin_gemm_fp32",
|
||||
description="marlin_gemm_fp32",
|
||||
).blocked_autorange(min_run_time=min_run_time)
|
||||
)
|
||||
|
||||
|
||||
@@ -15,11 +15,18 @@ import ray
|
||||
import torch
|
||||
from ray.experimental.tqdm_ray import tqdm
|
||||
|
||||
from vllm.model_executor.layers.fused_moe import fused_topk
|
||||
from vllm.model_executor.layers.fused_moe.config import (
|
||||
FusedMoEConfig,
|
||||
FusedMoEParallelConfig,
|
||||
FusedMoEQuantConfig,
|
||||
RoutingMethodType,
|
||||
_get_config_dtype_str,
|
||||
)
|
||||
from vllm.model_executor.layers.fused_moe.fused_moe import *
|
||||
from vllm.model_executor.layers.fused_moe.triton_deep_gemm_moe import (
|
||||
TritonOrDeepGemmExperts,
|
||||
)
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.transformers_utils.config import get_config
|
||||
from vllm.triton_utils import triton
|
||||
@@ -194,10 +201,36 @@ def benchmark_config(
|
||||
block_shape=block_quant_shape,
|
||||
)
|
||||
|
||||
deep_gemm_experts = None
|
||||
if use_deep_gemm:
|
||||
deep_gemm_experts = mk.FusedMoEModularKernel(
|
||||
prepare_finalize=MoEPrepareAndFinalizeNoEP(),
|
||||
fused_experts=TritonOrDeepGemmExperts(
|
||||
moe_config=FusedMoEConfig(
|
||||
num_experts=num_experts,
|
||||
experts_per_token=topk,
|
||||
hidden_dim=hidden_size,
|
||||
intermediate_size_per_partition=shard_intermediate_size,
|
||||
num_local_experts=num_experts,
|
||||
activation="silu",
|
||||
moe_parallel_config=FusedMoEParallelConfig.make_no_parallel(),
|
||||
in_dtype=init_dtype,
|
||||
routing_method=RoutingMethodType.TopK,
|
||||
device="cuda",
|
||||
),
|
||||
quant_config=quant_config,
|
||||
),
|
||||
)
|
||||
|
||||
with override_config(config):
|
||||
topk_weights, topk_ids, token_expert_indices = fused_topk(
|
||||
x, input_gating, topk, renormalize=not use_deep_gemm
|
||||
)
|
||||
|
||||
if use_deep_gemm:
|
||||
return deep_gemm_experts(
|
||||
x, w1, w2, topk_weights, topk_ids, inplace=True
|
||||
)
|
||||
return fused_experts(
|
||||
x,
|
||||
w1,
|
||||
@@ -206,7 +239,6 @@ def benchmark_config(
|
||||
topk_ids,
|
||||
inplace=True,
|
||||
quant_config=quant_config,
|
||||
allow_deep_gemm=use_deep_gemm,
|
||||
)
|
||||
|
||||
# JIT compilation & warmup
|
||||
@@ -643,6 +675,7 @@ def main(args: argparse.Namespace):
|
||||
"DeepseekV3ForCausalLM",
|
||||
"DeepseekV32ForCausalLM",
|
||||
"Glm4MoeForCausalLM",
|
||||
"Glm4MoeLiteForCausalLM",
|
||||
"NemotronHForCausalLM",
|
||||
):
|
||||
E = config.n_routed_experts
|
||||
|
||||
@@ -8,10 +8,8 @@ import ray
|
||||
import torch
|
||||
from transformers import AutoConfig
|
||||
|
||||
from vllm.model_executor.layers.fused_moe.fused_moe import *
|
||||
from vllm.model_executor.layers.fused_moe import fused_topk
|
||||
from vllm.model_executor.layers.fused_moe.moe_permute_unpermute import (
|
||||
_moe_permute,
|
||||
_moe_unpermute_and_reduce,
|
||||
moe_permute,
|
||||
moe_unpermute,
|
||||
)
|
||||
@@ -41,7 +39,6 @@ def benchmark_permute(
|
||||
use_fp8_w8a8: bool,
|
||||
use_int8_w8a16: bool,
|
||||
num_iters: int = 100,
|
||||
use_customized_permute: bool = False,
|
||||
) -> float:
|
||||
# init_dtype = torch.float16 if use_fp8_w8a8 else dtype
|
||||
hidden_states = torch.randn(num_tokens, hidden_size, dtype=dtype)
|
||||
@@ -64,31 +61,14 @@ def benchmark_permute(
|
||||
input_gating.copy_(gating_output[i])
|
||||
|
||||
def run():
|
||||
if use_customized_permute:
|
||||
(
|
||||
permuted_hidden_states,
|
||||
a1q_scale,
|
||||
first_token_off,
|
||||
inv_perm_idx,
|
||||
m_indices,
|
||||
) = moe_permute(
|
||||
qhidden_states,
|
||||
a1q_scale=None,
|
||||
topk_ids=topk_ids,
|
||||
n_expert=num_experts,
|
||||
expert_map=None,
|
||||
align_block_size=align_block_size,
|
||||
)
|
||||
else:
|
||||
(
|
||||
permuted_hidden_states,
|
||||
a1q_scale,
|
||||
sorted_token_ids,
|
||||
expert_ids,
|
||||
inv_perm,
|
||||
) = _moe_permute(
|
||||
qhidden_states, None, topk_ids, num_experts, None, align_block_size
|
||||
)
|
||||
moe_permute(
|
||||
qhidden_states,
|
||||
a1q_scale=None,
|
||||
topk_ids=topk_ids,
|
||||
n_expert=num_experts,
|
||||
expert_map=None,
|
||||
align_block_size=align_block_size,
|
||||
)
|
||||
|
||||
# JIT compilation & warmup
|
||||
run()
|
||||
@@ -133,11 +113,9 @@ def benchmark_unpermute(
|
||||
use_fp8_w8a8: bool,
|
||||
use_int8_w8a16: bool,
|
||||
num_iters: int = 100,
|
||||
use_customized_permute: bool = False,
|
||||
) -> float:
|
||||
# init_dtype = torch.float16 if use_fp8_w8a8 else dtype
|
||||
hidden_states = torch.randn(num_tokens, hidden_size, dtype=dtype)
|
||||
output_hidden_states = torch.empty_like(hidden_states)
|
||||
if use_fp8_w8a8:
|
||||
align_block_size = 128 # deepgemm needs 128 m aligned block
|
||||
qhidden_states, scale = _fp8_quantize(hidden_states, None, None)
|
||||
@@ -152,78 +130,37 @@ def benchmark_unpermute(
|
||||
)
|
||||
|
||||
def prepare():
|
||||
if use_customized_permute:
|
||||
(
|
||||
permuted_hidden_states,
|
||||
a1q_scale,
|
||||
first_token_off,
|
||||
inv_perm_idx,
|
||||
m_indices,
|
||||
) = moe_permute(
|
||||
qhidden_states,
|
||||
a1q_scale=None,
|
||||
topk_ids=topk_ids,
|
||||
n_expert=num_experts,
|
||||
expert_map=None,
|
||||
align_block_size=align_block_size,
|
||||
)
|
||||
# convert to fp16/bf16 as gemm output
|
||||
return (
|
||||
permuted_hidden_states.to(dtype),
|
||||
first_token_off,
|
||||
inv_perm_idx,
|
||||
m_indices,
|
||||
)
|
||||
else:
|
||||
(
|
||||
permuted_qhidden_states,
|
||||
a1q_scale,
|
||||
sorted_token_ids,
|
||||
expert_ids,
|
||||
inv_perm,
|
||||
) = _moe_permute(
|
||||
qhidden_states, None, topk_ids, num_experts, None, align_block_size
|
||||
)
|
||||
# convert to fp16/bf16 as gemm output
|
||||
return (
|
||||
permuted_qhidden_states.to(dtype),
|
||||
a1q_scale,
|
||||
sorted_token_ids,
|
||||
expert_ids,
|
||||
inv_perm,
|
||||
)
|
||||
(
|
||||
permuted_hidden_states,
|
||||
_,
|
||||
first_token_off,
|
||||
inv_perm_idx,
|
||||
_,
|
||||
) = moe_permute(
|
||||
qhidden_states,
|
||||
a1q_scale=None,
|
||||
topk_ids=topk_ids,
|
||||
n_expert=num_experts,
|
||||
expert_map=None,
|
||||
align_block_size=align_block_size,
|
||||
)
|
||||
# convert to fp16/bf16 as gemm output
|
||||
return (
|
||||
permuted_hidden_states.to(dtype),
|
||||
first_token_off,
|
||||
inv_perm_idx,
|
||||
)
|
||||
|
||||
def run(input: tuple):
|
||||
if use_customized_permute:
|
||||
(
|
||||
permuted_hidden_states,
|
||||
first_token_off,
|
||||
inv_perm_idx,
|
||||
m_indices,
|
||||
) = input
|
||||
output = torch.empty_like(hidden_states)
|
||||
moe_unpermute(
|
||||
output,
|
||||
permuted_hidden_states,
|
||||
topk_weights,
|
||||
inv_perm_idx,
|
||||
first_token_off,
|
||||
)
|
||||
else:
|
||||
(
|
||||
permuted_hidden_states,
|
||||
a1q_scale,
|
||||
sorted_token_ids,
|
||||
expert_ids,
|
||||
inv_perm,
|
||||
) = input
|
||||
_moe_unpermute_and_reduce(
|
||||
output_hidden_states,
|
||||
permuted_hidden_states,
|
||||
inv_perm,
|
||||
topk_weights,
|
||||
True,
|
||||
)
|
||||
(permuted_hidden_states, first_token_off, inv_perm_idx) = input
|
||||
output = torch.empty_like(hidden_states)
|
||||
moe_unpermute(
|
||||
output,
|
||||
permuted_hidden_states,
|
||||
topk_weights,
|
||||
inv_perm_idx,
|
||||
first_token_off,
|
||||
)
|
||||
|
||||
# JIT compilation & warmup
|
||||
input = prepare()
|
||||
@@ -278,8 +215,7 @@ class BenchmarkWorker:
|
||||
dtype: torch.dtype,
|
||||
use_fp8_w8a8: bool,
|
||||
use_int8_w8a16: bool,
|
||||
use_customized_permute: bool = False,
|
||||
) -> tuple[dict[str, int], float]:
|
||||
) -> tuple[float, float]:
|
||||
set_random_seed(self.seed)
|
||||
|
||||
permute_time = benchmark_permute(
|
||||
@@ -291,7 +227,6 @@ class BenchmarkWorker:
|
||||
use_fp8_w8a8,
|
||||
use_int8_w8a16,
|
||||
num_iters=100,
|
||||
use_customized_permute=use_customized_permute,
|
||||
)
|
||||
unpermute_time = benchmark_unpermute(
|
||||
num_tokens,
|
||||
@@ -302,7 +237,6 @@ class BenchmarkWorker:
|
||||
use_fp8_w8a8,
|
||||
use_int8_w8a16,
|
||||
num_iters=100,
|
||||
use_customized_permute=use_customized_permute,
|
||||
)
|
||||
return permute_time, unpermute_time
|
||||
|
||||
@@ -330,6 +264,7 @@ def main(args: argparse.Namespace):
|
||||
config.architectures[0] == "DeepseekV3ForCausalLM"
|
||||
or config.architectures[0] == "DeepseekV2ForCausalLM"
|
||||
or config.architectures[0] == "Glm4MoeForCausalLM"
|
||||
or config.architectures[0] == "Glm4MoeLiteForCausalLM"
|
||||
):
|
||||
E = config.n_routed_experts
|
||||
topk = config.num_experts_per_tok
|
||||
@@ -348,7 +283,6 @@ def main(args: argparse.Namespace):
|
||||
dtype = torch.float16 if current_platform.is_rocm() else config.dtype
|
||||
use_fp8_w8a8 = args.dtype == "fp8_w8a8"
|
||||
use_int8_w8a16 = args.dtype == "int8_w8a16"
|
||||
use_customized_permute = args.use_customized_permute
|
||||
|
||||
if args.batch_size is None:
|
||||
batch_sizes = [
|
||||
@@ -400,7 +334,6 @@ def main(args: argparse.Namespace):
|
||||
dtype,
|
||||
use_fp8_w8a8,
|
||||
use_int8_w8a16,
|
||||
use_customized_permute,
|
||||
)
|
||||
for batch_size in batch_sizes
|
||||
],
|
||||
@@ -420,7 +353,6 @@ if __name__ == "__main__":
|
||||
parser.add_argument(
|
||||
"--dtype", type=str, choices=["auto", "fp8_w8a8", "int8_w8a16"], default="auto"
|
||||
)
|
||||
parser.add_argument("--use-customized-permute", action="store_true")
|
||||
parser.add_argument("--seed", type=int, default=0)
|
||||
parser.add_argument("--batch-size", type=int, required=False)
|
||||
parser.add_argument("--trust-remote-code", action="store_true")
|
||||
|
||||
@@ -14,7 +14,6 @@ from vllm.triton_utils import triton
|
||||
from vllm.utils.deep_gemm import (
|
||||
calc_diff,
|
||||
fp8_gemm_nt,
|
||||
get_col_major_tma_aligned_tensor,
|
||||
per_block_cast_to_fp8,
|
||||
)
|
||||
|
||||
@@ -48,8 +47,9 @@ def benchmark_shape(
|
||||
block_size = [128, 128]
|
||||
|
||||
# Pre-quantize A for all implementations
|
||||
A_deepgemm, A_scale_deepgemm = per_token_group_quant_fp8(A, block_size[1])
|
||||
A_scale_deepgemm = get_col_major_tma_aligned_tensor(A_scale_deepgemm)
|
||||
A_deepgemm, A_scale_deepgemm = per_token_group_quant_fp8(
|
||||
A, block_size[1], column_major_scales=True, tma_aligned_scales=True
|
||||
)
|
||||
C_deepgemm = torch.empty((m, n), device="cuda", dtype=torch.bfloat16)
|
||||
A_vllm, A_scale_vllm = per_token_group_quant_fp8(A, block_size[1])
|
||||
A_vllm_cutlass, A_scale_vllm_cutlass = per_token_group_quant_fp8(
|
||||
|
||||
@@ -13,6 +13,8 @@ endif()
|
||||
#
|
||||
# Define environment variables for special configurations
|
||||
#
|
||||
set(ENABLE_AVX2 $ENV{VLLM_CPU_AVX2})
|
||||
set(ENABLE_AVX512 $ENV{VLLM_CPU_AVX512})
|
||||
set(ENABLE_AVX512BF16 $ENV{VLLM_CPU_AVX512BF16})
|
||||
set(ENABLE_AVX512VNNI $ENV{VLLM_CPU_AVX512VNNI})
|
||||
set(ENABLE_AMXBF16 $ENV{VLLM_CPU_AMXBF16})
|
||||
@@ -103,6 +105,16 @@ else()
|
||||
find_isa(${CPUINFO} "bf16" ARM_BF16_FOUND) # Check for ARM BF16 support
|
||||
find_isa(${CPUINFO} "S390" S390_FOUND)
|
||||
find_isa(${CPUINFO} "v" RVV_FOUND) # Check for RISC-V RVV support
|
||||
|
||||
# Support cross-compilation by allowing override via environment variables
|
||||
if (ENABLE_AVX2)
|
||||
set(AVX2_FOUND ON)
|
||||
message(STATUS "AVX2 support enabled via VLLM_CPU_AVX2 environment variable")
|
||||
endif()
|
||||
if (ENABLE_AVX512)
|
||||
set(AVX512_FOUND ON)
|
||||
message(STATUS "AVX512 support enabled via VLLM_CPU_AVX512 environment variable")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if (AVX512_FOUND AND NOT AVX512_DISABLED)
|
||||
@@ -379,6 +391,12 @@ if (AVX512_FOUND AND NOT AVX512_DISABLED)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if (ASIMD_FOUND AND NOT APPLE_SILICON_FOUND)
|
||||
set(VLLM_EXT_SRC
|
||||
"csrc/cpu/shm.cpp"
|
||||
${VLLM_EXT_SRC})
|
||||
endif()
|
||||
|
||||
if(USE_ONEDNN)
|
||||
set(VLLM_EXT_SRC
|
||||
"csrc/cpu/dnnl_kernels.cpp"
|
||||
|
||||
@@ -19,7 +19,7 @@ else()
|
||||
FetchContent_Declare(
|
||||
flashmla
|
||||
GIT_REPOSITORY https://github.com/vllm-project/FlashMLA
|
||||
GIT_TAG 46d64a8ebef03fa50b4ae74937276a5c940e3f95
|
||||
GIT_TAG c2afa9cb93e674d5a9120a170a6da57b89267208
|
||||
GIT_PROGRESS TRUE
|
||||
CONFIGURE_COMMAND ""
|
||||
BUILD_COMMAND ""
|
||||
@@ -30,6 +30,24 @@ endif()
|
||||
FetchContent_MakeAvailable(flashmla)
|
||||
message(STATUS "FlashMLA is available at ${flashmla_SOURCE_DIR}")
|
||||
|
||||
# Vendor FlashMLA interface into vLLM with torch-ops shim.
|
||||
set(FLASHMLA_VENDOR_DIR "${CMAKE_SOURCE_DIR}/vllm/third_party/flashmla")
|
||||
file(MAKE_DIRECTORY "${FLASHMLA_VENDOR_DIR}")
|
||||
file(READ "${flashmla_SOURCE_DIR}/flash_mla/flash_mla_interface.py"
|
||||
FLASHMLA_INTERFACE_CONTENT)
|
||||
string(REPLACE "import flash_mla.cuda as flash_mla_cuda"
|
||||
"import vllm._flashmla_C\nflash_mla_cuda = torch.ops._flashmla_C"
|
||||
FLASHMLA_INTERFACE_CONTENT
|
||||
"${FLASHMLA_INTERFACE_CONTENT}")
|
||||
file(WRITE "${FLASHMLA_VENDOR_DIR}/flash_mla_interface.py"
|
||||
"${FLASHMLA_INTERFACE_CONTENT}")
|
||||
|
||||
# Install the generated flash_mla_interface.py to the wheel
|
||||
# Use COMPONENT _flashmla_C to ensure it's installed with the C extension
|
||||
install(FILES "${FLASHMLA_VENDOR_DIR}/flash_mla_interface.py"
|
||||
DESTINATION vllm/third_party/flashmla/
|
||||
COMPONENT _flashmla_C)
|
||||
|
||||
# The FlashMLA kernels only work on hopper and require CUDA 12.3 or later.
|
||||
# Only build FlashMLA kernels if we are building for something compatible with
|
||||
# sm90a
|
||||
@@ -55,16 +73,42 @@ if(FLASH_MLA_ARCHS)
|
||||
|
||||
set(FlashMLA_SOURCES
|
||||
${flashmla_SOURCE_DIR}/csrc/torch_api.cpp
|
||||
${flashmla_SOURCE_DIR}/csrc/pybind.cpp
|
||||
${flashmla_SOURCE_DIR}/csrc/smxx/get_mla_metadata.cu
|
||||
${flashmla_SOURCE_DIR}/csrc/smxx/mla_combine.cu
|
||||
${flashmla_SOURCE_DIR}/csrc/sm90/decode/dense/splitkv_mla.cu
|
||||
${flashmla_SOURCE_DIR}/csrc/sm90/decode/sparse_fp8/splitkv_mla.cu
|
||||
|
||||
# Misc kernels for decoding
|
||||
${flashmla_SOURCE_DIR}/csrc/smxx/decode/get_decoding_sched_meta/get_decoding_sched_meta.cu
|
||||
${flashmla_SOURCE_DIR}/csrc/smxx/decode/combine/combine.cu
|
||||
|
||||
# sm90 dense decode
|
||||
${flashmla_SOURCE_DIR}/csrc/sm90/decode/dense/instantiations/fp16.cu
|
||||
${flashmla_SOURCE_DIR}/csrc/sm90/decode/dense/instantiations/bf16.cu
|
||||
|
||||
# sm90 sparse decode
|
||||
${flashmla_SOURCE_DIR}/csrc/sm90/decode/sparse_fp8/instantiations/model1_persistent_h64.cu
|
||||
${flashmla_SOURCE_DIR}/csrc/sm90/decode/sparse_fp8/instantiations/model1_persistent_h128.cu
|
||||
${flashmla_SOURCE_DIR}/csrc/sm90/decode/sparse_fp8/instantiations/v32_persistent_h64.cu
|
||||
${flashmla_SOURCE_DIR}/csrc/sm90/decode/sparse_fp8/instantiations/v32_persistent_h128.cu
|
||||
|
||||
# sm90 sparse prefill
|
||||
${flashmla_SOURCE_DIR}/csrc/sm90/prefill/sparse/fwd.cu
|
||||
${flashmla_SOURCE_DIR}/csrc/sm100/decode/sparse_fp8/splitkv_mla.cu
|
||||
${flashmla_SOURCE_DIR}/csrc/sm90/prefill/sparse/instantiations/phase1_k512.cu
|
||||
${flashmla_SOURCE_DIR}/csrc/sm90/prefill/sparse/instantiations/phase1_k512_topklen.cu
|
||||
${flashmla_SOURCE_DIR}/csrc/sm90/prefill/sparse/instantiations/phase1_k576.cu
|
||||
${flashmla_SOURCE_DIR}/csrc/sm90/prefill/sparse/instantiations/phase1_k576_topklen.cu
|
||||
|
||||
# sm100 dense prefill & backward
|
||||
${flashmla_SOURCE_DIR}/csrc/sm100/prefill/dense/fmha_cutlass_fwd_sm100.cu
|
||||
${flashmla_SOURCE_DIR}/csrc/sm100/prefill/dense/fmha_cutlass_bwd_sm100.cu
|
||||
${flashmla_SOURCE_DIR}/csrc/sm100/prefill/sparse/fwd.cu
|
||||
|
||||
# sm100 sparse prefill
|
||||
${flashmla_SOURCE_DIR}/csrc/sm100/prefill/sparse/fwd/head64/instantiations/phase1_k512.cu
|
||||
${flashmla_SOURCE_DIR}/csrc/sm100/prefill/sparse/fwd/head64/instantiations/phase1_k576.cu
|
||||
${flashmla_SOURCE_DIR}/csrc/sm100/prefill/sparse/fwd/head128/instantiations/phase1_k512.cu
|
||||
${flashmla_SOURCE_DIR}/csrc/sm100/prefill/sparse/fwd/head128/instantiations/phase1_k576.cu
|
||||
${flashmla_SOURCE_DIR}/csrc/sm100/prefill/sparse/fwd_for_small_topk/head128/instantiations/phase1_prefill_k512.cu
|
||||
|
||||
# sm100 sparse decode
|
||||
${flashmla_SOURCE_DIR}/csrc/sm100/decode/head64/instantiations/v32.cu
|
||||
${flashmla_SOURCE_DIR}/csrc/sm100/decode/head64/instantiations/model1.cu
|
||||
${flashmla_SOURCE_DIR}/csrc/sm100/prefill/sparse/fwd_for_small_topk/head128/instantiations/phase1_decode_k512.cu
|
||||
)
|
||||
|
||||
set(FlashMLA_Extension_SOURCES
|
||||
@@ -76,6 +120,7 @@ if(FLASH_MLA_ARCHS)
|
||||
|
||||
set(FlashMLA_INCLUDES
|
||||
${flashmla_SOURCE_DIR}/csrc
|
||||
${flashmla_SOURCE_DIR}/csrc/kerutils/include
|
||||
${flashmla_SOURCE_DIR}/csrc/sm90
|
||||
${flashmla_SOURCE_DIR}/csrc/cutlass/include
|
||||
${flashmla_SOURCE_DIR}/csrc/cutlass/tools/util/include
|
||||
@@ -83,7 +128,6 @@ if(FLASH_MLA_ARCHS)
|
||||
|
||||
set(FlashMLA_Extension_INCLUDES
|
||||
${flashmla_SOURCE_DIR}/csrc
|
||||
${flashmla_SOURCE_DIR}/csrc/sm90
|
||||
${flashmla_SOURCE_DIR}/csrc/extension/sm90/dense_fp8/
|
||||
${flashmla_SOURCE_DIR}/csrc/cutlass/include
|
||||
${flashmla_SOURCE_DIR}/csrc/cutlass/tools/util/include
|
||||
@@ -110,9 +154,12 @@ if(FLASH_MLA_ARCHS)
|
||||
|
||||
# Keep Stable ABI for the module, but *not* for CUDA/C++ files.
|
||||
# This prevents Py_LIMITED_API from affecting nvcc and C++ compiles.
|
||||
# Also enable C++20 for the FlashMLA sources (required for std::span, requires, etc.)
|
||||
target_compile_options(_flashmla_C PRIVATE
|
||||
$<$<COMPILE_LANGUAGE:CUDA>:-UPy_LIMITED_API>
|
||||
$<$<COMPILE_LANGUAGE:CXX>:-UPy_LIMITED_API>)
|
||||
$<$<COMPILE_LANGUAGE:CXX>:-UPy_LIMITED_API>
|
||||
$<$<COMPILE_LANGUAGE:CXX>:-std=c++20>
|
||||
$<$<COMPILE_LANGUAGE:CUDA>:-std=c++20>)
|
||||
|
||||
define_extension_target(
|
||||
_flashmla_extension_C
|
||||
|
||||
@@ -7,6 +7,7 @@
|
||||
#include <vector>
|
||||
|
||||
void swap_blocks(torch::Tensor& src, torch::Tensor& dst,
|
||||
int64_t block_size_in_bytes,
|
||||
const torch::Tensor& block_mapping);
|
||||
|
||||
void reshape_and_cache(torch::Tensor& key, torch::Tensor& value,
|
||||
|
||||
@@ -24,7 +24,14 @@
|
||||
typedef __hip_bfloat16 __nv_bfloat16;
|
||||
#endif
|
||||
|
||||
#if defined(__gfx942__)
|
||||
constexpr float kFp8ScaleDivisor = 224.f;
|
||||
#else
|
||||
constexpr float kFp8ScaleDivisor = 448.f;
|
||||
#endif
|
||||
|
||||
void swap_blocks(torch::Tensor& src, torch::Tensor& dst,
|
||||
int64_t block_size_in_bytes,
|
||||
const torch::Tensor& block_mapping) {
|
||||
torch::Device src_device = src.device();
|
||||
torch::Device dst_device = dst.device();
|
||||
@@ -49,10 +56,6 @@ void swap_blocks(torch::Tensor& src, torch::Tensor& dst,
|
||||
char* src_ptr = static_cast<char*>(src.data_ptr());
|
||||
char* dst_ptr = static_cast<char*>(dst.data_ptr());
|
||||
|
||||
// We use the stride instead of numel in case the cache is padded for memory
|
||||
// alignment reasons, we assume the blocks data (inclusive of any padding)
|
||||
// is contiguous in memory
|
||||
const int64_t block_size_in_bytes = src.element_size() * src.stride(0);
|
||||
const at::cuda::OptionalCUDAGuard device_guard(
|
||||
src_device.is_cuda() ? src_device : dst_device);
|
||||
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
||||
@@ -205,7 +208,8 @@ __global__ void reshape_and_cache_flash_kernel(
|
||||
const int64_t block_stride, const int64_t page_stride,
|
||||
const int64_t head_stride, const int64_t key_stride,
|
||||
const int64_t value_stride, const int num_heads, const int head_size,
|
||||
const int block_size, const float* k_scale, const float* v_scale) {
|
||||
const int block_size, const float* k_scale, const float* v_scale,
|
||||
const int kv_scale_stride) {
|
||||
const int64_t token_idx = blockIdx.x;
|
||||
const int64_t slot_idx = slot_mapping[token_idx];
|
||||
// NOTE: slot_idx can be -1 if the token is padded
|
||||
@@ -229,21 +233,23 @@ __global__ void reshape_and_cache_flash_kernel(
|
||||
// this is true for the NHD layout where `head_stride == head_size`
|
||||
const bool is_contiguous_heads = (head_stride == head_size);
|
||||
|
||||
float k_scale_val = (kv_dt == Fp8KVCacheDataType::kAuto) ? 0.f : *k_scale;
|
||||
float v_scale_val = (kv_dt == Fp8KVCacheDataType::kAuto) ? 0.f : *v_scale;
|
||||
constexpr int VEC_SIZE = (sizeof(scalar_t) == 2) ? 8 : 4;
|
||||
CopyWithScaleOp<cache_t, scalar_t, kv_dt> k_op{k_scale_val};
|
||||
CopyWithScaleOp<cache_t, scalar_t, kv_dt> v_op{v_scale_val};
|
||||
if (is_contiguous_heads) {
|
||||
// NHD layout
|
||||
|
||||
if (is_contiguous_heads && kv_scale_stride == 0) {
|
||||
// NHD layout and k/v_scales are [1] (i.e. single scale for all heads)
|
||||
// kv cache: [num_blocks, block_size, num_heads, head_size]
|
||||
float k_scale_val = (kv_dt == Fp8KVCacheDataType::kAuto) ? 0.f : *k_scale;
|
||||
float v_scale_val = (kv_dt == Fp8KVCacheDataType::kAuto) ? 0.f : *v_scale;
|
||||
|
||||
CopyWithScaleOp<cache_t, scalar_t, kv_dt> k_op{k_scale_val};
|
||||
CopyWithScaleOp<cache_t, scalar_t, kv_dt> v_op{v_scale_val};
|
||||
|
||||
vectorize_with_alignment<VEC_SIZE>(key_src, key_dst, n_elems, threadIdx.x,
|
||||
blockDim.x, k_op);
|
||||
|
||||
vectorize_with_alignment<VEC_SIZE>(value_src, value_dst, n_elems,
|
||||
threadIdx.x, blockDim.x, v_op);
|
||||
|
||||
} else {
|
||||
// HND layout OR k/v_scales are [num_heads] (i.e. per-attn-head)
|
||||
// HND layout: heads are strided, but each head_size segment is contiguous
|
||||
// kv cache: [num_blocks, num_heads, block_size, head_size]
|
||||
const int lane = threadIdx.x & 31; // 0..31 within warp
|
||||
@@ -259,6 +265,16 @@ __global__ void reshape_and_cache_flash_kernel(
|
||||
cache_t* __restrict__ v_dst_h =
|
||||
value_dst + static_cast<int64_t>(head) * head_stride;
|
||||
|
||||
float k_scale_val = (kv_dt == Fp8KVCacheDataType::kAuto)
|
||||
? 0.f
|
||||
: k_scale[head * kv_scale_stride];
|
||||
float v_scale_val = (kv_dt == Fp8KVCacheDataType::kAuto)
|
||||
? 0.f
|
||||
: v_scale[head * kv_scale_stride];
|
||||
|
||||
CopyWithScaleOp<cache_t, scalar_t, kv_dt> k_op{k_scale_val};
|
||||
CopyWithScaleOp<cache_t, scalar_t, kv_dt> v_op{v_scale_val};
|
||||
|
||||
// within each head, let the 32 threads of the warp perform the vector
|
||||
// copy
|
||||
vectorize_with_alignment<VEC_SIZE>(k_src_h, k_dst_h, head_size, lane, 32,
|
||||
@@ -391,8 +407,7 @@ __global__ void concat_and_cache_ds_mla_kernel(
|
||||
}
|
||||
|
||||
// Compute the scale for the tile
|
||||
float tile_scale = max_abs / 448.f;
|
||||
tile_scale = fmaxf(tile_scale, FLT_MIN);
|
||||
float tile_scale = fmaxf(max_abs / kFp8ScaleDivisor, FLT_MIN);
|
||||
|
||||
// The first lane of each half-warp writes the scale to kv_cache
|
||||
if ((lane_idx == 0) || (lane_idx == 16)) {
|
||||
@@ -461,11 +476,8 @@ __global__ void indexer_k_quant_and_cache_kernel(
|
||||
#endif
|
||||
}
|
||||
|
||||
#if defined(__gfx942__)
|
||||
float scale = fmaxf(amax, 1e-4) / 224.0f;
|
||||
#else
|
||||
float scale = fmaxf(amax, 1e-4) / 448.0f;
|
||||
#endif
|
||||
float scale = fmaxf(amax, 1e-4) / kFp8ScaleDivisor;
|
||||
|
||||
if (use_ue8m0) {
|
||||
scale = exp2f(ceilf(log2f(scale)));
|
||||
}
|
||||
@@ -608,7 +620,8 @@ void reshape_and_cache(
|
||||
slot_mapping.data_ptr<int64_t>(), block_stride, page_stride, \
|
||||
head_stride, key_stride, value_stride, num_heads, head_size, \
|
||||
block_size, reinterpret_cast<const float*>(k_scale.data_ptr()), \
|
||||
reinterpret_cast<const float*>(v_scale.data_ptr()));
|
||||
reinterpret_cast<const float*>(v_scale.data_ptr()), \
|
||||
kv_scale_stride);
|
||||
|
||||
void reshape_and_cache_flash(
|
||||
torch::Tensor& key, // [num_tokens, num_heads, head_size]
|
||||
@@ -617,8 +630,9 @@ void reshape_and_cache_flash(
|
||||
torch::Tensor&
|
||||
value_cache, // [num_blocks, block_size, num_heads, head_size]
|
||||
torch::Tensor& slot_mapping, // [num_tokens] or [num_actual_tokens]
|
||||
const std::string& kv_cache_dtype, torch::Tensor& k_scale,
|
||||
torch::Tensor& v_scale) {
|
||||
const std::string& kv_cache_dtype,
|
||||
torch::Tensor& k_scale, // [1] or [num_heads]
|
||||
torch::Tensor& v_scale) { // [1] or [num_heads]
|
||||
// NOTE(woosuk): In vLLM V1, key.size(0) can be different from
|
||||
// slot_mapping.size(0) because of padding for CUDA graphs.
|
||||
// In vLLM V0, key.size(0) is always equal to slot_mapping.size(0) because
|
||||
@@ -641,6 +655,12 @@ void reshape_and_cache_flash(
|
||||
int64_t head_stride = key_cache.stride(2);
|
||||
TORCH_CHECK(key_cache.stride(0) == value_cache.stride(0));
|
||||
|
||||
TORCH_CHECK(k_scale.sizes() == v_scale.sizes(),
|
||||
"k_scale and v_scale must have the same shape");
|
||||
TORCH_CHECK(k_scale.numel() == 1 || k_scale.numel() == num_heads,
|
||||
"k_scale and v_scale must be of shape [1] or [num_heads]");
|
||||
int kv_scale_stride = (k_scale.numel() > 1) ? 1 : 0;
|
||||
|
||||
dim3 grid(num_tokens);
|
||||
dim3 block(std::min(num_heads * head_size, 512));
|
||||
const at::cuda::OptionalCUDAGuard device_guard(device_of(key));
|
||||
|
||||
@@ -80,8 +80,10 @@ struct FP16Vec16 : public Vec<FP16Vec16> {
|
||||
reg.val[1] = vld1q_f16(reinterpret_cast<const __fp16*>(ptr) + 8);
|
||||
}
|
||||
|
||||
explicit FP16Vec16(const FP32Vec16& vec);
|
||||
// ASIMD does not support non-temporal loads
|
||||
explicit FP16Vec16(bool, const void* ptr) : FP16Vec16(ptr) {}
|
||||
|
||||
explicit FP16Vec16(const FP32Vec16& vec);
|
||||
void save(void* ptr) const {
|
||||
vst1q_f16(reinterpret_cast<__fp16*>(ptr), reg.val[0]);
|
||||
vst1q_f16(reinterpret_cast<__fp16*>(ptr) + 8, reg.val[1]);
|
||||
@@ -190,6 +192,9 @@ struct BF16Vec16 : public Vec<BF16Vec16> {
|
||||
explicit BF16Vec16(const void* ptr)
|
||||
: reg(*reinterpret_cast<const bfloat16x8x2_t*>(ptr)) {};
|
||||
|
||||
// ASIMD does not support non-temporal loads
|
||||
explicit BF16Vec16(bool, const void* ptr) : BF16Vec16(ptr) {}
|
||||
|
||||
explicit BF16Vec16(bfloat16x8x2_t data) : reg(data) {};
|
||||
|
||||
explicit BF16Vec16(const FP32Vec16&);
|
||||
@@ -474,6 +479,9 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
|
||||
: reg({vld1q_f32(ptr), vld1q_f32(ptr + 4), vld1q_f32(ptr + 8),
|
||||
vld1q_f32(ptr + 12)}) {}
|
||||
|
||||
// ASIMD does not support non-temporal loads
|
||||
explicit FP32Vec16(bool, const float* ptr) : FP32Vec16(ptr) {}
|
||||
|
||||
explicit FP32Vec16(float32x4x4_t data) : reg(data) {}
|
||||
|
||||
explicit FP32Vec16(const FP32Vec8& data) {
|
||||
@@ -756,6 +764,96 @@ struct INT8Vec16 : public Vec<INT8Vec16> {
|
||||
};
|
||||
};
|
||||
|
||||
struct INT8Vec64 : public Vec<INT8Vec64> {
|
||||
constexpr static int VEC_ELEM_NUM = 64;
|
||||
union AliasReg {
|
||||
int8x16x4_t reg;
|
||||
int8_t values[VEC_ELEM_NUM];
|
||||
};
|
||||
int8x16x4_t reg;
|
||||
|
||||
explicit INT8Vec64(const int8_t* ptr) { reg = vld1q_s8_x4(ptr); }
|
||||
|
||||
// ASIMD does not support non-temporal loads
|
||||
explicit INT8Vec64(bool, const int8_t* ptr) : INT8Vec64(ptr) {}
|
||||
|
||||
void save(int8_t* ptr) const { vst1q_s8_x4(ptr, reg); }
|
||||
|
||||
// masked store
|
||||
void save(int8_t* p, int elem_num) const {
|
||||
TORCH_CHECK(elem_num <= VEC_ELEM_NUM && elem_num > 0);
|
||||
|
||||
if (elem_num == VEC_ELEM_NUM) {
|
||||
vst1q_s8_x4(p, reg);
|
||||
return;
|
||||
}
|
||||
|
||||
const int full_quadwords = elem_num / 16;
|
||||
const int remaining_bytes = elem_num % 16;
|
||||
|
||||
for (int i = 0; i < full_quadwords; ++i) {
|
||||
vst1q_s8(p + 16 * i, reg.val[i]);
|
||||
}
|
||||
|
||||
if (remaining_bytes) {
|
||||
const int8x16_t v = reg.val[full_quadwords];
|
||||
int8_t* tail = p + 16 * full_quadwords;
|
||||
switch (remaining_bytes) {
|
||||
case 15:
|
||||
tail[14] = vgetq_lane_s8(v, 14);
|
||||
[[fallthrough]];
|
||||
case 14:
|
||||
tail[13] = vgetq_lane_s8(v, 13);
|
||||
[[fallthrough]];
|
||||
case 13:
|
||||
tail[12] = vgetq_lane_s8(v, 12);
|
||||
[[fallthrough]];
|
||||
case 12:
|
||||
tail[11] = vgetq_lane_s8(v, 11);
|
||||
[[fallthrough]];
|
||||
case 11:
|
||||
tail[10] = vgetq_lane_s8(v, 10);
|
||||
[[fallthrough]];
|
||||
case 10:
|
||||
tail[9] = vgetq_lane_s8(v, 9);
|
||||
[[fallthrough]];
|
||||
case 9:
|
||||
tail[8] = vgetq_lane_s8(v, 8);
|
||||
[[fallthrough]];
|
||||
case 8:
|
||||
tail[7] = vgetq_lane_s8(v, 7);
|
||||
[[fallthrough]];
|
||||
case 7:
|
||||
tail[6] = vgetq_lane_s8(v, 6);
|
||||
[[fallthrough]];
|
||||
case 6:
|
||||
tail[5] = vgetq_lane_s8(v, 5);
|
||||
[[fallthrough]];
|
||||
case 5:
|
||||
tail[4] = vgetq_lane_s8(v, 4);
|
||||
[[fallthrough]];
|
||||
case 4:
|
||||
tail[3] = vgetq_lane_s8(v, 3);
|
||||
[[fallthrough]];
|
||||
case 3:
|
||||
tail[2] = vgetq_lane_s8(v, 2);
|
||||
[[fallthrough]];
|
||||
case 2:
|
||||
tail[1] = vgetq_lane_s8(v, 1);
|
||||
[[fallthrough]];
|
||||
case 1:
|
||||
tail[0] = vgetq_lane_s8(v, 0);
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ASIMD does not support non-temporal stores
|
||||
void nt_save(int8_t* ptr) const { save(ptr); }
|
||||
}; // INT8Vec64
|
||||
|
||||
template <typename T>
|
||||
struct VecType {
|
||||
using vec_type = void;
|
||||
|
||||
@@ -360,13 +360,14 @@ void onednn_scaled_mm(
|
||||
const std::optional<torch::Tensor>& azp, // [M] or [1]
|
||||
const std::optional<torch::Tensor>& azp_adj, // [M] or [1]
|
||||
const std::optional<torch::Tensor>& bias, // [N]
|
||||
int64_t handler) {
|
||||
const torch::Tensor& handler_tensor) {
|
||||
CPU_KERNEL_GUARD_IN(onednn_scaled_mm)
|
||||
TORCH_CHECK(a.dim() == 2);
|
||||
TORCH_CHECK(a.is_contiguous());
|
||||
TORCH_CHECK(c.is_contiguous());
|
||||
W8A8MatMulPrimitiveHandler* ptr =
|
||||
reinterpret_cast<W8A8MatMulPrimitiveHandler*>(handler);
|
||||
reinterpret_cast<W8A8MatMulPrimitiveHandler*>(
|
||||
handler_tensor.item<int64_t>());
|
||||
const int32_t* azp_ptr = nullptr;
|
||||
if (azp.has_value()) {
|
||||
azp_ptr = azp->data_ptr<int32_t>();
|
||||
@@ -519,13 +520,14 @@ int64_t create_onednn_mm_handler(const torch::Tensor& b,
|
||||
|
||||
void onednn_mm(torch::Tensor& c, // [M, OC], row-major
|
||||
const torch::Tensor& a, // [M, IC], row-major
|
||||
const std::optional<torch::Tensor>& bias, int64_t handler) {
|
||||
const std::optional<torch::Tensor>& bias,
|
||||
const torch::Tensor& handler_tensor) {
|
||||
CPU_KERNEL_GUARD_IN(onednn_mm)
|
||||
TORCH_CHECK(a.dim() == 2);
|
||||
TORCH_CHECK(a.stride(-1) == 1);
|
||||
TORCH_CHECK(c.stride(-1) == 1);
|
||||
MatMulPrimitiveHandler* ptr =
|
||||
reinterpret_cast<MatMulPrimitiveHandler*>(handler);
|
||||
reinterpret_cast<MatMulPrimitiveHandler*>(handler_tensor.item<int64_t>());
|
||||
|
||||
// ACL matmuls expect contiguous source tensors
|
||||
#ifdef VLLM_USE_ACL
|
||||
|
||||
@@ -5,6 +5,10 @@
|
||||
#include <sys/stat.h>
|
||||
#include <unistd.h>
|
||||
|
||||
#ifdef __aarch64__
|
||||
#include <atomic>
|
||||
#endif
|
||||
|
||||
namespace {
|
||||
#define MAX_SHM_RANK_NUM 8
|
||||
#define PER_THREAD_SHM_BUFFER_BYTES (4 * 1024 * 1024)
|
||||
@@ -34,8 +38,17 @@ struct KernelVecType<c10::Half> {
|
||||
};
|
||||
|
||||
struct ThreadSHMContext {
|
||||
#ifdef __aarch64__
|
||||
// memory model is weaker on AArch64, so we use atomic variables for
|
||||
// consumer (load-acquire) and producer (store-release) to make sure
|
||||
// that a stamp cannot be ready before the corresponding data is ready.
|
||||
std::atomic<char> _curr_thread_stamp[2];
|
||||
std::atomic<char> _ready_thread_stamp[2];
|
||||
static_assert(std::atomic<char>::is_always_lock_free);
|
||||
#else
|
||||
volatile char _curr_thread_stamp[2];
|
||||
volatile char _ready_thread_stamp[2];
|
||||
#endif // __aarch64__
|
||||
int local_stamp_buffer_idx;
|
||||
int remote_stamp_buffer_idx;
|
||||
int thread_id;
|
||||
@@ -62,10 +75,17 @@ struct ThreadSHMContext {
|
||||
TORCH_CHECK(group_size <= MAX_SHM_RANK_NUM);
|
||||
TORCH_CHECK((size_t)this % 64 == 0);
|
||||
TORCH_CHECK((size_t)thread_shm_ptr % 64 == 0);
|
||||
#ifdef __aarch64__
|
||||
_curr_thread_stamp[0].store(1, std::memory_order_relaxed);
|
||||
_curr_thread_stamp[1].store(1, std::memory_order_relaxed);
|
||||
_ready_thread_stamp[0].store(0, std::memory_order_relaxed);
|
||||
_ready_thread_stamp[1].store(0, std::memory_order_relaxed);
|
||||
#else
|
||||
_curr_thread_stamp[0] = 1;
|
||||
_curr_thread_stamp[1] = 1;
|
||||
_ready_thread_stamp[0] = 0;
|
||||
_ready_thread_stamp[1] = 0;
|
||||
#endif // __aarch64__
|
||||
_thread_buffer_mask[0] = 0;
|
||||
_thread_buffer_mask[1] = 0;
|
||||
for (int i = 0; i < MAX_SHM_RANK_NUM; ++i) {
|
||||
@@ -103,19 +123,43 @@ struct ThreadSHMContext {
|
||||
_thread_buffer_mask[local_stamp_buffer_idx] ^= 0xFFFFFFFFFFFFFFFF;
|
||||
}
|
||||
|
||||
char get_curr_stamp(int idx) const { return _curr_thread_stamp[idx]; }
|
||||
char get_curr_stamp(int idx) const {
|
||||
#ifdef __aarch64__
|
||||
return _curr_thread_stamp[idx].load(std::memory_order_acquire);
|
||||
#else
|
||||
return _curr_thread_stamp[idx];
|
||||
#endif // __aarch64__
|
||||
}
|
||||
|
||||
char get_ready_stamp(int idx) const { return _ready_thread_stamp[idx]; }
|
||||
char get_ready_stamp(int idx) const {
|
||||
#ifdef __aarch64__
|
||||
return _ready_thread_stamp[idx].load(std::memory_order_acquire);
|
||||
#else
|
||||
return _ready_thread_stamp[idx];
|
||||
#endif // __aarch64__
|
||||
}
|
||||
|
||||
void next_stamp() {
|
||||
#ifdef __aarch64__
|
||||
_curr_thread_stamp[local_stamp_buffer_idx].fetch_add(
|
||||
1, std::memory_order_release);
|
||||
#else
|
||||
_mm_mfence();
|
||||
_curr_thread_stamp[local_stamp_buffer_idx] += 1;
|
||||
#endif // __aarch64__
|
||||
}
|
||||
|
||||
void commit_ready_stamp() {
|
||||
#ifdef __aarch64__
|
||||
_ready_thread_stamp[local_stamp_buffer_idx].store(
|
||||
_curr_thread_stamp[local_stamp_buffer_idx].load(
|
||||
std::memory_order_relaxed),
|
||||
std::memory_order_release);
|
||||
#else
|
||||
_mm_mfence();
|
||||
_ready_thread_stamp[local_stamp_buffer_idx] =
|
||||
_curr_thread_stamp[local_stamp_buffer_idx];
|
||||
#endif // __aarch64__
|
||||
}
|
||||
|
||||
int get_swizzled_rank(int idx) { return swizzled_ranks[idx]; }
|
||||
@@ -142,7 +186,11 @@ struct ThreadSHMContext {
|
||||
break;
|
||||
}
|
||||
++_spinning_count;
|
||||
#ifdef __aarch64__
|
||||
__asm__ __volatile__("yield");
|
||||
#else
|
||||
_mm_pause();
|
||||
#endif // __aarch64__
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -19,13 +19,14 @@ void onednn_scaled_mm(torch::Tensor& c, const torch::Tensor& a,
|
||||
const std::optional<torch::Tensor>& azp,
|
||||
const std::optional<torch::Tensor>& azp_adj,
|
||||
const std::optional<torch::Tensor>& bias,
|
||||
int64_t handler);
|
||||
const torch::Tensor& handler_tensor);
|
||||
|
||||
int64_t create_onednn_mm_handler(const torch::Tensor& b,
|
||||
int64_t primitive_cache_size);
|
||||
|
||||
void onednn_mm(torch::Tensor& c, const torch::Tensor& a,
|
||||
const std::optional<torch::Tensor>& bias, int64_t handler);
|
||||
const std::optional<torch::Tensor>& bias,
|
||||
const torch::Tensor& handler_tensor);
|
||||
|
||||
bool is_onednn_acl_supported();
|
||||
|
||||
@@ -196,7 +197,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
|
||||
// oneDNN GEMM
|
||||
ops.def(
|
||||
"onednn_mm(Tensor! c, Tensor a, Tensor? bias, "
|
||||
"int handler) -> ()");
|
||||
"Tensor handler_tensor) -> ()");
|
||||
ops.impl("onednn_mm", torch::kCPU, &onednn_mm);
|
||||
|
||||
// Check if oneDNN was built with ACL backend
|
||||
@@ -212,7 +213,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
|
||||
// oneDNN scaled_mm for W8A8 with static per-tensor activation quantization
|
||||
ops.def(
|
||||
"onednn_scaled_mm(Tensor! c, Tensor a, Tensor a_scales, Tensor? azp, "
|
||||
"Tensor? azp_adj, Tensor? bias, int handler) -> ()");
|
||||
"Tensor? azp_adj, Tensor? bias, Tensor handler_tensor) -> ()");
|
||||
ops.impl("onednn_scaled_mm", torch::kCPU, &onednn_scaled_mm);
|
||||
|
||||
// Compute int8 quantized tensor for given scaling factor.
|
||||
@@ -230,7 +231,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
|
||||
#endif
|
||||
|
||||
// SHM CCL
|
||||
#ifdef __AVX512F__
|
||||
#if defined(__AVX512F__) || (defined(__aarch64__) && !defined(__APPLE__))
|
||||
ops.def("init_shm_manager(str name, int group_size, int rank) -> int",
|
||||
&init_shm_manager);
|
||||
ops.def("join_shm_manager(int handle, str name) -> str", &join_shm_manager);
|
||||
@@ -250,7 +251,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
|
||||
ops.impl("shm_send_tensor_list", torch::kCPU, &shm_send_tensor_list);
|
||||
ops.def("shm_recv_tensor_list(int handle, int src) -> Tensor[](a)",
|
||||
&shm_recv_tensor_list);
|
||||
#endif
|
||||
#endif // #if defined(__AVX512F__) || defined(__aarch64__)
|
||||
|
||||
// sgl-kernels
|
||||
#if defined(__AVX512BF16__) && defined(__AVX512F__) && defined(__AVX512VNNI__)
|
||||
|
||||
@@ -31,8 +31,6 @@ namespace moe {
|
||||
|
||||
constexpr unsigned FULL_WARP_MASK = 0xffffffff;
|
||||
constexpr int32_t WARP_SIZE = 32;
|
||||
constexpr int32_t BLOCK_SIZE = 512;
|
||||
constexpr int32_t NUM_WARPS_PER_BLOCK = BLOCK_SIZE / WARP_SIZE;
|
||||
|
||||
namespace warp_topk {
|
||||
|
||||
@@ -65,14 +63,6 @@ __forceinline__ __device__ bool is_better_than(T val, T baseline, idxT index,
|
||||
return res;
|
||||
}
|
||||
|
||||
template <typename T, typename idxT>
|
||||
int calc_smem_size_for_block_wide(int num_of_warp, int64_t k) {
|
||||
int64_t cache_topk = (sizeof(T) + sizeof(idxT)) * num_of_warp * k;
|
||||
int64_t n = std::max<int>(num_of_warp / 2 * k, num_of_warp * WARP_SIZE);
|
||||
return max(cache_topk,
|
||||
round_up_to_multiple_of<256>(n * sizeof(T)) + n * sizeof(idxT));
|
||||
}
|
||||
|
||||
template <int size, bool ascending, bool reverse, typename T, typename idxT,
|
||||
bool is_stable>
|
||||
struct BitonicMerge {
|
||||
@@ -267,6 +257,15 @@ class WarpSort {
|
||||
}
|
||||
}
|
||||
|
||||
// Accessors for per-lane selected value/index.
|
||||
// NOTE: For the common case `capacity == WARP_SIZE`, `max_arr_len_ == 1`
|
||||
// and callers should use `i == 0`.
|
||||
__device__ __forceinline__ idxT get_idx(int i = 0) const {
|
||||
return idx_arr_[i];
|
||||
}
|
||||
|
||||
__device__ __forceinline__ T get_val(int i = 0) const { return val_arr_[i]; }
|
||||
|
||||
protected:
|
||||
static constexpr int max_arr_len_ = capacity / WARP_SIZE;
|
||||
|
||||
@@ -285,6 +284,7 @@ class WarpSelect : public WarpSort<capacity, greater, T, idxT, is_stable> {
|
||||
__device__ WarpSelect(idxT k, T dummy)
|
||||
: WarpSort<capacity, greater, T, idxT, is_stable>(k, dummy),
|
||||
k_th_(dummy),
|
||||
k_th_idx_(0),
|
||||
k_th_lane_((k - 1) % WARP_SIZE) {
|
||||
extern __shared__ char smem_buf[]; // extern __shared__ T smem_buf[];
|
||||
|
||||
@@ -346,9 +346,6 @@ class WarpSelect : public WarpSort<capacity, greater, T, idxT, is_stable> {
|
||||
idxT idx = (lane_ < smem_buf_len_) ? idx_smem_[lane_] : 0;
|
||||
merge_buf_(val, idx);
|
||||
}
|
||||
|
||||
// after done(), smem is used for merging results among warps
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
private:
|
||||
@@ -503,255 +500,186 @@ __device__ void topk_with_k2(T* output, T const* input, BiasT const* bias,
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T, typename BiasT, ScoringFunc SF>
|
||||
__global__ void topk_with_k2_kernel(T* output, T* input, BiasT const* bias,
|
||||
int64_t const num_tokens,
|
||||
int64_t const num_cases,
|
||||
int64_t const n_group,
|
||||
int64_t const num_experts_per_group) {
|
||||
int32_t warp_id = threadIdx.x / WARP_SIZE;
|
||||
int32_t lane_id = threadIdx.x % WARP_SIZE;
|
||||
|
||||
int32_t case_id = blockIdx.x * NUM_WARPS_PER_BLOCK + warp_id;
|
||||
if (case_id < num_cases) {
|
||||
input += case_id * num_experts_per_group;
|
||||
// bias is per expert group, offset to current group
|
||||
int32_t group_id = case_id % n_group;
|
||||
BiasT const* group_bias = bias + group_id * num_experts_per_group;
|
||||
output += case_id;
|
||||
|
||||
cg::thread_block block = cg::this_thread_block();
|
||||
cg::thread_block_tile<32> tile = cg::tiled_partition<32>(block);
|
||||
|
||||
#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
|
||||
asm volatile("griddepcontrol.wait;");
|
||||
#endif
|
||||
topk_with_k2<T, BiasT, SF>(output, input, group_bias, tile, lane_id,
|
||||
num_experts_per_group);
|
||||
}
|
||||
#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
|
||||
asm volatile("griddepcontrol.launch_dependents;");
|
||||
#endif
|
||||
}
|
||||
|
||||
template <typename T, typename BiasT, typename IdxT, ScoringFunc SF,
|
||||
int NGroup = -1>
|
||||
__global__ void group_idx_and_topk_idx_kernel(
|
||||
T* scores, T const* group_scores, float* topk_values, IdxT* topk_indices,
|
||||
BiasT const* bias, int64_t const num_tokens, int64_t const n_group,
|
||||
int64_t const topk_group, int64_t const topk, int64_t const num_experts,
|
||||
int64_t const num_experts_per_group, bool renormalize,
|
||||
template <typename T, typename BiasT, typename IdxT, ScoringFunc SF>
|
||||
__global__ void grouped_topk_fused_kernel(
|
||||
T* scores, float* topk_values, IdxT* topk_indices, BiasT const* bias,
|
||||
int64_t const num_tokens, int64_t const num_experts, int64_t const n_group,
|
||||
int64_t const topk_group, int64_t const topk, bool renormalize,
|
||||
double routed_scaling_factor) {
|
||||
int32_t warp_id = threadIdx.x / WARP_SIZE;
|
||||
int32_t lane_id = threadIdx.x % WARP_SIZE;
|
||||
int32_t case_id =
|
||||
blockIdx.x * NUM_WARPS_PER_BLOCK + warp_id; // one per token
|
||||
scores += case_id * num_experts;
|
||||
group_scores += case_id * n_group;
|
||||
topk_values += case_id * topk;
|
||||
topk_indices += case_id * topk;
|
||||
int32_t const token_id = static_cast<int32_t>(blockIdx.x);
|
||||
if (token_id >= num_tokens) {
|
||||
return;
|
||||
}
|
||||
|
||||
constexpr bool kUseStaticNGroup = (NGroup > 0);
|
||||
// use int32 to avoid implicit conversion
|
||||
int32_t const n_group_i32 =
|
||||
kUseStaticNGroup ? NGroup : static_cast<int32_t>(n_group);
|
||||
int32_t const warp_id = threadIdx.x / WARP_SIZE;
|
||||
int32_t const lane_id = threadIdx.x % WARP_SIZE;
|
||||
|
||||
int32_t align_num_experts_per_group =
|
||||
warp_topk::round_up_to_multiple_of<WARP_SIZE>(num_experts_per_group);
|
||||
int32_t const n_group_i32 = static_cast<int32_t>(n_group);
|
||||
int32_t const topk_group_i32 = static_cast<int32_t>(topk_group);
|
||||
int32_t const topk_i32 = static_cast<int32_t>(topk);
|
||||
int32_t const num_experts_i32 = static_cast<int32_t>(num_experts);
|
||||
|
||||
int32_t const num_warps = blockDim.x / WARP_SIZE;
|
||||
if (warp_id >= n_group_i32 || num_warps < n_group_i32) {
|
||||
return;
|
||||
}
|
||||
|
||||
int32_t const num_experts_per_group = num_experts_i32 / n_group_i32;
|
||||
|
||||
T* scores_token = scores + static_cast<int64_t>(token_id) * num_experts;
|
||||
|
||||
cg::thread_block block = cg::this_thread_block();
|
||||
cg::thread_block_tile<32> tile = cg::tiled_partition<32>(block);
|
||||
|
||||
extern __shared__ char smem_buf[]; // NOTE: reuse the shared memory here to
|
||||
// store the target topk idx
|
||||
int32_t* s_topk_idx = reinterpret_cast<int32_t*>(smem_buf);
|
||||
T* s_topk_value =
|
||||
reinterpret_cast<T*>(s_topk_idx + NUM_WARPS_PER_BLOCK * topk) +
|
||||
warp_id * topk;
|
||||
s_topk_idx += warp_id * topk;
|
||||
extern __shared__ char smem_buf[];
|
||||
// warpSelect internal staging buffer layout
|
||||
size_t const val_bytes =
|
||||
static_cast<size_t>(num_warps) * WARP_SIZE * sizeof(T);
|
||||
size_t const val_bytes_aligned =
|
||||
warp_topk::round_up_to_multiple_of<256>(val_bytes);
|
||||
size_t const idx_bytes =
|
||||
static_cast<size_t>(num_warps) * WARP_SIZE * sizeof(int32_t);
|
||||
size_t const internal_bytes = val_bytes_aligned + idx_bytes;
|
||||
|
||||
T value = neg_inf<T>();
|
||||
T topk_group_value = neg_inf<T>();
|
||||
int32_t num_equalto_topkth_group;
|
||||
// user-managed shared memory starts after warpSelect internal staging.
|
||||
uintptr_t ptr_u = reinterpret_cast<uintptr_t>(smem_buf + internal_bytes);
|
||||
ptr_u = (ptr_u + 15) & ~static_cast<uintptr_t>(15); // align to 16B
|
||||
T* s_group_scores = reinterpret_cast<T*>(ptr_u);
|
||||
|
||||
#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
|
||||
asm volatile("griddepcontrol.wait;"); // I think all prolog can be put before
|
||||
// acqbulk because it's ptr arithmetic
|
||||
#endif
|
||||
|
||||
if (case_id < num_tokens) {
|
||||
// calculate group_idx
|
||||
int32_t target_num_min =
|
||||
WARP_SIZE - n_group_i32 + static_cast<int32_t>(topk_group);
|
||||
// The check is necessary to avoid abnormal input
|
||||
if (lane_id < n_group_i32 && is_finite(group_scores[lane_id])) {
|
||||
value = group_scores[lane_id];
|
||||
}
|
||||
// phase 1: per-group scan
|
||||
int32_t const group_offset = warp_id * num_experts_per_group;
|
||||
topk_with_k2<T, BiasT, SF>(s_group_scores + warp_id,
|
||||
scores_token + group_offset, bias + group_offset,
|
||||
tile, lane_id, num_experts_per_group);
|
||||
|
||||
int count_equal_to_top_value = WARP_SIZE - n_group_i32;
|
||||
int pre_count_equal_to_top_value = 0;
|
||||
// Use loop to find the largset top_group
|
||||
while (count_equal_to_top_value < target_num_min) {
|
||||
topk_group_value = cg::reduce(tile, value, cg::greater<T>());
|
||||
if (value == topk_group_value) {
|
||||
value = neg_inf<T>();
|
||||
}
|
||||
pre_count_equal_to_top_value = count_equal_to_top_value;
|
||||
count_equal_to_top_value =
|
||||
__popc(__ballot_sync(FULL_WARP_MASK, (value == neg_inf<T>())));
|
||||
}
|
||||
num_equalto_topkth_group = target_num_min - pre_count_equal_to_top_value;
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
// phase 2: warp0 selects groups + merges candidates to final topk
|
||||
if (warp_id != 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
topk_values += static_cast<int64_t>(token_id) * topk;
|
||||
topk_indices += static_cast<int64_t>(token_id) * topk;
|
||||
|
||||
// select topk_group groups by group score
|
||||
warp_topk::WarpSelect</*capability*/ WARP_SIZE, /*greater*/ true, T, int32_t,
|
||||
/* is_stable */ true>
|
||||
queue((int32_t)topk, neg_inf<T>());
|
||||
group_sel(static_cast<int32_t>(topk_group_i32), neg_inf<T>());
|
||||
|
||||
int count_equalto_topkth_group = 0;
|
||||
bool if_proceed_next_topk = topk_group_value != neg_inf<T>();
|
||||
if (case_id < num_tokens && if_proceed_next_topk) {
|
||||
auto process_group = [&](int i_group) {
|
||||
if ((group_scores[i_group] > topk_group_value) ||
|
||||
((group_scores[i_group] == topk_group_value) &&
|
||||
(count_equalto_topkth_group < num_equalto_topkth_group))) {
|
||||
int32_t offset = i_group * num_experts_per_group;
|
||||
for (int32_t i = lane_id; i < align_num_experts_per_group;
|
||||
i += WARP_SIZE) {
|
||||
T candidates = neg_inf<T>();
|
||||
if (i < num_experts_per_group) {
|
||||
// apply scoring function (if any) and add bias
|
||||
T input = scores[offset + i];
|
||||
if (is_finite(input)) {
|
||||
T score = apply_scoring<SF>(input);
|
||||
candidates = score + static_cast<T>(bias[offset + i]);
|
||||
}
|
||||
}
|
||||
queue.add(candidates, offset + i);
|
||||
}
|
||||
if (group_scores[i_group] == topk_group_value) {
|
||||
count_equalto_topkth_group++;
|
||||
// all lanes must participate in WarpSelect::add().
|
||||
T gscore = (lane_id < n_group_i32) ? s_group_scores[lane_id] : neg_inf<T>();
|
||||
group_sel.add(gscore, lane_id);
|
||||
group_sel.done();
|
||||
|
||||
// proceed only if the k-th selected group score is not -inf
|
||||
bool proceed = false;
|
||||
if (topk_group_i32 > 0) {
|
||||
int const kth_lane = topk_group_i32 - 1;
|
||||
// broadcast the k-th selected group score to all lanes
|
||||
T kth_val = __shfl_sync(FULL_WARP_MASK, group_sel.get_val(0), kth_lane);
|
||||
proceed = (kth_val != neg_inf<T>());
|
||||
}
|
||||
|
||||
if (!proceed) {
|
||||
for (int i = lane_id; i < topk_i32; i += WARP_SIZE) {
|
||||
topk_indices[i] = static_cast<IdxT>(i);
|
||||
topk_values[i] = 1.0f / static_cast<float>(topk_i32);
|
||||
}
|
||||
#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
|
||||
asm volatile("griddepcontrol.launch_dependents;");
|
||||
#endif
|
||||
return;
|
||||
}
|
||||
|
||||
// merge per-group topk candidates for selected groups, then select topk
|
||||
warp_topk::WarpSelect</*capability*/ WARP_SIZE, /*greater*/ true, T, int32_t,
|
||||
/* is_stable */ true>
|
||||
expert_sel(static_cast<int32_t>(topk_i32), neg_inf<T>());
|
||||
|
||||
// selected group ids reside in lanes [0, topk_group)
|
||||
int32_t sel_gid_lane = (lane_id < topk_group_i32) ? group_sel.get_idx(0) : 0;
|
||||
|
||||
// add candidates from selected groups to expert_sel
|
||||
for (int32_t g = 0; g < topk_group_i32; ++g) {
|
||||
int32_t gid = __shfl_sync(FULL_WARP_MASK, sel_gid_lane, g);
|
||||
int32_t const offset = gid * num_experts_per_group;
|
||||
int32_t const align_num_experts_per_group =
|
||||
warp_topk::round_up_to_multiple_of<WARP_SIZE>(num_experts_per_group);
|
||||
for (int32_t i = lane_id; i < align_num_experts_per_group; i += WARP_SIZE) {
|
||||
// all lanes must call `add()` the same number of times.
|
||||
T cand = neg_inf<T>();
|
||||
int32_t idx = 0;
|
||||
if (i < num_experts_per_group) {
|
||||
idx = offset + i;
|
||||
T input = scores_token[idx];
|
||||
if (is_finite(input)) {
|
||||
T score = apply_scoring<SF>(input);
|
||||
cand = score + static_cast<T>(bias[idx]);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
if constexpr (kUseStaticNGroup) {
|
||||
#pragma unroll
|
||||
for (int i_group = 0; i_group < NGroup; ++i_group) {
|
||||
process_group(i_group);
|
||||
}
|
||||
} else {
|
||||
for (int i_group = 0; i_group < n_group_i32; ++i_group) {
|
||||
process_group(i_group);
|
||||
}
|
||||
}
|
||||
queue.done();
|
||||
// Get the topk_idx
|
||||
queue.dumpIdx(s_topk_idx);
|
||||
}
|
||||
|
||||
// Load the valid score value
|
||||
// Calculate the summation
|
||||
float topk_sum = 1e-20;
|
||||
if (case_id < num_tokens && if_proceed_next_topk) {
|
||||
for (int i = lane_id;
|
||||
i < warp_topk::round_up_to_multiple_of<WARP_SIZE>(topk);
|
||||
i += WARP_SIZE) {
|
||||
T value = cuda_cast<T, float>(0.0f);
|
||||
if (i < topk) {
|
||||
// Load the score value (without bias) for normalization
|
||||
T input = scores[s_topk_idx[i]];
|
||||
value = apply_scoring<SF>(input);
|
||||
s_topk_value[i] = value;
|
||||
}
|
||||
if (renormalize) {
|
||||
topk_sum +=
|
||||
cg::reduce(tile, cuda_cast<float, T>(value), cg::plus<float>());
|
||||
}
|
||||
expert_sel.add(cand, idx);
|
||||
}
|
||||
}
|
||||
expert_sel.done();
|
||||
|
||||
__syncthreads();
|
||||
|
||||
if (case_id < num_tokens) {
|
||||
if (if_proceed_next_topk) {
|
||||
float scale = routed_scaling_factor;
|
||||
if (renormalize) {
|
||||
scale /= topk_sum;
|
||||
}
|
||||
for (int i = lane_id; i < topk; i += WARP_SIZE) {
|
||||
float base = cuda_cast<float, T>(s_topk_value[i]);
|
||||
float value = base * scale;
|
||||
topk_indices[i] = s_topk_idx[i];
|
||||
topk_values[i] = value;
|
||||
}
|
||||
} else {
|
||||
for (int i = lane_id; i < topk; i += WARP_SIZE) {
|
||||
topk_indices[i] = i;
|
||||
topk_values[i] = 1.0f / topk;
|
||||
}
|
||||
}
|
||||
// Note: when if_proceed_next_topk==false, choose the first 8 experts as the
|
||||
// default result.
|
||||
// compute unbiased routing weights + optional renorm.
|
||||
float lane_unbiased = 0.0f;
|
||||
IdxT lane_idx = 0;
|
||||
if (lane_id < topk_i32) {
|
||||
lane_idx = static_cast<IdxT>(expert_sel.get_idx(0));
|
||||
T in = scores_token[static_cast<int32_t>(lane_idx)];
|
||||
lane_unbiased = cuda_cast<float, T>(apply_scoring<SF>(in));
|
||||
}
|
||||
|
||||
float topk_sum = 1e-20f;
|
||||
if (renormalize) {
|
||||
topk_sum += cg::reduce(tile, lane_unbiased, cg::plus<float>());
|
||||
}
|
||||
|
||||
float scale = static_cast<float>(routed_scaling_factor);
|
||||
if (renormalize) {
|
||||
scale /= topk_sum;
|
||||
}
|
||||
|
||||
if (lane_id < topk_i32) {
|
||||
topk_indices[lane_id] = lane_idx;
|
||||
topk_values[lane_id] = lane_unbiased * scale;
|
||||
}
|
||||
|
||||
#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
|
||||
asm volatile("griddepcontrol.launch_dependents;");
|
||||
#endif
|
||||
}
|
||||
|
||||
template <typename T, typename BiasT, typename IdxT, ScoringFunc SF>
|
||||
inline void launch_group_idx_and_topk_kernel(
|
||||
cudaLaunchConfig_t const& config, T* scores, T* group_scores,
|
||||
float* topk_values, IdxT* topk_indices, BiasT const* bias,
|
||||
int64_t const num_tokens, int64_t const n_group, int64_t const topk_group,
|
||||
int64_t const topk, int64_t const num_experts,
|
||||
int64_t const num_experts_per_group, bool const renormalize,
|
||||
double const routed_scaling_factor) {
|
||||
auto launch = [&](auto* kernel_instance2) {
|
||||
cudaLaunchKernelEx(&config, kernel_instance2, scores, group_scores,
|
||||
topk_values, topk_indices, bias, num_tokens, n_group,
|
||||
topk_group, topk, num_experts, num_experts_per_group,
|
||||
renormalize, routed_scaling_factor);
|
||||
};
|
||||
|
||||
switch (n_group) {
|
||||
case 4: {
|
||||
launch(&group_idx_and_topk_idx_kernel<T, BiasT, IdxT, SF, 4>);
|
||||
break;
|
||||
}
|
||||
case 8: {
|
||||
launch(&group_idx_and_topk_idx_kernel<T, BiasT, IdxT, SF, 8>);
|
||||
break;
|
||||
}
|
||||
case 16: {
|
||||
launch(&group_idx_and_topk_idx_kernel<T, BiasT, IdxT, SF, 16>);
|
||||
break;
|
||||
}
|
||||
case 32: {
|
||||
launch(&group_idx_and_topk_idx_kernel<T, BiasT, IdxT, SF, 32>);
|
||||
break;
|
||||
}
|
||||
default: {
|
||||
launch(&group_idx_and_topk_idx_kernel<T, BiasT, IdxT, SF>);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T, typename BiasT, typename IdxT>
|
||||
void invokeNoAuxTc(T* scores, T* group_scores, float* topk_values,
|
||||
IdxT* topk_indices, BiasT const* bias,
|
||||
int64_t const num_tokens, int64_t const num_experts,
|
||||
int64_t const n_group, int64_t const topk_group,
|
||||
int64_t const topk, bool const renormalize,
|
||||
double const routed_scaling_factor, int const scoring_func,
|
||||
bool enable_pdl = false, cudaStream_t const stream = 0) {
|
||||
int64_t num_cases = num_tokens * n_group;
|
||||
int64_t topk_with_k2_num_blocks = (num_cases - 1) / NUM_WARPS_PER_BLOCK + 1;
|
||||
void invokeNoAuxTc(T* scores, float* topk_values, IdxT* topk_indices,
|
||||
BiasT const* bias, int64_t const num_tokens,
|
||||
int64_t const num_experts, int64_t const n_group,
|
||||
int64_t const topk_group, int64_t const topk,
|
||||
bool const renormalize, double const routed_scaling_factor,
|
||||
int const scoring_func, bool enable_pdl = false,
|
||||
cudaStream_t const stream = 0) {
|
||||
cudaLaunchConfig_t config;
|
||||
config.gridDim = topk_with_k2_num_blocks;
|
||||
config.blockDim = BLOCK_SIZE;
|
||||
config.dynamicSmemBytes = 0;
|
||||
// One block per token; one warp per group.
|
||||
config.gridDim = static_cast<uint32_t>(num_tokens);
|
||||
config.blockDim = static_cast<uint32_t>(n_group) * WARP_SIZE;
|
||||
// Dynamic shared memory: WarpSelect staging + per-group topk buffers.
|
||||
int32_t const num_warps = static_cast<int32_t>(n_group);
|
||||
size_t const val_bytes =
|
||||
static_cast<size_t>(num_warps) * WARP_SIZE * sizeof(T);
|
||||
size_t const val_bytes_aligned =
|
||||
warp_topk::round_up_to_multiple_of<256>(val_bytes);
|
||||
size_t const idx_bytes =
|
||||
static_cast<size_t>(num_warps) * WARP_SIZE * sizeof(int32_t);
|
||||
size_t const internal_bytes = val_bytes_aligned + idx_bytes;
|
||||
size_t const extra_bytes = 16 + static_cast<size_t>(n_group) * sizeof(T);
|
||||
config.dynamicSmemBytes = internal_bytes + extra_bytes;
|
||||
config.stream = stream;
|
||||
cudaLaunchAttribute attrs[1];
|
||||
attrs[0].id = cudaLaunchAttributeProgrammaticStreamSerialization;
|
||||
@@ -759,66 +687,35 @@ void invokeNoAuxTc(T* scores, T* group_scores, float* topk_values,
|
||||
config.numAttrs = 1;
|
||||
config.attrs = attrs;
|
||||
auto const sf = static_cast<ScoringFunc>(scoring_func);
|
||||
int64_t const num_experts_per_group = num_experts / n_group;
|
||||
auto launch_topk_with_k2 = [&](auto* kernel_instance1) {
|
||||
cudaLaunchKernelEx(&config, kernel_instance1, group_scores, scores, bias,
|
||||
num_tokens, num_cases, n_group, num_experts_per_group);
|
||||
};
|
||||
switch (sf) {
|
||||
case SCORING_NONE: {
|
||||
auto* kernel_instance1 = &topk_with_k2_kernel<T, BiasT, SCORING_NONE>;
|
||||
launch_topk_with_k2(kernel_instance1);
|
||||
break;
|
||||
auto* kernel_instance =
|
||||
&grouped_topk_fused_kernel<T, BiasT, IdxT, SCORING_NONE>;
|
||||
cudaLaunchKernelEx(&config, kernel_instance, scores, topk_values,
|
||||
topk_indices, bias, num_tokens, num_experts, n_group,
|
||||
topk_group, topk, renormalize, routed_scaling_factor);
|
||||
return;
|
||||
}
|
||||
case SCORING_SIGMOID: {
|
||||
auto* kernel_instance1 = &topk_with_k2_kernel<T, BiasT, SCORING_SIGMOID>;
|
||||
launch_topk_with_k2(kernel_instance1);
|
||||
break;
|
||||
auto* kernel_instance =
|
||||
&grouped_topk_fused_kernel<T, BiasT, IdxT, SCORING_SIGMOID>;
|
||||
cudaLaunchKernelEx(&config, kernel_instance, scores, topk_values,
|
||||
topk_indices, bias, num_tokens, num_experts, n_group,
|
||||
topk_group, topk, renormalize, routed_scaling_factor);
|
||||
return;
|
||||
}
|
||||
default:
|
||||
// should be guarded by higher level checks.
|
||||
TORCH_CHECK(false, "Unsupported scoring_func in invokeNoAuxTc");
|
||||
}
|
||||
|
||||
int64_t topk_with_k_group_num_blocks =
|
||||
(num_tokens - 1) / NUM_WARPS_PER_BLOCK + 1;
|
||||
size_t dynamic_smem_in_bytes =
|
||||
warp_topk::calc_smem_size_for_block_wide<T, int32_t>(NUM_WARPS_PER_BLOCK,
|
||||
topk);
|
||||
config.gridDim = topk_with_k_group_num_blocks;
|
||||
config.blockDim = BLOCK_SIZE;
|
||||
config.dynamicSmemBytes = dynamic_smem_in_bytes;
|
||||
config.stream = stream;
|
||||
attrs[0].id = cudaLaunchAttributeProgrammaticStreamSerialization;
|
||||
attrs[0].val.programmaticStreamSerializationAllowed = enable_pdl;
|
||||
config.numAttrs = 1;
|
||||
config.attrs = attrs;
|
||||
switch (sf) {
|
||||
case SCORING_NONE: {
|
||||
launch_group_idx_and_topk_kernel<T, BiasT, IdxT, SCORING_NONE>(
|
||||
config, scores, group_scores, topk_values, topk_indices, bias,
|
||||
num_tokens, n_group, topk_group, topk, num_experts,
|
||||
num_experts_per_group, renormalize, routed_scaling_factor);
|
||||
break;
|
||||
}
|
||||
case SCORING_SIGMOID: {
|
||||
launch_group_idx_and_topk_kernel<T, BiasT, IdxT, SCORING_SIGMOID>(
|
||||
config, scores, group_scores, topk_values, topk_indices, bias,
|
||||
num_tokens, n_group, topk_group, topk, num_experts,
|
||||
num_experts_per_group, renormalize, routed_scaling_factor);
|
||||
break;
|
||||
}
|
||||
default:
|
||||
TORCH_CHECK(false, "Unsupported scoring_func in invokeNoAuxTc");
|
||||
}
|
||||
}
|
||||
|
||||
#define INSTANTIATE_NOAUX_TC(T, BiasT, IdxT) \
|
||||
template void invokeNoAuxTc<T, BiasT, IdxT>( \
|
||||
T * scores, T * group_scores, float* topk_values, IdxT* topk_indices, \
|
||||
BiasT const* bias, int64_t const num_tokens, int64_t const num_experts, \
|
||||
int64_t const n_group, int64_t const topk_group, int64_t const topk, \
|
||||
bool const renormalize, double const routed_scaling_factor, \
|
||||
#define INSTANTIATE_NOAUX_TC(T, BiasT, IdxT) \
|
||||
template void invokeNoAuxTc<T, BiasT, IdxT>( \
|
||||
T * scores, float* topk_values, IdxT* topk_indices, BiasT const* bias, \
|
||||
int64_t const num_tokens, int64_t const num_experts, \
|
||||
int64_t const n_group, int64_t const topk_group, int64_t const topk, \
|
||||
bool const renormalize, double const routed_scaling_factor, \
|
||||
int const scoring_func, bool enable_pdl, cudaStream_t const stream);
|
||||
|
||||
INSTANTIATE_NOAUX_TC(float, float, int32_t);
|
||||
@@ -843,17 +740,21 @@ std::tuple<torch::Tensor, torch::Tensor> grouped_topk(
|
||||
int64_t num_tokens = input_size[0];
|
||||
int64_t num_experts = input_size[1];
|
||||
TORCH_CHECK(input_size.size() == 2, "scores must be a 2D Tensor");
|
||||
TORCH_CHECK(n_group > 0, "n_group must be positive");
|
||||
TORCH_CHECK(topk > 0, "topk must be positive");
|
||||
TORCH_CHECK(topk_group > 0, "topk_group must be positive");
|
||||
TORCH_CHECK(topk_group <= n_group, "topk_group must be <= n_group");
|
||||
TORCH_CHECK(num_experts % n_group == 0,
|
||||
"num_experts should be divisible by n_group");
|
||||
TORCH_CHECK(n_group <= 32,
|
||||
"n_group should be smaller than or equal to 32 for now");
|
||||
TORCH_CHECK(topk <= 32, "topk should be smaller than or equal to 32 for now");
|
||||
TORCH_CHECK(topk <= topk_group * (num_experts / n_group),
|
||||
"topk must be <= topk_group * (num_experts / n_group)");
|
||||
TORCH_CHECK(scoring_func == vllm::moe::SCORING_NONE ||
|
||||
scoring_func == vllm::moe::SCORING_SIGMOID,
|
||||
"scoring_func must be SCORING_NONE (0) or SCORING_SIGMOID (1)");
|
||||
|
||||
torch::Tensor group_scores = torch::empty(
|
||||
{num_tokens, n_group}, torch::dtype(data_type).device(torch::kCUDA));
|
||||
// Always output float32 for topk_values (eliminates Python-side conversion)
|
||||
torch::Tensor topk_values = torch::empty(
|
||||
{num_tokens, topk}, torch::dtype(torch::kFloat32).device(torch::kCUDA));
|
||||
@@ -868,7 +769,6 @@ std::tuple<torch::Tensor, torch::Tensor> grouped_topk(
|
||||
case torch::kFloat16: \
|
||||
vllm::moe::invokeNoAuxTc<T, half, IdxT>( \
|
||||
reinterpret_cast<T*>(scores.mutable_data_ptr()), \
|
||||
reinterpret_cast<T*>(group_scores.mutable_data_ptr()), \
|
||||
reinterpret_cast<float*>(topk_values.mutable_data_ptr()), \
|
||||
reinterpret_cast<IdxT*>(topk_indices.mutable_data_ptr()), \
|
||||
reinterpret_cast<half const*>(bias.data_ptr()), num_tokens, \
|
||||
@@ -879,7 +779,6 @@ std::tuple<torch::Tensor, torch::Tensor> grouped_topk(
|
||||
case torch::kFloat32: \
|
||||
vllm::moe::invokeNoAuxTc<T, float, IdxT>( \
|
||||
reinterpret_cast<T*>(scores.mutable_data_ptr()), \
|
||||
reinterpret_cast<T*>(group_scores.mutable_data_ptr()), \
|
||||
reinterpret_cast<float*>(topk_values.mutable_data_ptr()), \
|
||||
reinterpret_cast<IdxT*>(topk_indices.mutable_data_ptr()), \
|
||||
reinterpret_cast<float const*>(bias.data_ptr()), num_tokens, \
|
||||
@@ -890,7 +789,6 @@ std::tuple<torch::Tensor, torch::Tensor> grouped_topk(
|
||||
case torch::kBFloat16: \
|
||||
vllm::moe::invokeNoAuxTc<T, __nv_bfloat16, IdxT>( \
|
||||
reinterpret_cast<T*>(scores.mutable_data_ptr()), \
|
||||
reinterpret_cast<T*>(group_scores.mutable_data_ptr()), \
|
||||
reinterpret_cast<float*>(topk_values.mutable_data_ptr()), \
|
||||
reinterpret_cast<IdxT*>(topk_indices.mutable_data_ptr()), \
|
||||
reinterpret_cast<__nv_bfloat16 const*>(bias.data_ptr()), \
|
||||
|
||||
@@ -58,7 +58,7 @@ TEMPLATE = (
|
||||
"( MARLIN_KERNEL_PARAMS );"
|
||||
)
|
||||
|
||||
THREAD_CONFIGS = [(128, 128, 256), (64, 256, 256), (64, 128, 128)]
|
||||
THREAD_CONFIGS = [(128, 128, 256), (64, 256, 256), (64, 128, 128), (128, 64, 128)]
|
||||
|
||||
THREAD_M_BLOCKS = [0.5, 1, 2, 3, 4]
|
||||
|
||||
|
||||
@@ -3,8 +3,8 @@
|
||||
#define MARLIN_NAMESPACE_NAME marlin_moe_wna16
|
||||
#endif
|
||||
|
||||
#include "quantization/gptq_marlin/marlin.cuh"
|
||||
#include "quantization/gptq_marlin/marlin_dtypes.cuh"
|
||||
#include "quantization/marlin/marlin.cuh"
|
||||
#include "quantization/marlin/marlin_dtypes.cuh"
|
||||
#include "core/scalar_type.hpp"
|
||||
|
||||
#define MARLIN_KERNEL_PARAMS \
|
||||
|
||||
@@ -23,10 +23,10 @@
|
||||
#define MARLIN_NAMESPACE_NAME marlin_moe_wna16
|
||||
#endif
|
||||
|
||||
#include "quantization/gptq_marlin/marlin.cuh"
|
||||
#include "quantization/gptq_marlin/marlin_dtypes.cuh"
|
||||
#include "quantization/gptq_marlin/dequant.h"
|
||||
#include "quantization/gptq_marlin/marlin_mma.h"
|
||||
#include "quantization/marlin/marlin.cuh"
|
||||
#include "quantization/marlin/marlin_dtypes.cuh"
|
||||
#include "quantization/marlin/dequant.h"
|
||||
#include "quantization/marlin/marlin_mma.h"
|
||||
#include "core/scalar_type.hpp"
|
||||
|
||||
#define STATIC_ASSERT_SCALAR_TYPE_VALID(scalar_t) \
|
||||
|
||||
@@ -126,14 +126,16 @@ thread_config_t small_batch_thread_configs[] = {
|
||||
|
||||
// thread_k, thread_n, num_threads
|
||||
{128, 128, 256},
|
||||
{64, 128, 128}};
|
||||
{64, 128, 128},
|
||||
{128, 64, 128}};
|
||||
|
||||
thread_config_t large_batch_thread_configs[] = {
|
||||
// Ordered by priority
|
||||
|
||||
// thread_k, thread_n, num_threads
|
||||
{64, 256, 256},
|
||||
{64, 128, 128}};
|
||||
{64, 128, 128},
|
||||
{128, 64, 128}};
|
||||
|
||||
typedef struct {
|
||||
int blocks_per_sm;
|
||||
|
||||
@@ -4,7 +4,13 @@
|
||||
|
||||
void topk_softmax(torch::Tensor& topk_weights, torch::Tensor& topk_indices,
|
||||
torch::Tensor& token_expert_indices,
|
||||
torch::Tensor& gating_output, bool renormalize);
|
||||
torch::Tensor& gating_output, bool renormalize,
|
||||
std::optional<torch::Tensor> bias);
|
||||
|
||||
void topk_sigmoid(torch::Tensor& topk_weights, torch::Tensor& topk_indices,
|
||||
torch::Tensor& token_expert_indices,
|
||||
torch::Tensor& gating_output, bool renormalize,
|
||||
std::optional<torch::Tensor> bias);
|
||||
|
||||
void moe_sum(torch::Tensor& input, torch::Tensor& output);
|
||||
|
||||
|
||||
@@ -42,7 +42,7 @@ void moe_permute(
|
||||
auto sort_workspace = torch::empty(
|
||||
{sorter_size},
|
||||
torch::dtype(torch::kInt8).device(torch::kCUDA).requires_grad(false));
|
||||
auto copy_topk_ids = topk_ids.clone(); // copy topk_ids for preprocess
|
||||
torch::Tensor topk_ids_for_sort = topk_ids;
|
||||
auto permuted_experts_id = torch::empty_like(topk_ids);
|
||||
auto sorted_row_idx = torch::empty_like(inv_permuted_idx);
|
||||
|
||||
@@ -62,12 +62,13 @@ void moe_permute(
|
||||
const int* expert_map_ptr = get_ptr<int>(expert_map.value());
|
||||
valid_num_ptr =
|
||||
get_ptr<int64_t>(expert_first_token_offset) + n_local_expert;
|
||||
preprocessTopkIdLauncher(get_ptr<int>(copy_topk_ids), n_token * topk,
|
||||
topk_ids_for_sort = topk_ids.clone();
|
||||
preprocessTopkIdLauncher(get_ptr<int>(topk_ids_for_sort), n_token * topk,
|
||||
expert_map_ptr, n_expert, stream);
|
||||
}
|
||||
// expert sort topk expert id and scan expert id get expert_first_token_offset
|
||||
sortAndScanExpert(
|
||||
get_ptr<int>(copy_topk_ids), get_ptr<int>(token_expert_indices),
|
||||
get_ptr<const int>(topk_ids_for_sort), get_ptr<int>(token_expert_indices),
|
||||
get_ptr<int>(permuted_experts_id), get_ptr<int>(sorted_row_idx),
|
||||
get_ptr<int64_t>(expert_first_token_offset), n_token, n_expert,
|
||||
n_local_expert, topk, sorter, get_ptr<int>(sort_workspace), stream);
|
||||
|
||||
@@ -109,7 +109,7 @@ void computeExpertFirstTokenOffset(int const* sorted_indices,
|
||||
sorted_indices, total_indices, num_experts, expert_first_token_offset);
|
||||
}
|
||||
|
||||
void sortAndScanExpert(int* expert_for_source_row, const int* source_rows,
|
||||
void sortAndScanExpert(const int* expert_for_source_row, const int* source_rows,
|
||||
int* permuted_experts, int* permuted_rows,
|
||||
int64_t* expert_first_token_offset, int num_rows,
|
||||
int num_experts, int num_experts_per_node, int k,
|
||||
|
||||
@@ -48,7 +48,7 @@ void computeExpertFirstTokenOffset(int const* sorted_indices,
|
||||
int64_t* expert_first_token_offset,
|
||||
cudaStream_t stream);
|
||||
|
||||
void sortAndScanExpert(int* expert_for_source_row, const int* source_rows,
|
||||
void sortAndScanExpert(const int* expert_for_source_row, const int* source_rows,
|
||||
int* permuted_experts, int* permuted_rows,
|
||||
int64_t* expert_first_token_offset, int num_rows,
|
||||
int num_experts, int num_experts_per_node, int k,
|
||||
|
||||
@@ -62,6 +62,12 @@ __device__ __forceinline__ float toFloat(T value) {
|
||||
}
|
||||
}
|
||||
|
||||
// Scoring function enums
|
||||
enum ScoringFunc {
|
||||
SCORING_SOFTMAX = 0, // apply softmax
|
||||
SCORING_SIGMOID = 1 // apply sigmoid
|
||||
};
|
||||
|
||||
// ====================== Softmax things ===============================
|
||||
// We have our own implementation of softmax here so we can support transposing the output
|
||||
// in the softmax kernel when we extend this module to support expert-choice routing.
|
||||
@@ -125,6 +131,27 @@ __launch_bounds__(TPB) __global__
|
||||
}
|
||||
}
|
||||
|
||||
template <int TPB, typename InputType>
|
||||
__launch_bounds__(TPB) __global__
|
||||
void moeSigmoid(const InputType* input, const bool* finished, float* output, const int num_cols)
|
||||
{
|
||||
const int thread_row_offset = blockIdx.x * num_cols;
|
||||
|
||||
// Don't touch finished rows.
|
||||
if ((finished != nullptr) && finished[blockIdx.x])
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
for (int ii = threadIdx.x; ii < num_cols; ii += TPB)
|
||||
{
|
||||
const int idx = thread_row_offset + ii;
|
||||
const float val = toFloat(input[idx]);
|
||||
const float sigmoid_val = 1.0f / (1.0f + __expf(-val));
|
||||
output[idx] = sigmoid_val;
|
||||
}
|
||||
}
|
||||
|
||||
template <int TPB, typename IndType>
|
||||
__launch_bounds__(TPB) __global__ void moeTopK(
|
||||
const float* inputs_after_softmax,
|
||||
@@ -136,7 +163,8 @@ __launch_bounds__(TPB) __global__ void moeTopK(
|
||||
const int k,
|
||||
const int start_expert,
|
||||
const int end_expert,
|
||||
const bool renormalize)
|
||||
const bool renormalize,
|
||||
const float* bias)
|
||||
{
|
||||
|
||||
using cub_kvp = cub::KeyValuePair<int, float>;
|
||||
@@ -162,7 +190,13 @@ __launch_bounds__(TPB) __global__ void moeTopK(
|
||||
{
|
||||
const int idx = thread_read_offset + expert;
|
||||
inp_kvp.key = expert;
|
||||
inp_kvp.value = inputs_after_softmax[idx];
|
||||
|
||||
// Apply correction bias if provided
|
||||
if (bias != nullptr) {
|
||||
inp_kvp.value = inputs_after_softmax[idx] + bias[expert];
|
||||
} else {
|
||||
inp_kvp.value = inputs_after_softmax[idx];
|
||||
}
|
||||
|
||||
for (int prior_k = 0; prior_k < k_idx; ++prior_k)
|
||||
{
|
||||
@@ -186,12 +220,13 @@ __launch_bounds__(TPB) __global__ void moeTopK(
|
||||
const bool should_process_row = row_is_active && node_uses_expert;
|
||||
|
||||
const int idx = k * block_row + k_idx;
|
||||
output[idx] = result_kvp.value;
|
||||
// Return the unbiased scores for output weights
|
||||
output[idx] = inputs_after_softmax[thread_read_offset + expert];
|
||||
indices[idx] = should_process_row ? (expert - start_expert) : num_experts;
|
||||
assert(indices[idx] >= 0);
|
||||
source_rows[idx] = k_idx * num_rows + block_row;
|
||||
if (renormalize) {
|
||||
selected_sum += result_kvp.value;
|
||||
selected_sum += inputs_after_softmax[thread_read_offset + expert];
|
||||
}
|
||||
}
|
||||
__syncthreads();
|
||||
@@ -225,10 +260,12 @@ __launch_bounds__(TPB) __global__ void moeTopK(
|
||||
2) This implementation assumes k is small, but will work for any k.
|
||||
*/
|
||||
|
||||
template <int VPT, int NUM_EXPERTS, int WARPS_PER_CTA, int BYTES_PER_LDG, int WARP_SIZE_PARAM, typename IndType, typename InputType = float>
|
||||
template <int VPT, int NUM_EXPERTS, int WARPS_PER_CTA, int BYTES_PER_LDG, int WARP_SIZE_PARAM, typename IndType,
|
||||
typename InputType = float, ScoringFunc SF>
|
||||
__launch_bounds__(WARPS_PER_CTA* WARP_SIZE_PARAM) __global__
|
||||
void topkGatingSoftmax(const InputType* input, const bool* finished, float* output, const int num_rows, IndType* indices,
|
||||
int* source_rows, const int k, const int start_expert, const int end_expert, const bool renormalize)
|
||||
void topkGating(const InputType* input, const bool* finished, float* output, const int num_rows, IndType* indices,
|
||||
int* source_rows, const int k, const int start_expert, const int end_expert, const bool renormalize,
|
||||
const float* bias)
|
||||
{
|
||||
static_assert(std::is_same_v<InputType, float> || std::is_same_v<InputType, __nv_bfloat16> ||
|
||||
std::is_same_v<InputType, __half>,
|
||||
@@ -353,61 +390,89 @@ __launch_bounds__(WARPS_PER_CTA* WARP_SIZE_PARAM) __global__
|
||||
}
|
||||
}
|
||||
|
||||
// First, we perform a max reduce within the thread. We can do the max in fp16 safely (I think) and just
|
||||
// convert to float afterwards for the exp + sum reduction.
|
||||
float thread_max = row_chunk[0];
|
||||
if constexpr (SF == SCORING_SOFTMAX) {
|
||||
// First, we perform a max reduce within the thread.
|
||||
float thread_max = row_chunk[0];
|
||||
#pragma unroll
|
||||
for (int ii = 1; ii < VPT; ++ii)
|
||||
{
|
||||
for (int ii = 1; ii < VPT; ++ii) {
|
||||
thread_max = max(thread_max, row_chunk[ii]);
|
||||
}
|
||||
}
|
||||
|
||||
// Now, we find the max within the thread group and distribute among the threads. We use a butterfly reduce.
|
||||
#pragma unroll
|
||||
for (int mask = THREADS_PER_ROW / 2; mask > 0; mask /= 2)
|
||||
{
|
||||
for (int mask = THREADS_PER_ROW / 2; mask > 0; mask /= 2)
|
||||
{
|
||||
thread_max = max(thread_max, VLLM_SHFL_XOR_SYNC_WIDTH(thread_max, mask, THREADS_PER_ROW));
|
||||
}
|
||||
}
|
||||
|
||||
// From this point, thread max in all the threads have the max within the row.
|
||||
// Now, we subtract the max from each element in the thread and take the exp. We also compute the thread local sum.
|
||||
float row_sum = 0;
|
||||
// From this point, thread max in all the threads have the max within the row.
|
||||
// Now, we subtract the max from each element in the thread and take the exp. We also compute the thread local sum.
|
||||
float row_sum = 0;
|
||||
#pragma unroll
|
||||
for (int ii = 0; ii < VPT; ++ii)
|
||||
{
|
||||
for (int ii = 0; ii < VPT; ++ii)
|
||||
{
|
||||
row_chunk[ii] = expf(row_chunk[ii] - thread_max);
|
||||
row_sum += row_chunk[ii];
|
||||
}
|
||||
}
|
||||
|
||||
// Now, we perform the sum reduce within each thread group. Similar to the max reduce, we use a bufferfly pattern.
|
||||
#pragma unroll
|
||||
for (int mask = THREADS_PER_ROW / 2; mask > 0; mask /= 2)
|
||||
{
|
||||
for (int mask = THREADS_PER_ROW / 2; mask > 0; mask /= 2)
|
||||
{
|
||||
row_sum += VLLM_SHFL_XOR_SYNC_WIDTH(row_sum, mask, THREADS_PER_ROW);
|
||||
}
|
||||
}
|
||||
|
||||
// From this point, all threads have the max and the sum for their rows in the thread_max and thread_sum variables
|
||||
// respectively. Finally, we can scale the rows for the softmax. Technically, for top-k gating we don't need to
|
||||
// compute the entire softmax row. We can likely look at the maxes and only compute for the top-k values in the row.
|
||||
// However, this kernel will likely not be a bottle neck and it seems better to closer match torch and find the
|
||||
// argmax after computing the softmax.
|
||||
const float reciprocal_row_sum = 1.f / row_sum;
|
||||
// From this point, all threads have the max and the sum for their rows in the thread_max and thread_sum variables
|
||||
// respectively. Finally, we can scale the rows for the softmax. Technically, for top-k gating we don't need to
|
||||
// compute the entire softmax row. We can likely look at the maxes and only compute for the top-k values in the row.
|
||||
// However, this kernel will likely not be a bottle neck and it seems better to closer match torch and find the
|
||||
// argmax after computing the softmax.
|
||||
const float reciprocal_row_sum = 1.f / row_sum;
|
||||
|
||||
#pragma unroll
|
||||
for (int ii = 0; ii < VPT; ++ii)
|
||||
{
|
||||
for (int ii = 0; ii < VPT; ++ii)
|
||||
{
|
||||
row_chunk[ii] = row_chunk[ii] * reciprocal_row_sum;
|
||||
}
|
||||
} else if constexpr (SF == SCORING_SIGMOID) {
|
||||
#pragma unroll
|
||||
for (int ii = 0; ii < VPT; ++ii)
|
||||
{
|
||||
row_chunk[ii] = 1.0f / (1.0f + __expf(-row_chunk[ii]));
|
||||
}
|
||||
}
|
||||
|
||||
// Now, softmax_res contains the softmax of the row chunk. Now, I want to find the topk elements in each row, along
|
||||
static constexpr int COLS_PER_GROUP_LDG = ELTS_PER_LDG * THREADS_PER_ROW;
|
||||
|
||||
// If bias is not null, use biased value for selection
|
||||
float row_chunk_for_choice[VPT];
|
||||
// Apply correction bias
|
||||
if (bias != nullptr) {
|
||||
#pragma unroll
|
||||
for (int ldg = 0; ldg < LDG_PER_THREAD; ++ldg) {
|
||||
#pragma unroll
|
||||
for (int ii = 0; ii < ELTS_PER_LDG; ++ii) {
|
||||
const int expert = first_elt_read_by_thread + ldg * COLS_PER_GROUP_LDG + ii;
|
||||
float bias_val = expert < NUM_EXPERTS ? bias[expert] : 0.0f;
|
||||
row_chunk_for_choice[ldg * ELTS_PER_LDG + ii] = row_chunk[ldg * ELTS_PER_LDG + ii] + bias_val;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
#pragma unroll
|
||||
for (int ii = 0; ii < VPT; ++ii) {
|
||||
row_chunk_for_choice[ii] = row_chunk[ii];
|
||||
}
|
||||
}
|
||||
|
||||
// Now, row_chunk contains the softmax / sigmoid of the row chunk. Now, I want to find the topk elements in each row, along
|
||||
// with the max index.
|
||||
int start_col = first_elt_read_by_thread;
|
||||
static constexpr int COLS_PER_GROUP_LDG = ELTS_PER_LDG * THREADS_PER_ROW;
|
||||
|
||||
float selected_sum = 0.f;
|
||||
for (int k_idx = 0; k_idx < k; ++k_idx)
|
||||
{
|
||||
// First, each thread does the local argmax
|
||||
float max_val_for_choice = row_chunk_for_choice[0];
|
||||
float max_val = row_chunk[0];
|
||||
int expert = start_col;
|
||||
#pragma unroll
|
||||
@@ -416,12 +481,14 @@ __launch_bounds__(WARPS_PER_CTA* WARP_SIZE_PARAM) __global__
|
||||
#pragma unroll
|
||||
for (int ii = 0; ii < ELTS_PER_LDG; ++ii)
|
||||
{
|
||||
float val_for_choice = row_chunk_for_choice[ldg * ELTS_PER_LDG + ii];
|
||||
float val = row_chunk[ldg * ELTS_PER_LDG + ii];
|
||||
|
||||
// No check on the experts here since columns with the smallest index are processed first and only
|
||||
// updated if > (not >=)
|
||||
if (val > max_val)
|
||||
if (val_for_choice > max_val_for_choice)
|
||||
{
|
||||
max_val_for_choice = val_for_choice;
|
||||
max_val = val;
|
||||
expert = col + ii;
|
||||
}
|
||||
@@ -434,12 +501,14 @@ __launch_bounds__(WARPS_PER_CTA* WARP_SIZE_PARAM) __global__
|
||||
#pragma unroll
|
||||
for (int mask = THREADS_PER_ROW / 2; mask > 0; mask /= 2)
|
||||
{
|
||||
float other_max_for_choice = VLLM_SHFL_XOR_SYNC_WIDTH(max_val_for_choice, mask, THREADS_PER_ROW);
|
||||
float other_max = VLLM_SHFL_XOR_SYNC_WIDTH(max_val, mask, THREADS_PER_ROW);
|
||||
int other_expert = VLLM_SHFL_XOR_SYNC_WIDTH(expert, mask, THREADS_PER_ROW);
|
||||
|
||||
// We want lower indices to "win" in every thread so we break ties this way
|
||||
if (other_max > max_val || (other_max == max_val && other_expert < expert))
|
||||
if (other_max_for_choice > max_val_for_choice || (other_max_for_choice == max_val_for_choice && other_expert < expert))
|
||||
{
|
||||
max_val_for_choice = other_max_for_choice;
|
||||
max_val = other_max;
|
||||
expert = other_expert;
|
||||
}
|
||||
@@ -474,7 +543,7 @@ __launch_bounds__(WARPS_PER_CTA* WARP_SIZE_PARAM) __global__
|
||||
{
|
||||
const int offset_for_expert = expert % ELTS_PER_LDG;
|
||||
// Safe to set to any negative value since row_chunk values must be between 0 and 1.
|
||||
row_chunk[ldg_group_for_expert * ELTS_PER_LDG + offset_for_expert] = -10000.f;
|
||||
row_chunk_for_choice[ldg_group_for_expert * ELTS_PER_LDG + offset_for_expert] = -10000.f;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -508,10 +577,10 @@ struct TopkConstants
|
||||
};
|
||||
} // namespace detail
|
||||
|
||||
template <int EXPERTS, int WARPS_PER_TB, int WARP_SIZE_PARAM, int MAX_BYTES_PER_LDG, typename IndType, typename InputType>
|
||||
void topkGatingSoftmaxLauncherHelper(const InputType* input, const bool* finished, float* output, IndType* indices,
|
||||
template <int EXPERTS, int WARPS_PER_TB, int WARP_SIZE_PARAM, int MAX_BYTES_PER_LDG, typename IndType, typename InputType, ScoringFunc SF>
|
||||
void topkGatingLauncherHelper(const InputType* input, const bool* finished, float* output, IndType* indices,
|
||||
int* source_row, const int num_rows, const int k, const int start_expert, const int end_expert, const bool renormalize,
|
||||
cudaStream_t stream)
|
||||
const float* bias, cudaStream_t stream)
|
||||
{
|
||||
static constexpr int BYTES_PER_LDG = MIN(MAX_BYTES_PER_LDG, sizeof(InputType) * EXPERTS);
|
||||
using Constants = detail::TopkConstants<EXPERTS, BYTES_PER_LDG, WARP_SIZE_PARAM, InputType>;
|
||||
@@ -521,43 +590,51 @@ void topkGatingSoftmaxLauncherHelper(const InputType* input, const bool* finishe
|
||||
const int num_blocks = (num_warps + WARPS_PER_TB - 1) / WARPS_PER_TB;
|
||||
|
||||
dim3 block_dim(WARP_SIZE_PARAM, WARPS_PER_TB);
|
||||
topkGatingSoftmax<VPT, EXPERTS, WARPS_PER_TB, BYTES_PER_LDG, WARP_SIZE_PARAM, IndType, InputType><<<num_blocks, block_dim, 0, stream>>>(
|
||||
input, finished, output, num_rows, indices, source_row, k, start_expert, end_expert, renormalize);
|
||||
topkGating<VPT, EXPERTS, WARPS_PER_TB, BYTES_PER_LDG, WARP_SIZE_PARAM, IndType, InputType, SF><<<num_blocks, block_dim, 0, stream>>>(
|
||||
input, finished, output, num_rows, indices, source_row, k, start_expert, end_expert, renormalize, bias);
|
||||
}
|
||||
|
||||
#ifndef USE_ROCM
|
||||
#define LAUNCH_SOFTMAX(NUM_EXPERTS, WARPS_PER_TB, MAX_BYTES) \
|
||||
static_assert(WARP_SIZE == 32, \
|
||||
"Unsupported warp size. Only 32 is supported for CUDA"); \
|
||||
topkGatingSoftmaxLauncherHelper<NUM_EXPERTS, WARPS_PER_TB, WARP_SIZE, MAX_BYTES>( \
|
||||
gating_output, nullptr, topk_weights, topk_indices, token_expert_indices, \
|
||||
num_tokens, topk, 0, num_experts, renormalize, stream);
|
||||
#define LAUNCH_TOPK(NUM_EXPERTS, WARPS_PER_TB, MAX_BYTES) \
|
||||
static_assert(WARP_SIZE == 32, \
|
||||
"Unsupported warp size. Only 32 is supported for CUDA"); \
|
||||
topkGatingLauncherHelper<NUM_EXPERTS, WARPS_PER_TB, WARP_SIZE, MAX_BYTES, \
|
||||
IndType, InputType, SF>( \
|
||||
gating_output, nullptr, topk_weights, topk_indices, \
|
||||
token_expert_indices, num_tokens, topk, 0, num_experts, renormalize, \
|
||||
bias, stream);
|
||||
#else
|
||||
#define LAUNCH_SOFTMAX(NUM_EXPERTS, WARPS_PER_TB, MAX_BYTES) \
|
||||
if (WARP_SIZE == 64) { \
|
||||
topkGatingSoftmaxLauncherHelper<NUM_EXPERTS, WARPS_PER_TB, 64, MAX_BYTES>( \
|
||||
gating_output, nullptr, topk_weights, topk_indices, token_expert_indices, \
|
||||
num_tokens, topk, 0, num_experts, renormalize, stream); \
|
||||
} else if (WARP_SIZE == 32) { \
|
||||
topkGatingSoftmaxLauncherHelper<NUM_EXPERTS, WARPS_PER_TB, 32, MAX_BYTES>( \
|
||||
gating_output, nullptr, topk_weights, topk_indices, token_expert_indices, \
|
||||
num_tokens, topk, 0, num_experts, renormalize, stream); \
|
||||
} else { \
|
||||
assert(false && "Unsupported warp size. Only 32 and 64 are supported for ROCm"); \
|
||||
#define LAUNCH_TOPK(NUM_EXPERTS, WARPS_PER_TB, MAX_BYTES) \
|
||||
if (WARP_SIZE == 64) { \
|
||||
topkGatingLauncherHelper<NUM_EXPERTS, WARPS_PER_TB, 64, MAX_BYTES, \
|
||||
IndType, InputType, SF>( \
|
||||
gating_output, nullptr, topk_weights, topk_indices, \
|
||||
token_expert_indices, num_tokens, topk, 0, num_experts, renormalize, \
|
||||
bias, stream); \
|
||||
} else if (WARP_SIZE == 32) { \
|
||||
topkGatingLauncherHelper<NUM_EXPERTS, WARPS_PER_TB, 32, MAX_BYTES, \
|
||||
IndType, InputType, SF>( \
|
||||
gating_output, nullptr, topk_weights, topk_indices, \
|
||||
token_expert_indices, num_tokens, topk, 0, num_experts, renormalize, \
|
||||
bias, stream); \
|
||||
} else { \
|
||||
assert(false && \
|
||||
"Unsupported warp size. Only 32 and 64 are supported for ROCm"); \
|
||||
}
|
||||
#endif
|
||||
|
||||
template <typename IndType, typename InputType>
|
||||
void topkGatingSoftmaxKernelLauncher(
|
||||
template <typename IndType, typename InputType, ScoringFunc SF>
|
||||
void topkGatingKernelLauncher(
|
||||
const InputType* gating_output,
|
||||
float* topk_weights,
|
||||
IndType* topk_indices,
|
||||
int* token_expert_indices,
|
||||
float* softmax_workspace,
|
||||
float* workspace,
|
||||
const int num_tokens,
|
||||
const int num_experts,
|
||||
const int topk,
|
||||
const bool renormalize,
|
||||
const float* bias,
|
||||
cudaStream_t stream) {
|
||||
static constexpr int WARPS_PER_TB = 4;
|
||||
static constexpr int BYTES_PER_LDG_POWER_OF_2 = 16;
|
||||
@@ -569,64 +646,71 @@ void topkGatingSoftmaxKernelLauncher(
|
||||
#endif
|
||||
switch (num_experts) {
|
||||
case 1:
|
||||
LAUNCH_SOFTMAX(1, WARPS_PER_TB, BYTES_PER_LDG_POWER_OF_2);
|
||||
LAUNCH_TOPK(1, WARPS_PER_TB, BYTES_PER_LDG_POWER_OF_2);
|
||||
break;
|
||||
case 2:
|
||||
LAUNCH_SOFTMAX(2, WARPS_PER_TB, BYTES_PER_LDG_POWER_OF_2);
|
||||
LAUNCH_TOPK(2, WARPS_PER_TB, BYTES_PER_LDG_POWER_OF_2);
|
||||
break;
|
||||
case 4:
|
||||
LAUNCH_SOFTMAX(4, WARPS_PER_TB, BYTES_PER_LDG_POWER_OF_2);
|
||||
LAUNCH_TOPK(4, WARPS_PER_TB, BYTES_PER_LDG_POWER_OF_2);
|
||||
break;
|
||||
case 8:
|
||||
LAUNCH_SOFTMAX(8, WARPS_PER_TB, BYTES_PER_LDG_POWER_OF_2);
|
||||
LAUNCH_TOPK(8, WARPS_PER_TB, BYTES_PER_LDG_POWER_OF_2);
|
||||
break;
|
||||
case 16:
|
||||
LAUNCH_SOFTMAX(16, WARPS_PER_TB, BYTES_PER_LDG_POWER_OF_2);
|
||||
LAUNCH_TOPK(16, WARPS_PER_TB, BYTES_PER_LDG_POWER_OF_2);
|
||||
break;
|
||||
case 32:
|
||||
LAUNCH_SOFTMAX(32, WARPS_PER_TB, BYTES_PER_LDG_POWER_OF_2);
|
||||
LAUNCH_TOPK(32, WARPS_PER_TB, BYTES_PER_LDG_POWER_OF_2);
|
||||
break;
|
||||
case 64:
|
||||
LAUNCH_SOFTMAX(64, WARPS_PER_TB, BYTES_PER_LDG_POWER_OF_2);
|
||||
LAUNCH_TOPK(64, WARPS_PER_TB, BYTES_PER_LDG_POWER_OF_2);
|
||||
break;
|
||||
case 128:
|
||||
LAUNCH_SOFTMAX(128, WARPS_PER_TB, BYTES_PER_LDG_POWER_OF_2);
|
||||
LAUNCH_TOPK(128, WARPS_PER_TB, BYTES_PER_LDG_POWER_OF_2);
|
||||
break;
|
||||
case 256:
|
||||
LAUNCH_SOFTMAX(256, WARPS_PER_TB, BYTES_PER_LDG_POWER_OF_2);
|
||||
LAUNCH_TOPK(256, WARPS_PER_TB, BYTES_PER_LDG_POWER_OF_2);
|
||||
break;
|
||||
case 512:
|
||||
LAUNCH_SOFTMAX(512, WARPS_PER_TB, BYTES_PER_LDG_POWER_OF_2);
|
||||
LAUNCH_TOPK(512, WARPS_PER_TB, BYTES_PER_LDG_POWER_OF_2);
|
||||
break;
|
||||
// (CUDA only) support multiples of 64 when num_experts is not power of 2.
|
||||
// ROCm uses WARP_SIZE 64 so 8 bytes loading won't fit for some of num_experts,
|
||||
// alternatively we can test 4 bytes loading and enable it in future.
|
||||
#ifndef USE_ROCM
|
||||
case 192:
|
||||
LAUNCH_SOFTMAX(192, WARPS_PER_TB, BYTES_PER_LDG_MULTIPLE_64);
|
||||
LAUNCH_TOPK(192, WARPS_PER_TB, BYTES_PER_LDG_MULTIPLE_64);
|
||||
break;
|
||||
case 320:
|
||||
LAUNCH_SOFTMAX(320, WARPS_PER_TB, BYTES_PER_LDG_MULTIPLE_64);
|
||||
LAUNCH_TOPK(320, WARPS_PER_TB, BYTES_PER_LDG_MULTIPLE_64);
|
||||
break;
|
||||
case 384:
|
||||
LAUNCH_SOFTMAX(384, WARPS_PER_TB, BYTES_PER_LDG_MULTIPLE_64);
|
||||
LAUNCH_TOPK(384, WARPS_PER_TB, BYTES_PER_LDG_MULTIPLE_64);
|
||||
break;
|
||||
case 448:
|
||||
LAUNCH_SOFTMAX(448, WARPS_PER_TB, BYTES_PER_LDG_MULTIPLE_64);
|
||||
LAUNCH_TOPK(448, WARPS_PER_TB, BYTES_PER_LDG_MULTIPLE_64);
|
||||
break;
|
||||
case 576:
|
||||
LAUNCH_SOFTMAX(576, WARPS_PER_TB, BYTES_PER_LDG_MULTIPLE_64);
|
||||
LAUNCH_TOPK(576, WARPS_PER_TB, BYTES_PER_LDG_MULTIPLE_64);
|
||||
break;
|
||||
#endif
|
||||
default: {
|
||||
TORCH_CHECK(softmax_workspace != nullptr,
|
||||
"softmax_workspace must be provided for num_experts that are not a power of 2 or multiple of 64.");
|
||||
TORCH_CHECK(workspace != nullptr,
|
||||
"workspace must be provided for num_experts that are not a power of 2 or multiple of 64.");
|
||||
static constexpr int TPB = 256;
|
||||
moeSoftmax<TPB, InputType><<<num_tokens, TPB, 0, stream>>>(
|
||||
gating_output, nullptr, softmax_workspace, num_experts);
|
||||
if constexpr (SF == SCORING_SOFTMAX) {
|
||||
moeSoftmax<TPB, InputType><<<num_tokens, TPB, 0, stream>>>(
|
||||
gating_output, nullptr, workspace, num_experts);
|
||||
} else if constexpr (SF == SCORING_SIGMOID) {
|
||||
moeSigmoid<TPB, InputType><<<num_tokens, TPB, 0, stream>>>(
|
||||
gating_output, nullptr, workspace, num_experts);
|
||||
} else {
|
||||
TORCH_CHECK(false, "Unsupported scoring func");
|
||||
}
|
||||
moeTopK<TPB><<<num_tokens, TPB, 0, stream>>>(
|
||||
softmax_workspace, nullptr, topk_weights, topk_indices, token_expert_indices,
|
||||
num_experts, topk, 0, num_experts, renormalize);
|
||||
workspace, nullptr, topk_weights, topk_indices, token_expert_indices,
|
||||
num_experts, topk, 0, num_experts, renormalize, bias);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -635,40 +719,55 @@ void topkGatingSoftmaxKernelLauncher(
|
||||
} // namespace vllm
|
||||
|
||||
|
||||
template<typename ComputeType>
|
||||
void dispatch_topk_softmax_launch(
|
||||
template<typename ComputeType, vllm::moe::ScoringFunc SF>
|
||||
void dispatch_topk_launch(
|
||||
torch::Tensor& gating_output,
|
||||
torch::Tensor& topk_weights,
|
||||
torch::Tensor& topk_indices,
|
||||
torch::Tensor& token_expert_indices,
|
||||
torch::Tensor& softmax_workspace,
|
||||
int num_tokens, int num_experts, int topk, bool renormalize, cudaStream_t stream)
|
||||
{
|
||||
int num_tokens, int num_experts, int topk, bool renormalize,
|
||||
std::optional<torch::Tensor> bias,
|
||||
cudaStream_t stream)
|
||||
{
|
||||
const float* bias_ptr = nullptr;
|
||||
if (bias.has_value()) {
|
||||
const torch::Tensor& bias_tensor = bias.value();
|
||||
TORCH_CHECK(bias_tensor.scalar_type() == at::ScalarType::Float, "bias tensor must be float32");
|
||||
TORCH_CHECK(bias_tensor.dim() == 1, "bias tensor must be 1D");
|
||||
TORCH_CHECK(bias_tensor.size(0) == num_experts, "bias size mismatch, expected: ", num_experts);
|
||||
TORCH_CHECK(bias_tensor.is_contiguous(), "bias tensor must be contiguous");
|
||||
bias_ptr = bias_tensor.data_ptr<float>();
|
||||
}
|
||||
|
||||
if (topk_indices.scalar_type() == at::ScalarType::Int) {
|
||||
vllm::moe::topkGatingSoftmaxKernelLauncher<int, ComputeType>(
|
||||
vllm::moe::topkGatingKernelLauncher<int, ComputeType, SF>(
|
||||
reinterpret_cast<const ComputeType*>(gating_output.data_ptr()),
|
||||
topk_weights.data_ptr<float>(),
|
||||
topk_indices.data_ptr<int>(),
|
||||
token_expert_indices.data_ptr<int>(),
|
||||
softmax_workspace.data_ptr<float>(),
|
||||
num_tokens, num_experts, topk, renormalize, stream);
|
||||
num_tokens, num_experts, topk, renormalize,
|
||||
bias_ptr, stream);
|
||||
} else if (topk_indices.scalar_type() == at::ScalarType::UInt32) {
|
||||
vllm::moe::topkGatingSoftmaxKernelLauncher<uint32_t, ComputeType>(
|
||||
vllm::moe::topkGatingKernelLauncher<uint32_t, ComputeType, SF>(
|
||||
reinterpret_cast<const ComputeType*>(gating_output.data_ptr()),
|
||||
topk_weights.data_ptr<float>(),
|
||||
topk_indices.data_ptr<uint32_t>(),
|
||||
token_expert_indices.data_ptr<int>(),
|
||||
softmax_workspace.data_ptr<float>(),
|
||||
num_tokens, num_experts, topk, renormalize, stream);
|
||||
num_tokens, num_experts, topk, renormalize,
|
||||
bias_ptr, stream);
|
||||
} else {
|
||||
TORCH_CHECK(topk_indices.scalar_type() == at::ScalarType::Long);
|
||||
vllm::moe::topkGatingSoftmaxKernelLauncher<int64_t, ComputeType>(
|
||||
vllm::moe::topkGatingKernelLauncher<int64_t, ComputeType, SF>(
|
||||
reinterpret_cast<const ComputeType*>(gating_output.data_ptr()),
|
||||
topk_weights.data_ptr<float>(),
|
||||
topk_indices.data_ptr<int64_t>(),
|
||||
token_expert_indices.data_ptr<int>(),
|
||||
softmax_workspace.data_ptr<float>(),
|
||||
num_tokens, num_experts, topk, renormalize, stream);
|
||||
num_tokens, num_experts, topk, renormalize,
|
||||
bias_ptr, stream);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -677,7 +776,8 @@ void topk_softmax(
|
||||
torch::Tensor& topk_indices, // [num_tokens, topk]
|
||||
torch::Tensor& token_expert_indices, // [num_tokens, topk]
|
||||
torch::Tensor& gating_output, // [num_tokens, num_experts]
|
||||
bool renormalize)
|
||||
bool renormalize,
|
||||
std::optional<torch::Tensor> bias)
|
||||
{
|
||||
const int num_experts = gating_output.size(-1);
|
||||
const auto num_tokens = gating_output.numel() / num_experts;
|
||||
@@ -693,14 +793,55 @@ void topk_softmax(
|
||||
torch::Tensor softmax_workspace = torch::empty({workspace_size}, workspace_options);
|
||||
|
||||
if (gating_output.scalar_type() == at::ScalarType::Float) {
|
||||
dispatch_topk_softmax_launch<float>(gating_output, topk_weights, topk_indices,
|
||||
token_expert_indices, softmax_workspace, num_tokens, num_experts, topk, renormalize, stream);
|
||||
dispatch_topk_launch<float, vllm::moe::SCORING_SOFTMAX>(gating_output, topk_weights, topk_indices,
|
||||
token_expert_indices, softmax_workspace, num_tokens, num_experts, topk, renormalize,
|
||||
bias, stream);
|
||||
} else if (gating_output.scalar_type() == at::ScalarType::Half) {
|
||||
dispatch_topk_softmax_launch<__half>(gating_output, topk_weights, topk_indices,
|
||||
token_expert_indices, softmax_workspace, num_tokens, num_experts, topk, renormalize, stream);
|
||||
dispatch_topk_launch<__half, vllm::moe::SCORING_SOFTMAX>(gating_output, topk_weights, topk_indices,
|
||||
token_expert_indices, softmax_workspace, num_tokens, num_experts, topk, renormalize,
|
||||
bias, stream);
|
||||
} else if (gating_output.scalar_type() == at::ScalarType::BFloat16) {
|
||||
dispatch_topk_softmax_launch<__nv_bfloat16>(gating_output, topk_weights, topk_indices,
|
||||
token_expert_indices, softmax_workspace, num_tokens, num_experts, topk, renormalize, stream);
|
||||
dispatch_topk_launch<__nv_bfloat16, vllm::moe::SCORING_SOFTMAX>(gating_output, topk_weights, topk_indices,
|
||||
token_expert_indices, softmax_workspace, num_tokens, num_experts, topk, renormalize,
|
||||
bias, stream);
|
||||
} else {
|
||||
TORCH_CHECK(false, "Unsupported gating_output data type: ", gating_output.scalar_type());
|
||||
}
|
||||
}
|
||||
|
||||
void topk_sigmoid(
|
||||
torch::Tensor& topk_weights, // [num_tokens, topk]
|
||||
torch::Tensor& topk_indices, // [num_tokens, topk]
|
||||
torch::Tensor& token_expert_indices, // [num_tokens, topk]
|
||||
torch::Tensor& gating_output, // [num_tokens, num_experts]
|
||||
bool renormalize,
|
||||
std::optional<torch::Tensor> bias)
|
||||
{
|
||||
const int num_experts = gating_output.size(-1);
|
||||
const auto num_tokens = gating_output.numel() / num_experts;
|
||||
const int topk = topk_weights.size(-1);
|
||||
|
||||
const bool is_pow_2 = (num_experts != 0) && ((num_experts & (num_experts - 1)) == 0);
|
||||
const bool needs_workspace = !is_pow_2 || num_experts > 256;
|
||||
const int64_t workspace_size = needs_workspace ? num_tokens * num_experts : 0;
|
||||
|
||||
const at::cuda::OptionalCUDAGuard device_guard(device_of(gating_output));
|
||||
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
||||
const auto workspace_options = gating_output.options().dtype(at::ScalarType::Float);
|
||||
torch::Tensor workspace = torch::empty({workspace_size}, workspace_options);
|
||||
|
||||
if (gating_output.scalar_type() == at::ScalarType::Float) {
|
||||
dispatch_topk_launch<float, vllm::moe::SCORING_SIGMOID>(gating_output, topk_weights, topk_indices,
|
||||
token_expert_indices, workspace, num_tokens, num_experts, topk, renormalize,
|
||||
bias, stream);
|
||||
} else if (gating_output.scalar_type() == at::ScalarType::Half) {
|
||||
dispatch_topk_launch<__half, vllm::moe::SCORING_SIGMOID>(gating_output, topk_weights, topk_indices,
|
||||
token_expert_indices, workspace, num_tokens, num_experts, topk, renormalize,
|
||||
bias, stream);
|
||||
} else if (gating_output.scalar_type() == at::ScalarType::BFloat16) {
|
||||
dispatch_topk_launch<__nv_bfloat16, vllm::moe::SCORING_SIGMOID>(gating_output, topk_weights, topk_indices,
|
||||
token_expert_indices, workspace, num_tokens, num_experts, topk, renormalize,
|
||||
bias, stream);
|
||||
} else {
|
||||
TORCH_CHECK(false, "Unsupported gating_output data type: ", gating_output.scalar_type());
|
||||
}
|
||||
|
||||
@@ -5,9 +5,17 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
|
||||
// Apply topk softmax to the gating outputs.
|
||||
m.def(
|
||||
"topk_softmax(Tensor! topk_weights, Tensor! topk_indices, Tensor! "
|
||||
"token_expert_indices, Tensor gating_output, bool renormalize) -> ()");
|
||||
"token_expert_indices, Tensor gating_output, bool renormalize, Tensor? "
|
||||
"bias) -> ()");
|
||||
m.impl("topk_softmax", torch::kCUDA, &topk_softmax);
|
||||
|
||||
// Apply topk sigmoid to the gating outputs.
|
||||
m.def(
|
||||
"topk_sigmoid(Tensor! topk_weights, Tensor! topk_indices, Tensor! "
|
||||
"token_expert_indices, Tensor gating_output, bool renormalize, Tensor? "
|
||||
"bias) -> ()");
|
||||
m.impl("topk_sigmoid", torch::kCUDA, &topk_sigmoid);
|
||||
|
||||
// Calculate the result of moe by summing up the partial results
|
||||
// from all selected experts.
|
||||
m.def("moe_sum(Tensor input, Tensor! output) -> ()");
|
||||
|
||||
@@ -260,12 +260,6 @@ void get_cutlass_moe_mm_data(
|
||||
const int64_t num_experts, const int64_t n, const int64_t k,
|
||||
const std::optional<torch::Tensor>& blockscale_offsets);
|
||||
|
||||
void get_cutlass_moe_mm_problem_sizes(
|
||||
const torch::Tensor& topk_ids, torch::Tensor& problem_sizes1,
|
||||
torch::Tensor& problem_sizes2, const int64_t num_experts, const int64_t n,
|
||||
const int64_t k, const std::optional<torch::Tensor>& blockscale_offsets,
|
||||
std::optional<bool> force_swap_ab = std::nullopt);
|
||||
|
||||
void get_cutlass_moe_mm_problem_sizes_from_expert_offsets(
|
||||
const torch::Tensor& expert_first_token_offset,
|
||||
torch::Tensor& problem_sizes1, torch::Tensor& problem_sizes2,
|
||||
@@ -299,7 +293,8 @@ std::vector<torch::Tensor> cutlass_sparse_compress(torch::Tensor const& a);
|
||||
|
||||
void scaled_fp4_quant(torch::Tensor& output, torch::Tensor const& input,
|
||||
torch::Tensor& output_scale,
|
||||
torch::Tensor const& input_scale);
|
||||
torch::Tensor const& input_scale,
|
||||
bool is_sf_swizzled_layout);
|
||||
|
||||
void scaled_fp4_experts_quant(
|
||||
torch::Tensor& output, torch::Tensor& output_scale,
|
||||
|
||||
@@ -27,17 +27,24 @@
|
||||
|
||||
#include "cuda_utils.h"
|
||||
#include "launch_bounds_utils.h"
|
||||
|
||||
// Define before including nvfp4_utils.cuh so the header
|
||||
// can use this macro during compilation.
|
||||
#define NVFP4_ENABLE_ELTS16 1
|
||||
#include "nvfp4_utils.cuh"
|
||||
|
||||
namespace vllm {
|
||||
|
||||
// Use UE4M3 by default.
|
||||
template <class Type, bool UE8M0_SF = false>
|
||||
__global__ void __launch_bounds__(1024, VLLM_BLOCKS_PER_SM(1024))
|
||||
silu_mul_cvt_fp16_to_fp4(int32_t numRows, int32_t numCols, Type const* in,
|
||||
float const* SFScale, uint32_t* out,
|
||||
uint32_t* SFout) {
|
||||
using PackedVec = PackedVec<Type>;
|
||||
__global__ void __launch_bounds__(512, VLLM_BLOCKS_PER_SM(512))
|
||||
silu_mul_cvt_fp16_to_fp4(int32_t numRows, int32_t numCols,
|
||||
int32_t num_padded_cols,
|
||||
Type const* __restrict__ in,
|
||||
float const* __restrict__ SFScale,
|
||||
uint32_t* __restrict__ out,
|
||||
uint32_t* __restrict__ SFout) {
|
||||
using PackedVec = vllm::PackedVec<Type>;
|
||||
static constexpr int CVT_FP4_NUM_THREADS_PER_SF =
|
||||
(CVT_FP4_SF_VEC_SIZE / CVT_FP4_ELTS_PER_THREAD);
|
||||
static_assert(sizeof(PackedVec) == sizeof(Type) * CVT_FP4_ELTS_PER_THREAD,
|
||||
@@ -49,34 +56,60 @@ __global__ void __launch_bounds__(1024, VLLM_BLOCKS_PER_SM(1024))
|
||||
// Get the global scaling factor, which will be applied to the SF.
|
||||
// Note SFScale is the same as next GEMM's alpha, which is
|
||||
// (448.f / (Alpha_A / 6.f)).
|
||||
float const SFScaleVal = SFScale == nullptr ? 1.0f : SFScale[0];
|
||||
float const SFScaleVal = (SFScale == nullptr) ? 1.0f : SFScale[0];
|
||||
|
||||
int32_t const colIdx = blockDim.x * blockIdx.y + threadIdx.x;
|
||||
int elem_idx = colIdx * CVT_FP4_ELTS_PER_THREAD;
|
||||
|
||||
// Input tensor row/col loops.
|
||||
for (int rowIdx = blockIdx.x; rowIdx < numRows; rowIdx += gridDim.x) {
|
||||
for (int colIdx = threadIdx.x; colIdx < numCols / CVT_FP4_ELTS_PER_THREAD;
|
||||
colIdx += blockDim.x) {
|
||||
if (colIdx < num_padded_cols) {
|
||||
PackedVec in_vec;
|
||||
PackedVec in_vec2;
|
||||
int64_t inOffset =
|
||||
rowIdx * (numCols * 2 / CVT_FP4_ELTS_PER_THREAD) + colIdx;
|
||||
int64_t inOffset2 = rowIdx * (numCols * 2 / CVT_FP4_ELTS_PER_THREAD) +
|
||||
numCols / CVT_FP4_ELTS_PER_THREAD + colIdx;
|
||||
PackedVec in_vec = reinterpret_cast<PackedVec const*>(in)[inOffset];
|
||||
PackedVec in_vec2 = reinterpret_cast<PackedVec const*>(in)[inOffset2];
|
||||
|
||||
// Get the output tensor offset.
|
||||
// Same as inOffset because 8 elements are packed into one uint32_t.
|
||||
int64_t outOffset = rowIdx * (numCols / CVT_FP4_ELTS_PER_THREAD) + colIdx;
|
||||
auto& out_pos = out[outOffset];
|
||||
bool valid = (rowIdx < numRows) && (elem_idx < numCols);
|
||||
if constexpr (CVT_FP4_PACK16) {
|
||||
ld256_or_zero_cg_u32<Type>(
|
||||
in_vec, &reinterpret_cast<const uint32_t*>(in)[inOffset * 8],
|
||||
valid);
|
||||
ld256_or_zero_cg_u32<Type>(
|
||||
in_vec2, &reinterpret_cast<const uint32_t*>(in)[inOffset2 * 8],
|
||||
valid);
|
||||
} else {
|
||||
ld128_or_zero_cg_u32<Type>(
|
||||
in_vec, &reinterpret_cast<const uint32_t*>(in)[inOffset * 4],
|
||||
valid);
|
||||
ld128_or_zero_cg_u32<Type>(
|
||||
in_vec2, &reinterpret_cast<const uint32_t*>(in)[inOffset2 * 4],
|
||||
valid);
|
||||
}
|
||||
|
||||
// Compute silu and mul
|
||||
PackedVec out_silu_mul = compute_silu_mul(in_vec, in_vec2);
|
||||
PackedVec out_silu_mul = compute_silu_mul<Type>(in_vec, in_vec2);
|
||||
|
||||
auto sf_out =
|
||||
cvt_quant_to_fp4_get_sf_out_offset<uint32_t,
|
||||
CVT_FP4_NUM_THREADS_PER_SF>(
|
||||
rowIdx, colIdx, numKTiles, SFout);
|
||||
|
||||
out_pos = cvt_warp_fp16_to_fp4<Type, UE8M0_SF>(out_silu_mul, SFScaleVal,
|
||||
sf_out);
|
||||
auto out_val =
|
||||
cvt_warp_fp16_to_fp4<Type, CVT_FP4_NUM_THREADS_PER_SF, UE8M0_SF>(
|
||||
out_silu_mul, SFScaleVal, sf_out);
|
||||
|
||||
if (valid) {
|
||||
if constexpr (CVT_FP4_PACK16) {
|
||||
int64_t outOffset = rowIdx * (numCols / 8) + colIdx * 2;
|
||||
uint64_t packed64 =
|
||||
(uint64_t(out_val.hi) << 32) | uint64_t(out_val.lo);
|
||||
reinterpret_cast<uint64_t*>(out)[outOffset >> 1] = packed64;
|
||||
} else {
|
||||
out[inOffset] = out_val;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -103,17 +136,23 @@ void silu_and_mul_nvfp4_quant_sm1xxa(torch::Tensor& output, // [..., d]
|
||||
auto output_ptr = static_cast<int64_t*>(output.data_ptr());
|
||||
const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
|
||||
auto stream = at::cuda::getCurrentCUDAStream(input.get_device());
|
||||
dim3 block(std::min(int(n / ELTS_PER_THREAD), 1024));
|
||||
dim3 block(std::min(int(n / ELTS_PER_THREAD), 512));
|
||||
int const numBlocksPerSM =
|
||||
vllm_runtime_blocks_per_sm(static_cast<int>(block.x));
|
||||
dim3 grid(std::min(int(m), multiProcessorCount * numBlocksPerSM));
|
||||
|
||||
int sf_n_unpadded = int(n / CVT_FP4_SF_VEC_SIZE);
|
||||
|
||||
int grid_y = vllm::div_round_up(sf_n_unpadded, static_cast<int>(block.x));
|
||||
int grid_x = std::min(
|
||||
int(m), std::max(1, (multiProcessorCount * numBlocksPerSM) / grid_y));
|
||||
dim3 grid(grid_x, grid_y);
|
||||
|
||||
VLLM_DISPATCH_HALF_TYPES(
|
||||
input.scalar_type(), "silu_and_mul_nvfp4_quant_kernel", [&] {
|
||||
using cuda_type = vllm::CUDATypeConverter<scalar_t>::Type;
|
||||
auto input_ptr = static_cast<cuda_type const*>(input.data_ptr());
|
||||
vllm::silu_mul_cvt_fp16_to_fp4<cuda_type><<<grid, block, 0, stream>>>(
|
||||
m, n, input_ptr, input_sf_ptr,
|
||||
m, n, sf_n_unpadded, input_ptr, input_sf_ptr,
|
||||
reinterpret_cast<uint32_t*>(output_ptr),
|
||||
reinterpret_cast<uint32_t*>(sf_out));
|
||||
});
|
||||
|
||||
@@ -140,8 +140,8 @@ __global__ void __launch_bounds__(512, VLLM_BLOCKS_PER_SM(512))
|
||||
CVT_FP4_NUM_THREADS_PER_SF>(
|
||||
rowIdx_in_expert, colIdx, numKTiles, SFout_in_expert);
|
||||
|
||||
out_pos =
|
||||
cvt_warp_fp16_to_fp4<Type, UE8M0_SF>(quant_input, SFScaleVal, sf_out);
|
||||
out_pos = cvt_warp_fp16_to_fp4<Type, CVT_FP4_NUM_THREADS_PER_SF, UE8M0_SF>(
|
||||
quant_input, SFScaleVal, sf_out);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -246,8 +246,8 @@ __global__ void __launch_bounds__(1024, VLLM_BLOCKS_PER_SM(1024))
|
||||
CVT_FP4_NUM_THREADS_PER_SF>(
|
||||
rowIdx_in_expert, colIdx, numKTiles, SFout_in_expert);
|
||||
|
||||
out_pos =
|
||||
cvt_warp_fp16_to_fp4<Type, UE8M0_SF>(quant_input, SFScaleVal, sf_out);
|
||||
out_pos = cvt_warp_fp16_to_fp4<Type, CVT_FP4_NUM_THREADS_PER_SF, UE8M0_SF>(
|
||||
quant_input, SFScaleVal, sf_out);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -21,7 +21,8 @@
|
||||
void scaled_fp4_quant_sm1xxa(torch::Tensor const& output,
|
||||
torch::Tensor const& input,
|
||||
torch::Tensor const& output_sf,
|
||||
torch::Tensor const& input_sf);
|
||||
torch::Tensor const& input_sf,
|
||||
bool is_sf_swizzled_layout);
|
||||
#endif
|
||||
|
||||
#if (defined(ENABLE_NVFP4_SM100) && ENABLE_NVFP4_SM100) || \
|
||||
@@ -51,10 +52,12 @@ void silu_and_mul_scaled_fp4_experts_quant_sm1xxa(
|
||||
#endif
|
||||
|
||||
void scaled_fp4_quant(torch::Tensor& output, torch::Tensor const& input,
|
||||
torch::Tensor& output_sf, torch::Tensor const& input_sf) {
|
||||
torch::Tensor& output_sf, torch::Tensor const& input_sf,
|
||||
bool is_sf_swizzled_layout) {
|
||||
#if (defined(ENABLE_NVFP4_SM100) && ENABLE_NVFP4_SM100) || \
|
||||
(defined(ENABLE_NVFP4_SM120) && ENABLE_NVFP4_SM120)
|
||||
return scaled_fp4_quant_sm1xxa(output, input, output_sf, input_sf);
|
||||
return scaled_fp4_quant_sm1xxa(output, input, output_sf, input_sf,
|
||||
is_sf_swizzled_layout);
|
||||
#endif
|
||||
TORCH_CHECK_NOT_IMPLEMENTED(false, "No compiled nvfp4 quantization kernel");
|
||||
}
|
||||
|
||||
@@ -27,29 +27,23 @@
|
||||
|
||||
#include "cuda_utils.h"
|
||||
#include "launch_bounds_utils.h"
|
||||
|
||||
// Define before including nvfp4_utils.cuh so the header
|
||||
// can use this macro during compilation.
|
||||
#define NVFP4_ENABLE_ELTS16 1
|
||||
#include "nvfp4_utils.cuh"
|
||||
|
||||
namespace vllm {
|
||||
|
||||
template <typename Int>
|
||||
__host__ __device__ inline Int round_up(Int x, Int y) {
|
||||
static_assert(std::is_integral_v<Int>,
|
||||
"round_up argument must be integral type");
|
||||
return ((x + y - 1) / y) * y;
|
||||
}
|
||||
|
||||
// Compute effective rows for grid configuration with swizzled SF layouts.
|
||||
inline int computeEffectiveRows(int m) {
|
||||
constexpr int ROW_TILE = 128;
|
||||
return round_up(m, ROW_TILE);
|
||||
}
|
||||
|
||||
// Use UE4M3 by default.
|
||||
template <class Type, bool UE8M0_SF = false>
|
||||
__global__ void __launch_bounds__(512, VLLM_BLOCKS_PER_SM(512))
|
||||
cvt_fp16_to_fp4(int32_t numRows, int32_t numCols, Type const* in,
|
||||
float const* SFScale, uint32_t* out, uint32_t* SFout) {
|
||||
using PackedVec = PackedVec<Type>;
|
||||
cvt_fp16_to_fp4(int32_t numRows, int32_t numCols, int32_t num_padded_cols,
|
||||
Type const* __restrict__ in,
|
||||
float const* __restrict__ SFScale,
|
||||
uint32_t* __restrict__ out, uint32_t* __restrict__ SFout) {
|
||||
using PackedVec = vllm::PackedVec<Type>;
|
||||
|
||||
static constexpr int CVT_FP4_NUM_THREADS_PER_SF =
|
||||
(CVT_FP4_SF_VEC_SIZE / CVT_FP4_ELTS_PER_THREAD);
|
||||
static_assert(sizeof(PackedVec) == sizeof(Type) * CVT_FP4_ELTS_PER_THREAD,
|
||||
@@ -59,33 +53,31 @@ __global__ void __launch_bounds__(512, VLLM_BLOCKS_PER_SM(512))
|
||||
int32_t const numKTiles = (numCols + 63) / 64;
|
||||
|
||||
int sf_m = round_up<int>(numRows, 128);
|
||||
int sf_n_unpadded = numCols / CVT_FP4_SF_VEC_SIZE;
|
||||
int sf_n_int = round_up<int>(sf_n_unpadded, 4) / 4;
|
||||
int num_padded_cols = sf_n_int * 4 * CVT_FP4_SF_VEC_SIZE;
|
||||
int32_t const colIdx = blockDim.x * blockIdx.y + threadIdx.x;
|
||||
int elem_idx = colIdx * CVT_FP4_ELTS_PER_THREAD;
|
||||
|
||||
// Get the global scaling factor, which will be applied to the SF.
|
||||
// Note SFScale is the same as next GEMM's alpha, which is
|
||||
// (448.f / (Alpha_A / 6.f)).
|
||||
float const global_scale = SFScale == nullptr ? 1.0f : SFScale[0];
|
||||
float const global_scale = (SFScale == nullptr) ? 1.0f : SFScale[0];
|
||||
|
||||
// Iterate over all rows and cols including padded ones -
|
||||
// ensures we visit every single scale factor address to initialize it.
|
||||
for (int rowIdx = blockIdx.x; rowIdx < sf_m; rowIdx += gridDim.x) {
|
||||
for (int colIdx = threadIdx.x;
|
||||
colIdx < num_padded_cols / CVT_FP4_ELTS_PER_THREAD;
|
||||
colIdx += blockDim.x) {
|
||||
int elem_idx = colIdx * CVT_FP4_ELTS_PER_THREAD;
|
||||
|
||||
if (colIdx < num_padded_cols) {
|
||||
PackedVec in_vec;
|
||||
int64_t inOffset = rowIdx * (numCols / CVT_FP4_ELTS_PER_THREAD) + colIdx;
|
||||
|
||||
// If we are outside valid rows OR outside valid columns -> Use Zeros
|
||||
if (rowIdx >= numRows || elem_idx >= numCols) {
|
||||
memset(&in_vec, 0, sizeof(PackedVec));
|
||||
|
||||
bool valid = (rowIdx < numRows) && (elem_idx < numCols);
|
||||
if constexpr (CVT_FP4_PACK16) {
|
||||
ld256_or_zero_cg_u32<Type>(
|
||||
in_vec, &reinterpret_cast<const uint32_t*>(in)[inOffset * 8],
|
||||
valid);
|
||||
} else {
|
||||
// Valid Region: Load actual data
|
||||
in_vec = reinterpret_cast<PackedVec const*>(in)[inOffset];
|
||||
ld128_or_zero_cg_u32<Type>(
|
||||
in_vec, &reinterpret_cast<const uint32_t*>(in)[inOffset * 4],
|
||||
valid);
|
||||
}
|
||||
|
||||
auto sf_out =
|
||||
@@ -94,13 +86,85 @@ __global__ void __launch_bounds__(512, VLLM_BLOCKS_PER_SM(512))
|
||||
rowIdx, colIdx, numKTiles, SFout);
|
||||
|
||||
auto out_val =
|
||||
cvt_warp_fp16_to_fp4<Type, UE8M0_SF>(in_vec, global_scale, sf_out);
|
||||
cvt_warp_fp16_to_fp4<Type, CVT_FP4_NUM_THREADS_PER_SF, UE8M0_SF>(
|
||||
in_vec, global_scale, sf_out);
|
||||
|
||||
// We do NOT write output for padding because the 'out' tensor is not
|
||||
// padded.
|
||||
if (rowIdx < numRows && elem_idx < numCols) {
|
||||
// Same as inOffset because 8 elements are packed into one uint32_t.
|
||||
out[inOffset] = out_val;
|
||||
if (valid) {
|
||||
if constexpr (CVT_FP4_PACK16) {
|
||||
int64_t outOffset = rowIdx * (numCols / 8) + colIdx * 2;
|
||||
uint64_t packed64 =
|
||||
(uint64_t(out_val.hi) << 32) | uint64_t(out_val.lo);
|
||||
reinterpret_cast<uint64_t*>(out)[outOffset >> 1] = packed64;
|
||||
} else {
|
||||
out[inOffset] = out_val;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Use UE4M3 by default.
|
||||
template <class Type, bool UE8M0_SF = false>
|
||||
__global__ void __launch_bounds__(512, VLLM_BLOCKS_PER_SM(512))
|
||||
cvt_fp16_to_fp4_sf_major(int32_t numRows, int32_t numCols,
|
||||
int32_t sf_n_unpadded, Type const* __restrict__ in,
|
||||
float const* __restrict__ SFScale,
|
||||
uint32_t* __restrict__ out,
|
||||
uint32_t* __restrict__ SFout) {
|
||||
using PackedVec = PackedVec<Type>;
|
||||
|
||||
static constexpr int CVT_FP4_NUM_THREADS_PER_SF =
|
||||
(CVT_FP4_SF_VEC_SIZE / CVT_FP4_ELTS_PER_THREAD);
|
||||
static_assert(sizeof(PackedVec) == sizeof(Type) * CVT_FP4_ELTS_PER_THREAD,
|
||||
"Vec size is not matched.");
|
||||
|
||||
int32_t const colIdx = blockDim.x * blockIdx.y + threadIdx.x;
|
||||
int elem_idx = colIdx * CVT_FP4_ELTS_PER_THREAD;
|
||||
|
||||
// Get the global scaling factor, which will be applied to the SF.
|
||||
// Note SFScale is the same as next GEMM's alpha, which is
|
||||
// (448.f / (Alpha_A / 6.f)).
|
||||
float const global_scale = (SFScale == nullptr) ? 1.0f : SFScale[0];
|
||||
|
||||
// Iterate over all rows and cols including padded ones -
|
||||
// ensures we visit every single scale factor address to initialize it.
|
||||
for (int rowIdx = blockIdx.x; rowIdx < numRows; rowIdx += gridDim.x) {
|
||||
if (colIdx < sf_n_unpadded) {
|
||||
PackedVec in_vec;
|
||||
int64_t inOffset = rowIdx * (numCols / CVT_FP4_ELTS_PER_THREAD) + colIdx;
|
||||
|
||||
// If we are outside valid rows OR outside valid columns -> Use Zeros
|
||||
bool valid = (rowIdx < numRows) && (elem_idx < numCols);
|
||||
if constexpr (CVT_FP4_PACK16) {
|
||||
ld256_or_zero_cg_u32<Type>(
|
||||
in_vec, &reinterpret_cast<const uint32_t*>(in)[inOffset * 8],
|
||||
valid);
|
||||
} else {
|
||||
ld128_or_zero_cg_u32<Type>(
|
||||
in_vec, &reinterpret_cast<const uint32_t*>(in)[inOffset * 4],
|
||||
valid);
|
||||
}
|
||||
|
||||
auto sf_out =
|
||||
sf_out_rowmajor_u8<uint32_t>(rowIdx, colIdx, sf_n_unpadded, SFout);
|
||||
|
||||
auto out_val =
|
||||
cvt_warp_fp16_to_fp4<Type, CVT_FP4_NUM_THREADS_PER_SF, UE8M0_SF>(
|
||||
in_vec, global_scale, sf_out);
|
||||
|
||||
// We do NOT write output for padding because the 'out' tensor is not
|
||||
// padded.
|
||||
if (valid) {
|
||||
if constexpr (CVT_FP4_PACK16) {
|
||||
int64_t outOffset = rowIdx * (numCols / 8) + colIdx * 2;
|
||||
uint64_t packed64 =
|
||||
(uint64_t(out_val.hi) << 32) | uint64_t(out_val.lo);
|
||||
reinterpret_cast<uint64_t*>(out)[outOffset >> 1] = packed64;
|
||||
} else {
|
||||
out[inOffset] = out_val;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -111,7 +175,8 @@ __global__ void __launch_bounds__(512, VLLM_BLOCKS_PER_SM(512))
|
||||
void scaled_fp4_quant_sm1xxa(torch::Tensor const& output,
|
||||
torch::Tensor const& input,
|
||||
torch::Tensor const& output_sf,
|
||||
torch::Tensor const& input_sf) {
|
||||
torch::Tensor const& input_sf,
|
||||
bool is_sf_swizzled_layout) {
|
||||
int32_t m = input.size(0);
|
||||
int32_t n = input.size(1);
|
||||
|
||||
@@ -129,19 +194,48 @@ void scaled_fp4_quant_sm1xxa(torch::Tensor const& output,
|
||||
const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
|
||||
auto stream = at::cuda::getCurrentCUDAStream(input.get_device());
|
||||
|
||||
int sf_n_unpadded = int(n / CVT_FP4_SF_VEC_SIZE);
|
||||
|
||||
// Grid, Block size. Each thread converts 8 values.
|
||||
dim3 block(std::min(int(n / ELTS_PER_THREAD), 512));
|
||||
int const numBlocksPerSM =
|
||||
vllm_runtime_blocks_per_sm(static_cast<int>(block.x));
|
||||
int effectiveRows = vllm::computeEffectiveRows(m);
|
||||
dim3 grid(std::min(effectiveRows, multiProcessorCount * numBlocksPerSM));
|
||||
|
||||
VLLM_DISPATCH_HALF_TYPES(input.scalar_type(), "nvfp4_quant_kernel", [&] {
|
||||
using cuda_type = vllm::CUDATypeConverter<scalar_t>::Type;
|
||||
auto input_ptr = static_cast<cuda_type const*>(input.data_ptr());
|
||||
// NOTE: We don't support e8m0 scales at this moment.
|
||||
vllm::cvt_fp16_to_fp4<cuda_type, false><<<grid, block, 0, stream>>>(
|
||||
m, n, input_ptr, input_sf_ptr, reinterpret_cast<uint32_t*>(output_ptr),
|
||||
reinterpret_cast<uint32_t*>(sf_out));
|
||||
});
|
||||
}
|
||||
if (is_sf_swizzled_layout) {
|
||||
int sf_n_int = int(vllm::round_up(sf_n_unpadded, 4) / 4);
|
||||
int32_t num_padded_cols =
|
||||
sf_n_int * 4 * CVT_FP4_SF_VEC_SIZE / CVT_FP4_ELTS_PER_THREAD;
|
||||
|
||||
int grid_y = vllm::div_round_up(num_padded_cols, static_cast<int>(block.x));
|
||||
int grid_x =
|
||||
std::min(vllm::computeEffectiveRows(m),
|
||||
std::max(1, (multiProcessorCount * numBlocksPerSM) / grid_y));
|
||||
dim3 grid(grid_x, grid_y);
|
||||
|
||||
VLLM_DISPATCH_HALF_TYPES(input.scalar_type(), "nvfp4_quant_kernel", [&] {
|
||||
using cuda_type = vllm::CUDATypeConverter<scalar_t>::Type;
|
||||
auto input_ptr = static_cast<cuda_type const*>(input.data_ptr());
|
||||
// NOTE: We don't support e8m0 scales at this moment.
|
||||
vllm::cvt_fp16_to_fp4<cuda_type, false><<<grid, block, 0, stream>>>(
|
||||
m, n, num_padded_cols, input_ptr, input_sf_ptr,
|
||||
reinterpret_cast<uint32_t*>(output_ptr),
|
||||
reinterpret_cast<uint32_t*>(sf_out));
|
||||
});
|
||||
} else {
|
||||
int grid_y = vllm::div_round_up(sf_n_unpadded, static_cast<int>(block.x));
|
||||
int grid_x = std::min(
|
||||
m, std::max(1, (multiProcessorCount * numBlocksPerSM) / grid_y));
|
||||
dim3 grid(grid_x, grid_y);
|
||||
|
||||
VLLM_DISPATCH_HALF_TYPES(input.scalar_type(), "nvfp4_quant_kernel", [&] {
|
||||
using cuda_type = vllm::CUDATypeConverter<scalar_t>::Type;
|
||||
auto input_ptr = static_cast<cuda_type const*>(input.data_ptr());
|
||||
// NOTE: We don't support e8m0 scales at this moment.
|
||||
vllm::cvt_fp16_to_fp4_sf_major<cuda_type, false>
|
||||
<<<grid, block, 0, stream>>>(m, n, sf_n_unpadded, input_ptr,
|
||||
input_sf_ptr,
|
||||
reinterpret_cast<uint32_t*>(output_ptr),
|
||||
reinterpret_cast<uint32_t*>(sf_out));
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
@@ -19,9 +19,17 @@
|
||||
#include <cuda_runtime.h>
|
||||
#include <cuda_fp8.h>
|
||||
|
||||
#define ELTS_PER_THREAD 8
|
||||
|
||||
#if (defined(NVFP4_ENABLE_ELTS16) && (CUDART_VERSION >= 12090) && \
|
||||
defined(ENABLE_NVFP4_SM100) && ENABLE_NVFP4_SM100)
|
||||
#define ELTS_PER_THREAD 16
|
||||
constexpr int CVT_FP4_ELTS_PER_THREAD = 16;
|
||||
constexpr bool CVT_FP4_PACK16 = true;
|
||||
#else
|
||||
#define ELTS_PER_THREAD 8
|
||||
constexpr int CVT_FP4_ELTS_PER_THREAD = 8;
|
||||
constexpr bool CVT_FP4_PACK16 = false;
|
||||
#endif
|
||||
|
||||
constexpr int CVT_FP4_SF_VEC_SIZE = 16;
|
||||
|
||||
namespace vllm {
|
||||
@@ -68,19 +76,46 @@ struct TypeConverter<__nv_bfloat16> {
|
||||
using Type = __nv_bfloat162;
|
||||
};
|
||||
|
||||
#if (defined(NVFP4_ENABLE_ELTS16) && (CUDART_VERSION >= 12090) && \
|
||||
defined(ENABLE_NVFP4_SM100) && ENABLE_NVFP4_SM100)
|
||||
// Define a 32 bytes packed data type.
|
||||
template <class Type>
|
||||
struct alignas(32) PackedVec {
|
||||
typename TypeConverter<Type>::Type elts[8];
|
||||
};
|
||||
#else
|
||||
// Define a 16 bytes packed data type.
|
||||
template <class Type>
|
||||
struct PackedVec {
|
||||
struct alignas(16) PackedVec {
|
||||
typename TypeConverter<Type>::Type elts[4];
|
||||
};
|
||||
#endif
|
||||
|
||||
template <>
|
||||
struct PackedVec<__nv_fp8_e4m3> {
|
||||
__nv_fp8x2_e4m3 elts[8];
|
||||
};
|
||||
|
||||
template <typename Int>
|
||||
__host__ __device__ inline Int round_up(Int x, Int y) {
|
||||
static_assert(std::is_integral_v<Int>,
|
||||
"round_up argument must be integral type");
|
||||
return ((x + y - 1) / y) * y;
|
||||
}
|
||||
|
||||
template <typename Int>
|
||||
__host__ __device__ __forceinline__ Int div_round_up(Int x, Int y) {
|
||||
return (x + y - 1) / y;
|
||||
}
|
||||
|
||||
// Compute effective rows for grid configuration with swizzled SF layouts.
|
||||
inline int computeEffectiveRows(int m) {
|
||||
constexpr int ROW_TILE = 128;
|
||||
return round_up(m, ROW_TILE);
|
||||
}
|
||||
|
||||
// Convert 8 float32 values into 8 e2m1 values (represented as one uint32_t).
|
||||
inline __device__ uint32_t fp32_vec_to_e2m1(float (&array)[8]) {
|
||||
inline __device__ uint32_t fp32_vec8_to_e2m1(float (&array)[8]) {
|
||||
uint32_t val;
|
||||
asm volatile(
|
||||
"{\n"
|
||||
@@ -101,7 +136,7 @@ inline __device__ uint32_t fp32_vec_to_e2m1(float (&array)[8]) {
|
||||
}
|
||||
|
||||
// Convert 4 float2 values into 8 e2m1 values (represented as one uint32_t).
|
||||
inline __device__ uint32_t fp32_vec_to_e2m1(float2 (&array)[4]) {
|
||||
__device__ __forceinline__ uint32_t fp32_vec8_to_e2m1(float2 (&array)[4]) {
|
||||
uint32_t val;
|
||||
asm volatile(
|
||||
"{\n"
|
||||
@@ -114,20 +149,115 @@ inline __device__ uint32_t fp32_vec_to_e2m1(float2 (&array)[4]) {
|
||||
"cvt.rn.satfinite.e2m1x2.f32 byte2, %6, %5;\n"
|
||||
"cvt.rn.satfinite.e2m1x2.f32 byte3, %8, %7;\n"
|
||||
"mov.b32 %0, {byte0, byte1, byte2, byte3};\n"
|
||||
"}"
|
||||
"}\n"
|
||||
: "=r"(val)
|
||||
: "f"(array[0].x), "f"(array[0].y), "f"(array[1].x), "f"(array[1].y),
|
||||
"f"(array[2].x), "f"(array[2].y), "f"(array[3].x), "f"(array[3].y));
|
||||
return val;
|
||||
}
|
||||
|
||||
struct u32x2 {
|
||||
uint32_t lo, hi;
|
||||
};
|
||||
|
||||
using fp4_packed_t = std::conditional_t<CVT_FP4_PACK16, u32x2, uint32_t>;
|
||||
|
||||
__device__ __forceinline__ u32x2 fp32_vec16_to_e2m1(float2 (&array)[8]) {
|
||||
u32x2 out;
|
||||
asm volatile(
|
||||
"{\n"
|
||||
".reg .b8 b0;\n"
|
||||
".reg .b8 b1;\n"
|
||||
".reg .b8 b2;\n"
|
||||
".reg .b8 b3;\n"
|
||||
".reg .b8 b4;\n"
|
||||
".reg .b8 b5;\n"
|
||||
".reg .b8 b6;\n"
|
||||
".reg .b8 b7;\n"
|
||||
"cvt.rn.satfinite.e2m1x2.f32 b0, %3, %2;\n"
|
||||
"cvt.rn.satfinite.e2m1x2.f32 b1, %5, %4;\n"
|
||||
"cvt.rn.satfinite.e2m1x2.f32 b2, %7, %6;\n"
|
||||
"cvt.rn.satfinite.e2m1x2.f32 b3, %9, %8;\n"
|
||||
"cvt.rn.satfinite.e2m1x2.f32 b4, %11, %10;\n"
|
||||
"cvt.rn.satfinite.e2m1x2.f32 b5, %13, %12;\n"
|
||||
"cvt.rn.satfinite.e2m1x2.f32 b6, %15, %14;\n"
|
||||
"cvt.rn.satfinite.e2m1x2.f32 b7, %17, %16;\n"
|
||||
"mov.b32 %0, {b0, b1, b2, b3};\n"
|
||||
"mov.b32 %1, {b4, b5, b6, b7};\n"
|
||||
"}\n"
|
||||
: "=r"(out.lo), "=r"(out.hi)
|
||||
: "f"(array[0].x), "f"(array[0].y), "f"(array[1].x), "f"(array[1].y),
|
||||
"f"(array[2].x), "f"(array[2].y), "f"(array[3].x), "f"(array[3].y),
|
||||
"f"(array[4].x), "f"(array[4].y), "f"(array[5].x), "f"(array[5].y),
|
||||
"f"(array[6].x), "f"(array[6].y), "f"(array[7].x), "f"(array[7].y));
|
||||
return out;
|
||||
}
|
||||
|
||||
__device__ __forceinline__ uint32_t pack_fp4(float2 (&v)[4]) {
|
||||
return fp32_vec8_to_e2m1(v);
|
||||
}
|
||||
|
||||
__device__ __forceinline__ u32x2 pack_fp4(float2 (&v)[8]) {
|
||||
return fp32_vec16_to_e2m1(v);
|
||||
}
|
||||
|
||||
// Fast reciprocal.
|
||||
inline __device__ float reciprocal_approximate_ftz(float a) {
|
||||
__device__ __forceinline__ float reciprocal_approximate_ftz(float a) {
|
||||
float b;
|
||||
asm volatile("rcp.approx.ftz.f32 %0, %1;\n" : "=f"(b) : "f"(a));
|
||||
asm volatile("rcp.approx.ftz.f32 %0, %1;" : "=f"(b) : "f"(a));
|
||||
return b;
|
||||
}
|
||||
|
||||
template <class Type>
|
||||
__device__ __forceinline__ void ld128_or_zero_cg_u32(PackedVec<Type>& out,
|
||||
const void* ptr,
|
||||
bool pred) {
|
||||
uint32_t r0, r1, r2, r3;
|
||||
|
||||
asm volatile(
|
||||
"{\n"
|
||||
" .reg .pred pr;\n"
|
||||
" setp.ne.u32 pr, %4, 0;\n"
|
||||
" mov.u32 %0, 0;\n"
|
||||
" mov.u32 %1, 0;\n"
|
||||
" mov.u32 %2, 0;\n"
|
||||
" mov.u32 %3, 0;\n"
|
||||
" @pr ld.global.cg.v4.u32 {%0,%1,%2,%3}, [%5];\n"
|
||||
"}\n"
|
||||
: "=r"(r0), "=r"(r1), "=r"(r2), "=r"(r3)
|
||||
: "r"((int)pred), "l"(ptr));
|
||||
|
||||
*reinterpret_cast<uint4*>(&out) = uint4{r0, r1, r2, r3};
|
||||
}
|
||||
|
||||
template <class Type>
|
||||
__device__ __forceinline__ void ld256_or_zero_cg_u32(PackedVec<Type>& out,
|
||||
const void* ptr,
|
||||
bool pred) {
|
||||
uint32_t r0, r1, r2, r3, r4, r5, r6, r7;
|
||||
|
||||
asm volatile(
|
||||
"{\n"
|
||||
" .reg .pred pr;\n"
|
||||
" setp.ne.u32 pr, %8, 0;\n"
|
||||
" mov.u32 %0, 0;\n"
|
||||
" mov.u32 %1, 0;\n"
|
||||
" mov.u32 %2, 0;\n"
|
||||
" mov.u32 %3, 0;\n"
|
||||
" mov.u32 %4, 0;\n"
|
||||
" mov.u32 %5, 0;\n"
|
||||
" mov.u32 %6, 0;\n"
|
||||
" mov.u32 %7, 0;\n"
|
||||
" @pr ld.global.cg.v8.u32 {%0,%1,%2,%3,%4,%5,%6,%7}, [%9];\n"
|
||||
"}\n"
|
||||
: "=r"(r0), "=r"(r1), "=r"(r2), "=r"(r3), "=r"(r4), "=r"(r5), "=r"(r6),
|
||||
"=r"(r7)
|
||||
: "r"((int)pred), "l"(ptr));
|
||||
|
||||
reinterpret_cast<uint4*>(&out)[0] = uint4{r0, r1, r2, r3};
|
||||
reinterpret_cast<uint4*>(&out)[1] = uint4{r4, r5, r6, r7};
|
||||
}
|
||||
|
||||
// Compute SF output offset for swizzled tensor core layout.
|
||||
// SF layout: [numMTiles, numKTiles, 32, 4, 4]
|
||||
// Caller must precompute: numKTiles = (numCols + 63) / 64
|
||||
@@ -166,21 +296,41 @@ __device__ __forceinline__ uint8_t* cvt_quant_to_fp4_get_sf_out_offset(
|
||||
return reinterpret_cast<uint8_t*>(SFout) + SFOffset;
|
||||
}
|
||||
|
||||
template <class SFType>
|
||||
__device__ __forceinline__ uint8_t* sf_out_rowmajor_u8(int row, int pack,
|
||||
int packs_per_row_sf,
|
||||
SFType* SFout) {
|
||||
constexpr int PACK = CVT_FP4_ELTS_PER_THREAD;
|
||||
constexpr int THREADS_PER_SF =
|
||||
CVT_FP4_SF_VEC_SIZE / PACK; // 1 if PACK=16, 2 else PACK=8
|
||||
|
||||
if (threadIdx.x % THREADS_PER_SF != 0) return nullptr;
|
||||
|
||||
int sf_col =
|
||||
pack / THREADS_PER_SF; // PACK=16 => sf_col=pack; PACK=8 => sf_col=pack/2
|
||||
int64_t off = (int64_t)row * packs_per_row_sf + sf_col;
|
||||
|
||||
return (uint8_t*)SFout + off;
|
||||
}
|
||||
|
||||
// Quantizes the provided PackedVec into the uint32_t output
|
||||
template <class Type, bool UE8M0_SF = false>
|
||||
__device__ uint32_t cvt_warp_fp16_to_fp4(PackedVec<Type>& vec, float SFScaleVal,
|
||||
uint8_t* SFout) {
|
||||
template <class Type, int CVT_FP4_NUM_THREADS_PER_SF, bool UE8M0_SF = false>
|
||||
__device__ __forceinline__ fp4_packed_t
|
||||
cvt_warp_fp16_to_fp4(PackedVec<Type>& vec, float SFScaleVal, uint8_t* SFout) {
|
||||
// Get absolute maximum values among the local 8 values.
|
||||
auto localMax = __habs2(vec.elts[0]);
|
||||
|
||||
// Local maximum value.
|
||||
// Local maximum value.
|
||||
#pragma unroll
|
||||
for (int i = 1; i < CVT_FP4_ELTS_PER_THREAD / 2; i++) {
|
||||
localMax = __hmax2(localMax, __habs2(vec.elts[i]));
|
||||
}
|
||||
|
||||
// Get the absolute maximum among all 16 values (two threads).
|
||||
localMax = __hmax2(__shfl_xor_sync(uint32_t(-1), localMax, 1), localMax);
|
||||
|
||||
if constexpr (CVT_FP4_NUM_THREADS_PER_SF == 2) {
|
||||
localMax = __hmax2(__shfl_xor_sync(0xffffffffu, localMax, 1), localMax);
|
||||
}
|
||||
// Get the final absolute maximum values.
|
||||
float vecMax = float(__hmax(localMax.x, localMax.y));
|
||||
|
||||
@@ -205,18 +355,17 @@ __device__ uint32_t cvt_warp_fp16_to_fp4(PackedVec<Type>& vec, float SFScaleVal,
|
||||
// Convert back to fp32.
|
||||
SFValue = float(tmp);
|
||||
}
|
||||
|
||||
// Write the SF to global memory (STG.8).
|
||||
if (SFout) *SFout = fp8SFVal;
|
||||
|
||||
// Get the output scale.
|
||||
// Recipe: final_scale = reciprocal(fp32(fp8(SFValue * SFScaleVal))) *
|
||||
// reciprocal(SFScaleVal))
|
||||
float outputScale =
|
||||
SFValue != 0 ? reciprocal_approximate_ftz(
|
||||
SFValue * reciprocal_approximate_ftz(SFScaleVal))
|
||||
: 0.0f;
|
||||
|
||||
if (SFout) {
|
||||
// Write the SF to global memory (STG.8).
|
||||
*SFout = fp8SFVal;
|
||||
}
|
||||
SFValue != 0.0f ? reciprocal_approximate_ftz(
|
||||
SFValue * reciprocal_approximate_ftz(SFScaleVal))
|
||||
: 0.0f;
|
||||
|
||||
// Convert the input to float.
|
||||
float2 fp2Vals[CVT_FP4_ELTS_PER_THREAD / 2];
|
||||
@@ -233,10 +382,7 @@ __device__ uint32_t cvt_warp_fp16_to_fp4(PackedVec<Type>& vec, float SFScaleVal,
|
||||
}
|
||||
|
||||
// Convert to e2m1 values.
|
||||
uint32_t e2m1Vec = fp32_vec_to_e2m1(fp2Vals);
|
||||
|
||||
// Write the e2m1 values to global memory.
|
||||
return e2m1Vec;
|
||||
return pack_fp4(fp2Vals);
|
||||
}
|
||||
|
||||
// silu in float32
|
||||
|
||||
@@ -7,7 +7,7 @@
|
||||
#include <cuda_fp16.h>
|
||||
#include <cuda_bf16.h>
|
||||
#include <iostream>
|
||||
#include "../gptq_marlin/marlin_dtypes.cuh"
|
||||
#include "../marlin/marlin_dtypes.cuh"
|
||||
using marlin::MarlinScalarType2;
|
||||
|
||||
namespace allspark {
|
||||
|
||||
@@ -70,15 +70,6 @@ QUANT_CONFIGS = [
|
||||
"thread_m_blocks": THREAD_M_BLOCKS,
|
||||
"group_blocks": [-1, 2, 4, 8],
|
||||
},
|
||||
# HQQ
|
||||
{
|
||||
"a_type": ["kFloat16"],
|
||||
"b_type": "kU4",
|
||||
"thread_configs": THREAD_CONFIGS,
|
||||
"thread_m_blocks": THREAD_M_BLOCKS,
|
||||
"group_blocks": [4],
|
||||
"is_zp_float": True,
|
||||
},
|
||||
# GPTQ-INT4
|
||||
{
|
||||
"b_type": "kU4B8",
|
||||
@@ -46,7 +46,7 @@ __global__ void permute_cols_kernel(int4 const* __restrict__ a_int4_ptr,
|
||||
|
||||
} // namespace marlin
|
||||
|
||||
torch::Tensor gptq_marlin_gemm(
|
||||
torch::Tensor marlin_gemm(
|
||||
torch::Tensor& a, std::optional<torch::Tensor> c_or_none,
|
||||
torch::Tensor& b_q_weight,
|
||||
std::optional<torch::Tensor> const& b_bias_or_none, torch::Tensor& b_scales,
|
||||
@@ -528,7 +528,7 @@ void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* b_bias,
|
||||
|
||||
} // namespace marlin
|
||||
|
||||
torch::Tensor gptq_marlin_gemm(
|
||||
torch::Tensor marlin_gemm(
|
||||
torch::Tensor& a, std::optional<torch::Tensor> c_or_none,
|
||||
torch::Tensor& b_q_weight,
|
||||
std::optional<torch::Tensor> const& b_bias_or_none, torch::Tensor& b_scales,
|
||||
@@ -856,5 +856,5 @@ torch::Tensor gptq_marlin_gemm(
|
||||
#endif
|
||||
|
||||
TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CUDA, m) {
|
||||
m.impl("gptq_marlin_gemm", &gptq_marlin_gemm);
|
||||
m.impl("marlin_gemm", &marlin_gemm);
|
||||
}
|
||||
@@ -130,26 +130,6 @@ inline void launch_compute_problem_sizes(const torch::Tensor& topk_ids,
|
||||
}
|
||||
} // namespace
|
||||
|
||||
void get_cutlass_moe_mm_problem_sizes_caller(
|
||||
const torch::Tensor& topk_ids, torch::Tensor& problem_sizes1,
|
||||
torch::Tensor& problem_sizes2, const int64_t num_experts, const int64_t n,
|
||||
const int64_t k, const std::optional<torch::Tensor>& blockscale_offsets,
|
||||
std::optional<bool> force_swap_ab = std::nullopt) {
|
||||
auto stream = at::cuda::getCurrentCUDAStream(topk_ids.device().index());
|
||||
auto options_int32 =
|
||||
torch::TensorOptions().dtype(torch::kInt32).device(topk_ids.device());
|
||||
torch::Tensor atomic_buffer = torch::zeros(num_experts, options_int32);
|
||||
|
||||
// Swap-AB should be disabled for FP4 path
|
||||
bool may_swap_ab =
|
||||
force_swap_ab.value_or((!blockscale_offsets.has_value()) &&
|
||||
(topk_ids.numel() <= SWAP_AB_THRESHOLD));
|
||||
|
||||
launch_compute_problem_sizes(topk_ids, problem_sizes1, problem_sizes2,
|
||||
atomic_buffer, num_experts, n, k, stream,
|
||||
may_swap_ab);
|
||||
}
|
||||
|
||||
template <bool SWAP_AB>
|
||||
__global__ void compute_problem_sizes_from_expert_offsets(
|
||||
const int64_t* __restrict__ expert_first_token_offset,
|
||||
|
||||
@@ -77,12 +77,6 @@ void get_cutlass_moe_mm_data_caller(
|
||||
const int64_t num_experts, const int64_t n, const int64_t k,
|
||||
const std::optional<torch::Tensor>& blockscale_offsets);
|
||||
|
||||
void get_cutlass_moe_mm_problem_sizes_caller(
|
||||
const torch::Tensor& topk_ids, torch::Tensor& problem_sizes1,
|
||||
torch::Tensor& problem_sizes2, const int64_t num_experts, const int64_t n,
|
||||
const int64_t k, const std::optional<torch::Tensor>& blockscale_offsets,
|
||||
std::optional<bool> force_swap_ab = std::nullopt);
|
||||
|
||||
void get_cutlass_moe_mm_problem_sizes_from_expert_offsets_caller(
|
||||
const torch::Tensor& expert_first_token_offset,
|
||||
torch::Tensor& problem_sizes1, torch::Tensor& problem_sizes2,
|
||||
@@ -306,27 +300,6 @@ void get_cutlass_moe_mm_data(
|
||||
version_num, ". Required capability: 90, 100, or 120");
|
||||
}
|
||||
|
||||
void get_cutlass_moe_mm_problem_sizes(
|
||||
const torch::Tensor& topk_ids, torch::Tensor& problem_sizes1,
|
||||
torch::Tensor& problem_sizes2, const int64_t num_experts, const int64_t n,
|
||||
const int64_t k, const std::optional<torch::Tensor>& blockscale_offsets,
|
||||
std::optional<bool> force_swap_ab = std::nullopt) {
|
||||
int32_t version_num = get_sm_version_num();
|
||||
#if (defined ENABLE_CUTLASS_MOE_SM90 && ENABLE_CUTLASS_MOE_SM90) || \
|
||||
(defined ENABLE_CUTLASS_MOE_SM100 && ENABLE_CUTLASS_MOE_SM100) || \
|
||||
(defined ENABLE_CUTLASS_MOE_SM120 && ENABLE_CUTLASS_MOE_SM120)
|
||||
get_cutlass_moe_mm_problem_sizes_caller(topk_ids, problem_sizes1,
|
||||
problem_sizes2, num_experts, n, k,
|
||||
blockscale_offsets, force_swap_ab);
|
||||
return;
|
||||
#endif
|
||||
TORCH_CHECK_NOT_IMPLEMENTED(
|
||||
false,
|
||||
"No compiled get_cutlass_moe_mm_problem_sizes: no cutlass_scaled_mm "
|
||||
"kernel for CUDA device capability: ",
|
||||
version_num, ". Required capability: 90, 100, or 120");
|
||||
}
|
||||
|
||||
void get_cutlass_moe_mm_problem_sizes_from_expert_offsets(
|
||||
const torch::Tensor& expert_first_token_offset,
|
||||
torch::Tensor& problem_sizes1, torch::Tensor& problem_sizes2,
|
||||
|
||||
@@ -9,6 +9,10 @@ torch::Tensor wvSplitK(const at::Tensor& in_a, const at::Tensor& in_b,
|
||||
const std::optional<at::Tensor>& in_bias,
|
||||
const int64_t CuCount);
|
||||
|
||||
torch::Tensor wvSplitKrc(const at::Tensor& in_a, const at::Tensor& in_b,
|
||||
const std::optional<at::Tensor>& in_bias,
|
||||
const int64_t CuCount);
|
||||
|
||||
void wvSplitKQ(const at::Tensor& in_a, const at::Tensor& in_b,
|
||||
const std::optional<at::Tensor>& in_bias, at::Tensor& out_c,
|
||||
const at::Tensor& scale_a, const at::Tensor& scale_b,
|
||||
|
||||
@@ -13,6 +13,13 @@
|
||||
#include "dispatch_utils.h"
|
||||
#include "quantization/w8a8/fp8/common.cuh"
|
||||
|
||||
// TODO(rasmith): The kernels in this file are susceptible to integer overflow
|
||||
// issues, do not take strides, and are unable to handle PyTorch tensors that
|
||||
// return is_contiguous() as False (the tensors may actually be contiguous
|
||||
// in memory).
|
||||
//
|
||||
// However, it may be possible to fix these kernels to handle both issues.
|
||||
|
||||
#if defined(__HIPCC__) && \
|
||||
(defined(__gfx90a__) || defined(__gfx942__) || defined(__gfx950__))
|
||||
#define __HIP__GFX9__
|
||||
@@ -287,6 +294,11 @@ torch::Tensor LLMM1(at::Tensor& in_a, at::Tensor& in_b,
|
||||
V0 += (s.x + s.y); \
|
||||
}
|
||||
|
||||
// To avoid LLVM silently upcasting to double
|
||||
__device__ inline unsigned int min__(uint32_t a, uint32_t b) {
|
||||
return min(a, b);
|
||||
}
|
||||
|
||||
#if defined(__HIP__GFX9__) // TODO: Add NAVI support
|
||||
// This version targets cases where A[] fits LDS capacity
|
||||
template <typename scalar_t, int THRDS, int YTILE, int WvPrGrp, int A_CHUNK,
|
||||
@@ -334,11 +346,11 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
|
||||
// - Then the WG will move to another 8 K elements
|
||||
// TODO: Logic below will only work when K is multiple of 8
|
||||
//----------------------------------------------------
|
||||
for (uint32_t k = 0; k < min(K * N, max_lds_len);
|
||||
for (uint32_t k = 0; k < min__(K * N, max_lds_len);
|
||||
k += THRDS * WvPrGrp * A_CHUNK) {
|
||||
uint32_t k_in = k + ((threadIdx.y * THRDS + threadIdx.x) * A_CHUNK);
|
||||
|
||||
if (k_in >= min(K * N, max_lds_len)) break;
|
||||
if (k_in >= min__(K * N, max_lds_len)) break;
|
||||
|
||||
*((bigType*)(&s[k_in])) = *((bigType*)(&A[k_in]));
|
||||
}
|
||||
@@ -633,11 +645,11 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
|
||||
// - Then the WG will move to another 8 K elements
|
||||
// TODO: Logic below will only work when K is multiple of 8
|
||||
//----------------------------------------------------
|
||||
for (uint32_t k = 0; k < min(K * N, max_lds_len);
|
||||
for (uint32_t k = 0; k < min__(K * N, max_lds_len);
|
||||
k += THRDS * WvPrGrp * A_CHUNK) {
|
||||
uint32_t k_in = k + ((threadIdx.y * THRDS + threadIdx.x) * A_CHUNK);
|
||||
|
||||
if (k_in >= min(K * N, max_lds_len)) break;
|
||||
if (k_in >= min__(K * N, max_lds_len)) break;
|
||||
|
||||
*((bigType*)(&s[k_in])) = *((bigType*)(&A[k_in]));
|
||||
}
|
||||
@@ -954,11 +966,11 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
|
||||
//----------------------------------------------------
|
||||
#define PCML
|
||||
#ifndef PCML
|
||||
for (uint32_t k = 0; k < min(K * N, max_lds_len);
|
||||
for (uint32_t k = 0; k < min__(K * N, max_lds_len);
|
||||
k += THRDS * WvPrGrp * A_CHUNK) {
|
||||
uint32_t k_in = k + ((threadIdx.y * THRDS + threadIdx.x) * A_CHUNK);
|
||||
|
||||
if (k_in >= min(K * N, max_lds_len)) break;
|
||||
if (k_in >= min__(K * N, max_lds_len)) break;
|
||||
|
||||
*((bigType*)(&s[k_in])) = *((bigType*)(&A[k_in]));
|
||||
}
|
||||
@@ -975,7 +987,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
|
||||
? kFit
|
||||
: (kFit - kFit % TUC); // round up to multiple of TUC
|
||||
// if (kFit == 0) kFit = TUC;
|
||||
kFit = min(kFit, K);
|
||||
kFit = min__(kFit, K);
|
||||
|
||||
float sum[N][YTILE];
|
||||
scalar8 sum4[N][YTILE];
|
||||
@@ -1251,6 +1263,7 @@ int mindiv(int N, int div1, int div2) {
|
||||
}
|
||||
for (int i = 12; i >= 0; i--)
|
||||
if (rnds[0] == rnds[i]) return (div2 - i);
|
||||
return 0;
|
||||
}
|
||||
|
||||
torch::Tensor wvSplitK(const at::Tensor& in_a, const at::Tensor& in_b,
|
||||
@@ -1352,6 +1365,536 @@ torch::Tensor wvSplitK(const at::Tensor& in_a, const at::Tensor& in_b,
|
||||
return out_c;
|
||||
}
|
||||
|
||||
#if defined(__gfx950__) // TODO: Add NAVI support
|
||||
// This version targets big A[] cases, where it is much larger than LDS
|
||||
// capacity
|
||||
#define WVSPLITKRC_1KPASS
|
||||
template <typename scalar_t, int THRDS, int YTILE, int WvPrGrp, int A_CHUNK,
|
||||
int UNRL, int N, int GrpsShrB>
|
||||
|
||||
__global__ void __launch_bounds__(WvPrGrp* THRDS)
|
||||
__attribute__((amdgpu_waves_per_eu(1, 1)))
|
||||
wvSplitKrc_(const int actlN, const int K, const int M, const int Bx,
|
||||
const int By, const scalar_t* __restrict__ B,
|
||||
const scalar_t* __restrict__ A,
|
||||
const scalar_t* __restrict__ BIAS, float* glbl, scalar_t* C,
|
||||
const int CuCount) {
|
||||
// Use upper half of glbl buffer for atomic reduce counting
|
||||
int* cntr = (int*)(&glbl[M * N]);
|
||||
|
||||
constexpr int NTILE = 16;
|
||||
constexpr int WVLDS_ = (NTILE * THRDS * A_CHUNK);
|
||||
constexpr int APAD = 1;
|
||||
constexpr int ASTRD = 64;
|
||||
constexpr int BPAD = 1;
|
||||
constexpr int BSTRD = 64;
|
||||
constexpr int WVLDS = ((WVLDS_ + (WVLDS_ / BSTRD) * 4 * BPAD));
|
||||
|
||||
constexpr int max_lds_len = LDS_SIZE / 2;
|
||||
|
||||
using scalar16 =
|
||||
__attribute__((__vector_size__((A_CHUNK * 2) * sizeof(float)))) float;
|
||||
using scalar8 =
|
||||
__attribute__((__vector_size__((A_CHUNK / 2) * sizeof(float)))) float;
|
||||
using half4 =
|
||||
__attribute__((__vector_size__((A_CHUNK / 2) * sizeof(__bf16)))) __bf16;
|
||||
union bigType {
|
||||
scalar_t h[A_CHUNK];
|
||||
float f[A_CHUNK / 2];
|
||||
unsigned int i[A_CHUNK / 2];
|
||||
float2 f2[A_CHUNK / 4];
|
||||
unsigned long l[A_CHUNK / 4];
|
||||
double d[A_CHUNK / 4];
|
||||
half4 h4[A_CHUNK / 4];
|
||||
scalar8 h8;
|
||||
};
|
||||
using big4 = __attribute__((__vector_size__(4 * sizeof(bigType)))) __bf16;
|
||||
|
||||
__shared__ scalar_t stg[WvPrGrp * WVLDS / GrpsShrB];
|
||||
unsigned int* myStg = (unsigned int*)(&stg[WVLDS * (threadIdx.y / GrpsShrB)]);
|
||||
__shared__ scalar_t s[max_lds_len - WvPrGrp * WVLDS / GrpsShrB];
|
||||
|
||||
#ifndef WVSPLITKRC_1KPASS
|
||||
constexpr int TUC_ = (THRDS * UNRL * A_CHUNK);
|
||||
// find biggest k size that fits padded into LDS
|
||||
constexpr uint32_t kFit__ = (max_lds_len - WvPrGrp * WVLDS / GrpsShrB) / N;
|
||||
constexpr uint32_t kFit_ = (kFit__ * ASTRD) / (APAD + ASTRD);
|
||||
uint32_t kFit = kFit_ - (kFit_ % TUC_);
|
||||
uint32_t kfitsPerRdc = (K + kFit - 1) / kFit;
|
||||
|
||||
// find best k split to fill the CUs
|
||||
if (((K + kfitsPerRdc * kFit - 1) / (kfitsPerRdc * kFit)) * numCuWithFullK <=
|
||||
CuCount)
|
||||
while (true) {
|
||||
while (kFit > TUC_) {
|
||||
uint32_t kFit_ = kFit - TUC_;
|
||||
if (((K + (kfitsPerRdc * kFit_ - 1)) / (kfitsPerRdc * kFit_)) *
|
||||
numCuWithFullK >
|
||||
CuCount)
|
||||
break;
|
||||
kFit = kFit_;
|
||||
}
|
||||
if (((K + ((kfitsPerRdc - 1) * kFit - 1)) / ((kfitsPerRdc - 1) * kFit)) *
|
||||
numCuWithFullK <=
|
||||
CuCount)
|
||||
kfitsPerRdc--;
|
||||
else
|
||||
break;
|
||||
}
|
||||
#else
|
||||
int constexpr kFit = 512;
|
||||
int constexpr kfitsPerRdc = 1;
|
||||
#endif
|
||||
|
||||
bool doRdc = (kfitsPerRdc * kFit < K);
|
||||
uint32_t numCuWithFullK =
|
||||
((M + (WvPrGrp * YTILE / GrpsShrB) - 1) / (WvPrGrp * YTILE / GrpsShrB));
|
||||
uint32_t Mmod = numCuWithFullK * (WvPrGrp * YTILE / GrpsShrB);
|
||||
|
||||
// given above k-split, find this wave's position
|
||||
uint32_t kFitPdd = kFit + (kFit / ASTRD) * APAD;
|
||||
uint32_t m0 = (blockIdx.x * WvPrGrp / GrpsShrB) * YTILE;
|
||||
uint32_t m1 = ((threadIdx.y % WvPrGrp) / GrpsShrB) * YTILE;
|
||||
uint32_t m = (m0 + m1) % Mmod;
|
||||
const uint32_t k_str = (m0 / Mmod) * kFit * kfitsPerRdc;
|
||||
uint32_t k_end = (m0 / Mmod + 1) * kFit * kfitsPerRdc;
|
||||
const uint32_t k_rnd = (K + kFit * kfitsPerRdc - 1) / (kFit * kfitsPerRdc);
|
||||
|
||||
scalar8 sum4[N / NTILE / GrpsShrB][1];
|
||||
bigType bigB_[YTILE / GrpsShrB][UNRL];
|
||||
const uint32_t bLoader = (threadIdx.y % GrpsShrB);
|
||||
uint32_t kBase = 0;
|
||||
if (k_str >= K) return;
|
||||
if (m >= Mmod) return;
|
||||
|
||||
bool noreloada = false;
|
||||
constexpr bool FAST_UNSAFE_RDC_INIT = false;
|
||||
|
||||
#ifdef WVSPLITKRC_1KPASS
|
||||
// Early glbl init, B[] loading, if 1KPASS
|
||||
if constexpr (FAST_UNSAFE_RDC_INIT) {
|
||||
if (m + (threadIdx.x % 16) < M)
|
||||
if (doRdc)
|
||||
if (k_str == 0) {
|
||||
int mindx = m + (threadIdx.x % 16);
|
||||
int nindx_ = (0 + (threadIdx.x / 16) * 4) + 0 * NTILE +
|
||||
(N / GrpsShrB) * (threadIdx.y % GrpsShrB);
|
||||
int adr_ = mindx + M * nindx_ / 4;
|
||||
__hip_atomic_store(&cntr[adr_], 0, __ATOMIC_RELAXED,
|
||||
__HIP_MEMORY_SCOPE_AGENT);
|
||||
for (uint32_t nt = 0; nt < N / NTILE / GrpsShrB; nt++) {
|
||||
for (uint32_t j = 0; j < 4; j++) {
|
||||
int nindx = (j + (threadIdx.x / 16) * 4) + nt * NTILE +
|
||||
(N / GrpsShrB) * (threadIdx.y % GrpsShrB);
|
||||
int adr = mindx + M * nindx;
|
||||
__hip_atomic_store(&glbl[adr], 0, __ATOMIC_RELAXED,
|
||||
__HIP_MEMORY_SCOPE_AGENT);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Load first B[] chunk
|
||||
#pragma unroll
|
||||
for (uint32_t k2 = 0; k2 < UNRL; k2++) {
|
||||
uint32_t k = k_str + k2 * THRDS * A_CHUNK;
|
||||
uint32_t k_ = k + threadIdx.x * A_CHUNK;
|
||||
const scalar_t* B_ = &B[min__(k_, K - A_CHUNK)];
|
||||
#pragma unroll
|
||||
for (uint32_t y = 0; y < YTILE / GrpsShrB; y++)
|
||||
bigB_[y][k2].h8 = (loadnt(
|
||||
(scalar8*)(&B_[min__(y * GrpsShrB + bLoader + m, M - 1) * K])));
|
||||
}
|
||||
{
|
||||
#else
|
||||
while (m < Mmod) {
|
||||
#endif
|
||||
|
||||
#ifndef WVSPLITKRC_1KPASS
|
||||
if constexpr (FAST_UNSAFE_RDC_INIT) {
|
||||
if (m + (threadIdx.x % 16) < M)
|
||||
if (doRdc)
|
||||
if (k_str == 0) {
|
||||
int mindx = m + (threadIdx.x % 16);
|
||||
int nindx_ = (0 + (threadIdx.x / 16) * 4) + 0 * NTILE +
|
||||
(N / GrpsShrB) * (threadIdx.y % GrpsShrB);
|
||||
int adr_ = mindx + M * nindx_ / 4;
|
||||
__hip_atomic_store(&cntr[adr_], 0, __ATOMIC_RELAXED,
|
||||
__HIP_MEMORY_SCOPE_AGENT);
|
||||
for (uint32_t nt = 0; nt < N / NTILE / GrpsShrB; nt++) {
|
||||
for (uint32_t j = 0; j < 4; j++) {
|
||||
int nindx = (j + (threadIdx.x / 16) * 4) + nt * NTILE +
|
||||
(N / GrpsShrB) * (threadIdx.y % GrpsShrB);
|
||||
int adr = mindx + M * nindx;
|
||||
__hip_atomic_store(&glbl[adr], 0, __ATOMIC_RELAXED,
|
||||
__HIP_MEMORY_SCOPE_AGENT);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#ifndef WVSPLITKRC_1KPASS
|
||||
for (uint32_t k1 = k_str; k1 < k_end; k1 += THRDS * A_CHUNK * UNRL) {
|
||||
#else
|
||||
const uint32_t k1 = k_str;
|
||||
{
|
||||
#endif
|
||||
#ifndef WVSPLITKRC_1KPASS
|
||||
const bool reloada = (!noreloada) &&
|
||||
((k1 == k_str) || (k1 == k_str + kBase + kFit)) &&
|
||||
(k1 < k_end);
|
||||
// load next chunk of A[] to LDS
|
||||
if (reloada) {
|
||||
if (k1 != k_str) kBase += kFit;
|
||||
__syncthreads();
|
||||
#else
|
||||
const bool reloada = (!noreloada) &&
|
||||
((k1 == k_str) || (k1 == k_str + kBase + kFit)) &&
|
||||
(k1 < k_end);
|
||||
if (reloada) {
|
||||
#endif
|
||||
constexpr int sprdN = 4;
|
||||
const uint32_t thrd = ((threadIdx.y / sprdN) * THRDS + threadIdx.x);
|
||||
|
||||
#ifndef WVSPLITKRC_1KPASS
|
||||
#pragma unroll
|
||||
for (int k = 0; k < kFit; k += THRDS * (WvPrGrp / sprdN) * A_CHUNK) {
|
||||
#else
|
||||
const unsigned int k = 0;
|
||||
{
|
||||
#endif
|
||||
unsigned int kOff = k + (thrd * A_CHUNK);
|
||||
unsigned int kOffcp = min__(K - A_CHUNK, k_str + kOff);
|
||||
const unsigned int k_in = kOffcp + ((threadIdx.y % sprdN)) * K;
|
||||
const unsigned int k_ot = kOff + ((threadIdx.y % sprdN)) * kFitPdd;
|
||||
for (unsigned int n = 0; n < N / 2; n += sprdN) {
|
||||
__builtin_amdgcn_global_load_lds((int*)(&A[k_in + n * K]),
|
||||
(int*)(&s[(k_ot + n * kFitPdd)]),
|
||||
16, 0, 0);
|
||||
if (((threadIdx.y % sprdN)) + n + N / 2 >= actlN) continue;
|
||||
__builtin_amdgcn_global_load_lds(
|
||||
(int*)(&A[k_in + (n + N / 2) * K]),
|
||||
(int*)(&s[(k_ot + (n + N / 2) * kFitPdd)]), 16, 0, 0);
|
||||
}
|
||||
|
||||
// Stage loaded B[] to LDS for MFMA swizzling...
|
||||
for (uint32_t k2 = 0; k2 < UNRL; k2++) {
|
||||
uint32_t k = k1 + k2 * THRDS * A_CHUNK;
|
||||
uint32_t k_ = k + threadIdx.x * A_CHUNK;
|
||||
const bool oob_k = (k_ >= K);
|
||||
for (uint32_t y = 0; y < YTILE / GrpsShrB; y++) {
|
||||
uint32_t idx = threadIdx.x * 4 +
|
||||
(y * GrpsShrB + bLoader) * ((THRDS + BPAD) * 4);
|
||||
// zero out if oob
|
||||
*((scalar8*)&myStg[idx]) =
|
||||
(oob_k || (y * GrpsShrB + bLoader + m >= M))
|
||||
? 0
|
||||
: bigB_[y][k2].h8;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#ifndef WVSPLITKRC_1KPASS
|
||||
// Fire load of next B[] chunk...
|
||||
if ((k1 + THRDS * A_CHUNK * UNRL < k_end) &&
|
||||
(k1 + THRDS * A_CHUNK * UNRL < K))
|
||||
#pragma unroll
|
||||
for (uint32_t k2 = 0; k2 < UNRL; k2++) {
|
||||
uint32_t k = k1 + THRDS * A_CHUNK * UNRL + k2 * THRDS * A_CHUNK;
|
||||
uint32_t k_ = k + threadIdx.x * A_CHUNK;
|
||||
const scalar_t* B_ = &B[min__(k_, K - A_CHUNK)];
|
||||
#pragma unroll
|
||||
for (uint32_t y = 0; y < YTILE / GrpsShrB; y++)
|
||||
bigB_[y][k2].h8 = (loadnt(
|
||||
(scalar8*)(&B_[min__(y * GrpsShrB + bLoader + m, M - 1) * K])));
|
||||
}
|
||||
#endif
|
||||
|
||||
// B[] staging is cooperative across GrpsShrB, so sync here before reading
|
||||
// back
|
||||
__syncthreads();
|
||||
|
||||
// read back B[] swizzled for MFMA...
|
||||
bigType bigB[YTILE][UNRL];
|
||||
for (uint32_t k2 = 0; k2 < UNRL; k2++) {
|
||||
for (uint32_t y = 0; y < YTILE; y++) {
|
||||
unsigned int idx = (threadIdx.x % YTILE) * ((THRDS + BPAD) * 4) +
|
||||
(threadIdx.x / YTILE) * 4 + y * 16;
|
||||
bigB[y][k2].h8 = *((scalar8*)&myStg[idx]);
|
||||
}
|
||||
}
|
||||
|
||||
// rReadback A[] swizzled for MFMA...
|
||||
bigType bigA[N / GrpsShrB][UNRL];
|
||||
#pragma unroll
|
||||
for (uint32_t k2 = 0; k2 < UNRL; k2++) {
|
||||
uint32_t k = k1 + k2 * THRDS * A_CHUNK - kBase - k_str;
|
||||
#pragma unroll
|
||||
for (uint32_t nt = 0; nt < N / GrpsShrB; nt += NTILE)
|
||||
#pragma unroll
|
||||
for (uint32_t n = 0; n < NTILE; n++) {
|
||||
uint32_t idxa = (nt + (threadIdx.x % NTILE) +
|
||||
(N / GrpsShrB) * (threadIdx.y % GrpsShrB)) *
|
||||
kFitPdd +
|
||||
A_CHUNK * ((threadIdx.x / NTILE) + n * 4) + k;
|
||||
bigA[nt + n][k2] = *((const bigType*)(&(s[idxa])));
|
||||
}
|
||||
}
|
||||
|
||||
// Do the MFMAs
|
||||
#pragma unroll
|
||||
for (uint32_t k2 = 0; k2 < UNRL; k2++) {
|
||||
#pragma unroll
|
||||
for (uint32_t nt = 0; nt < N / NTILE / GrpsShrB; nt++) {
|
||||
if constexpr (std::is_same_v<scalar_t, half>) {
|
||||
sum4[nt][0] = __builtin_amdgcn_mfma_f32_16x16x16f16(
|
||||
bigA[nt * NTILE + 0][k2].h4[0], bigB[0][k2].h4[0],
|
||||
(k1 == k_str) ? ((scalar8){0}) : sum4[nt][0], 0, 0, 0);
|
||||
sum4[nt][0] = __builtin_amdgcn_mfma_f32_16x16x16f16(
|
||||
bigA[nt * NTILE + 0][k2].h4[1], bigB[0][k2].h4[1], sum4[nt][0], 0,
|
||||
0, 0);
|
||||
} else { // bf16
|
||||
sum4[nt][0] = __builtin_amdgcn_mfma_f32_16x16x16bf16_1k(
|
||||
bigA[nt * NTILE + 0][k2].h4[0], bigB[0][k2].h4[0],
|
||||
(k1 == k_str) ? ((scalar8){0}) : sum4[nt][0], 0, 0, 0);
|
||||
sum4[nt][0] = __builtin_amdgcn_mfma_f32_16x16x16bf16_1k(
|
||||
bigA[nt * NTILE + 0][k2].h4[1], bigB[0][k2].h4[1], sum4[nt][0], 0,
|
||||
0, 0);
|
||||
}
|
||||
#pragma unroll
|
||||
for (uint32_t j = 1; j < YTILE; j++) {
|
||||
if constexpr (std::is_same_v<scalar_t, half>) {
|
||||
sum4[nt][0] = __builtin_amdgcn_mfma_f32_16x16x16f16(
|
||||
bigA[nt * NTILE + j][k2].h4[0], bigB[j][k2].h4[0], sum4[nt][0],
|
||||
0, 0, 0);
|
||||
sum4[nt][0] = __builtin_amdgcn_mfma_f32_16x16x16f16(
|
||||
bigA[nt * NTILE + j][k2].h4[1], bigB[j][k2].h4[1], sum4[nt][0],
|
||||
0, 0, 0);
|
||||
} else { // bf16
|
||||
sum4[nt][0] = __builtin_amdgcn_mfma_f32_16x16x16bf16_1k(
|
||||
bigA[nt * NTILE + j][k2].h4[0], bigB[j][k2].h4[0], sum4[nt][0],
|
||||
0, 0, 0);
|
||||
sum4[nt][0] = __builtin_amdgcn_mfma_f32_16x16x16bf16_1k(
|
||||
bigA[nt * NTILE + j][k2].h4[1], bigB[j][k2].h4[1], sum4[nt][0],
|
||||
0, 0, 0);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (!doRdc) {
|
||||
if (m + (threadIdx.x % 16) < M) {
|
||||
scalar_t biases[N / NTILE / GrpsShrB][4] = {0};
|
||||
if (BIAS)
|
||||
for (uint32_t nt = 0; nt < N / NTILE / GrpsShrB; nt++) {
|
||||
for (uint32_t j = 0; j < 4; j++) {
|
||||
int mindx = m + (threadIdx.x % 16);
|
||||
int nindx = (j + (threadIdx.x / 16) * 4) + nt * NTILE +
|
||||
(N / GrpsShrB) * (threadIdx.y % GrpsShrB);
|
||||
biases[nt][j] = BIAS[(mindx % Bx) + (nindx % By) * M];
|
||||
}
|
||||
}
|
||||
for (uint32_t nt = 0; nt < N / NTILE / GrpsShrB; nt++) {
|
||||
for (uint32_t j = 0; j < 4; j++) {
|
||||
int mindx = m + (threadIdx.x % 16);
|
||||
int nindx = (j + (threadIdx.x / 16) * 4) + nt * NTILE +
|
||||
(N / GrpsShrB) * (threadIdx.y % GrpsShrB);
|
||||
int adr = mindx + M * nindx;
|
||||
if constexpr (std::is_same_v<scalar_t, __hip_bfloat16>) {
|
||||
if (BIAS) sum4[nt][0][j] += __bfloat162float(biases[nt][j]);
|
||||
C[adr] = __float2bfloat16(sum4[nt][0][j]);
|
||||
} else {
|
||||
if (BIAS) sum4[nt][0][j] += __half2float(biases[nt][j]);
|
||||
C[adr] = __float2half(sum4[nt][0][j]);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
if (m + (threadIdx.x % 16) < M) {
|
||||
int my_cntr;
|
||||
if (!BIAS) {
|
||||
int mindx = m + (threadIdx.x % 16);
|
||||
for (uint32_t nt = 0; nt < N / NTILE / GrpsShrB; nt++)
|
||||
for (uint32_t j = 0; j < 4; j++) {
|
||||
int nindx = (j + (threadIdx.x / 16) * 4) + nt * NTILE +
|
||||
(N / GrpsShrB) * (threadIdx.y % GrpsShrB);
|
||||
int adr = mindx + M * nindx;
|
||||
atomicAdd(&glbl[adr], sum4[nt][0][j]);
|
||||
}
|
||||
int nindx_ = (0 + (threadIdx.x / 16) * 4) + 0 * NTILE +
|
||||
(N / GrpsShrB) * (threadIdx.y % GrpsShrB);
|
||||
int adr_ = mindx + M * nindx_ / 4;
|
||||
my_cntr = atomicAdd(&cntr[adr_], 1);
|
||||
float vals[N / NTILE / GrpsShrB][4] = {};
|
||||
if (my_cntr + 1 == k_rnd) {
|
||||
for (uint32_t nt = 0; nt < N / NTILE / GrpsShrB; nt++) {
|
||||
for (uint32_t j = 0; j < 4; j++) {
|
||||
int nindx = (j + (threadIdx.x / 16) * 4) + nt * NTILE +
|
||||
(N / GrpsShrB) * (threadIdx.y % GrpsShrB);
|
||||
int adr = mindx + M * nindx;
|
||||
vals[nt][j] = glbl[adr];
|
||||
}
|
||||
}
|
||||
for (uint32_t nt = 0; nt < N / NTILE / GrpsShrB; nt++) {
|
||||
for (uint32_t j = 0; j < 4; j++) {
|
||||
int nindx = (j + (threadIdx.x / 16) * 4) + nt * NTILE +
|
||||
(N / GrpsShrB) * (threadIdx.y % GrpsShrB);
|
||||
if (nindx >= actlN) break;
|
||||
int adr = mindx + M * nindx;
|
||||
if constexpr (std::is_same_v<scalar_t, __hip_bfloat16>) {
|
||||
C[adr] = __float2bfloat16(vals[nt][j]);
|
||||
} else {
|
||||
C[adr] = __float2half(vals[nt][j]);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
int mindx = m + (threadIdx.x % 16);
|
||||
scalar_t biases[N / NTILE / GrpsShrB][4] = {};
|
||||
// Atomic add the output, read biases
|
||||
for (uint32_t nt = 0; nt < N / NTILE / GrpsShrB; nt++)
|
||||
for (uint32_t j = 0; j < 4; j++) {
|
||||
int nindx = (j + (threadIdx.x / 16) * 4) + nt * NTILE +
|
||||
(N / GrpsShrB) * (threadIdx.y % GrpsShrB);
|
||||
int adr = mindx + M * nindx;
|
||||
atomicAdd(&glbl[adr], sum4[nt][0][j]);
|
||||
biases[nt][j] = BIAS[(mindx % Bx) + (nindx % By) * M];
|
||||
}
|
||||
int nindx_ = (0 + (threadIdx.x / 16) * 4) + 0 * NTILE +
|
||||
(N / GrpsShrB) * (threadIdx.y % GrpsShrB);
|
||||
int adr_ = mindx + M * nindx_ / 4;
|
||||
// Update the complete counter
|
||||
my_cntr = atomicAdd(&cntr[adr_], 1);
|
||||
float vals[N / NTILE / GrpsShrB][4] = {};
|
||||
// If we're the last k-shard, read back the value and convert...
|
||||
if (my_cntr + 1 == k_rnd) {
|
||||
for (uint32_t nt = 0; nt < N / NTILE / GrpsShrB; nt++) {
|
||||
for (uint32_t j = 0; j < 4; j++) {
|
||||
int nindx = (j + (threadIdx.x / 16) * 4) + nt * NTILE +
|
||||
(N / GrpsShrB) * (threadIdx.y % GrpsShrB);
|
||||
int adr = mindx + M * nindx;
|
||||
vals[nt][j] = glbl[adr];
|
||||
}
|
||||
}
|
||||
for (uint32_t nt = 0; nt < N / NTILE / GrpsShrB; nt++) {
|
||||
for (uint32_t j = 0; j < 4; j++) {
|
||||
int nindx = (j + (threadIdx.x / 16) * 4) + nt * NTILE +
|
||||
(N / GrpsShrB) * (threadIdx.y % GrpsShrB);
|
||||
if (nindx >= actlN) break;
|
||||
int adr = mindx + M * nindx;
|
||||
if constexpr (std::is_same_v<scalar_t, __hip_bfloat16>) {
|
||||
vals[nt][j] += __bfloat162float(biases[nt][j]);
|
||||
C[adr] = __float2bfloat16(vals[nt][j]);
|
||||
} else {
|
||||
vals[nt][j] += __half2float(biases[nt][j]);
|
||||
C[adr] = __float2half(vals[nt][j]);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#ifndef WVSPLITKRC_1KPASS
|
||||
m0 += CuCount * WvPrGrp * YTILE / GrpsShrB;
|
||||
m = (m0 + m1) % Mmod;
|
||||
k_str = (m0 / Mmod) * kFit * kfitsPerRdc;
|
||||
k_end = (m0 / Mmod + 1) * kFit * kfitsPerRdc;
|
||||
if (k_str >= K) break;
|
||||
kBase = 0;
|
||||
#endif
|
||||
}
|
||||
}
|
||||
#else // !defined(__HIP__GFX9__) TODO: Add NAVI support
|
||||
template <typename scalar_t, int THRDS, int YTILE, int WvPrGrp, int A_CHUNK,
|
||||
int UNRL, int N, int GrpsShrB>
|
||||
__global__ void wvSplitKrc_(const int actlN, const int K, const int M,
|
||||
const int Bx, const int By, const scalar_t* B,
|
||||
const scalar_t* __restrict__ A,
|
||||
const scalar_t* __restrict__ BIAS, float* glbl,
|
||||
// int* cntr,
|
||||
scalar_t* C, const int CuCount){UNREACHABLE_CODE}
|
||||
#endif // defined(__HIP__GFX9__) TODO: Add NAVI support
|
||||
|
||||
torch::Tensor wvSplitKrc(const at::Tensor& in_a, const at::Tensor& in_b,
|
||||
const std::optional<at::Tensor>& in_bias,
|
||||
const int64_t CuCount) {
|
||||
auto M_in = in_a.size(0);
|
||||
auto N_in = in_b.size(0);
|
||||
auto K_in = in_a.size(1);
|
||||
auto Bx_in =
|
||||
(in_bias.has_value() && in_bias->numel() > 0)
|
||||
? (in_bias->sizes().size() == 2) ? in_bias->size(1) : in_bias->size(0)
|
||||
: 1;
|
||||
auto By_in = (in_bias.has_value() && in_bias->numel() > 0 &&
|
||||
in_bias->sizes().size() == 2)
|
||||
? in_bias->size(0)
|
||||
: 1;
|
||||
|
||||
TORCH_CHECK(in_a.dtype() == in_b.dtype());
|
||||
TORCH_CHECK(K_in % 8 == 0, "k % 8 == 0");
|
||||
TORCH_CHECK(in_a.dtype() == torch::kFloat16 ||
|
||||
in_a.dtype() == torch::kBFloat16);
|
||||
|
||||
auto out_c = torch::empty(
|
||||
{N_in, M_in},
|
||||
torch::TensorOptions().dtype(in_b.dtype()).device(in_b.device()));
|
||||
|
||||
auto N_p2 = 1U << (32 - __builtin_clz(N_in - 1));
|
||||
auto axl_glbl = torch::empty(
|
||||
{N_p2 + N_p2 / 4, M_in + M_in / 4},
|
||||
torch::TensorOptions().dtype(torch::kFloat32).device(in_b.device()));
|
||||
axl_glbl.zero_(); // disable for FAST_UNSAFE_RDC_INIT
|
||||
|
||||
dim3 grid(CuCount);
|
||||
|
||||
const at::cuda::OptionalCUDAGuard device_guard(device_of(in_a));
|
||||
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
||||
// const int max_lds_len = get_lds_size() / 2;
|
||||
|
||||
#define WVSPLITKrc(_WvPrGrp, _YTILE, _UNRL, _N, _GrpsShrB) \
|
||||
{ \
|
||||
dim3 block(64, _WvPrGrp); \
|
||||
wvSplitKrc_<fptype, 64, _YTILE, _WvPrGrp, 8, _UNRL, _N, _GrpsShrB> \
|
||||
<<<grid, block, 0, stream>>>(N_in, K_in, M_in, Bx_in, By_in, af4, bf4, \
|
||||
biasf4, glbl, c, CuCount); \
|
||||
}
|
||||
|
||||
AT_DISPATCH_REDUCED_FLOATING_TYPES(in_b.scalar_type(), "wvSplitKrc", [&] {
|
||||
using fptype = typename scalar<scalar_t>::type;
|
||||
fptype* af4 = reinterpret_cast<fptype*>(in_a.data_ptr());
|
||||
const fptype* bf4 = reinterpret_cast<const fptype*>(in_b.data_ptr());
|
||||
const fptype* biasf4 =
|
||||
(in_bias.has_value() && in_bias->numel() > 0)
|
||||
? reinterpret_cast<const fptype*>(in_bias->data_ptr())
|
||||
: nullptr;
|
||||
fptype* c = reinterpret_cast<fptype*>(out_c.data_ptr());
|
||||
auto glbl = axl_glbl.data_ptr<float>();
|
||||
switch (N_p2) {
|
||||
case 16:
|
||||
WVSPLITKrc(4, 16, 1, 16, 1) break;
|
||||
case 32:
|
||||
WVSPLITKrc(4, 16, 1, 32, 2) break;
|
||||
case 64:
|
||||
WVSPLITKrc(4, 16, 1, 64, 2) break;
|
||||
case 128:
|
||||
WVSPLITKrc(4, 16, 1, 128, 4) break;
|
||||
default:
|
||||
throw std::runtime_error(
|
||||
"Unsupported N value: " + std::to_string(M_in) + "," +
|
||||
std::to_string(K_in) + "," + std::to_string(N_in));
|
||||
}
|
||||
});
|
||||
return out_c;
|
||||
}
|
||||
|
||||
#if defined(__HIP__MI3XX__) // TODO: Add NAVI support
|
||||
template <typename scalar_t, typename fp8_t, int THRDS, int YTILE, int WvPrGrp,
|
||||
int A_CHUNK, int UNRL, int N>
|
||||
@@ -1381,7 +1924,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
|
||||
__shared__ fp8_t s[max_lds_len];
|
||||
|
||||
for (uint32_t k = (threadIdx.y * THRDS + threadIdx.x) * A_CHUNK;
|
||||
k < min(K * N, max_lds_len); k += THRDS * WvPrGrp * A_CHUNK) {
|
||||
k < min__(K * N, max_lds_len); k += THRDS * WvPrGrp * A_CHUNK) {
|
||||
*((bigType*)(&s[k])) = *((bigType*)(&A[k]));
|
||||
}
|
||||
__syncthreads();
|
||||
@@ -1570,7 +2113,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
|
||||
__shared__ fp8_t s[max_lds_len];
|
||||
|
||||
for (uint32_t k = (threadIdx.y * THRDS + threadIdx.x) * A_CHUNK;
|
||||
k < min(K * N, max_lds_len); k += THRDS * WvPrGrp * A_CHUNK) {
|
||||
k < min__(K * N, max_lds_len); k += THRDS * WvPrGrp * A_CHUNK) {
|
||||
*((bigType*)(&s[k])) = *((bigType*)(&A[k]));
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
@@ -26,6 +26,12 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, rocm_ops) {
|
||||
"Tensor");
|
||||
rocm_ops.impl("wvSplitK", torch::kCUDA, &wvSplitK);
|
||||
|
||||
// Custom gemm op for skinny matrix-matrix multiplication
|
||||
rocm_ops.def(
|
||||
"wvSplitKrc(Tensor in_a, Tensor in_b, Tensor? in_bias, int CuCount) -> "
|
||||
"Tensor");
|
||||
rocm_ops.impl("wvSplitKrc", torch::kCUDA, &wvSplitKrc);
|
||||
|
||||
// wvSplitK for fp8
|
||||
rocm_ops.def(
|
||||
"wvSplitKQ(Tensor in_a, Tensor in_b, Tensor? in_bias, Tensor! out_c, "
|
||||
|
||||
@@ -303,9 +303,9 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
|
||||
ops.def("permute_cols(Tensor A, Tensor perm) -> Tensor");
|
||||
ops.impl("permute_cols", torch::kCUDA, &permute_cols);
|
||||
|
||||
// gptq_marlin Optimized Quantized GEMM for GPTQ.
|
||||
// Marlin Optimized Quantized GEMM (supports GPTQ, AWQ, FP8, NVFP4, MXFP4).
|
||||
ops.def(
|
||||
"gptq_marlin_gemm(Tensor a, Tensor? c_or_none, Tensor b_q_weight, "
|
||||
"marlin_gemm(Tensor a, Tensor? c_or_none, Tensor b_q_weight, "
|
||||
"Tensor? b_bias_or_none,Tensor b_scales, "
|
||||
"Tensor? a_scales, Tensor? global_scale, Tensor? b_zeros_or_none, "
|
||||
"Tensor? "
|
||||
@@ -474,19 +474,6 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
|
||||
"()");
|
||||
ops.impl("get_cutlass_moe_mm_data", torch::kCUDA, &get_cutlass_moe_mm_data);
|
||||
|
||||
// A function that computes problem sizes for each expert's multiplication
|
||||
// used by the two mms called from fused MoE operation. It takes topk_ids as
|
||||
// an input, and computes problem_sizes1 and problem_sizes2 only.
|
||||
ops.def(
|
||||
"get_cutlass_moe_mm_problem_sizes(Tensor topk_ids, "
|
||||
" Tensor! problem_sizes1, "
|
||||
" Tensor! problem_sizes2, "
|
||||
" int num_experts, int n, int k, "
|
||||
" Tensor? blockscale_offsets, "
|
||||
" bool? force_swap_ab) -> ()");
|
||||
ops.impl("get_cutlass_moe_mm_problem_sizes", torch::kCUDA,
|
||||
&get_cutlass_moe_mm_problem_sizes);
|
||||
|
||||
// compute per-expert problem sizes from expert_first_token_offset
|
||||
// produced by vLLM's moe_permute kernel
|
||||
ops.def(
|
||||
@@ -559,7 +546,8 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
|
||||
// Compute NVFP4 block quantized tensor.
|
||||
ops.def(
|
||||
"scaled_fp4_quant(Tensor! output, Tensor input,"
|
||||
" Tensor! output_scale, Tensor input_scale) -> ()");
|
||||
" Tensor! output_scale, Tensor input_scale, bool "
|
||||
"is_sf_swizzled_layout) -> ()");
|
||||
ops.impl("scaled_fp4_quant", torch::kCUDA, &scaled_fp4_quant);
|
||||
|
||||
// Compute NVFP4 experts quantization.
|
||||
@@ -705,7 +693,8 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) {
|
||||
// Cache ops
|
||||
// Swap in (out) the cache blocks from src to dst.
|
||||
cache_ops.def(
|
||||
"swap_blocks(Tensor src, Tensor! dst, Tensor block_mapping) -> ()");
|
||||
"swap_blocks(Tensor src, Tensor! dst,"
|
||||
" int block_size_in_bytes, Tensor block_mapping) -> ()");
|
||||
cache_ops.impl("swap_blocks", torch::kCUDA, &swap_blocks);
|
||||
|
||||
// Reshape the key and value tensors and cache them.
|
||||
|
||||
@@ -5,6 +5,23 @@
|
||||
# docs/contributing/dockerfile/dockerfile.md and
|
||||
# docs/assets/contributing/dockerfile-stages-dependency.png
|
||||
|
||||
# =============================================================================
|
||||
# VERSION MANAGEMENT
|
||||
# =============================================================================
|
||||
# ARG defaults in this Dockerfile are the source of truth for pinned versions.
|
||||
# docker/versions.json is auto-generated for use with docker buildx bake.
|
||||
#
|
||||
# When updating versions:
|
||||
# 1. Edit the ARG defaults below
|
||||
# 2. Run: python tools/generate_versions_json.py
|
||||
#
|
||||
# To query versions programmatically:
|
||||
# jq -r '.variable.CUDA_VERSION.default' docker/versions.json
|
||||
#
|
||||
# To build with bake:
|
||||
# docker buildx bake -f docker/docker-bake.hcl -f docker/versions.json
|
||||
# =============================================================================
|
||||
|
||||
ARG CUDA_VERSION=12.9.1
|
||||
ARG PYTHON_VERSION=3.12
|
||||
|
||||
@@ -117,8 +134,8 @@ ENV UV_LINK_MODE=copy
|
||||
# Verify GCC version
|
||||
RUN gcc --version
|
||||
|
||||
# Workaround for triton/pytorch issues
|
||||
RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/
|
||||
# Ensure CUDA compatibility library is loaded
|
||||
RUN echo "/usr/local/cuda-$(echo "$CUDA_VERSION" | cut -d. -f1,2)/compat/" > /etc/ld.so.conf.d/00-cuda-compat.conf && ldconfig
|
||||
|
||||
# ============================================================
|
||||
# SLOW-CHANGING DEPENDENCIES BELOW
|
||||
@@ -131,16 +148,41 @@ ARG PYTORCH_CUDA_INDEX_BASE_URL
|
||||
|
||||
WORKDIR /workspace
|
||||
|
||||
# install build and runtime dependencies
|
||||
# We can specify the standard or nightly build of PyTorch
|
||||
ARG PYTORCH_NIGHTLY
|
||||
|
||||
# Install build and runtime dependencies, including PyTorch
|
||||
# Check whether to install torch nightly instead of release for this build
|
||||
COPY requirements/common.txt requirements/common.txt
|
||||
COPY requirements/cuda.txt requirements/cuda.txt
|
||||
COPY use_existing_torch.py use_existing_torch.py
|
||||
COPY pyproject.toml pyproject.toml
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
uv pip install --python /opt/venv/bin/python3 -r requirements/cuda.txt \
|
||||
--extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
|
||||
if [ "${PYTORCH_NIGHTLY}" = "1" ]; then \
|
||||
echo "Installing torch nightly..." \
|
||||
&& uv pip install --python /opt/venv/bin/python3 torch torchaudio torchvision --pre \
|
||||
--index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/nightly/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') \
|
||||
&& echo "Installing other requirements..." \
|
||||
&& /opt/venv/bin/python3 use_existing_torch.py --prefix \
|
||||
&& uv pip install --python /opt/venv/bin/python3 -r requirements/cuda.txt \
|
||||
--extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/nightly/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.'); \
|
||||
else \
|
||||
uv pip install --python /opt/venv/bin/python3 -r requirements/cuda.txt \
|
||||
--extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.'); \
|
||||
fi
|
||||
|
||||
# Track PyTorch lib versions used during build and match in downstream instances.
|
||||
# We do this for both nightly and release so we can strip dependencies/*.txt as needed.
|
||||
# Otherwise library dependencies can upgrade/downgrade torch incorrectly.
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
uv pip freeze | grep -i "^torch=\|^torchvision=\|^torchaudio=" > torch_lib_versions.txt \
|
||||
&& TORCH_LIB_VERSIONS=$(cat torch_lib_versions.txt | xargs) \
|
||||
&& echo "Installed torch libs: ${TORCH_LIB_VERSIONS}"
|
||||
|
||||
# CUDA arch list used by torch
|
||||
# Explicitly set the list to avoid issues with torch 2.2
|
||||
# See https://github.com/pytorch/pytorch/pull/123243
|
||||
# From versions.json: .torch.cuda_arch_list
|
||||
ARG torch_cuda_arch_list='7.0 7.5 8.0 8.9 9.0 10.0 12.0'
|
||||
ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}
|
||||
#################### BUILD BASE IMAGE ####################
|
||||
@@ -153,8 +195,13 @@ ARG PIP_INDEX_URL UV_INDEX_URL
|
||||
ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL
|
||||
ARG PYTORCH_CUDA_INDEX_BASE_URL
|
||||
|
||||
# install build dependencies
|
||||
# We can specify the standard or nightly build of PyTorch
|
||||
ARG PYTORCH_NIGHTLY
|
||||
|
||||
# Install build dependencies
|
||||
COPY requirements/build.txt requirements/build.txt
|
||||
COPY use_existing_torch.py use_existing_torch.py
|
||||
COPY --from=base /workspace/torch_lib_versions.txt torch_lib_versions.txt
|
||||
|
||||
# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
|
||||
# Reference: https://github.com/astral-sh/uv/pull/1694
|
||||
@@ -164,8 +211,18 @@ ENV UV_INDEX_STRATEGY="unsafe-best-match"
|
||||
ENV UV_LINK_MODE=copy
|
||||
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
uv pip install --python /opt/venv/bin/python3 -r requirements/build.txt \
|
||||
--extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
|
||||
if [ "${PYTORCH_NIGHTLY}" = "1" ]; then \
|
||||
echo "Installing build requirements without torch..." \
|
||||
&& python3 use_existing_torch.py --prefix \
|
||||
&& uv pip install --python /opt/venv/bin/python3 -r requirements/build.txt \
|
||||
&& echo "Installing torch nightly..." \
|
||||
&& uv pip install --python /opt/venv/bin/python3 $(cat torch_lib_versions.txt | grep -i "^torch=" | xargs) --pre \
|
||||
--index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/nightly/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.'); \
|
||||
else \
|
||||
echo "Installing build requirements..." \
|
||||
&& uv pip install --python /opt/venv/bin/python3 -r requirements/build.txt \
|
||||
--extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.'); \
|
||||
fi
|
||||
|
||||
WORKDIR /workspace
|
||||
|
||||
@@ -197,6 +254,13 @@ ARG VLLM_MAIN_CUDA_VERSION=""
|
||||
# Use dummy version for csrc-build wheel (only .so files are extracted, version doesn't matter)
|
||||
ENV SETUPTOOLS_SCM_PRETEND_VERSION="0.0.0+csrc.build"
|
||||
|
||||
# Use existing torch for nightly builds
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
if [ "${PYTORCH_NIGHTLY}" = "1" ]; then \
|
||||
python3 use_existing_torch.py --prefix; \
|
||||
fi
|
||||
|
||||
# Build the vLLM wheel
|
||||
# if USE_SCCACHE is set, use sccache to speed up compilation
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
if [ "$USE_SCCACHE" = "1" ]; then \
|
||||
@@ -240,6 +304,7 @@ RUN --mount=type=cache,target=/root/.cache/ccache \
|
||||
export VLLM_DOCKER_BUILD_CONTEXT=1 && \
|
||||
python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38; \
|
||||
fi
|
||||
|
||||
#################### CSRC BUILD IMAGE ####################
|
||||
|
||||
#################### EXTENSIONS BUILD IMAGE ####################
|
||||
@@ -256,7 +321,8 @@ ENV UV_LINK_MODE=copy
|
||||
WORKDIR /workspace
|
||||
|
||||
# Build DeepGEMM wheel
|
||||
ARG DEEPGEMM_GIT_REF
|
||||
# Default moved here from tools/install_deepgemm.sh for centralized version management
|
||||
ARG DEEPGEMM_GIT_REF=594953acce41793ae00a1233eb516044d604bcb6
|
||||
COPY tools/install_deepgemm.sh /tmp/install_deepgemm.sh
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
mkdir -p /tmp/deepgemm/dist && \
|
||||
@@ -271,8 +337,9 @@ RUN mkdir -p /tmp/deepgemm/dist && touch /tmp/deepgemm/dist/.deepgemm_skipped
|
||||
|
||||
# Build pplx-kernels and DeepEP wheels
|
||||
COPY tools/ep_kernels/install_python_libraries.sh /tmp/install_python_libraries.sh
|
||||
ARG PPLX_COMMIT_HASH
|
||||
ARG DEEPEP_COMMIT_HASH
|
||||
# Defaults moved here from tools/ep_kernels/install_python_libraries.sh for centralized version management
|
||||
ARG PPLX_COMMIT_HASH=12cecfd
|
||||
ARG DEEPEP_COMMIT_HASH=73b6ea4
|
||||
ARG NVSHMEM_VER
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
mkdir -p /tmp/ep_kernels_workspace/dist && \
|
||||
@@ -294,8 +361,13 @@ ARG PIP_INDEX_URL UV_INDEX_URL
|
||||
ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL
|
||||
ARG PYTORCH_CUDA_INDEX_BASE_URL
|
||||
|
||||
# install build dependencies
|
||||
# We can specify the standard or nightly build of PyTorch
|
||||
ARG PYTORCH_NIGHTLY
|
||||
|
||||
# Install build dependencies
|
||||
COPY requirements/build.txt requirements/build.txt
|
||||
COPY use_existing_torch.py use_existing_torch.py
|
||||
COPY --from=base /workspace/torch_lib_versions.txt torch_lib_versions.txt
|
||||
|
||||
# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
|
||||
# Reference: https://github.com/astral-sh/uv/pull/1694
|
||||
@@ -305,14 +377,23 @@ ENV UV_INDEX_STRATEGY="unsafe-best-match"
|
||||
ENV UV_LINK_MODE=copy
|
||||
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
uv pip install --python /opt/venv/bin/python3 -r requirements/build.txt \
|
||||
--extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
|
||||
if [ "${PYTORCH_NIGHTLY}" = "1" ]; then \
|
||||
echo "Installing build requirements without torch..." \
|
||||
&& python3 use_existing_torch.py --prefix \
|
||||
&& uv pip install --python /opt/venv/bin/python3 -r requirements/build.txt \
|
||||
&& echo "Installing torch nightly..." \
|
||||
&& uv pip install --python /opt/venv/bin/python3 $(cat torch_lib_versions.txt | grep -i "^torch=" | xargs) --pre \
|
||||
--index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/nightly/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.'); \
|
||||
else \
|
||||
echo "Installing build requirements..." \
|
||||
&& uv pip install --python /opt/venv/bin/python3 -r requirements/build.txt \
|
||||
--extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.'); \
|
||||
fi
|
||||
|
||||
WORKDIR /workspace
|
||||
|
||||
# Copy pre-built csrc wheel directly
|
||||
COPY --from=csrc-build /workspace/dist /precompiled-wheels
|
||||
|
||||
COPY . .
|
||||
|
||||
ARG GIT_REPO_CHECK=0
|
||||
@@ -325,6 +406,13 @@ ENV VLLM_TARGET_DEVICE=${vllm_target_device}
|
||||
# Skip adding +precompiled suffix to version (preserves git-derived version)
|
||||
ENV VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX=1
|
||||
|
||||
# Use existing torch for nightly builds
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
if [ "${PYTORCH_NIGHTLY}" = "1" ]; then \
|
||||
python3 use_existing_torch.py --prefix; \
|
||||
fi
|
||||
|
||||
# Build the vLLM wheel
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
--mount=type=bind,source=.git,target=.git \
|
||||
if [ "${vllm_target_device}" = "cuda" ]; then \
|
||||
@@ -347,7 +435,8 @@ RUN if [ "$RUN_WHEEL_CHECK" = "true" ]; then \
|
||||
else \
|
||||
echo "Skipping wheel size check."; \
|
||||
fi
|
||||
#################### EXTENSION Build IMAGE ####################
|
||||
|
||||
#################### WHEEL BUILD IMAGE ####################
|
||||
|
||||
#################### DEV IMAGE ####################
|
||||
FROM base AS dev
|
||||
@@ -365,12 +454,34 @@ ENV UV_LINK_MODE=copy
|
||||
|
||||
# Install libnuma-dev, required by fastsafetensors (fixes #20384)
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends libnuma-dev && rm -rf /var/lib/apt/lists/*
|
||||
|
||||
|
||||
# We can specify the standard or nightly build of PyTorch
|
||||
ARG PYTORCH_NIGHTLY
|
||||
|
||||
# Install development dependencies
|
||||
COPY requirements/lint.txt requirements/lint.txt
|
||||
COPY requirements/test.in requirements/test.in
|
||||
COPY requirements/test.txt requirements/test.txt
|
||||
COPY requirements/dev.txt requirements/dev.txt
|
||||
COPY use_existing_torch.py use_existing_torch.py
|
||||
COPY --from=base /workspace/torch_lib_versions.txt torch_lib_versions.txt
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
uv pip install --python /opt/venv/bin/python3 -r requirements/dev.txt \
|
||||
--extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
|
||||
if [ "${PYTORCH_NIGHTLY}" = "1" ]; then \
|
||||
echo "Installing dev requirements plus torch nightly..." \
|
||||
&& python3 use_existing_torch.py --prefix \
|
||||
&& cat torch_lib_versions.txt >> requirements/test.in \
|
||||
&& uv pip compile requirements/test.in -o requirements/test.txt --index-strategy unsafe-best-match \
|
||||
--extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/nightly/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') \
|
||||
&& uv pip install --python /opt/venv/bin/python3 $(cat torch_lib_versions.txt | xargs) --pre \
|
||||
-r requirements/dev.txt \
|
||||
--extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/nightly/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.'); \
|
||||
else \
|
||||
echo "Installing dev requirements..." \
|
||||
&& uv pip install --python /opt/venv/bin/python3 -r requirements/dev.txt \
|
||||
--extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.'); \
|
||||
fi
|
||||
|
||||
#################### DEV IMAGE ####################
|
||||
#################### vLLM installation IMAGE ####################
|
||||
# image with vLLM installed
|
||||
@@ -453,8 +564,8 @@ ENV UV_HTTP_TIMEOUT=500
|
||||
ENV UV_INDEX_STRATEGY="unsafe-best-match"
|
||||
ENV UV_LINK_MODE=copy
|
||||
|
||||
# Workaround for triton/pytorch issues
|
||||
RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/
|
||||
# Ensure CUDA compatibility library is loaded
|
||||
RUN echo "/usr/local/cuda-$(echo "$CUDA_VERSION" | cut -d. -f1,2)/compat/" > /etc/ld.so.conf.d/00-cuda-compat.conf && ldconfig
|
||||
|
||||
# ============================================================
|
||||
# SLOW-CHANGING DEPENDENCIES BELOW
|
||||
@@ -474,7 +585,8 @@ RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
# Install FlashInfer pre-compiled kernel cache and binaries
|
||||
# This is ~1.1GB and only changes when FlashInfer version bumps
|
||||
# https://docs.flashinfer.ai/installation.html
|
||||
ARG FLASHINFER_VERSION=0.5.3
|
||||
# From versions.json: .flashinfer.version
|
||||
ARG FLASHINFER_VERSION=0.6.1
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
uv pip install --system flashinfer-cubin==${FLASHINFER_VERSION} \
|
||||
&& uv pip install --system flashinfer-jit-cache==${FLASHINFER_VERSION} \
|
||||
@@ -503,14 +615,20 @@ RUN set -eux; \
|
||||
|
||||
# Install vllm-openai dependencies (saves ~2.6s per build)
|
||||
# These are stable packages that don't depend on vLLM itself
|
||||
# From versions.json: .bitsandbytes.x86_64, .bitsandbytes.arm64
|
||||
# From versions.json: .openai_server_extras.timm, .openai_server_extras.runai_model_streamer
|
||||
ARG BITSANDBYTES_VERSION_X86=0.46.1
|
||||
ARG BITSANDBYTES_VERSION_ARM64=0.42.0
|
||||
ARG TIMM_VERSION=">=1.0.17"
|
||||
ARG RUNAI_MODEL_STREAMER_VERSION=">=0.15.3"
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
|
||||
BITSANDBYTES_VERSION="0.42.0"; \
|
||||
BITSANDBYTES_VERSION="${BITSANDBYTES_VERSION_ARM64}"; \
|
||||
else \
|
||||
BITSANDBYTES_VERSION="0.46.1"; \
|
||||
BITSANDBYTES_VERSION="${BITSANDBYTES_VERSION_X86}"; \
|
||||
fi; \
|
||||
uv pip install --system accelerate hf_transfer modelscope \
|
||||
"bitsandbytes>=${BITSANDBYTES_VERSION}" 'timm>=1.0.17' 'runai-model-streamer[s3,gcs]>=0.15.3'
|
||||
"bitsandbytes>=${BITSANDBYTES_VERSION}" "timm${TIMM_VERSION}" "runai-model-streamer[s3,gcs]${RUNAI_MODEL_STREAMER_VERSION}"
|
||||
|
||||
# ============================================================
|
||||
# VLLM INSTALLATION (depends on build stage)
|
||||
@@ -521,11 +639,26 @@ ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL
|
||||
ARG PYTORCH_CUDA_INDEX_BASE_URL
|
||||
ARG PIP_KEYRING_PROVIDER UV_KEYRING_PROVIDER
|
||||
|
||||
# Install vllm wheel first, so that torch etc will be installed.
|
||||
# We can specify the standard or nightly build of PyTorch
|
||||
ARG PYTORCH_NIGHTLY
|
||||
|
||||
# Install vLLM wheel first, so that torch etc will be installed.
|
||||
# Check whether to install torch nightly instead of release for this build.
|
||||
COPY --from=base /workspace/torch_lib_versions.txt torch_lib_versions.txt
|
||||
RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist \
|
||||
--mount=type=cache,target=/root/.cache/uv \
|
||||
uv pip install --system dist/*.whl --verbose \
|
||||
--extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
|
||||
if [ "${PYTORCH_NIGHTLY}" = "1" ]; then \
|
||||
echo "Installing torch nightly..." \
|
||||
&& uv pip install --system $(cat torch_lib_versions.txt | xargs) --pre \
|
||||
--index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/nightly/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') \
|
||||
&& echo "Installing vLLM..." \
|
||||
&& uv pip install --system dist/*.whl --verbose \
|
||||
--extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/nightly/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.'); \
|
||||
else \
|
||||
echo "Installing vLLM..." \
|
||||
&& uv pip install --system dist/*.whl --verbose \
|
||||
--extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.'); \
|
||||
fi
|
||||
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
. /etc/environment && \
|
||||
@@ -585,12 +718,33 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
|
||||
&& apt-get update -y \
|
||||
&& apt-get install -y git
|
||||
|
||||
# install development dependencies (for testing)
|
||||
# We can specify the standard or nightly build of PyTorch
|
||||
ARG PYTORCH_NIGHTLY
|
||||
|
||||
# Install development dependencies (for testing)
|
||||
COPY requirements/lint.txt requirements/lint.txt
|
||||
COPY requirements/test.in requirements/test.in
|
||||
COPY requirements/test.txt requirements/test.txt
|
||||
COPY requirements/dev.txt requirements/dev.txt
|
||||
COPY use_existing_torch.py use_existing_torch.py
|
||||
COPY --from=base /workspace/torch_lib_versions.txt torch_lib_versions.txt
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
CUDA_MAJOR="${CUDA_VERSION%%.*}"; \
|
||||
if [ "$CUDA_MAJOR" -ge 12 ]; then \
|
||||
uv pip install --system -r requirements/dev.txt \
|
||||
--extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.'); \
|
||||
if [ "${PYTORCH_NIGHTLY}" = "1" ]; then \
|
||||
echo "Installing dev requirements plus torch nightly..." \
|
||||
&& python3 use_existing_torch.py --prefix \
|
||||
&& cat torch_lib_versions.txt >> requirements/test.in \
|
||||
&& uv pip compile requirements/test.in -o requirements/test.txt --index-strategy unsafe-best-match \
|
||||
--extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/nightly/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') \
|
||||
&& uv pip install --system $(cat torch_lib_versions.txt | xargs) --pre \
|
||||
-r requirements/dev.txt \
|
||||
--extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/nightly/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.'); \
|
||||
else \
|
||||
echo "Installing dev requirements..." \
|
||||
&& uv pip install --system -r requirements/dev.txt \
|
||||
--extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.'); \
|
||||
fi \
|
||||
fi
|
||||
|
||||
# install development dependencies (for testing)
|
||||
|
||||
@@ -15,9 +15,11 @@
|
||||
# Build arguments:
|
||||
# PYTHON_VERSION=3.13|3.12 (default)|3.11|3.10
|
||||
# VLLM_CPU_DISABLE_AVX512=false (default)|true
|
||||
# VLLM_CPU_AVX512BF16=false (default)|true
|
||||
# VLLM_CPU_AVX512VNNI=false (default)|true
|
||||
# VLLM_CPU_AMXBF16=false |true (default)
|
||||
# VLLM_CPU_AVX2=false (default)|true (for cross-compilation)
|
||||
# VLLM_CPU_AVX512=false (default)|true (for cross-compilation)
|
||||
# VLLM_CPU_AVX512BF16=false (default)|true (for cross-compilation)
|
||||
# VLLM_CPU_AVX512VNNI=false (default)|true (for cross-compilation)
|
||||
# VLLM_CPU_AMXBF16=false (default)|true (for cross-compilation)
|
||||
#
|
||||
|
||||
######################### COMMON BASE IMAGE #########################
|
||||
@@ -54,9 +56,12 @@ ENV PIP_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL}
|
||||
ENV UV_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL}
|
||||
ENV UV_INDEX_STRATEGY="unsafe-best-match"
|
||||
ENV UV_LINK_MODE="copy"
|
||||
|
||||
# Copy requirements files for installation
|
||||
COPY requirements/common.txt requirements/common.txt
|
||||
COPY requirements/cpu.txt requirements/cpu.txt
|
||||
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
--mount=type=bind,src=requirements/common.txt,target=requirements/common.txt \
|
||||
--mount=type=bind,src=requirements/cpu.txt,target=requirements/cpu.txt \
|
||||
uv pip install --upgrade pip && \
|
||||
uv pip install -r requirements/cpu.txt
|
||||
|
||||
@@ -88,6 +93,12 @@ ARG GIT_REPO_CHECK=0
|
||||
# Support for building with non-AVX512 vLLM: docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" ...
|
||||
ARG VLLM_CPU_DISABLE_AVX512=0
|
||||
ENV VLLM_CPU_DISABLE_AVX512=${VLLM_CPU_DISABLE_AVX512}
|
||||
# Support for cross-compilation with AVX2 ISA: docker build --build-arg VLLM_CPU_AVX2="1" ...
|
||||
ARG VLLM_CPU_AVX2=0
|
||||
ENV VLLM_CPU_AVX2=${VLLM_CPU_AVX2}
|
||||
# Support for cross-compilation with AVX512 ISA: docker build --build-arg VLLM_CPU_AVX512="1" ...
|
||||
ARG VLLM_CPU_AVX512=0
|
||||
ENV VLLM_CPU_AVX512=${VLLM_CPU_AVX512}
|
||||
# Support for building with AVX512BF16 ISA: docker build --build-arg VLLM_CPU_AVX512BF16="true" ...
|
||||
ARG VLLM_CPU_AVX512BF16=0
|
||||
ENV VLLM_CPU_AVX512BF16=${VLLM_CPU_AVX512BF16}
|
||||
@@ -100,18 +111,19 @@ ENV VLLM_CPU_AMXBF16=${VLLM_CPU_AMXBF16}
|
||||
|
||||
WORKDIR /workspace/vllm
|
||||
|
||||
# Copy build requirements
|
||||
COPY requirements/cpu-build.txt requirements/build.txt
|
||||
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
--mount=type=bind,src=requirements/cpu-build.txt,target=requirements/build.txt \
|
||||
uv pip install -r requirements/build.txt
|
||||
|
||||
COPY . .
|
||||
RUN --mount=type=bind,source=.git,target=.git \
|
||||
if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi
|
||||
|
||||
RUN if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi
|
||||
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
--mount=type=cache,target=/root/.cache/ccache \
|
||||
--mount=type=cache,target=/workspace/vllm/.deps,sharing=locked \
|
||||
--mount=type=bind,source=.git,target=.git \
|
||||
VLLM_TARGET_DEVICE=cpu python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38
|
||||
|
||||
######################### TEST DEPS #########################
|
||||
@@ -119,9 +131,11 @@ FROM base AS vllm-test-deps
|
||||
|
||||
WORKDIR /workspace/vllm
|
||||
|
||||
# Copy test requirements
|
||||
COPY requirements/test.in requirements/cpu-test.in
|
||||
|
||||
# TODO: Update to 2.9.0 when there is a new build for intel_extension_for_pytorch for that version
|
||||
RUN --mount=type=bind,src=requirements/test.in,target=requirements/test.in \
|
||||
cp requirements/test.in requirements/cpu-test.in && \
|
||||
RUN \
|
||||
sed -i '/mamba_ssm/d' requirements/cpu-test.in && \
|
||||
remove_packages_not_supported_on_aarch64() { \
|
||||
case "$(uname -m)" in \
|
||||
@@ -132,7 +146,7 @@ RUN --mount=type=bind,src=requirements/test.in,target=requirements/test.in \
|
||||
esac; \
|
||||
}; \
|
||||
remove_packages_not_supported_on_aarch64 && \
|
||||
sed -i 's/^torch==.*/torch==2.9.1/g' requirements/cpu-test.in && \
|
||||
sed -i 's/^torch==.*/torch==2.10.0/g' requirements/cpu-test.in && \
|
||||
sed -i 's/torchaudio.*/torchaudio/g' requirements/cpu-test.in && \
|
||||
sed -i 's/torchvision.*/torchvision/g' requirements/cpu-test.in && \
|
||||
uv pip compile requirements/cpu-test.in -o requirements/cpu-test.txt --index-strategy unsafe-best-match --torch-backend cpu
|
||||
@@ -200,4 +214,29 @@ RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
--mount=type=bind,from=vllm-build,src=/workspace/vllm/dist,target=dist \
|
||||
uv pip install dist/*.whl
|
||||
|
||||
# Add labels to document build configuration
|
||||
LABEL org.opencontainers.image.title="vLLM CPU"
|
||||
LABEL org.opencontainers.image.description="vLLM inference engine for CPU platforms"
|
||||
LABEL org.opencontainers.image.vendor="vLLM Project"
|
||||
LABEL org.opencontainers.image.source="https://github.com/vllm-project/vllm"
|
||||
|
||||
# Build configuration labels
|
||||
ARG TARGETARCH
|
||||
ARG VLLM_CPU_DISABLE_AVX512
|
||||
ARG VLLM_CPU_AVX2
|
||||
ARG VLLM_CPU_AVX512
|
||||
ARG VLLM_CPU_AVX512BF16
|
||||
ARG VLLM_CPU_AVX512VNNI
|
||||
ARG VLLM_CPU_AMXBF16
|
||||
ARG PYTHON_VERSION
|
||||
|
||||
LABEL ai.vllm.build.target-arch="${TARGETARCH}"
|
||||
LABEL ai.vllm.build.cpu-disable-avx512="${VLLM_CPU_DISABLE_AVX512:-false}"
|
||||
LABEL ai.vllm.build.cpu-avx2="${VLLM_CPU_AVX2:-false}"
|
||||
LABEL ai.vllm.build.cpu-avx512="${VLLM_CPU_AVX512:-false}"
|
||||
LABEL ai.vllm.build.cpu-avx512bf16="${VLLM_CPU_AVX512BF16:-false}"
|
||||
LABEL ai.vllm.build.cpu-avx512vnni="${VLLM_CPU_AVX512VNNI:-false}"
|
||||
LABEL ai.vllm.build.cpu-amxbf16="${VLLM_CPU_AMXBF16:-false}"
|
||||
LABEL ai.vllm.build.python-version="${PYTHON_VERSION:-3.12}"
|
||||
|
||||
ENTRYPOINT ["vllm", "serve"]
|
||||
|
||||
@@ -1,3 +1,11 @@
|
||||
#######
|
||||
#
|
||||
# THIS FILE IS DEPRECATED AND WILL BE REMOVED SHORTLY
|
||||
#
|
||||
# Please use the standard Dockerfile with PYTORCH_NIGHTLY=1 instead
|
||||
#
|
||||
#######
|
||||
|
||||
# The vLLM Dockerfile is used to construct vLLM image against torch nightly that can be directly used for testing
|
||||
|
||||
# for torch nightly, cuda >=12.6 is required,
|
||||
@@ -213,15 +221,14 @@ RUN pip install setuptools==75.6.0 packaging==23.2 ninja==1.11.1.3 build==1.2.2.
|
||||
|
||||
|
||||
# build flashinfer for torch nightly from source around 10 mins
|
||||
# release version: v0.5.2
|
||||
# release version: v0.6.1
|
||||
# todo(elainewy): cache flashinfer build result for faster build
|
||||
ENV CCACHE_DIR=/root/.cache/ccache
|
||||
RUN --mount=type=cache,target=/root/.cache/ccache \
|
||||
--mount=type=cache,target=/root/.cache/uv \
|
||||
echo "git clone flashinfer..." \
|
||||
&& git clone --recursive https://github.com/flashinfer-ai/flashinfer.git \
|
||||
&& git clone --depth 1 --branch v0.6.1 --recursive https://github.com/flashinfer-ai/flashinfer.git \
|
||||
&& cd flashinfer \
|
||||
&& git checkout v0.5.2 \
|
||||
&& git submodule update --init --recursive \
|
||||
&& echo "finish git clone flashinfer..." \
|
||||
&& rm -rf build \
|
||||
|
||||
@@ -85,6 +85,8 @@ ONBUILD COPY ./ vllm/
|
||||
FROM base AS fetch_vllm_1
|
||||
ARG VLLM_REPO="https://github.com/vllm-project/vllm.git"
|
||||
ARG VLLM_BRANCH="main"
|
||||
ENV VLLM_REPO=${VLLM_REPO}
|
||||
ENV VLLM_BRANCH=${VLLM_BRANCH}
|
||||
ONBUILD RUN git clone ${VLLM_REPO} \
|
||||
&& cd vllm \
|
||||
&& git fetch -v --prune -- origin ${VLLM_BRANCH} \
|
||||
@@ -301,6 +303,10 @@ RUN --mount=type=bind,from=export_vllm,src=/,target=/install \
|
||||
&& pip uninstall -y vllm \
|
||||
&& uv pip install --system *.whl
|
||||
|
||||
# Install RIXL wheel
|
||||
RUN --mount=type=bind,from=build_rixl,src=/app/install,target=/rixl_install \
|
||||
uv pip install --system /rixl_install/*.whl
|
||||
|
||||
WORKDIR /vllm-workspace
|
||||
ARG COMMON_WORKDIR
|
||||
COPY --from=build_vllm ${COMMON_WORKDIR}/vllm /vllm-workspace
|
||||
@@ -379,5 +385,5 @@ RUN echo "VLLM_BASE_IMAGE=${BASE_IMAGE}" >> ${COMMON_WORKDIR}/versions.txt
|
||||
CMD ["/bin/bash"]
|
||||
|
||||
#Set entrypoint for vllm-openai official images
|
||||
FROM final As vllm-openai
|
||||
FROM final AS vllm-openai
|
||||
ENTRYPOINT ["vllm", "serve"]
|
||||
|
||||
@@ -198,92 +198,6 @@ RUN cd mori \
|
||||
RUN mkdir -p /app/install && cp /app/mori/dist/*.whl /app/install
|
||||
|
||||
|
||||
###
|
||||
### RIXL Build
|
||||
###
|
||||
FROM build_pytorch AS build_rixl
|
||||
ARG RIXL_BRANCH
|
||||
ARG RIXL_REPO
|
||||
ARG ETCD_BRANCH
|
||||
ARG ETCD_REPO
|
||||
ARG UCX_BRANCH
|
||||
ARG UCX_REPO
|
||||
|
||||
ENV ROCM_PATH=/opt/rocm
|
||||
ENV UCX_HOME=/usr/local/ucx
|
||||
ENV RIXL_HOME=/usr/local/rixl
|
||||
ENV RIXL_BENCH_HOME=/usr/local/rixl_bench
|
||||
|
||||
# RIXL build system dependences and RDMA support
|
||||
RUN apt-get -y update && apt-get -y install autoconf libtool pkg-config \
|
||||
libgrpc-dev \
|
||||
libgrpc++-dev \
|
||||
libprotobuf-dev \
|
||||
protobuf-compiler-grpc \
|
||||
libcpprest-dev \
|
||||
libaio-dev \
|
||||
librdmacm1 \
|
||||
librdmacm-dev \
|
||||
libibverbs1 \
|
||||
libibverbs-dev \
|
||||
ibverbs-utils \
|
||||
rdmacm-utils \
|
||||
ibverbs-providers
|
||||
|
||||
RUN pip install meson auditwheel patchelf tomlkit
|
||||
|
||||
WORKDIR /workspace
|
||||
|
||||
RUN git clone ${ETCD_REPO} && \
|
||||
cd etcd-cpp-apiv3 && \
|
||||
git checkout ${ETCD_BRANCH} && \
|
||||
mkdir build && cd build && \
|
||||
cmake .. -DCMAKE_POLICY_VERSION_MINIMUM=3.5 && \
|
||||
make -j$(nproc) && \
|
||||
make install
|
||||
|
||||
RUN cd /usr/local/src && \
|
||||
git clone ${UCX_REPO} && \
|
||||
cd ucx && \
|
||||
git checkout ${UCX_BRANCH} && \
|
||||
./autogen.sh && \
|
||||
mkdir build && cd build && \
|
||||
../configure \
|
||||
--prefix=/usr/local/ucx \
|
||||
--enable-shared \
|
||||
--disable-static \
|
||||
--disable-doxygen-doc \
|
||||
--enable-optimizations \
|
||||
--enable-devel-headers \
|
||||
--with-rocm=/opt/rocm \
|
||||
--with-verbs \
|
||||
--with-dm \
|
||||
--enable-mt && \
|
||||
make -j && \
|
||||
make -j install
|
||||
|
||||
ENV PATH=/usr/local/ucx/bin:$PATH
|
||||
ENV LD_LIBRARY_PATH=${UCX_HOME}/lib:${LD_LIBRARY_PATH}
|
||||
|
||||
RUN git clone ${RIXL_REPO} /opt/rixl && \
|
||||
cd /opt/rixl && \
|
||||
git checkout ${RIXL_BRANCH} && \
|
||||
meson setup build --prefix=${RIXL_HOME} \
|
||||
-Ducx_path=${UCX_HOME} \
|
||||
-Drocm_path=${ROCM_PATH} && \
|
||||
cd build && \
|
||||
ninja && \
|
||||
ninja install
|
||||
|
||||
# Generate RIXL wheel
|
||||
RUN cd /opt/rixl && mkdir -p /app/install && \
|
||||
./contrib/build-wheel.sh \
|
||||
--output-dir /app/install \
|
||||
--rocm-dir ${ROCM_PATH} \
|
||||
--ucx-plugins-dir ${UCX_HOME}/lib/ucx \
|
||||
--nixl-plugins-dir ${RIXL_HOME}/lib/x86_64-linux-gnu/plugins
|
||||
|
||||
|
||||
###
|
||||
### FlashAttention Build
|
||||
###
|
||||
@@ -365,8 +279,6 @@ RUN --mount=type=bind,from=build_aiter,src=/app/install/,target=/install \
|
||||
cp /install/*.whl /app/debs
|
||||
RUN --mount=type=bind,from=build_mori,src=/app/install/,target=/install \
|
||||
cp /install/*.whl /app/debs
|
||||
RUN --mount=type=bind,from=build_rixl,src=/app/install/,target=/install \
|
||||
cp /install/*.whl /app/debs
|
||||
|
||||
FROM base AS final
|
||||
RUN --mount=type=bind,from=debs,src=/app/debs,target=/install \
|
||||
@@ -385,12 +297,6 @@ ARG FA_BRANCH
|
||||
ARG FA_REPO
|
||||
ARG AITER_BRANCH
|
||||
ARG AITER_REPO
|
||||
ARG RIXL_BRANCH
|
||||
ARG RIXL_REPO
|
||||
ARG ETCD_BRANCH
|
||||
ARG ETCD_REPO
|
||||
ARG UCX_BRANCH
|
||||
ARG UCX_REPO
|
||||
ARG MORI_BRANCH
|
||||
ARG MORI_REPO
|
||||
RUN echo "BASE_IMAGE: ${BASE_IMAGE}" > /app/versions.txt \
|
||||
@@ -406,11 +312,5 @@ RUN echo "BASE_IMAGE: ${BASE_IMAGE}" > /app/versions.txt \
|
||||
&& echo "FA_REPO: ${FA_REPO}" >> /app/versions.txt \
|
||||
&& echo "AITER_BRANCH: ${AITER_BRANCH}" >> /app/versions.txt \
|
||||
&& echo "AITER_REPO: ${AITER_REPO}" >> /app/versions.txt \
|
||||
&& echo "RIXL_BRANCH: ${RIXL_BRANCH}" >> /app/versions.txt \
|
||||
&& echo "RIXL_REPO: ${RIXL_REPO}" >> /app/versions.txt \
|
||||
&& echo "ETCD_BRANCH: ${ETCD_BRANCH}" >> /app/versions.txt \
|
||||
&& echo "ETCD_REPO: ${ETCD_REPO}" >> /app/versions.txt \
|
||||
&& echo "UCX_BRANCH: ${UCX_BRANCH}" >> /app/versions.txt \
|
||||
&& echo "UCX_REPO: ${UCX_REPO}" >> /app/versions.txt \
|
||||
&& echo "MORI_BRANCH: ${MORI_BRANCH}" >> /app/versions.txt \
|
||||
&& echo "MORI_REPO: ${MORI_REPO}" >> /app/versions.txt
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user