Compare commits
503 Commits
v0.16.0rc2
...
v0.16.1rc0
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
3827c8c55a | ||
|
|
ade81f17fe | ||
|
|
6042e66cd5 | ||
|
|
9f9a675b23 | ||
|
|
a07c4c5939 | ||
|
|
d3a51da92a | ||
|
|
186ea22efe | ||
|
|
4a9c07a0a2 | ||
|
|
9d37941017 | ||
|
|
4171ff6dd9 | ||
|
|
13025e71e8 | ||
|
|
71dfce6aa6 | ||
|
|
2aa4140402 | ||
|
|
86c3b5a808 | ||
|
|
160424a937 | ||
|
|
9511a3f8ee | ||
|
|
de527e1cec | ||
|
|
1976356ee6 | ||
|
|
cbf8f7028c | ||
|
|
6831650c40 | ||
|
|
ed42507f6d | ||
|
|
9571e99945 | ||
|
|
c97234c08b | ||
|
|
b188bab441 | ||
|
|
15d76f74e2 | ||
|
|
8fd6975479 | ||
|
|
5d18bf8b32 | ||
|
|
0788ff0a15 | ||
|
|
d72b0be33c | ||
|
|
42489e43c2 | ||
|
|
af5e6afa0a | ||
|
|
ee59a7c615 | ||
|
|
709eadbb0b | ||
|
|
90fc7f9109 | ||
|
|
675ec59aa9 | ||
|
|
80e60a6133 | ||
|
|
26e722f906 | ||
|
|
2c619e5e3f | ||
|
|
8a685be8d9 | ||
|
|
2465071510 | ||
|
|
cd43673668 | ||
|
|
35d44b4557 | ||
|
|
8ad54a991b | ||
|
|
92510edc32 | ||
|
|
a6c137521c | ||
|
|
4572a06afe | ||
|
|
5cc29cfb8b | ||
|
|
8fae54faff | ||
|
|
f7967577f5 | ||
|
|
af770b8e7b | ||
|
|
2ff3e436ad | ||
|
|
c2c4c4611a | ||
|
|
f38f8c9742 | ||
|
|
ec1d30c0f6 | ||
|
|
e3b2324ec4 | ||
|
|
dbf0da817a | ||
|
|
3bbb2046ff | ||
|
|
576fe50333 | ||
|
|
a0e50a4260 | ||
|
|
9fa5b25a23 | ||
|
|
ea97750414 | ||
|
|
067c5d9ad1 | ||
|
|
f5972a872f | ||
|
|
a9e15e040d | ||
|
|
542ca66357 | ||
|
|
fc8456c336 | ||
|
|
9ce8fad2a9 | ||
|
|
c38b8d5a31 | ||
|
|
60da0e1544 | ||
|
|
9609b1f18d | ||
|
|
a0c7081695 | ||
|
|
34ce0ffd1f | ||
|
|
0de5333989 | ||
|
|
a87cc50859 | ||
|
|
761e63e541 | ||
|
|
d12d201409 | ||
|
|
b3ad37c5db | ||
|
|
14561fabfd | ||
|
|
c77f3e1207 | ||
|
|
012dee9233 | ||
|
|
f1c664545b | ||
|
|
c870eb9e0f | ||
|
|
6af03f2394 | ||
|
|
1a6cf39dec | ||
|
|
f91808ae0d | ||
|
|
33a0d43c71 | ||
|
|
80d93fd6da | ||
|
|
ec85340531 | ||
|
|
2ff4e51152 | ||
|
|
95642441d0 | ||
|
|
a7c9f7b7ec | ||
|
|
a4bd661fb3 | ||
|
|
3ef9fd0f98 | ||
|
|
22a97e6613 | ||
|
|
596ed1f02e | ||
|
|
b8d8b7e934 | ||
|
|
28c5e69ba0 | ||
|
|
864167d376 | ||
|
|
a2ba6a5244 | ||
|
|
c4f38696f7 | ||
|
|
a7f341c323 | ||
|
|
d13ece38d7 | ||
|
|
5cc7c4452e | ||
|
|
b95bb6927f | ||
|
|
392645454b | ||
|
|
1e8438a89a | ||
|
|
8435b2e049 | ||
|
|
b1b5e045df | ||
|
|
5f68464f92 | ||
|
|
aa08a30fc9 | ||
|
|
7f40e9e516 | ||
|
|
103e614b14 | ||
|
|
54e2f83d0a | ||
|
|
e631f8e78e | ||
|
|
e97c46a92d | ||
|
|
7291d1b288 | ||
|
|
987506bca6 | ||
|
|
c645e9a214 | ||
|
|
944ffb5968 | ||
|
|
2bcf71b9c0 | ||
|
|
b7892a3bef | ||
|
|
682566b18e | ||
|
|
b9c2a565cc | ||
|
|
dd8c3a7fb2 | ||
|
|
a8a47c17b6 | ||
|
|
40f88d8318 | ||
|
|
2cbf9656ce | ||
|
|
30132cd144 | ||
|
|
cbd95a2dd1 | ||
|
|
970861ac0c | ||
|
|
d24bdd7c4b | ||
|
|
d403c1da1c | ||
|
|
b71fbd06e2 | ||
|
|
74d90b1ce4 | ||
|
|
a4047d4ea9 | ||
|
|
965fe45935 | ||
|
|
98b0205c3c | ||
|
|
272b535ab3 | ||
|
|
f74f1572ca | ||
|
|
bebfe55b1c | ||
|
|
820d7815eb | ||
|
|
ab6f3487a6 | ||
|
|
8dc8a99b56 | ||
|
|
2aab2bb543 | ||
|
|
54254f7a61 | ||
|
|
cf93c1a128 | ||
|
|
89358f0d35 | ||
|
|
a0fe7ea2f0 | ||
|
|
991d6bff38 | ||
|
|
5719a4e4e6 | ||
|
|
11be2c74dc | ||
|
|
7a5adad480 | ||
|
|
59c6233297 | ||
|
|
d38cd3dde5 | ||
|
|
ded333fb9b | ||
|
|
9d7577b2bd | ||
|
|
e739c29ea4 | ||
|
|
a55caf6ae9 | ||
|
|
0e22cd618b | ||
|
|
ea5f903f80 | ||
|
|
0632ed8778 | ||
|
|
aaefc58ee0 | ||
|
|
f24b2de3d3 | ||
|
|
fac1507f03 | ||
|
|
f863994084 | ||
|
|
e4a5d8c653 | ||
|
|
a6d0299c75 | ||
|
|
6ce80f7071 | ||
|
|
1fe462168c | ||
|
|
ed31a020ee | ||
|
|
f9ac19204f | ||
|
|
59965affbd | ||
|
|
b1c4f0b265 | ||
|
|
8de7c636cc | ||
|
|
059779231f | ||
|
|
ea37530b47 | ||
|
|
f5432e35a3 | ||
|
|
07cab212f0 | ||
|
|
0c1dc42748 | ||
|
|
676f82ae81 | ||
|
|
81bfc21a6a | ||
|
|
4e2c7caf2d | ||
|
|
d9e62c03eb | ||
|
|
a1a2d79442 | ||
|
|
ac900c89bb | ||
|
|
76df6072ff | ||
|
|
16f24e8797 | ||
|
|
40b2f1c3d9 | ||
|
|
648951a9c3 | ||
|
|
f72061a19a | ||
|
|
662205d34e | ||
|
|
4fb8beefaa | ||
|
|
304319c4ed | ||
|
|
c683d11c94 | ||
|
|
3eff45d793 | ||
|
|
4685a630a2 | ||
|
|
ee1d25f199 | ||
|
|
6fff24f30f | ||
|
|
23210a911e | ||
|
|
1391378861 | ||
|
|
f6220f9877 | ||
|
|
2df2bb27b0 | ||
|
|
f75b61a9e9 | ||
|
|
7f51e93864 | ||
|
|
4611af1663 | ||
|
|
ad5aa6bd9f | ||
|
|
9681068cf9 | ||
|
|
b6101d384d | ||
|
|
5fcb0cdd68 | ||
|
|
c878b43b64 | ||
|
|
2b84ac669c | ||
|
|
11d3976b88 | ||
|
|
40da9625a1 | ||
|
|
8d9babd4de | ||
|
|
e99ba957ec | ||
|
|
64ac1395e8 | ||
|
|
61cf087680 | ||
|
|
847a57cd12 | ||
|
|
fcd6ac97ed | ||
|
|
95be2a7f22 | ||
|
|
0e60c925cf | ||
|
|
d7ff22204a | ||
|
|
c0bd8b13da | ||
|
|
caeb887bf6 | ||
|
|
6b3166a7c7 | ||
|
|
25e2e136ef | ||
|
|
6874638bc4 | ||
|
|
e24663c5a9 | ||
|
|
c50e105a88 | ||
|
|
a766b30349 | ||
|
|
1faa8cb73c | ||
|
|
e89a91d927 | ||
|
|
909b147197 | ||
|
|
a88b3be7c4 | ||
|
|
a49ea5a58f | ||
|
|
30ebe0dc3c | ||
|
|
cef65f0715 | ||
|
|
6f3b2047ab | ||
|
|
02e8f26cea | ||
|
|
4a00a511bb | ||
|
|
a0d8d944e2 | ||
|
|
df3f537a66 | ||
|
|
7743152957 | ||
|
|
ab33d2a629 | ||
|
|
be3af2d29e | ||
|
|
c656ba3b4d | ||
|
|
dc5fa77a4e | ||
|
|
1e4a084c8e | ||
|
|
7967e854da | ||
|
|
6bd6d0c3c1 | ||
|
|
8e962fef5f | ||
|
|
574fe75245 | ||
|
|
c61a98f529 | ||
|
|
28bffe9466 | ||
|
|
ad65177a19 | ||
|
|
d44a5b6c47 | ||
|
|
1d65283e95 | ||
|
|
c464b57374 | ||
|
|
c5c38e152a | ||
|
|
d00df624f3 | ||
|
|
9752da9d9c | ||
|
|
04925b2202 | ||
|
|
d74278fb67 | ||
|
|
b68fd899d1 | ||
|
|
0b5f9b7204 | ||
|
|
9a8853f781 | ||
|
|
387a1898d9 | ||
|
|
3b30e61507 | ||
|
|
824f9e8f3c | ||
|
|
6cc403e67d | ||
|
|
72d5951d02 | ||
|
|
a3205beffb | ||
|
|
6930becd45 | ||
|
|
03a8770a6d | ||
|
|
bc56a1d56e | ||
|
|
ec7d9e6745 | ||
|
|
3bb4e4311c | ||
|
|
08f8c198ae | ||
|
|
a21cedf4ff | ||
|
|
3ef74cde5d | ||
|
|
cd81cdb399 | ||
|
|
1e828573b4 | ||
|
|
a5ccc85c8c | ||
|
|
b5475d0534 | ||
|
|
9521002f0a | ||
|
|
ec17bdd894 | ||
|
|
bb59c90248 | ||
|
|
5bff999d12 | ||
|
|
bb85929aa6 | ||
|
|
5653021094 | ||
|
|
974d829b05 | ||
|
|
91ac5d9bfd | ||
|
|
23d825aba1 | ||
|
|
f07a128413 | ||
|
|
71cd89264f | ||
|
|
19fab44152 | ||
|
|
79c7e09235 | ||
|
|
79f3fab05a | ||
|
|
604b9eaec5 | ||
|
|
50dbd6c9e6 | ||
|
|
98bcc6ca59 | ||
|
|
f13e86d8dd | ||
|
|
9ca768c740 | ||
|
|
d5fe3f702c | ||
|
|
73391a1baa | ||
|
|
b3c14229b0 | ||
|
|
2f186635cb | ||
|
|
342a7cda2d | ||
|
|
d1ea65d0a1 | ||
|
|
de42abb366 | ||
|
|
60ca7981bc | ||
|
|
0ef5b9147b | ||
|
|
ed242652d7 | ||
|
|
b37b679770 | ||
|
|
a0638d052d | ||
|
|
c027541eaf | ||
|
|
fd267bc7b7 | ||
|
|
bfaa559305 | ||
|
|
87789c8364 | ||
|
|
bcd65c1f6a | ||
|
|
59d53066d8 | ||
|
|
4a9952ec1b | ||
|
|
1dae7b7843 | ||
|
|
5885e330ef | ||
|
|
071d863e20 | ||
|
|
0916e7960b | ||
|
|
3d2a026fd0 | ||
|
|
dddbff4624 | ||
|
|
47e9b63e1a | ||
|
|
934acddef9 | ||
|
|
742d214d6e | ||
|
|
4137c5dfa7 | ||
|
|
7a8a46ddcb | ||
|
|
bcf0731aa0 | ||
|
|
ec090c2429 | ||
|
|
eea3024f43 | ||
|
|
2f308214c0 | ||
|
|
1b4e8e53f8 | ||
|
|
dcf6ee8592 | ||
|
|
372b2e762a | ||
|
|
6afa587d31 | ||
|
|
94ed6cf6ea | ||
|
|
bf37812ca7 | ||
|
|
b86bf4417e | ||
|
|
de13dd781f | ||
|
|
62788f99a4 | ||
|
|
ea5ff3a1f6 | ||
|
|
04ea31baab | ||
|
|
6f019e6e0a | ||
|
|
d707678dfb | ||
|
|
fc22cae4ac | ||
|
|
96161fe978 | ||
|
|
4453ba8d9e | ||
|
|
aa181c923b | ||
|
|
be7370daf3 | ||
|
|
9ea1f598ce | ||
|
|
f120bd42d3 | ||
|
|
fac4e96940 | ||
|
|
6d4e27ce29 | ||
|
|
4c078fa546 | ||
|
|
6c0baee610 | ||
|
|
1100a97621 | ||
|
|
766e167821 | ||
|
|
becbe24808 | ||
|
|
679ca5d8d3 | ||
|
|
f2c47886fd | ||
|
|
334c715e0f | ||
|
|
7b5a8b4a9d | ||
|
|
dea63512bb | ||
|
|
8a798be929 | ||
|
|
fb455ed547 | ||
|
|
f5897613fb | ||
|
|
55a1a9563a | ||
|
|
386bfe5d08 | ||
|
|
e9cd691132 | ||
|
|
80f2ba6ea6 | ||
|
|
136b0bfa59 | ||
|
|
b96f7314b4 | ||
|
|
ced2a92f40 | ||
|
|
e1d97c38f8 | ||
|
|
ec12d39d44 | ||
|
|
ff1f83b056 | ||
|
|
83b47f67b1 | ||
|
|
fb7b30c716 | ||
|
|
31d992d215 | ||
|
|
5aff2699bd | ||
|
|
527ca32197 | ||
|
|
5458eb835d | ||
|
|
144d9b7cc8 | ||
|
|
83e26c834e | ||
|
|
5001211369 | ||
|
|
11c7ace340 | ||
|
|
be7f3d5d20 | ||
|
|
0ab06100f4 | ||
|
|
ffb3d553cc | ||
|
|
fa7e0bfacf | ||
|
|
48134a2c22 | ||
|
|
64f570ab56 | ||
|
|
fd618871b4 | ||
|
|
67a42b5a44 | ||
|
|
c7914d30f9 | ||
|
|
1b8756562e | ||
|
|
275e0d2a99 | ||
|
|
0f5e55e7a8 | ||
|
|
1e9204bff3 | ||
|
|
05339a7b20 | ||
|
|
40b8f55358 | ||
|
|
5045d5c983 | ||
|
|
e09546cf05 | ||
|
|
786806dd44 | ||
|
|
79504027ef | ||
|
|
addac0e653 | ||
|
|
675a22ed66 | ||
|
|
cb9574eb85 | ||
|
|
21dfb842d7 | ||
|
|
d1b837f0ae | ||
|
|
0b20469c62 | ||
|
|
d7982daff5 | ||
|
|
9b17c57460 | ||
|
|
1b3540e6c6 | ||
|
|
7a048ee65f | ||
|
|
c9a1923bb4 | ||
|
|
b482f71e9f | ||
|
|
1485396abb | ||
|
|
5ee5c86eeb | ||
|
|
b5dcb372e4 | ||
|
|
066c6da6a0 | ||
|
|
e30cedd44b | ||
|
|
3bcd494ef4 | ||
|
|
0e725a7d22 | ||
|
|
ba0511fd80 | ||
|
|
4a1550d22d | ||
|
|
d1481ba783 | ||
|
|
dc6de33c3d | ||
|
|
c4b9e6778f | ||
|
|
341eed3d30 | ||
|
|
6f2f59f2b3 | ||
|
|
bb2fc8b5e7 | ||
|
|
67132945bb | ||
|
|
f0ca0671c7 | ||
|
|
578977bb5e | ||
|
|
9615575afc | ||
|
|
4293c00b84 | ||
|
|
506ad7d7c1 | ||
|
|
fdd6f2ad58 | ||
|
|
33bcd3dc3b | ||
|
|
1f5febb4b8 | ||
|
|
ae871ca923 | ||
|
|
a2443de5fa | ||
|
|
f84a2a8f31 | ||
|
|
000214c4bb | ||
|
|
c5a66d1697 | ||
|
|
afdce12c89 | ||
|
|
82e11973cc | ||
|
|
b129136c7a | ||
|
|
599e4335a4 | ||
|
|
a1946570d8 | ||
|
|
d0bc520569 | ||
|
|
748625cdaf | ||
|
|
61413973e8 | ||
|
|
94de871546 | ||
|
|
e042d7e685 | ||
|
|
ae4e280602 | ||
|
|
cbea11c9f0 | ||
|
|
2c32558a3c | ||
|
|
5f970120f0 | ||
|
|
998e2d91f8 | ||
|
|
e1060a71a1 | ||
|
|
97fa8f6590 | ||
|
|
dab1de9f38 | ||
|
|
8d48d0a9d9 | ||
|
|
9608844f96 | ||
|
|
f69b903b4c | ||
|
|
81e217fe6b | ||
|
|
ab97bcf662 | ||
|
|
25e48a3aae | ||
|
|
8a5e0e2b2b | ||
|
|
4cde2e0159 | ||
|
|
047a457fa4 | ||
|
|
e94ec59733 | ||
|
|
13397841ab | ||
|
|
c60f8e3b49 | ||
|
|
5e75a14a66 | ||
|
|
e7e52781ff | ||
|
|
bb9f97308d | ||
|
|
4d39650961 | ||
|
|
8fd31f6245 | ||
|
|
eadb4e868b | ||
|
|
285bab4752 | ||
|
|
995bbf38f1 | ||
|
|
d4f123cc48 | ||
|
|
cb62e86f83 | ||
|
|
781ddf7868 | ||
|
|
64a9c2528b | ||
|
|
d0d97e2974 | ||
|
|
9562912cea | ||
|
|
9bdb06b436 | ||
|
|
caad9f1e01 | ||
|
|
1d5922fade | ||
|
|
3025b3cebb | ||
|
|
978a37c823 | ||
|
|
5a5c43511a | ||
|
|
d9bede0314 |
@@ -1,6 +1,7 @@
|
||||
group: Hardware
|
||||
group: Hardware - AMD Build
|
||||
steps:
|
||||
- label: "AMD: :docker: build image"
|
||||
key: image-build-amd
|
||||
depends_on: []
|
||||
device: amd_cpu
|
||||
no_plugin: true
|
||||
@@ -9,7 +10,7 @@ steps:
|
||||
docker build
|
||||
--build-arg max_jobs=16
|
||||
--build-arg REMOTE_VLLM=1
|
||||
--build-arg ARG_PYTORCH_ROCM_ARCH='gfx90a;gfx942'
|
||||
--build-arg ARG_PYTORCH_ROCM_ARCH='gfx942;gfx950'
|
||||
--build-arg VLLM_BRANCH=$BUILDKITE_COMMIT
|
||||
--tag "rocm/vllm-ci:${BUILDKITE_COMMIT}"
|
||||
-f docker/Dockerfile.rocm
|
||||
|
||||
@@ -8,7 +8,7 @@ clean_docker_tag() {
|
||||
}
|
||||
|
||||
print_usage_and_exit() {
|
||||
echo "Usage: $0 <registry> <repo> <commit> <branch> <vllm_use_precompiled> <vllm_merge_base_commit> <cache_from> <cache_to>"
|
||||
echo "Usage: $0 <registry> <repo> <commit> <branch> <image_tag> [<image_tag_latest>]"
|
||||
exit 1
|
||||
}
|
||||
|
||||
@@ -142,11 +142,16 @@ resolve_parent_commit() {
|
||||
|
||||
print_bake_config() {
|
||||
echo "--- :page_facing_up: Resolved bake configuration"
|
||||
BAKE_CONFIG_FILE="bake-config-build-${BUILDKITE_BUILD_NUMBER:-local}.json"
|
||||
# Write to a temp directory to avoid polluting the repo root (which is the
|
||||
# Docker build context). Files left in the repo root get COPY'd into the
|
||||
# image and can cause duplicate artifact uploads from downstream steps.
|
||||
local bake_tmp
|
||||
bake_tmp="$(mktemp -d)"
|
||||
BAKE_CONFIG_FILE="${bake_tmp}/bake-config-build-${BUILDKITE_BUILD_NUMBER:-local}.json"
|
||||
docker buildx bake -f "${VLLM_BAKE_FILE_PATH}" -f "${CI_HCL_PATH}" --print "${TARGET}" | tee "${BAKE_CONFIG_FILE}" || true
|
||||
echo "Saved bake config to ${BAKE_CONFIG_FILE}"
|
||||
echo "--- :arrow_down: Uploading bake config to Buildkite"
|
||||
buildkite-agent artifact upload "${BAKE_CONFIG_FILE}"
|
||||
(cd "$(dirname "${BAKE_CONFIG_FILE}")" && buildkite-agent artifact upload "$(basename "${BAKE_CONFIG_FILE}")")
|
||||
}
|
||||
|
||||
#################################
|
||||
@@ -154,7 +159,7 @@ print_bake_config() {
|
||||
#################################
|
||||
print_instance_info
|
||||
|
||||
if [[ $# -lt 7 ]]; then
|
||||
if [[ $# -lt 5 ]]; then
|
||||
print_usage_and_exit
|
||||
fi
|
||||
|
||||
@@ -163,10 +168,8 @@ REGISTRY=$1
|
||||
REPO=$2
|
||||
BUILDKITE_COMMIT=$3
|
||||
BRANCH=$4
|
||||
VLLM_USE_PRECOMPILED=$5
|
||||
VLLM_MERGE_BASE_COMMIT=$6
|
||||
IMAGE_TAG=$7
|
||||
IMAGE_TAG_LATEST=${8:-} # only used for main branch, optional
|
||||
IMAGE_TAG=$5
|
||||
IMAGE_TAG_LATEST=${6:-} # only used for main branch, optional
|
||||
|
||||
# build config
|
||||
TARGET="test-ci"
|
||||
@@ -193,8 +196,6 @@ export CACHE_FROM
|
||||
export CACHE_FROM_BASE_BRANCH
|
||||
export CACHE_FROM_MAIN
|
||||
export CACHE_TO
|
||||
export VLLM_USE_PRECOMPILED
|
||||
export VLLM_MERGE_BASE_COMMIT
|
||||
|
||||
# print args
|
||||
echo "--- :mag: Arguments"
|
||||
@@ -202,8 +203,6 @@ echo "REGISTRY: ${REGISTRY}"
|
||||
echo "REPO: ${REPO}"
|
||||
echo "BUILDKITE_COMMIT: ${BUILDKITE_COMMIT}"
|
||||
echo "BRANCH: ${BRANCH}"
|
||||
echo "VLLM_USE_PRECOMPILED: ${VLLM_USE_PRECOMPILED}"
|
||||
echo "VLLM_MERGE_BASE_COMMIT: ${VLLM_MERGE_BASE_COMMIT}"
|
||||
echo "IMAGE_TAG: ${IMAGE_TAG}"
|
||||
echo "IMAGE_TAG_LATEST: ${IMAGE_TAG_LATEST}"
|
||||
|
||||
|
||||
@@ -5,8 +5,7 @@ steps:
|
||||
depends_on: []
|
||||
timeout_in_minutes: 600
|
||||
commands:
|
||||
- if [[ "$BUILDKITE_BRANCH" != "main" ]]; then .buildkite/image_build/image_build.sh $REGISTRY $REPO $BUILDKITE_COMMIT $BRANCH $VLLM_USE_PRECOMPILED $VLLM_MERGE_BASE_COMMIT $IMAGE_TAG; fi
|
||||
- if [[ "$BUILDKITE_BRANCH" == "main" ]]; then .buildkite/image_build/image_build.sh $REGISTRY $REPO $BUILDKITE_COMMIT $BRANCH $VLLM_USE_PRECOMPILED $VLLM_MERGE_BASE_COMMIT $IMAGE_TAG $IMAGE_TAG_LATEST; fi
|
||||
- if [[ "$BUILDKITE_BRANCH" == "main" ]]; then .buildkite/image_build/image_build.sh $REGISTRY $REPO $BUILDKITE_COMMIT $BRANCH $IMAGE_TAG $IMAGE_TAG_LATEST; else .buildkite/image_build/image_build.sh $REGISTRY $REPO $BUILDKITE_COMMIT $BRANCH $IMAGE_TAG; fi
|
||||
retry:
|
||||
automatic:
|
||||
- exit_status: -1 # Agent was lost
|
||||
|
||||
@@ -11,10 +11,10 @@ REPO=$2
|
||||
BUILDKITE_COMMIT=$3
|
||||
|
||||
# authenticate with AWS ECR
|
||||
aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin $REGISTRY
|
||||
aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin "$REGISTRY"
|
||||
|
||||
# skip build if image already exists
|
||||
if [[ -z $(docker manifest inspect $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu) ]]; then
|
||||
if [[ -z $(docker manifest inspect "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-cpu) ]]; then
|
||||
echo "Image not found, proceeding with build..."
|
||||
else
|
||||
echo "Image found"
|
||||
@@ -24,13 +24,13 @@ fi
|
||||
# build
|
||||
docker build --file docker/Dockerfile.cpu \
|
||||
--build-arg max_jobs=16 \
|
||||
--build-arg buildkite_commit=$BUILDKITE_COMMIT \
|
||||
--build-arg buildkite_commit="$BUILDKITE_COMMIT" \
|
||||
--build-arg VLLM_CPU_AVX512BF16=true \
|
||||
--build-arg VLLM_CPU_AVX512VNNI=true \
|
||||
--build-arg VLLM_CPU_AMXBF16=true \
|
||||
--tag $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu \
|
||||
--tag "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-cpu \
|
||||
--target vllm-test \
|
||||
--progress plain .
|
||||
|
||||
# push
|
||||
docker push $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu
|
||||
docker push "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-cpu
|
||||
|
||||
@@ -11,10 +11,10 @@ REPO=$2
|
||||
BUILDKITE_COMMIT=$3
|
||||
|
||||
# authenticate with AWS ECR
|
||||
aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin $REGISTRY
|
||||
aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin "$REGISTRY"
|
||||
|
||||
# skip build if image already exists
|
||||
if [[ -z $(docker manifest inspect $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu) ]]; then
|
||||
if [[ -z $(docker manifest inspect "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-arm64-cpu) ]]; then
|
||||
echo "Image not found, proceeding with build..."
|
||||
else
|
||||
echo "Image found"
|
||||
@@ -24,10 +24,10 @@ fi
|
||||
# build
|
||||
docker build --file docker/Dockerfile.cpu \
|
||||
--build-arg max_jobs=16 \
|
||||
--build-arg buildkite_commit=$BUILDKITE_COMMIT \
|
||||
--tag $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu \
|
||||
--build-arg buildkite_commit="$BUILDKITE_COMMIT" \
|
||||
--tag "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-arm64-cpu \
|
||||
--target vllm-test \
|
||||
--progress plain .
|
||||
|
||||
# push
|
||||
docker push $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu
|
||||
docker push "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-arm64-cpu
|
||||
|
||||
@@ -11,10 +11,10 @@ REPO=$2
|
||||
BUILDKITE_COMMIT=$3
|
||||
|
||||
# authenticate with AWS ECR
|
||||
aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin $REGISTRY
|
||||
aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin "$REGISTRY"
|
||||
|
||||
# skip build if image already exists
|
||||
if [[ -z $(docker manifest inspect $REGISTRY/$REPO:$BUILDKITE_COMMIT-hpu) ]]; then
|
||||
if [[ -z $(docker manifest inspect "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-hpu) ]]; then
|
||||
echo "Image not found, proceeding with build..."
|
||||
else
|
||||
echo "Image found"
|
||||
@@ -25,10 +25,10 @@ fi
|
||||
docker build \
|
||||
--file tests/pytorch_ci_hud_benchmark/Dockerfile.hpu \
|
||||
--build-arg max_jobs=16 \
|
||||
--build-arg buildkite_commit=$BUILDKITE_COMMIT \
|
||||
--tag $REGISTRY/$REPO:$BUILDKITE_COMMIT-hpu \
|
||||
--build-arg buildkite_commit="$BUILDKITE_COMMIT" \
|
||||
--tag "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-hpu \
|
||||
--progress plain \
|
||||
https://github.com/vllm-project/vllm-gaudi.git
|
||||
|
||||
# push
|
||||
docker push $REGISTRY/$REPO:$BUILDKITE_COMMIT-hpu
|
||||
docker push "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-hpu
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
# We can use this script to compute baseline accuracy on chartqa for vllm.
|
||||
#
|
||||
# Make sure you have lm-eval-harness installed:
|
||||
# pip install "lm-eval[api]>=0.4.9.2"
|
||||
# pip install "lm-eval[api]>=0.4.11"
|
||||
|
||||
usage() {
|
||||
echo``
|
||||
@@ -41,4 +41,4 @@ lm_eval --model vllm-vlm \
|
||||
--tasks chartqa \
|
||||
--batch_size auto \
|
||||
--apply_chat_template \
|
||||
--limit $LIMIT
|
||||
--limit "$LIMIT"
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
# We can use this script to compute baseline accuracy on GSM for transformers.
|
||||
#
|
||||
# Make sure you have lm-eval-harness installed:
|
||||
# pip install "lm-eval[api]>=0.4.9.2"
|
||||
# pip install "lm-eval[api]>=0.4.11"
|
||||
|
||||
usage() {
|
||||
echo``
|
||||
|
||||
@@ -3,7 +3,7 @@
|
||||
# We use this for fp8, which HF does not support.
|
||||
#
|
||||
# Make sure you have lm-eval-harness installed:
|
||||
# pip install "lm-eval[api]>=0.4.9.2"
|
||||
# pip install "lm-eval[api]>=0.4.11"
|
||||
|
||||
usage() {
|
||||
echo``
|
||||
|
||||
@@ -3,7 +3,7 @@
|
||||
# We use this for fp8, which HF does not support.
|
||||
#
|
||||
# Make sure you have lm-eval-harness installed:
|
||||
# pip install "lm-eval[api]>=0.4.9.2"
|
||||
# pip install "lm-eval[api]>=0.4.11"
|
||||
|
||||
usage() {
|
||||
echo``
|
||||
@@ -20,14 +20,11 @@ usage() {
|
||||
echo
|
||||
}
|
||||
|
||||
while getopts "m:b:l:f:t:" OPT; do
|
||||
while getopts "m:l:f:t:" OPT; do
|
||||
case ${OPT} in
|
||||
m )
|
||||
MODEL="$OPTARG"
|
||||
;;
|
||||
b )
|
||||
BATCH_SIZE="$OPTARG"
|
||||
;;
|
||||
l )
|
||||
LIMIT="$OPTARG"
|
||||
;;
|
||||
|
||||
@@ -9,8 +9,10 @@ import json
|
||||
import os
|
||||
from dataclasses import dataclass
|
||||
from importlib import util
|
||||
from pathlib import Path
|
||||
|
||||
import pandas as pd
|
||||
import regex as re
|
||||
|
||||
pd.options.display.float_format = "{:.2f}".format
|
||||
plotly_found = util.find_spec("plotly.express") is not None
|
||||
@@ -275,6 +277,131 @@ def _apply_two_decimals(
|
||||
return styler.format({c: "{:.2f}" for c in num_cols}, na_rep="")
|
||||
|
||||
|
||||
# -----------------------------
|
||||
# Export helpers (Excel + CSV)
|
||||
# -----------------------------
|
||||
def _sanitize_sheet_name(name: str) -> str:
|
||||
"""
|
||||
Excel sheet constraints:
|
||||
- max 31 chars
|
||||
- cannot contain: : \ / ? * [ ]
|
||||
- cannot be empty
|
||||
"""
|
||||
name = "sheet" if name is None else str(name)
|
||||
name = re.sub(r"[:\\/?*\[\]]", "_", name)
|
||||
name = name.strip().strip("'")
|
||||
name = re.sub(r"\s+", " ", name)
|
||||
if not name:
|
||||
name = "sheet"
|
||||
return name[:31]
|
||||
|
||||
|
||||
def _group_to_sheet_base(group_cols: list[str], gkey_tuple) -> str:
|
||||
d = dict(zip(group_cols, gkey_tuple))
|
||||
model = d.get("Model", "model")
|
||||
model_short = str(model).split("/")[-1]
|
||||
ilen = d.get("Input Len", "")
|
||||
olen = d.get("Output Len", "")
|
||||
lens = f"_{ilen}x{olen}" if ilen != "" and olen != "" else ""
|
||||
return _sanitize_sheet_name(f"{model_short}{lens}")
|
||||
|
||||
|
||||
def _write_tables_to_excel_sheet(
|
||||
writer: pd.ExcelWriter, sheet: str, blocks: list[tuple[str, pd.DataFrame]]
|
||||
):
|
||||
startrow = 0
|
||||
for title, df in blocks:
|
||||
pd.DataFrame([[title]]).to_excel(
|
||||
writer, sheet_name=sheet, index=False, header=False, startrow=startrow
|
||||
)
|
||||
startrow += 1
|
||||
df.to_excel(writer, sheet_name=sheet, index=False, startrow=startrow)
|
||||
startrow += len(df) + 3
|
||||
|
||||
|
||||
def _safe_filename(s: str) -> str:
|
||||
s = re.sub(r"[^\w\-.]+", "_", str(s).strip())
|
||||
return s[:180] if len(s) > 180 else s
|
||||
|
||||
|
||||
# -----------------------------
|
||||
# vLLM environment export helper
|
||||
# -----------------------------
|
||||
def _parse_vllm_env_txt(env_path: Path) -> pd.DataFrame:
|
||||
"""Parse vllm_env.txt into a flat table (Section, Key, Value).
|
||||
|
||||
Supports:
|
||||
- section headers as standalone lines (no ':' or '=')
|
||||
- key-value lines like 'OS: Ubuntu ...'
|
||||
- env var lines like 'HF_HOME=/data/hf'
|
||||
"""
|
||||
lines = env_path.read_text(encoding="utf-8", errors="replace").splitlines()
|
||||
section = "General"
|
||||
rows: list[dict] = []
|
||||
|
||||
def set_section(s: str):
|
||||
nonlocal section
|
||||
s = (s or "").strip()
|
||||
if s:
|
||||
section = s
|
||||
|
||||
for raw in lines:
|
||||
stripped = raw.strip()
|
||||
if not stripped:
|
||||
continue
|
||||
# divider lines like =====
|
||||
if set(stripped) <= {"="}:
|
||||
continue
|
||||
|
||||
# section header heuristic: short standalone line
|
||||
if ":" not in stripped and "=" not in stripped and len(stripped) <= 64:
|
||||
if stripped.lower().startswith("collecting environment information"):
|
||||
continue
|
||||
set_section(stripped)
|
||||
continue
|
||||
|
||||
# env var style: KEY=VALUE (and not a URL with :)
|
||||
if "=" in stripped and ":" not in stripped:
|
||||
k, v = stripped.split("=", 1)
|
||||
k = k.strip()
|
||||
v = v.strip()
|
||||
if k:
|
||||
rows.append({"Section": section, "Key": k, "Value": v})
|
||||
continue
|
||||
|
||||
# key: value
|
||||
if ":" in stripped:
|
||||
k, v = stripped.split(":", 1)
|
||||
k = k.strip()
|
||||
v = v.strip()
|
||||
if k:
|
||||
rows.append({"Section": section, "Key": k, "Value": v})
|
||||
continue
|
||||
|
||||
return pd.DataFrame(rows, columns=["Section", "Key", "Value"])
|
||||
|
||||
|
||||
def _load_env_df_for_inputs(args, files: list[str]) -> pd.DataFrame | None:
|
||||
"""Load vllm_env.txt next to the *original* input JSON file.
|
||||
|
||||
Note: when only one -f is provided, the script may split JSON into ./splits/...,
|
||||
but vllm_env.txt typically lives next to the original benchmark_results.json.
|
||||
"""
|
||||
base_dir: Path | None = None
|
||||
if getattr(args, "file", None):
|
||||
base_dir = Path(args.file[0]).resolve().parent
|
||||
elif files:
|
||||
base_dir = Path(files[0]).resolve().parent
|
||||
if base_dir is None:
|
||||
return None
|
||||
|
||||
env_path = base_dir / "vllm_env.txt"
|
||||
if not env_path.exists():
|
||||
return None
|
||||
df = _parse_vllm_env_txt(env_path)
|
||||
return df
|
||||
|
||||
|
||||
# -----------------------------
|
||||
# Valid max concurrency summary helpers
|
||||
# -----------------------------
|
||||
@@ -428,7 +555,6 @@ def build_valid_max_concurrency_summary_html(
|
||||
|
||||
summary_df = pd.DataFrame(rows)
|
||||
|
||||
# --- Coerce numeric columns so Styler doesn't miss them due to object dtype ---
|
||||
for c in summary_df.columns:
|
||||
if c == "Configuration":
|
||||
continue
|
||||
@@ -436,12 +562,10 @@ def build_valid_max_concurrency_summary_html(
|
||||
|
||||
both_col = f"Max {conc_col} (Both)"
|
||||
|
||||
# --- Strict 2-decimal formatting for ALL non-Configuration columns ---
|
||||
formatters = {}
|
||||
for c in summary_df.columns:
|
||||
if c == "Configuration":
|
||||
continue
|
||||
# default argument binds per-column formatter correctly
|
||||
formatters[c] = lambda v: "" if pd.isna(v) else f"{float(v):.2f}"
|
||||
|
||||
styler = summary_df.style.format(formatters)
|
||||
@@ -460,6 +584,95 @@ def build_valid_max_concurrency_summary_html(
|
||||
return title + styler.to_html(table_attributes='border="1" class="dataframe"')
|
||||
|
||||
|
||||
def build_valid_max_concurrency_summary_df(
|
||||
tput_group_df: pd.DataFrame | None,
|
||||
ttft_group_df: pd.DataFrame | None,
|
||||
tpot_group_df: pd.DataFrame | None,
|
||||
conc_col: str,
|
||||
args,
|
||||
) -> pd.DataFrame | None:
|
||||
if ttft_group_df is None and tpot_group_df is None:
|
||||
return None
|
||||
|
||||
ttft_cols = (
|
||||
_config_value_columns(ttft_group_df, conc_col)
|
||||
if ttft_group_df is not None
|
||||
else []
|
||||
)
|
||||
tpot_cols = (
|
||||
_config_value_columns(tpot_group_df, conc_col)
|
||||
if tpot_group_df is not None
|
||||
else []
|
||||
)
|
||||
tput_cols = (
|
||||
_config_value_columns(tput_group_df, conc_col)
|
||||
if tput_group_df is not None
|
||||
else []
|
||||
)
|
||||
|
||||
if ttft_group_df is not None and tpot_group_df is not None:
|
||||
cfg_cols = [c for c in ttft_cols if c in tpot_cols]
|
||||
if tput_group_df is not None:
|
||||
cfg_cols = [c for c in cfg_cols if c in tput_cols] or cfg_cols
|
||||
else:
|
||||
cfg_cols = ttft_cols or tpot_cols
|
||||
|
||||
if not cfg_cols:
|
||||
cfg_cols = sorted(set(ttft_cols) | set(tpot_cols) | set(tput_cols), key=str)
|
||||
|
||||
rows = []
|
||||
for cfg in cfg_cols:
|
||||
ttft_max = (
|
||||
_max_concurrency_ok(ttft_group_df, conc_col, cfg, args.ttft_max_ms)
|
||||
if ttft_group_df is not None
|
||||
else pd.NA
|
||||
)
|
||||
tpot_max = (
|
||||
_max_concurrency_ok(tpot_group_df, conc_col, cfg, args.tpot_max_ms)
|
||||
if tpot_group_df is not None
|
||||
else pd.NA
|
||||
)
|
||||
both = (
|
||||
pd.NA
|
||||
if (pd.isna(ttft_max) or pd.isna(tpot_max))
|
||||
else min(ttft_max, tpot_max)
|
||||
)
|
||||
|
||||
tput_at_both = (
|
||||
_value_at_concurrency(tput_group_df, conc_col, cfg, both)
|
||||
if tput_group_df is not None
|
||||
else pd.NA
|
||||
)
|
||||
ttft_at_both = (
|
||||
_value_at_concurrency(ttft_group_df, conc_col, cfg, both)
|
||||
if ttft_group_df is not None
|
||||
else pd.NA
|
||||
)
|
||||
tpot_at_both = (
|
||||
_value_at_concurrency(tpot_group_df, conc_col, cfg, both)
|
||||
if tpot_group_df is not None
|
||||
else pd.NA
|
||||
)
|
||||
|
||||
rows.append(
|
||||
{
|
||||
"Configuration": cfg,
|
||||
f"Max {conc_col} (TTFT ≤ {args.ttft_max_ms:g} ms)": ttft_max,
|
||||
f"Max {conc_col} (TPOT ≤ {args.tpot_max_ms:g} ms)": tpot_max,
|
||||
f"Max {conc_col} (Both)": both,
|
||||
"Output Tput @ Both (tok/s)": tput_at_both,
|
||||
"TTFT @ Both (ms)": ttft_at_both,
|
||||
"TPOT @ Both (ms)": tpot_at_both,
|
||||
}
|
||||
)
|
||||
|
||||
df = pd.DataFrame(rows)
|
||||
for c in df.columns:
|
||||
if c != "Configuration":
|
||||
df[c] = pd.to_numeric(df[c], errors="coerce")
|
||||
return df
|
||||
|
||||
|
||||
# -----------------------------
|
||||
# Plot helper
|
||||
# -----------------------------
|
||||
@@ -537,6 +750,21 @@ def build_parser() -> argparse.ArgumentParser:
|
||||
default=100.0,
|
||||
help="Reference limit for TPOT plots (ms)",
|
||||
)
|
||||
|
||||
# ---- NEW: export options ----
|
||||
parser.add_argument(
|
||||
"--excel-out",
|
||||
type=str,
|
||||
default="perf_comparison.xlsx",
|
||||
help="Write one sheet per (Model, Dataset, Input Len, Output Len).",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--csv-out-dir",
|
||||
type=str,
|
||||
default="",
|
||||
help="If set, write per-group per-metric CSVs into this directory.",
|
||||
)
|
||||
|
||||
return parser
|
||||
|
||||
|
||||
@@ -657,7 +885,6 @@ def maybe_write_plot(
|
||||
markers=True,
|
||||
)
|
||||
|
||||
# Ensure plot hover + y tick labels are also 2 decimals.
|
||||
fig.update_traces(hovertemplate="%{y:.2f}<extra></extra>")
|
||||
fig.update_yaxes(tickformat=".2f")
|
||||
|
||||
@@ -730,87 +957,151 @@ def write_report_group_first(
|
||||
for metric_label, (df, _) in metric_cache.items()
|
||||
}
|
||||
|
||||
with open("perf_comparison.html", "w", encoding="utf-8") as main_fh:
|
||||
main_fh.write('<meta charset="utf-8">\n')
|
||||
for gkey in group_keys:
|
||||
gkey_tuple = normalize_group_key(gkey)
|
||||
suffix = build_group_suffix(group_cols_canonical, gkey_tuple)
|
||||
sub_path = group_filename(gkey_tuple)
|
||||
group_header = (
|
||||
'<div style="font-size: 1.4em; font-weight: 700; '
|
||||
'margin: 18px 0 10px 0;">'
|
||||
f"{_html.escape(suffix)}"
|
||||
"</div>\n"
|
||||
)
|
||||
csv_dir = Path(args.csv_out_dir) if args.csv_out_dir else None
|
||||
if csv_dir:
|
||||
csv_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
main_fh.write(group_header)
|
||||
with open(sub_path, "w", encoding="utf-8") as sub_fh:
|
||||
sub_fh.write('<meta charset="utf-8">\n')
|
||||
sub_fh.write(group_header)
|
||||
tput_group_df = None
|
||||
ttft_group_df = None
|
||||
tpot_group_df = None
|
||||
conc_col = args.xaxis
|
||||
excel_path = args.excel_out or "perf_comparison.xlsx"
|
||||
with pd.ExcelWriter(excel_path, engine="openpyxl") as xw:
|
||||
# ---- Environment sheet (first) ----
|
||||
env_sheet = _sanitize_sheet_name("Environment")
|
||||
env_df = _load_env_df_for_inputs(args, files)
|
||||
if env_df is None or env_df.empty:
|
||||
pd.DataFrame(
|
||||
[
|
||||
{
|
||||
"Section": "Environment",
|
||||
"Key": "vllm_env.txt",
|
||||
"Value": "NOT FOUND (or empty)",
|
||||
}
|
||||
]
|
||||
).to_excel(xw, sheet_name=env_sheet, index=False)
|
||||
else:
|
||||
env_df.to_excel(xw, sheet_name=env_sheet, index=False)
|
||||
with open("perf_comparison.html", "w", encoding="utf-8") as main_fh:
|
||||
main_fh.write('<meta charset="utf-8">\n')
|
||||
for gkey in group_keys:
|
||||
gkey_tuple = normalize_group_key(gkey)
|
||||
suffix = build_group_suffix(group_cols_canonical, gkey_tuple)
|
||||
sub_path = group_filename(gkey_tuple)
|
||||
group_header = (
|
||||
'<div style="font-size: 1.4em; font-weight: 700; '
|
||||
'margin: 18px 0 10px 0;">'
|
||||
f"{_html.escape(suffix)}"
|
||||
"</div>\n"
|
||||
)
|
||||
|
||||
for metric_label in plan.data_cols:
|
||||
gb = metric_groupbys[metric_label]
|
||||
df_sorted, raw_data_cols = metric_cache[metric_label]
|
||||
main_fh.write(group_header)
|
||||
|
||||
try:
|
||||
group_df = gb.get_group(gkey)
|
||||
except KeyError:
|
||||
missing = (
|
||||
'<div style="font-size: 1.1em; font-weight: 600; '
|
||||
'margin: 10px 0;">'
|
||||
f"{_html.escape(metric_label)} — missing for this group"
|
||||
"</div>\n"
|
||||
sheet = _group_to_sheet_base(group_cols_canonical, gkey_tuple)
|
||||
sheet_base = sheet
|
||||
dedup_i = 1
|
||||
while sheet in xw.sheets:
|
||||
dedup_i += 1
|
||||
sheet = _sanitize_sheet_name(f"{sheet_base}_{dedup_i}")
|
||||
|
||||
excel_blocks: list[tuple[str, pd.DataFrame]] = []
|
||||
|
||||
with open(sub_path, "w", encoding="utf-8") as sub_fh:
|
||||
sub_fh.write('<meta charset="utf-8">\n')
|
||||
sub_fh.write(group_header)
|
||||
tput_group_df = None
|
||||
ttft_group_df = None
|
||||
tpot_group_df = None
|
||||
conc_col = args.xaxis
|
||||
|
||||
for metric_label in plan.data_cols:
|
||||
gb = metric_groupbys[metric_label]
|
||||
df_sorted, raw_data_cols = metric_cache[metric_label]
|
||||
|
||||
try:
|
||||
group_df = gb.get_group(gkey)
|
||||
except KeyError:
|
||||
missing = (
|
||||
'<div style="font-size: 1.1em; font-weight: 600; '
|
||||
'margin: 10px 0;">'
|
||||
f"{_html.escape(metric_label)} — missing for this group"
|
||||
"</div>\n"
|
||||
)
|
||||
main_fh.write(missing)
|
||||
sub_fh.write(missing)
|
||||
continue
|
||||
|
||||
if conc_col not in group_df.columns:
|
||||
conc_col = _find_concurrency_col(group_df)
|
||||
|
||||
mn = metric_label.lower().strip()
|
||||
if "tok/s" in mn:
|
||||
tput_group_df = group_df
|
||||
elif "ttft" in mn:
|
||||
ttft_group_df = group_df
|
||||
elif mn in ("p99", "median") or "tpot" in mn:
|
||||
tpot_group_df = group_df
|
||||
|
||||
display_group = group_df.drop(
|
||||
columns=group_cols_canonical, errors="ignore"
|
||||
)
|
||||
|
||||
main_fh.write(missing)
|
||||
sub_fh.write(missing)
|
||||
continue
|
||||
html = render_metric_table_html(
|
||||
display_group, metric_label, suffix, args
|
||||
)
|
||||
main_fh.write(html)
|
||||
sub_fh.write(html)
|
||||
|
||||
if conc_col not in group_df.columns:
|
||||
conc_col = _find_concurrency_col(group_df)
|
||||
maybe_write_plot(
|
||||
main_fh,
|
||||
sub_fh,
|
||||
group_df=group_df,
|
||||
raw_data_cols=raw_data_cols,
|
||||
metric_label=metric_label,
|
||||
y_axis_col=y_axis_col,
|
||||
args=args,
|
||||
)
|
||||
|
||||
mn = metric_label.lower().strip()
|
||||
if "tok/s" in mn:
|
||||
tput_group_df = group_df
|
||||
elif "ttft" in mn:
|
||||
ttft_group_df = group_df
|
||||
elif mn in ("p99", "median") or "tpot" in mn:
|
||||
tpot_group_df = group_df
|
||||
excel_blocks.append(
|
||||
(metric_label, display_group.reset_index(drop=True))
|
||||
)
|
||||
if csv_dir:
|
||||
fn = _safe_filename(
|
||||
f"{sheet}__{metric_label}".replace(" ", "_").replace(
|
||||
"/", "_"
|
||||
)
|
||||
)
|
||||
display_group.to_csv(csv_dir / f"{fn}.csv", index=False)
|
||||
|
||||
display_group = group_df.drop(
|
||||
columns=group_cols_canonical, errors="ignore"
|
||||
)
|
||||
|
||||
html = render_metric_table_html(
|
||||
display_group, metric_label, suffix, args
|
||||
)
|
||||
main_fh.write(html)
|
||||
sub_fh.write(html)
|
||||
|
||||
maybe_write_plot(
|
||||
main_fh,
|
||||
sub_fh,
|
||||
group_df=group_df,
|
||||
raw_data_cols=raw_data_cols,
|
||||
metric_label=metric_label,
|
||||
y_axis_col=y_axis_col,
|
||||
summary_html = build_valid_max_concurrency_summary_html(
|
||||
tput_group_df=tput_group_df,
|
||||
ttft_group_df=ttft_group_df,
|
||||
tpot_group_df=tpot_group_df,
|
||||
conc_col=conc_col,
|
||||
args=args,
|
||||
)
|
||||
if summary_html:
|
||||
main_fh.write(summary_html)
|
||||
sub_fh.write(summary_html)
|
||||
|
||||
summary_html = build_valid_max_concurrency_summary_html(
|
||||
tput_group_df=tput_group_df,
|
||||
ttft_group_df=ttft_group_df,
|
||||
tpot_group_df=tpot_group_df,
|
||||
conc_col=conc_col,
|
||||
args=args,
|
||||
)
|
||||
if summary_html:
|
||||
main_fh.write(summary_html)
|
||||
sub_fh.write(summary_html)
|
||||
summary_df = build_valid_max_concurrency_summary_df(
|
||||
tput_group_df=tput_group_df,
|
||||
ttft_group_df=ttft_group_df,
|
||||
tpot_group_df=tpot_group_df,
|
||||
conc_col=conc_col,
|
||||
args=args,
|
||||
)
|
||||
if summary_df is not None:
|
||||
excel_blocks.append(
|
||||
("Valid Max Concurrency Summary", summary_df)
|
||||
)
|
||||
if csv_dir:
|
||||
fn = _safe_filename(
|
||||
f"{sheet}__Valid_Max_Concurrency_Summary"
|
||||
)
|
||||
summary_df.to_csv(csv_dir / f"{fn}.csv", index=False)
|
||||
|
||||
_write_tables_to_excel_sheet(xw, sheet, excel_blocks)
|
||||
|
||||
print(f"Wrote Excel: {excel_path}")
|
||||
if csv_dir:
|
||||
print(f"Wrote CSVs under: {csv_dir}")
|
||||
|
||||
|
||||
def main():
|
||||
|
||||
@@ -1,6 +1,4 @@
|
||||
#!/bin/bash
|
||||
|
||||
# This script should be run inside the CI process
|
||||
# This script assumes that we are already inside the vllm/ directory
|
||||
# Benchmarking results will be available inside vllm/benchmarks/results/
|
||||
|
||||
@@ -9,14 +7,19 @@
|
||||
set -x
|
||||
set -o pipefail
|
||||
|
||||
# Environment-driven debug controls (like ON_CPU=1)
|
||||
DRY_RUN="${DRY_RUN:-0}"
|
||||
MODEL_FILTER="${MODEL_FILTER:-}"
|
||||
DTYPE_FILTER="${DTYPE_FILTER:-}"
|
||||
|
||||
check_gpus() {
|
||||
if command -v nvidia-smi; then
|
||||
# check the number of GPUs and GPU type.
|
||||
declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
|
||||
declare -g gpu_count=$(nvidia-smi --list-gpus | grep -c . || true)
|
||||
elif command -v amd-smi; then
|
||||
declare -g gpu_count=$(amd-smi list | grep 'GPU' | wc -l)
|
||||
declare -g gpu_count=$(amd-smi list | grep -c 'GPU' || true)
|
||||
elif command -v hl-smi; then
|
||||
declare -g gpu_count=$(hl-smi --list | grep -i "Module ID" | wc -l)
|
||||
declare -g gpu_count=$(hl-smi --list | grep -ci "Module ID" || true)
|
||||
fi
|
||||
|
||||
if [[ $gpu_count -gt 0 ]]; then
|
||||
@@ -44,7 +47,7 @@ check_cpus() {
|
||||
declare -g numa_count=$(lscpu | grep "NUMA node(s):" | awk '{print $3}')
|
||||
if [[ $numa_count -gt 0 ]]; then
|
||||
echo "NUMA found."
|
||||
echo $numa_count
|
||||
echo "$numa_count"
|
||||
else
|
||||
echo "Need at least 1 NUMA to run benchmarking."
|
||||
exit 1
|
||||
@@ -112,13 +115,12 @@ json2envs() {
|
||||
}
|
||||
|
||||
wait_for_server() {
|
||||
# wait for vllm server to start
|
||||
# return 1 if vllm server crashes
|
||||
local timeout_val="1200"
|
||||
timeout "$timeout_val" bash -c '
|
||||
until curl -X POST localhost:8000/v1/completions; do
|
||||
until curl -sf http://localhost:8000/v1/models >/dev/null; do
|
||||
sleep 1
|
||||
done' && return 0 || return 1
|
||||
done
|
||||
'
|
||||
}
|
||||
|
||||
kill_processes_launched_by_current_bash() {
|
||||
@@ -252,37 +254,16 @@ run_benchmark_tests() {
|
||||
done
|
||||
}
|
||||
|
||||
run_latency_tests() {
|
||||
run_benchmark_tests "latency" "$1"
|
||||
}
|
||||
run_latency_tests() { run_benchmark_tests "latency" "$1"; }
|
||||
run_startup_tests() { run_benchmark_tests "startup" "$1"; }
|
||||
run_throughput_tests() { run_benchmark_tests "throughput" "$1"; }
|
||||
|
||||
run_startup_tests() {
|
||||
run_benchmark_tests "startup" "$1"
|
||||
}
|
||||
|
||||
run_throughput_tests() {
|
||||
run_benchmark_tests "throughput" "$1"
|
||||
}
|
||||
|
||||
run_serving_tests() {
|
||||
# run serving tests using `vllm bench serve` command
|
||||
# $1: a json file specifying serving test cases
|
||||
#
|
||||
# Supported JSON formats:
|
||||
# 1) Plain format: top-level array
|
||||
# [ { "test_name": "...", "server_parameters": {...}, ... }, ... ]
|
||||
#
|
||||
# 2) Default parameters field + plain format tests
|
||||
# {
|
||||
# "defaults": { ... },
|
||||
# "tests": [ { "test_name": "...", "server_parameters": {...}, ... }, ... ]
|
||||
# }
|
||||
|
||||
local serving_test_file
|
||||
serving_test_file=$1
|
||||
|
||||
# Iterate over serving tests
|
||||
jq -c '
|
||||
merge_serving_tests_stream() {
|
||||
# Emit merged serving test objects, optionally filtered by MODEL_FILTER/DTYPE_FILTER in DRY_RUN mode.
|
||||
# This helper does NOT modify JSON; it only filters the stream in dry-run mode.
|
||||
local serving_test_file="$1"
|
||||
# shellcheck disable=SC2016
|
||||
local merged='
|
||||
if type == "array" then
|
||||
# Plain format: test cases array
|
||||
.[]
|
||||
@@ -304,7 +285,50 @@ run_serving_tests() {
|
||||
else
|
||||
error("Unsupported serving test file format: must be array or object with .tests")
|
||||
end
|
||||
' "$serving_test_file" | while read -r params; do
|
||||
'
|
||||
|
||||
jq -c "$merged" "$serving_test_file" | \
|
||||
if [[ "${DRY_RUN:-0}" == "1" && ( "${MODEL_FILTER}${DTYPE_FILTER}" != "" ) ]]; then
|
||||
jq -c --arg model "$MODEL_FILTER" --arg dtype "$DTYPE_FILTER" '
|
||||
select((($model|length)==0)
|
||||
or ((.server_parameters.model // "") == $model)
|
||||
or ((.client_parameters.model // "") == $model))
|
||||
| select((($dtype|length)==0) or ((.server_parameters.dtype // "") == $dtype))
|
||||
'
|
||||
else
|
||||
cat
|
||||
fi
|
||||
}
|
||||
|
||||
run_serving_tests() {
|
||||
# run serving tests using `vllm bench serve` command
|
||||
# $1: a json file specifying serving test cases
|
||||
#
|
||||
# Supported JSON formats:
|
||||
# 1) Plain format: top-level array
|
||||
# [ { "test_name": "...", "server_parameters": {...}, ... }, ... ]
|
||||
#
|
||||
# 2) Default parameters field + plain format tests
|
||||
# {
|
||||
# "defaults": { ... },
|
||||
# "tests": [ { "test_name": "...", "server_parameters": {...}, ... }, ... ]
|
||||
# }
|
||||
|
||||
local serving_test_file
|
||||
serving_test_file=$1
|
||||
|
||||
# In dry-run mode, if filters are provided but no tests match, fail fast.
|
||||
if [[ "${DRY_RUN:-0}" == "1" && ( "${MODEL_FILTER}${DTYPE_FILTER}" != "" ) ]]; then
|
||||
local count
|
||||
count=$(merge_serving_tests_stream "$serving_test_file" | wc -l | tr -d ' ')
|
||||
if [[ "$count" -eq 0 ]]; then
|
||||
echo "No matching serving tests found in $serving_test_file for model='$MODEL_FILTER' dtype='$DTYPE_FILTER'." >&2
|
||||
return 0
|
||||
fi
|
||||
fi
|
||||
|
||||
# Iterate over serving tests (merged + optional filtered stream)
|
||||
merge_serving_tests_stream "$serving_test_file" | while read -r params; do
|
||||
# get the test name, and append the GPU type back to it.
|
||||
test_name=$(echo "$params" | jq -r '.test_name')
|
||||
if [[ ! "$test_name" =~ ^serving_ ]]; then
|
||||
@@ -373,7 +397,7 @@ run_serving_tests() {
|
||||
echo "Server command: $server_command"
|
||||
# support remote vllm server
|
||||
client_remote_args=""
|
||||
if [[ -z "${REMOTE_HOST}" ]]; then
|
||||
if [[ -z "${REMOTE_HOST}" && "${DRY_RUN:-0}" != "1" ]]; then
|
||||
bash -c "$server_command" &
|
||||
server_pid=$!
|
||||
# wait until the server is alive
|
||||
@@ -384,6 +408,9 @@ run_serving_tests() {
|
||||
echo ""
|
||||
echo "vLLM failed to start within the timeout period."
|
||||
fi
|
||||
elif [[ "${DRY_RUN:-0}" == "1" ]]; then
|
||||
# dry-run: don't start server
|
||||
echo "Dry Run."
|
||||
else
|
||||
server_command="Using Remote Server $REMOTE_HOST $REMOTE_PORT"
|
||||
if [[ ${REMOTE_PORT} ]]; then
|
||||
@@ -402,14 +429,12 @@ run_serving_tests() {
|
||||
for qps in $qps_list; do
|
||||
# remove the surrounding single quote from qps
|
||||
if [[ "$qps" == *"inf"* ]]; then
|
||||
echo "qps was $qps"
|
||||
qps="inf"
|
||||
echo "now qps is $qps"
|
||||
fi
|
||||
|
||||
# iterate over different max_concurrency
|
||||
for max_concurrency in $max_concurrency_list; do
|
||||
new_test_name=$test_name"_qps_"$qps"_concurrency_"$max_concurrency
|
||||
new_test_name="${test_name}_qps_${qps}_concurrency_${max_concurrency}"
|
||||
echo " new test name $new_test_name"
|
||||
# pass the tensor parallel size, the compilation mode, and the optimization
|
||||
# level to the client so that they can be used on the benchmark dashboard
|
||||
@@ -425,7 +450,9 @@ run_serving_tests() {
|
||||
echo "Running test case $test_name with qps $qps"
|
||||
echo "Client command: $client_command"
|
||||
|
||||
bash -c "$client_command"
|
||||
if [[ "${DRY_RUN:-0}" != "1" ]]; then
|
||||
bash -c "$client_command"
|
||||
fi
|
||||
|
||||
# record the benchmarking commands
|
||||
jq_output=$(jq -n \
|
||||
@@ -443,12 +470,15 @@ run_serving_tests() {
|
||||
done
|
||||
|
||||
# clean up
|
||||
kill -9 $server_pid
|
||||
kill_gpu_processes
|
||||
if [[ "${DRY_RUN:-0}" != "1" ]]; then
|
||||
kill -9 "$server_pid"
|
||||
kill_gpu_processes
|
||||
fi
|
||||
done
|
||||
}
|
||||
|
||||
main() {
|
||||
|
||||
local ARCH
|
||||
ARCH=''
|
||||
if [[ "$ON_CPU" == "1" ]]; then
|
||||
@@ -458,7 +488,13 @@ main() {
|
||||
check_gpus
|
||||
ARCH="$arch_suffix"
|
||||
fi
|
||||
check_hf_token
|
||||
|
||||
# DRY_RUN does not execute vLLM; do not require HF_TOKEN.
|
||||
if [[ "${DRY_RUN:-0}" != "1" ]]; then
|
||||
check_hf_token
|
||||
else
|
||||
echo "DRY_RUN=1 -> skip HF_TOKEN validation"
|
||||
fi
|
||||
|
||||
# dependencies
|
||||
(which wget && which curl) || (apt-get update && apt-get install -y wget curl)
|
||||
@@ -479,11 +515,16 @@ main() {
|
||||
|
||||
# dump vllm info via vllm collect-env
|
||||
env_output=$(vllm collect-env)
|
||||
|
||||
echo "$env_output" >"$RESULTS_FOLDER/vllm_env.txt"
|
||||
|
||||
# benchmarking
|
||||
run_serving_tests $QUICK_BENCHMARK_ROOT/tests/"${SERVING_JSON:-serving-tests$ARCH.json}"
|
||||
run_serving_tests $QUICK_BENCHMARK_ROOT/tests/"${SERVING_JSON:-serving-tests$ARCH.json}" || exit $?
|
||||
|
||||
if [[ "${DRY_RUN:-0}" == "1" ]]; then
|
||||
echo "DRY_RUN=1 -> skip latency/startup/throughput suites"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
run_latency_tests $QUICK_BENCHMARK_ROOT/tests/"${LATENCY_JSON:-latency-tests$ARCH.json}"
|
||||
run_startup_tests $QUICK_BENCHMARK_ROOT/tests/"${STARTUP_JSON:-startup-tests$ARCH.json}"
|
||||
run_throughput_tests $QUICK_BENCHMARK_ROOT/tests/"${THROUGHPUT_JSON:-throughput-tests$ARCH.json}"
|
||||
|
||||
@@ -0,0 +1,41 @@
|
||||
{
|
||||
"defaults": {
|
||||
"qps_list": [
|
||||
"inf"
|
||||
],
|
||||
"max_concurrency_list": [
|
||||
32,
|
||||
64,
|
||||
128
|
||||
],
|
||||
"server_environment_variables": {
|
||||
"VLLM_RPC_TIMEOUT": 100000,
|
||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||
"VLLM_CPU_SGL_KERNEL": 1,
|
||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||
},
|
||||
"server_parameters": {
|
||||
"dtype": "bfloat16",
|
||||
"model": "jinaai/jina-embeddings-v3",
|
||||
"trust_remote_code": ""
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "jinaai/jina-embeddings-v3",
|
||||
"backend": "openai-embeddings",
|
||||
"endpoint": "/v1/embeddings",
|
||||
"dataset_name": "sharegpt",
|
||||
"dataset_path": "ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||
"num_prompts": 200
|
||||
}
|
||||
},
|
||||
"tests": [
|
||||
{
|
||||
"test_name": "serving_jina_embed_v3_tp1_sharegpt",
|
||||
"server_parameters": {
|
||||
"tensor_parallel_size": 1
|
||||
},
|
||||
"client_parameters": {}
|
||||
}
|
||||
]
|
||||
}
|
||||
@@ -0,0 +1,283 @@
|
||||
{
|
||||
"defaults": {
|
||||
"qps_list": [
|
||||
"inf"
|
||||
],
|
||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
||||
"server_environment_variables": {
|
||||
"VLLM_RPC_TIMEOUT": 100000,
|
||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||
"VLLM_CPU_SGL_KERNEL": 1,
|
||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||
},
|
||||
"server_parameters": {
|
||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||
"tensor_parallel_size": 1,
|
||||
"dtype": "bfloat16",
|
||||
"distributed_executor_backend": "mp",
|
||||
"block_size": 128,
|
||||
"trust_remote_code": "",
|
||||
"disable_log_stats": "",
|
||||
"max_num_batched_tokens": 2048,
|
||||
"max_num_seqs": 256
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||
"backend": "vllm",
|
||||
"ignore-eos": "",
|
||||
"num_prompts": 200
|
||||
}
|
||||
},
|
||||
"tests": [
|
||||
{
|
||||
"test_name": "serving_llama8B_tp1_sharegpt",
|
||||
"server_parameters": {
|
||||
"tensor_parallel_size": 1
|
||||
},
|
||||
"client_parameters": {
|
||||
"dataset_name": "sharegpt",
|
||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json"
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_tp2_sharegpt",
|
||||
"server_parameters": {
|
||||
"tensor_parallel_size": 2
|
||||
},
|
||||
"client_parameters": {
|
||||
"dataset_name": "sharegpt",
|
||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json"
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_tp1_random_128_128",
|
||||
"server_parameters": {
|
||||
"tensor_parallel_size": 1
|
||||
},
|
||||
"client_parameters": {
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 128,
|
||||
"random-output-len": 128
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_tp2_random_128_128",
|
||||
"server_parameters": {
|
||||
"tensor_parallel_size": 2
|
||||
},
|
||||
"client_parameters": {
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 128,
|
||||
"random-output-len": 128
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_tp4_random_128_128",
|
||||
"server_parameters": {
|
||||
"tensor_parallel_size": 4
|
||||
},
|
||||
"client_parameters": {
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 128,
|
||||
"random-output-len": 128
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_tp1_random_128_2048",
|
||||
"server_parameters": {
|
||||
"tensor_parallel_size": 1
|
||||
},
|
||||
"client_parameters": {
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 128,
|
||||
"random-output-len": 2048
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_tp2_random_128_2048",
|
||||
"server_parameters": {
|
||||
"tensor_parallel_size": 2
|
||||
},
|
||||
"client_parameters": {
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 128,
|
||||
"random-output-len": 2048
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_tp4_random_128_2048",
|
||||
"server_parameters": {
|
||||
"tensor_parallel_size": 4
|
||||
},
|
||||
"client_parameters": {
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 128,
|
||||
"random-output-len": 2048
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_tp1_random_2048_128",
|
||||
"server_parameters": {
|
||||
"tensor_parallel_size": 1
|
||||
},
|
||||
"client_parameters": {
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 2048,
|
||||
"random-output-len": 128
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_tp2_random_2048_128",
|
||||
"server_parameters": {
|
||||
"tensor_parallel_size": 2
|
||||
},
|
||||
"client_parameters": {
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 2048,
|
||||
"random-output-len": 128
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_tp4_random_2048_128",
|
||||
"server_parameters": {
|
||||
"tensor_parallel_size": 4
|
||||
},
|
||||
"client_parameters": {
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 2048,
|
||||
"random-output-len": 128
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_int4_tp1_random_128_128",
|
||||
"server_parameters": {
|
||||
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
||||
"tensor_parallel_size": 1
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 128,
|
||||
"random-output-len": 128
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_int4_tp2_random_128_128",
|
||||
"server_parameters": {
|
||||
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
||||
"tensor_parallel_size": 2
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 128,
|
||||
"random-output-len": 128
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_int4_tp4_random_128_128",
|
||||
"server_parameters": {
|
||||
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
||||
"tensor_parallel_size": 4
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 128,
|
||||
"random-output-len": 128
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama3B_tp1_random_128_128",
|
||||
"server_parameters": {
|
||||
"model": "meta-llama/Llama-3.2-3B-Instruct",
|
||||
"tensor_parallel_size": 1
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "meta-llama/Llama-3.2-3B-Instruct",
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 128,
|
||||
"random-output-len": 128
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_granite2B_tp1_random_128_128",
|
||||
"server_parameters": {
|
||||
"model": "ibm-granite/granite-3.2-2b-instruct",
|
||||
"tensor_parallel_size": 1
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "ibm-granite/granite-3.2-2b-instruct",
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 128,
|
||||
"random-output-len": 128
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_qwen1.7B_tp1_random_128_128",
|
||||
"server_parameters": {
|
||||
"model": "Qwen/Qwen3-1.7B",
|
||||
"tensor_parallel_size": 1
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "Qwen/Qwen3-1.7B",
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 128,
|
||||
"random-output-len": 128
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_qwen4B_tp1_random_128_128",
|
||||
"server_parameters": {
|
||||
"model": "Qwen/Qwen3-4B",
|
||||
"tensor_parallel_size": 1
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "Qwen/Qwen3-4B",
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 128,
|
||||
"random-output-len": 128
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_qwen8B_tp1_random_128_128",
|
||||
"server_parameters": {
|
||||
"model": "Qwen/Qwen3-8B",
|
||||
"tensor_parallel_size": 1
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "Qwen/Qwen3-8B",
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 128,
|
||||
"random-output-len": 128
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_glm9B_tp1_random_128_128",
|
||||
"server_parameters": {
|
||||
"model": "zai-org/glm-4-9b-hf",
|
||||
"tensor_parallel_size": 1
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "zai-org/glm-4-9b-hf",
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 128,
|
||||
"random-output-len": 128
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_gemma7B_tp1_random_128_128",
|
||||
"server_parameters": {
|
||||
"model": "google/gemma-7b",
|
||||
"tensor_parallel_size": 1
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "google/gemma-7b",
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 128,
|
||||
"random-output-len": 128
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
@@ -148,136 +148,6 @@
|
||||
"random-input-len": 2048,
|
||||
"random-output-len": 128
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_int4_tp1_random_128_128",
|
||||
"server_parameters": {
|
||||
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
||||
"tensor_parallel_size": 1
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 128,
|
||||
"random-output-len": 128
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_int4_tp2_random_128_128",
|
||||
"server_parameters": {
|
||||
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
||||
"tensor_parallel_size": 2
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 128,
|
||||
"random-output-len": 128
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_int4_tp4_random_128_128",
|
||||
"server_parameters": {
|
||||
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
||||
"tensor_parallel_size": 4
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 128,
|
||||
"random-output-len": 128
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama3B_tp1_random_128_128",
|
||||
"server_parameters": {
|
||||
"model": "meta-llama/Llama-3.2-3B-Instruct",
|
||||
"tensor_parallel_size": 1
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "meta-llama/Llama-3.2-3B-Instruct",
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 128,
|
||||
"random-output-len": 128
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_granite2B_tp1_random_128_128",
|
||||
"server_parameters": {
|
||||
"model": "ibm-granite/granite-3.2-2b-instruct",
|
||||
"tensor_parallel_size": 1
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "ibm-granite/granite-3.2-2b-instruct",
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 128,
|
||||
"random-output-len": 128
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_qwen1.7B_tp1_random_128_128",
|
||||
"server_parameters": {
|
||||
"model": "Qwen/Qwen3-1.7B",
|
||||
"tensor_parallel_size": 1
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "Qwen/Qwen3-1.7B",
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 128,
|
||||
"random-output-len": 128
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_qwen4B_tp1_random_128_128",
|
||||
"server_parameters": {
|
||||
"model": "Qwen/Qwen3-4B",
|
||||
"tensor_parallel_size": 1
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "Qwen/Qwen3-4B",
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 128,
|
||||
"random-output-len": 128
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_qwen8B_tp1_random_128_128",
|
||||
"server_parameters": {
|
||||
"model": "Qwen/Qwen3-8B",
|
||||
"tensor_parallel_size": 1
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "Qwen/Qwen3-8B",
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 128,
|
||||
"random-output-len": 128
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_glm9B_tp1_random_128_128",
|
||||
"server_parameters": {
|
||||
"model": "zai-org/glm-4-9b-hf",
|
||||
"tensor_parallel_size": 1
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "zai-org/glm-4-9b-hf",
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 128,
|
||||
"random-output-len": 128
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_gemma7B_tp1_random_128_128",
|
||||
"server_parameters": {
|
||||
"model": "google/gemma-7b",
|
||||
"tensor_parallel_size": 1
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "google/gemma-7b",
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 128,
|
||||
"random-output-len": 128
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
@@ -25,7 +25,7 @@ S3_REGION="${AWS_DEFAULT_REGION:-us-west-2}"
|
||||
S3_URL="http://${S3_BUCKET}.s3-website-${S3_REGION}.amazonaws.com"
|
||||
|
||||
# Format ROCm version for path (e.g., "7.1" -> "rocm710")
|
||||
ROCM_VERSION_PATH="rocm$(echo ${ROCM_VERSION} | tr -d '.')"
|
||||
ROCM_VERSION_PATH="rocm$(echo "${ROCM_VERSION}" | tr -d '.')"
|
||||
ROCM_PATH="rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}"
|
||||
buildkite-agent annotate --style 'success' --context 'rocm-release-workflow' << EOF
|
||||
## ROCm Wheel and Docker Image Releases
|
||||
|
||||
@@ -83,7 +83,7 @@ case "${1:-}" in
|
||||
exit 1
|
||||
fi
|
||||
|
||||
WHEEL_COUNT=$(ls artifacts/rocm-base-wheels/*.whl 2>/dev/null | wc -l)
|
||||
WHEEL_COUNT=$(find artifacts/rocm-base-wheels -maxdepth 1 -name '*.whl' 2>/dev/null | wc -l)
|
||||
if [[ "$WHEEL_COUNT" -eq 0 ]]; then
|
||||
echo "ERROR: No wheels found in artifacts/rocm-base-wheels/" >&2
|
||||
exit 1
|
||||
@@ -110,9 +110,9 @@ case "${1:-}" in
|
||||
|
||||
echo ""
|
||||
echo "Downloaded wheels:"
|
||||
ls -lh artifacts/rocm-base-wheels/
|
||||
find artifacts/rocm-base-wheels -maxdepth 1 -name '*.whl' -exec ls -lh {} \;
|
||||
|
||||
WHEEL_COUNT=$(ls artifacts/rocm-base-wheels/*.whl 2>/dev/null | wc -l)
|
||||
WHEEL_COUNT=$(find artifacts/rocm-base-wheels -maxdepth 1 -name '*.whl' 2>/dev/null | wc -l)
|
||||
echo ""
|
||||
echo "Total: $WHEEL_COUNT wheels"
|
||||
echo "========================================"
|
||||
|
||||
@@ -134,7 +134,7 @@ log_info "Fetching merged PRs from milestone '${MILESTONE}'..."
|
||||
|
||||
# Store PR data in a temp file
|
||||
PR_DATA=$(mktemp)
|
||||
trap "rm -f $PR_DATA" EXIT
|
||||
trap 'rm -f "$PR_DATA"' EXIT
|
||||
|
||||
if ! gh pr list --state merged --search "milestone:${MILESTONE}" \
|
||||
--limit 1000 \
|
||||
|
||||
@@ -1,25 +1,37 @@
|
||||
#!/bin/bash
|
||||
|
||||
# This script runs test inside the corresponding ROCm docker container.
|
||||
# This script runs tests inside the corresponding ROCm docker container.
|
||||
# It handles both single-node and multi-node test configurations.
|
||||
#
|
||||
# Multi-node detection: Instead of matching on fragile group names, we detect
|
||||
# multi-node jobs structurally by looking for the bracket command syntax
|
||||
# "[node0_cmds] && [node1_cmds]" or via the NUM_NODES environment variable.
|
||||
set -o pipefail
|
||||
|
||||
# Export Python path
|
||||
export PYTHONPATH=".."
|
||||
|
||||
# Print ROCm version
|
||||
echo "--- Confirming Clean Initial State"
|
||||
while true; do
|
||||
sleep 3
|
||||
if grep -q clean /opt/amdgpu/etc/gpu_state; then
|
||||
echo "GPUs state is \"clean\""
|
||||
break
|
||||
fi
|
||||
done
|
||||
###############################################################################
|
||||
# Helper Functions
|
||||
###############################################################################
|
||||
|
||||
echo "--- ROCm info"
|
||||
rocminfo
|
||||
wait_for_clean_gpus() {
|
||||
local timeout=${1:-300}
|
||||
local start=$SECONDS
|
||||
echo "--- Waiting for clean GPU state (timeout: ${timeout}s)"
|
||||
while true; do
|
||||
if grep -q clean /opt/amdgpu/etc/gpu_state; then
|
||||
echo "GPUs state is \"clean\""
|
||||
return
|
||||
fi
|
||||
if (( SECONDS - start >= timeout )); then
|
||||
echo "Error: GPUs did not reach clean state within ${timeout}s" >&2
|
||||
exit 1
|
||||
fi
|
||||
sleep 3
|
||||
done
|
||||
}
|
||||
|
||||
# cleanup older docker images
|
||||
cleanup_docker() {
|
||||
# Get Docker's root directory
|
||||
docker_root=$(docker info -f '{{.DockerRootDir}}')
|
||||
@@ -28,15 +40,12 @@ cleanup_docker() {
|
||||
exit 1
|
||||
fi
|
||||
echo "Docker root directory: $docker_root"
|
||||
# Check disk usage of the filesystem where Docker's root directory is located
|
||||
|
||||
disk_usage=$(df "$docker_root" | tail -1 | awk '{print $5}' | sed 's/%//')
|
||||
# Define the threshold
|
||||
threshold=70
|
||||
if [ "$disk_usage" -gt "$threshold" ]; then
|
||||
echo "Disk usage is above $threshold%. Cleaning up Docker images and volumes..."
|
||||
# Remove dangling images (those that are not tagged and not used by any container)
|
||||
docker image prune -f
|
||||
# Remove unused volumes / force the system prune for old images as well.
|
||||
docker volume prune -f && docker system prune --force --filter "until=72h" --all
|
||||
echo "Docker images and volumes cleanup completed."
|
||||
else
|
||||
@@ -45,193 +54,258 @@ cleanup_docker() {
|
||||
}
|
||||
|
||||
cleanup_network() {
|
||||
for node in $(seq 0 $((NUM_NODES-1))); do
|
||||
if docker pr -a -q -f name="node${node}" | grep -q .; then
|
||||
docker stop "node${node}"
|
||||
local max_nodes=${NUM_NODES:-2}
|
||||
for node in $(seq 0 $((max_nodes - 1))); do
|
||||
if docker ps -a -q -f name="node${node}" | grep -q .; then
|
||||
docker stop "node${node}" || true
|
||||
fi
|
||||
done
|
||||
if docker network ls | grep docker-net; then
|
||||
docker network rm docker-net
|
||||
if docker network ls | grep -q docker-net; then
|
||||
docker network rm docker-net || true
|
||||
fi
|
||||
}
|
||||
|
||||
# Call the cleanup docker function
|
||||
is_multi_node() {
|
||||
local cmds="$1"
|
||||
# Primary signal: NUM_NODES environment variable set by the pipeline
|
||||
if [[ "${NUM_NODES:-1}" -gt 1 ]]; then
|
||||
return 0
|
||||
fi
|
||||
# Fallback: detect the bracket syntax structurally
|
||||
# Pattern: [...] && [...] (per-node command arrays)
|
||||
if [[ "$cmds" =~ \[.*\].*\&\&.*\[.*\] ]]; then
|
||||
return 0
|
||||
fi
|
||||
return 1
|
||||
}
|
||||
|
||||
###############################################################################
|
||||
# Pytest marker re-quoting
|
||||
#
|
||||
# When commands are passed through Buildkite -> shell -> $* -> bash -c,
|
||||
# quotes around pytest -m marker expressions get stripped:
|
||||
# pytest -v -s -m 'not cpu_test' v1/core
|
||||
# becomes:
|
||||
# pytest -v -s -m not cpu_test v1/core
|
||||
#
|
||||
# pytest then interprets "cpu_test" as a file path, not part of the marker.
|
||||
# This function detects unquoted multi-word marker expressions and re-quotes
|
||||
# them so they survive the final bash -c expansion.
|
||||
###############################################################################
|
||||
|
||||
re_quote_pytest_markers() {
|
||||
local cmds="$1"
|
||||
# Pattern: -m not <identifier> -> -m 'not <identifier>'
|
||||
# Handles the common cases: 'not cpu_test', 'not slow_test', etc.
|
||||
cmds=$(echo "$cmds" | sed -E "s/-m not ([a-zA-Z_][a-zA-Z0-9_]*)/-m 'not \1'/g")
|
||||
echo "$cmds"
|
||||
}
|
||||
|
||||
###############################################################################
|
||||
# ROCm-specific pytest command rewrites
|
||||
#
|
||||
# These apply ignore flags and environment overrides for tests that are not
|
||||
# yet supported or behave differently on ROCm hardware. Kept as a single
|
||||
# function so new exclusions are easy to add in one place.
|
||||
###############################################################################
|
||||
|
||||
apply_rocm_test_overrides() {
|
||||
local cmds="$1"
|
||||
|
||||
# --- Model registry filter ---
|
||||
if [[ $cmds == *"pytest -v -s models/test_registry.py"* ]]; then
|
||||
cmds=${cmds//"pytest -v -s models/test_registry.py"/"pytest -v -s models/test_registry.py -k 'not BambaForCausalLM and not GritLM and not Mamba2ForCausalLM and not Zamba2ForCausalLM'"}
|
||||
fi
|
||||
|
||||
# --- LoRA: disable custom paged attention ---
|
||||
if [[ $cmds == *"pytest -v -s lora"* ]]; then
|
||||
cmds=${cmds//"pytest -v -s lora"/"VLLM_ROCM_CUSTOM_PAGED_ATTN=0 pytest -v -s lora"}
|
||||
fi
|
||||
|
||||
# --- Kernel ignores ---
|
||||
if [[ $cmds == *" kernels/core"* ]]; then
|
||||
cmds="${cmds} \
|
||||
--ignore=kernels/core/test_fused_quant_layernorm.py \
|
||||
--ignore=kernels/core/test_permute_cols.py"
|
||||
fi
|
||||
|
||||
if [[ $cmds == *" kernels/attention"* ]]; then
|
||||
cmds="${cmds} \
|
||||
--ignore=kernels/attention/test_attention_selector.py \
|
||||
--ignore=kernels/attention/test_encoder_decoder_attn.py \
|
||||
--ignore=kernels/attention/test_flash_attn.py \
|
||||
--ignore=kernels/attention/test_flashinfer.py \
|
||||
--ignore=kernels/attention/test_prefix_prefill.py \
|
||||
--ignore=kernels/attention/test_cascade_flash_attn.py \
|
||||
--ignore=kernels/attention/test_mha_attn.py \
|
||||
--ignore=kernels/attention/test_lightning_attn.py \
|
||||
--ignore=kernels/attention/test_attention.py"
|
||||
fi
|
||||
|
||||
if [[ $cmds == *" kernels/quantization"* ]]; then
|
||||
cmds="${cmds} \
|
||||
--ignore=kernels/quantization/test_int8_quant.py \
|
||||
--ignore=kernels/quantization/test_machete_mm.py \
|
||||
--ignore=kernels/quantization/test_block_fp8.py \
|
||||
--ignore=kernels/quantization/test_block_int8.py \
|
||||
--ignore=kernels/quantization/test_marlin_gemm.py \
|
||||
--ignore=kernels/quantization/test_cutlass_scaled_mm.py \
|
||||
--ignore=kernels/quantization/test_int8_kernel.py"
|
||||
fi
|
||||
|
||||
if [[ $cmds == *" kernels/mamba"* ]]; then
|
||||
cmds="${cmds} \
|
||||
--ignore=kernels/mamba/test_mamba_mixer2.py \
|
||||
--ignore=kernels/mamba/test_causal_conv1d.py \
|
||||
--ignore=kernels/mamba/test_mamba_ssm_ssd.py"
|
||||
fi
|
||||
|
||||
if [[ $cmds == *" kernels/moe"* ]]; then
|
||||
cmds="${cmds} \
|
||||
--ignore=kernels/moe/test_moe.py \
|
||||
--ignore=kernels/moe/test_cutlass_moe.py \
|
||||
--ignore=kernels/moe/test_triton_moe_ptpc_fp8.py"
|
||||
fi
|
||||
|
||||
# --- Entrypoint ignores ---
|
||||
if [[ $cmds == *" entrypoints/openai "* ]]; then
|
||||
cmds=${cmds//" entrypoints/openai "/" entrypoints/openai \
|
||||
--ignore=entrypoints/openai/test_audio.py \
|
||||
--ignore=entrypoints/openai/test_shutdown.py \
|
||||
--ignore=entrypoints/openai/test_completion.py \
|
||||
--ignore=entrypoints/openai/test_models.py \
|
||||
--ignore=entrypoints/openai/test_lora_adapters.py \
|
||||
--ignore=entrypoints/openai/test_return_tokens_as_ids.py \
|
||||
--ignore=entrypoints/openai/test_root_path.py \
|
||||
--ignore=entrypoints/openai/test_tokenization.py \
|
||||
--ignore=entrypoints/openai/test_prompt_validation.py "}
|
||||
fi
|
||||
|
||||
if [[ $cmds == *" entrypoints/llm "* ]]; then
|
||||
cmds=${cmds//" entrypoints/llm "/" entrypoints/llm \
|
||||
--ignore=entrypoints/llm/test_chat.py \
|
||||
--ignore=entrypoints/llm/test_accuracy.py \
|
||||
--ignore=entrypoints/llm/test_init.py \
|
||||
--ignore=entrypoints/llm/test_prompt_validation.py "}
|
||||
fi
|
||||
|
||||
# Clean up escaped newlines from --ignore appends
|
||||
cmds=$(echo "$cmds" | sed 's/ \\ / /g')
|
||||
|
||||
echo "$cmds"
|
||||
}
|
||||
|
||||
###############################################################################
|
||||
# Main
|
||||
###############################################################################
|
||||
|
||||
# --- GPU initialization ---
|
||||
echo "--- Confirming Clean Initial State"
|
||||
wait_for_clean_gpus
|
||||
|
||||
echo "--- ROCm info"
|
||||
rocminfo
|
||||
|
||||
# --- Docker housekeeping ---
|
||||
cleanup_docker
|
||||
|
||||
echo "--- Resetting GPUs"
|
||||
|
||||
echo "reset" > /opt/amdgpu/etc/gpu_state
|
||||
wait_for_clean_gpus
|
||||
|
||||
while true; do
|
||||
sleep 3
|
||||
if grep -q clean /opt/amdgpu/etc/gpu_state; then
|
||||
echo "GPUs state is \"clean\""
|
||||
break
|
||||
fi
|
||||
done
|
||||
|
||||
# --- Pull test image ---
|
||||
echo "--- Pulling container"
|
||||
image_name="rocm/vllm-ci:${BUILDKITE_COMMIT}"
|
||||
container_name="rocm_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
|
||||
docker pull "${image_name}"
|
||||
|
||||
remove_docker_container() {
|
||||
docker rm -f "${container_name}" || docker image rm -f "${image_name}" || true
|
||||
docker rm -f "${container_name}" || docker image rm -f "${image_name}" || true
|
||||
}
|
||||
trap remove_docker_container EXIT
|
||||
|
||||
# --- Prepare commands ---
|
||||
echo "--- Running container"
|
||||
|
||||
HF_CACHE="$(realpath ~)/huggingface"
|
||||
mkdir -p "${HF_CACHE}"
|
||||
HF_MOUNT="/root/.cache/huggingface"
|
||||
|
||||
commands=$@
|
||||
commands="$*"
|
||||
echo "Raw commands: $commands"
|
||||
|
||||
commands=${commands//"pytest -v -s basic_correctness/test_basic_correctness.py"/"pytest -v -s basic_correctness/test_basic_correctness.py"}
|
||||
|
||||
if [[ $commands == *"pytest -v -s models/test_registry.py"* ]]; then
|
||||
commands=${commands//"pytest -v -s models/test_registry.py"/"pytest -v -s models/test_registry.py -k 'not BambaForCausalLM and not GritLM and not Mamba2ForCausalLM and not Zamba2ForCausalLM'"}
|
||||
fi
|
||||
|
||||
commands=${commands//"pytest -v -s compile/test_basic_correctness.py"/"pytest -v -s compile/test_basic_correctness.py"}
|
||||
|
||||
if [[ $commands == *"pytest -v -s lora"* ]]; then
|
||||
commands=${commands//"pytest -v -s lora"/"VLLM_ROCM_CUSTOM_PAGED_ATTN=0 pytest -v -s lora"}
|
||||
fi
|
||||
|
||||
#ignore certain kernels tests
|
||||
if [[ $commands == *" kernels/core"* ]]; then
|
||||
commands="${commands} \
|
||||
--ignore=kernels/core/test_fused_quant_layernorm.py \
|
||||
--ignore=kernels/core/test_permute_cols.py"
|
||||
fi
|
||||
|
||||
if [[ $commands == *" kernels/attention"* ]]; then
|
||||
commands="${commands} \
|
||||
--ignore=kernels/attention/test_attention_selector.py \
|
||||
--ignore=kernels/attention/test_encoder_decoder_attn.py \
|
||||
--ignore=kernels/attention/test_flash_attn.py \
|
||||
--ignore=kernels/attention/test_flashinfer.py \
|
||||
--ignore=kernels/attention/test_prefix_prefill.py \
|
||||
--ignore=kernels/attention/test_cascade_flash_attn.py \
|
||||
--ignore=kernels/attention/test_mha_attn.py \
|
||||
--ignore=kernels/attention/test_lightning_attn.py \
|
||||
--ignore=kernels/attention/test_attention.py"
|
||||
fi
|
||||
|
||||
if [[ $commands == *" kernels/quantization"* ]]; then
|
||||
commands="${commands} \
|
||||
--ignore=kernels/quantization/test_int8_quant.py \
|
||||
--ignore=kernels/quantization/test_machete_mm.py \
|
||||
--ignore=kernels/quantization/test_block_fp8.py \
|
||||
--ignore=kernels/quantization/test_block_int8.py \
|
||||
--ignore=kernels/quantization/test_marlin_gemm.py \
|
||||
--ignore=kernels/quantization/test_cutlass_scaled_mm.py \
|
||||
--ignore=kernels/quantization/test_int8_kernel.py"
|
||||
fi
|
||||
|
||||
if [[ $commands == *" kernels/mamba"* ]]; then
|
||||
commands="${commands} \
|
||||
--ignore=kernels/mamba/test_mamba_mixer2.py \
|
||||
--ignore=kernels/mamba/test_causal_conv1d.py \
|
||||
--ignore=kernels/mamba/test_mamba_ssm_ssd.py"
|
||||
fi
|
||||
|
||||
if [[ $commands == *" kernels/moe"* ]]; then
|
||||
commands="${commands} \
|
||||
--ignore=kernels/moe/test_moe.py \
|
||||
--ignore=kernels/moe/test_cutlass_moe.py \
|
||||
--ignore=kernels/moe/test_triton_moe_ptpc_fp8.py"
|
||||
fi
|
||||
|
||||
#ignore certain Entrypoints/openai tests
|
||||
if [[ $commands == *" entrypoints/openai "* ]]; then
|
||||
commands=${commands//" entrypoints/openai "/" entrypoints/openai \
|
||||
--ignore=entrypoints/openai/test_audio.py \
|
||||
--ignore=entrypoints/openai/test_shutdown.py \
|
||||
--ignore=entrypoints/openai/test_completion.py \
|
||||
--ignore=entrypoints/openai/test_models.py \
|
||||
--ignore=entrypoints/openai/test_lora_adapters.py \
|
||||
--ignore=entrypoints/openai/test_return_tokens_as_ids.py \
|
||||
--ignore=entrypoints/openai/test_root_path.py \
|
||||
--ignore=entrypoints/openai/test_tokenization.py \
|
||||
--ignore=entrypoints/openai/test_prompt_validation.py "}
|
||||
fi
|
||||
|
||||
#ignore certain Entrypoints/llm tests
|
||||
if [[ $commands == *" entrypoints/llm "* ]]; then
|
||||
commands=${commands//" entrypoints/llm "/" entrypoints/llm \
|
||||
--ignore=entrypoints/llm/test_chat.py \
|
||||
--ignore=entrypoints/llm/test_accuracy.py \
|
||||
--ignore=entrypoints/llm/test_init.py \
|
||||
--ignore=entrypoints/llm/test_prompt_validation.py "}
|
||||
fi
|
||||
|
||||
commands=$(echo "$commands" | sed 's/ \\ / /g')
|
||||
# Fix quoting before ROCm overrides (so overrides see correct structure)
|
||||
commands=$(re_quote_pytest_markers "$commands")
|
||||
commands=$(apply_rocm_test_overrides "$commands")
|
||||
echo "Final commands: $commands"
|
||||
|
||||
# --ignore=entrypoints/openai/test_encoder_decoder.py \
|
||||
# --ignore=entrypoints/openai/test_embedding.py \
|
||||
# --ignore=entrypoints/openai/test_oot_registration.py
|
||||
# --ignore=entrypoints/openai/test_accuracy.py \
|
||||
# --ignore=entrypoints/openai/test_models.py <= Fails on MI250 but passes on MI300 as of 2025-03-13
|
||||
|
||||
|
||||
MYPYTHONPATH=".."
|
||||
|
||||
# Test that we're launching on the machine that has
|
||||
# proper access to GPUs
|
||||
# Verify GPU access
|
||||
render_gid=$(getent group render | cut -d: -f3)
|
||||
if [[ -z "$render_gid" ]]; then
|
||||
echo "Error: 'render' group not found. This is required for GPU access." >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [[ $commands == *"VLLM_TEST_GROUP_NAME=mi325_4-2-node-tests-4-gpus-in-total"* ]]; then
|
||||
|
||||
# --- Route: multi-node vs single-node ---
|
||||
if is_multi_node "$commands"; then
|
||||
echo "--- Multi-node job detected"
|
||||
export DCKR_VER=$(docker --version | sed 's/Docker version \(.*\), build .*/\1/')
|
||||
|
||||
if [[ "$commands" =~ ^(.*)"["(.*)"] && ["(.*)"]"$ ]]; then
|
||||
prefix=$( echo "${BASH_REMATCH[1]}" | sed 's/;//g')
|
||||
echo "PREFIX: ${prefix}"
|
||||
export composite_command="(command rocm-smi || true)"
|
||||
myIFS=$IFS
|
||||
IFS=','
|
||||
read -ra node0 <<< ${BASH_REMATCH[2]}
|
||||
read -ra node1 <<< ${BASH_REMATCH[3]}
|
||||
IFS=$myIFS
|
||||
for i in "${!node0[@]}";do
|
||||
command_node_0=$(echo ${node0[i]} | sed 's/\"//g')
|
||||
command_node_1=$(echo ${node1[i]} | sed 's/\"//g')
|
||||
|
||||
export commands="./.buildkite/scripts/run-multi-node-test.sh /vllm-workspace/tests 2 2 ${image_name} '${command_node_0}' '${command_node_1}'"
|
||||
echo "COMMANDS: ${commands}"
|
||||
composite_command=$(echo "${composite_command} && ${commands}")
|
||||
done
|
||||
/bin/bash -c "${composite_command}"
|
||||
cleanup_network
|
||||
# Parse the bracket syntax: prefix ; [node0_cmds] && [node1_cmds]
|
||||
# BASH_REMATCH[1] = prefix (everything before first bracket)
|
||||
# BASH_REMATCH[2] = comma-separated node0 commands
|
||||
# BASH_REMATCH[3] = comma-separated node1 commands
|
||||
if [[ "$commands" =~ ^(.*)\[(.*)"] && ["(.*)\]$ ]]; then
|
||||
prefix=$(echo "${BASH_REMATCH[1]}" | sed 's/;//g')
|
||||
echo "PREFIX: ${prefix}"
|
||||
|
||||
export composite_command="(command rocm-smi || true)"
|
||||
saved_IFS=$IFS
|
||||
IFS=','
|
||||
read -ra node0 <<< "${BASH_REMATCH[2]}"
|
||||
read -ra node1 <<< "${BASH_REMATCH[3]}"
|
||||
IFS=$saved_IFS
|
||||
|
||||
if [[ ${#node0[@]} -ne ${#node1[@]} ]]; then
|
||||
echo "Warning: node0 has ${#node0[@]} commands, node1 has ${#node1[@]}. They will be paired by index."
|
||||
fi
|
||||
|
||||
for i in "${!node0[@]}"; do
|
||||
command_node_0=$(echo "${node0[i]}" | sed 's/\"//g')
|
||||
command_node_1=$(echo "${node1[i]}" | sed 's/\"//g')
|
||||
|
||||
step_cmd="./.buildkite/scripts/run-multi-node-test.sh /vllm-workspace/tests 2 2 ${image_name} '${command_node_0}' '${command_node_1}'"
|
||||
echo "COMMANDS: ${step_cmd}"
|
||||
composite_command="${composite_command} && ${step_cmd}"
|
||||
done
|
||||
|
||||
/bin/bash -c "${composite_command}"
|
||||
cleanup_network
|
||||
else
|
||||
echo "Failed to parse node commands! Exiting."
|
||||
cleanup_network
|
||||
exit 111
|
||||
echo "Multi-node job detected but failed to parse bracket command syntax."
|
||||
echo "Expected format: prefix ; [node0_cmd1, node0_cmd2] && [node1_cmd1, node1_cmd2]"
|
||||
echo "Got: $commands"
|
||||
cleanup_network
|
||||
exit 111
|
||||
fi
|
||||
else
|
||||
echo "--- Single-node job"
|
||||
echo "Render devices: $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES"
|
||||
docker run \
|
||||
--device /dev/kfd $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES \
|
||||
--network=host \
|
||||
--shm-size=16gb \
|
||||
--group-add "$render_gid" \
|
||||
--rm \
|
||||
-e HF_TOKEN \
|
||||
-e AWS_ACCESS_KEY_ID \
|
||||
-e AWS_SECRET_ACCESS_KEY \
|
||||
-v "${HF_CACHE}:${HF_MOUNT}" \
|
||||
-e "HF_HOME=${HF_MOUNT}" \
|
||||
-e "PYTHONPATH=${MYPYTHONPATH}" \
|
||||
--name "${container_name}" \
|
||||
"${image_name}" \
|
||||
/bin/bash -c "${commands}"
|
||||
--device /dev/kfd $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES \
|
||||
--network=host \
|
||||
--shm-size=16gb \
|
||||
--group-add "$render_gid" \
|
||||
--rm \
|
||||
-e HF_TOKEN \
|
||||
-e AWS_ACCESS_KEY_ID \
|
||||
-e AWS_SECRET_ACCESS_KEY \
|
||||
-v "${HF_CACHE}:${HF_MOUNT}" \
|
||||
-e "HF_HOME=${HF_MOUNT}" \
|
||||
-e "PYTHONPATH=${MYPYTHONPATH}" \
|
||||
--name "${container_name}" \
|
||||
"${image_name}" \
|
||||
/bin/bash -c "${commands}"
|
||||
fi
|
||||
|
||||
@@ -27,7 +27,7 @@ function cpu_tests() {
|
||||
podman exec -it "$container_id" bash -c "
|
||||
export TORCH_COMPILE_DISABLE=1
|
||||
set -xve
|
||||
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m" >> $HOME/test_basic.log
|
||||
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m" >> "$HOME"/test_basic.log
|
||||
|
||||
# Run basic model test
|
||||
podman exec -it "$container_id" bash -c "
|
||||
@@ -43,7 +43,7 @@ function cpu_tests() {
|
||||
pytest -v -s tests/models/language/generation/test_common.py::test_models[False-False-5-32-google/gemma-1.1-2b-it]
|
||||
pytest -v -s tests/models/language/pooling/test_classification.py::test_models[float-jason9693/Qwen2.5-1.5B-apeach]
|
||||
# TODO: Below test case tests/models/language/pooling/test_embedding.py::test_models[True-ssmits/Qwen2-7B-Instruct-embed-base] fails on ppc64le. Disabling it for time being.
|
||||
# pytest -v -s tests/models/language/pooling/test_embedding.py -m cpu_model" >> $HOME/test_rest.log
|
||||
# pytest -v -s tests/models/language/pooling/test_embedding.py -m cpu_model" >> "$HOME"/test_rest.log
|
||||
}
|
||||
|
||||
# All of CPU tests are expected to be finished less than 40 mins.
|
||||
|
||||
@@ -16,5 +16,5 @@ echo "--- :docker: Building Docker image"
|
||||
docker build --progress plain --tag "$IMAGE_NAME" --target vllm-test -f docker/Dockerfile.cpu .
|
||||
|
||||
# Run the image, setting --shm-size=4g for tensor parallel.
|
||||
docker run --rm --cpuset-cpus=$CORE_RANGE --cpuset-mems=$NUMA_NODE -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN -e VLLM_CPU_KVCACHE_SPACE=16 -e VLLM_CPU_CI_ENV=1 -e VLLM_CPU_SIM_MULTI_NUMA=1 --shm-size=4g $IMAGE_NAME \
|
||||
timeout $TIMEOUT_VAL bash -c "set -euox pipefail; echo \"--- Print packages\"; pip list; echo \"--- Running tests\"; ${TEST_COMMAND}"
|
||||
docker run --rm --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN -e VLLM_CPU_KVCACHE_SPACE=16 -e VLLM_CPU_CI_ENV=1 -e VLLM_CPU_SIM_MULTI_NUMA=1 --shm-size=4g "$IMAGE_NAME" \
|
||||
timeout "$TIMEOUT_VAL" bash -c "set -euox pipefail; echo \"--- Print packages\"; pip list; echo \"--- Running tests\"; ${TEST_COMMAND}"
|
||||
|
||||
@@ -7,7 +7,7 @@ set -exuo pipefail
|
||||
# Try building the docker image
|
||||
image_name="hpu/upstream-vllm-ci:${BUILDKITE_COMMIT}"
|
||||
container_name="hpu-upstream-vllm-ci-${BUILDKITE_COMMIT}-container"
|
||||
cat <<EOF | docker build -t ${image_name} -f - .
|
||||
cat <<EOF | docker build -t "${image_name}" -f - .
|
||||
FROM gaudi-base-image:latest
|
||||
|
||||
COPY ./ /workspace/vllm
|
||||
@@ -39,12 +39,12 @@ EOF
|
||||
# functions, while other platforms only need one remove_docker_container
|
||||
# function.
|
||||
EXITCODE=1
|
||||
remove_docker_containers() { docker rm -f ${container_name} || true; }
|
||||
remove_docker_containers() { docker rm -f "${container_name}" || true; }
|
||||
trap 'remove_docker_containers; exit $EXITCODE;' EXIT
|
||||
remove_docker_containers
|
||||
|
||||
echo "Running HPU plugin v1 test"
|
||||
docker run --rm --runtime=habana --name=${container_name} --network=host \
|
||||
docker run --rm --runtime=habana --name="${container_name}" --network=host \
|
||||
-e HABANA_VISIBLE_DEVICES=all \
|
||||
-e VLLM_SKIP_WARMUP=true \
|
||||
-e PT_HPU_ENABLE_LAZY_COLLECTIVES=true \
|
||||
|
||||
@@ -41,6 +41,7 @@ get_config() {
|
||||
echo "Error: file '${TEST_RUN_CONFIG_FILE}' does not exist in the warehouse" >&2
|
||||
exit 1
|
||||
fi
|
||||
# shellcheck source=/dev/null
|
||||
source "${TEST_RUN_CONFIG_FILE}"
|
||||
echo "Base docker image name that get from configuration: ${BASE_IMAGE_NAME}"
|
||||
return 0
|
||||
@@ -48,9 +49,8 @@ get_config() {
|
||||
|
||||
# get test running configuration.
|
||||
fetch_vllm_test_cfg
|
||||
get_config
|
||||
# Check if the function call was successful. If not, exit the script.
|
||||
if [ $? -ne 0 ]; then
|
||||
if ! get_config; then
|
||||
exit 1
|
||||
fi
|
||||
|
||||
@@ -62,14 +62,14 @@ agent_idx=$(echo "${BUILDKITE_AGENT_NAME}" | awk -F'-' '{print $(NF-1)}')
|
||||
echo "agent_idx: ${agent_idx}"
|
||||
builder_name="cachebuilder${agent_idx}"
|
||||
builder_cache_dir="/mnt/docker-cache${agent_idx}"
|
||||
mkdir -p ${builder_cache_dir}
|
||||
mkdir -p "${builder_cache_dir}"
|
||||
|
||||
# Try building the docker image
|
||||
cat <<EOF | DOCKER_BUILDKIT=1 docker build \
|
||||
--add-host cache-service-vllm.nginx-pypi-cache.svc.cluster.local:${PYPI_CACHE_HOST} \
|
||||
--builder ${builder_name} --cache-from type=local,src=${builder_cache_dir} \
|
||||
--cache-to type=local,dest=${builder_cache_dir},mode=max \
|
||||
--progress=plain --load -t ${image_name} -f - .
|
||||
--add-host cache-service-vllm.nginx-pypi-cache.svc.cluster.local:"${PYPI_CACHE_HOST}" \
|
||||
--builder "${builder_name}" --cache-from type=local,src="${builder_cache_dir}" \
|
||||
--cache-to type=local,dest="${builder_cache_dir}",mode=max \
|
||||
--progress=plain --load -t "${image_name}" -f - .
|
||||
FROM ${BASE_IMAGE_NAME}
|
||||
|
||||
# Define environments
|
||||
@@ -116,7 +116,7 @@ RUN --mount=type=cache,target=/root/.cache/pip \
|
||||
export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi && \
|
||||
source /usr/local/Ascend/ascend-toolkit/set_env.sh && \
|
||||
source /usr/local/Ascend/nnal/atb/set_env.sh && \
|
||||
export LD_LIBRARY_PATH=\$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/`uname -i`-linux/devlib && \
|
||||
export LD_LIBRARY_PATH=\$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/$(uname -i)-linux/devlib && \
|
||||
python3 -m pip install -v -e /workspace/vllm-ascend/ --extra-index https://download.pytorch.org/whl/cpu/
|
||||
|
||||
ENV VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||
@@ -139,7 +139,7 @@ trap remove_docker_container EXIT
|
||||
# Generate corresponding --device args based on BUILDKITE_AGENT_NAME
|
||||
# Ascend NPU BUILDKITE_AGENT_NAME format is {hostname}-{agent_idx}-{npu_card_num}cards, and agent_idx starts from 1.
|
||||
# e.g. atlas-a2-001-1-2cards means this is the 1-th agent on atlas-a2-001 host, and it has 2 NPU cards.
|
||||
# returns --device /dev/davinci0 --device /dev/davinci1
|
||||
# returns one argument per line: --device, /dev/davinciX, ...
|
||||
parse_and_gen_devices() {
|
||||
local input="$1"
|
||||
local index cards_num
|
||||
@@ -151,29 +151,24 @@ parse_and_gen_devices() {
|
||||
return 1
|
||||
fi
|
||||
|
||||
local devices=""
|
||||
local i=0
|
||||
while (( i < cards_num )); do
|
||||
local dev_idx=$(((index - 1)*cards_num + i ))
|
||||
devices="$devices --device /dev/davinci${dev_idx}"
|
||||
printf '%s\n' "--device"
|
||||
printf '%s\n' "/dev/davinci${dev_idx}"
|
||||
((i++))
|
||||
done
|
||||
|
||||
# trim leading space
|
||||
devices="${devices#"${devices%%[![:space:]]*}"}"
|
||||
# Output devices: assigned to the caller variable
|
||||
printf '%s' "$devices"
|
||||
}
|
||||
|
||||
devices=$(parse_and_gen_devices "${BUILDKITE_AGENT_NAME}") || exit 1
|
||||
mapfile -t device_args < <(parse_and_gen_devices "${BUILDKITE_AGENT_NAME}") || exit 1
|
||||
|
||||
# Run the image and execute the Out-Of-Tree (OOT) platform interface test case on Ascend NPU hardware.
|
||||
# This test checks whether the OOT platform interface is functioning properly in conjunction with
|
||||
# the hardware plugin vllm-ascend.
|
||||
model_cache_dir=/mnt/modelscope${agent_idx}
|
||||
mkdir -p ${model_cache_dir}
|
||||
mkdir -p "${model_cache_dir}"
|
||||
docker run \
|
||||
${devices} \
|
||||
"${device_args[@]}" \
|
||||
--device /dev/davinci_manager \
|
||||
--device /dev/devmm_svm \
|
||||
--device /dev/hisi_hdc \
|
||||
@@ -182,7 +177,7 @@ docker run \
|
||||
-v /usr/local/Ascend/driver/lib64/:/usr/local/Ascend/driver/lib64/ \
|
||||
-v /usr/local/Ascend/driver/version.info:/usr/local/Ascend/driver/version.info \
|
||||
-v /etc/ascend_install.info:/etc/ascend_install.info \
|
||||
-v ${model_cache_dir}:/root/.cache/modelscope \
|
||||
-v "${model_cache_dir}":/root/.cache/modelscope \
|
||||
--entrypoint="" \
|
||||
--name "${container_name}" \
|
||||
"${image_name}" \
|
||||
|
||||
@@ -61,7 +61,7 @@ echo "Results will be stored in: $RESULTS_DIR"
|
||||
echo "--- Installing Python dependencies ---"
|
||||
python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \
|
||||
&& python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \
|
||||
&& python3 -m pip install --progress-bar off "lm-eval[api]>=0.4.9.2" \
|
||||
&& python3 -m pip install --progress-bar off "lm-eval[api]>=0.4.11" \
|
||||
&& python3 -m pip install --progress-bar off hf-transfer tblib==3.1.0
|
||||
echo "--- Python dependencies installed ---"
|
||||
|
||||
|
||||
@@ -61,7 +61,7 @@ echo "Results will be stored in: $RESULTS_DIR"
|
||||
echo "--- Installing Python dependencies ---"
|
||||
python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \
|
||||
&& python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \
|
||||
&& python3 -m pip install --progress-bar off "lm-eval[api]>=0.4.9.2" \
|
||||
&& python3 -m pip install --progress-bar off "lm-eval[api]>=0.4.11" \
|
||||
&& python3 -m pip install --progress-bar off hf-transfer tblib==3.1.0
|
||||
echo "--- Python dependencies installed ---"
|
||||
|
||||
|
||||
@@ -8,7 +8,7 @@ image_name="xpu/vllm-ci:${BUILDKITE_COMMIT}"
|
||||
container_name="xpu_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
|
||||
|
||||
# Try building the docker image
|
||||
docker build -t ${image_name} -f docker/Dockerfile.xpu .
|
||||
docker build -t "${image_name}" -f docker/Dockerfile.xpu .
|
||||
|
||||
# Setup cleanup
|
||||
remove_docker_container() {
|
||||
@@ -39,6 +39,7 @@ docker run \
|
||||
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray
|
||||
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp
|
||||
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager --attention-backend=TRITON_ATTN
|
||||
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager --quantization fp8
|
||||
python3 examples/offline_inference/basic/generate.py --model superjob/Qwen3-4B-Instruct-2507-GPTQ-Int4 --block-size 64 --enforce-eager
|
||||
python3 examples/offline_inference/basic/generate.py --model ibm-research/PowerMoE-3b --block-size 64 --enforce-eager -tp 2
|
||||
python3 examples/offline_inference/basic/generate.py --model ibm-research/PowerMoE-3b --block-size 64 --enforce-eager -tp 2 --enable-expert-parallel
|
||||
|
||||
@@ -21,16 +21,16 @@ echo "Pushing original tag $ORIG_TAG_NAME$ORIG_TAG_SUFFIX to new nightly tag nam
|
||||
|
||||
# pull original arch-dependent images from AWS ECR Public
|
||||
aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7
|
||||
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:$ORIG_TAG_NAME-x86_64$ORIG_TAG_SUFFIX
|
||||
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:$ORIG_TAG_NAME-aarch64$ORIG_TAG_SUFFIX
|
||||
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:"$ORIG_TAG_NAME"-x86_64"$ORIG_TAG_SUFFIX"
|
||||
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:"$ORIG_TAG_NAME"-aarch64"$ORIG_TAG_SUFFIX"
|
||||
# tag arch-dependent images
|
||||
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$ORIG_TAG_NAME-x86_64$ORIG_TAG_SUFFIX vllm/vllm-openai:$TAG_NAME-x86_64
|
||||
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$ORIG_TAG_NAME-aarch64$ORIG_TAG_SUFFIX vllm/vllm-openai:$TAG_NAME-aarch64
|
||||
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:"$ORIG_TAG_NAME"-x86_64"$ORIG_TAG_SUFFIX" vllm/vllm-openai:"$TAG_NAME"-x86_64
|
||||
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:"$ORIG_TAG_NAME"-aarch64"$ORIG_TAG_SUFFIX" vllm/vllm-openai:"$TAG_NAME"-aarch64
|
||||
# push arch-dependent images to DockerHub
|
||||
docker push vllm/vllm-openai:$TAG_NAME-x86_64
|
||||
docker push vllm/vllm-openai:$TAG_NAME-aarch64
|
||||
docker push vllm/vllm-openai:"$TAG_NAME"-x86_64
|
||||
docker push vllm/vllm-openai:"$TAG_NAME"-aarch64
|
||||
# push arch-independent manifest to DockerHub
|
||||
docker manifest create vllm/vllm-openai:$TAG_NAME vllm/vllm-openai:$TAG_NAME-x86_64 vllm/vllm-openai:$TAG_NAME-aarch64 --amend
|
||||
docker manifest create vllm/vllm-openai:$TAG_NAME-$BUILDKITE_COMMIT vllm/vllm-openai:$TAG_NAME-x86_64 vllm/vllm-openai:$TAG_NAME-aarch64 --amend
|
||||
docker manifest push vllm/vllm-openai:$TAG_NAME
|
||||
docker manifest push vllm/vllm-openai:$TAG_NAME-$BUILDKITE_COMMIT
|
||||
docker manifest create vllm/vllm-openai:"$TAG_NAME" vllm/vllm-openai:"$TAG_NAME"-x86_64 vllm/vllm-openai:"$TAG_NAME"-aarch64 --amend
|
||||
docker manifest create vllm/vllm-openai:"$TAG_NAME"-"$BUILDKITE_COMMIT" vllm/vllm-openai:"$TAG_NAME"-x86_64 vllm/vllm-openai:"$TAG_NAME"-aarch64 --amend
|
||||
docker manifest push vllm/vllm-openai:"$TAG_NAME"
|
||||
docker manifest push vllm/vllm-openai:"$TAG_NAME"-"$BUILDKITE_COMMIT"
|
||||
|
||||
@@ -1,64 +0,0 @@
|
||||
#!/bin/bash
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
# Setup script for Prime-RL integration tests
|
||||
# This script prepares the environment for running Prime-RL tests with nightly vLLM
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
REPO_ROOT="$(cd "${SCRIPT_DIR}/../.." && pwd)"
|
||||
PRIME_RL_REPO="https://github.com/PrimeIntellect-ai/prime-rl.git"
|
||||
PRIME_RL_DIR="${REPO_ROOT}/prime-rl"
|
||||
|
||||
if command -v rocm-smi &> /dev/null || command -v rocminfo &> /dev/null; then
|
||||
echo "AMD GPU detected. Prime-RL currently only supports NVIDIA. Skipping..."
|
||||
exit 0
|
||||
fi
|
||||
|
||||
echo "Setting up Prime-RL integration test environment..."
|
||||
|
||||
# Clean up any existing Prime-RL directory
|
||||
if [ -d "${PRIME_RL_DIR}" ]; then
|
||||
echo "Removing existing Prime-RL directory..."
|
||||
rm -rf "${PRIME_RL_DIR}"
|
||||
fi
|
||||
|
||||
# Install UV if not available
|
||||
if ! command -v uv &> /dev/null; then
|
||||
echo "Installing UV package manager..."
|
||||
curl -LsSf https://astral.sh/uv/install.sh | sh
|
||||
source $HOME/.local/bin/env
|
||||
fi
|
||||
|
||||
# Clone Prime-RL repository at specific branch for reproducible tests
|
||||
PRIME_RL_BRANCH="integ-vllm-main"
|
||||
echo "Cloning Prime-RL repository at branch: ${PRIME_RL_BRANCH}..."
|
||||
git clone --branch "${PRIME_RL_BRANCH}" --single-branch "${PRIME_RL_REPO}" "${PRIME_RL_DIR}"
|
||||
cd "${PRIME_RL_DIR}"
|
||||
|
||||
echo "Setting up UV project environment..."
|
||||
export UV_PROJECT_ENVIRONMENT=/usr/local
|
||||
ln -s /usr/bin/python3 /usr/local/bin/python
|
||||
|
||||
# Remove vllm pin from pyproject.toml
|
||||
echo "Removing vllm pin from pyproject.toml..."
|
||||
sed -i '/vllm==/d' pyproject.toml
|
||||
|
||||
# Sync Prime-RL dependencies
|
||||
echo "Installing Prime-RL dependencies..."
|
||||
uv sync --inexact && uv sync --inexact --all-extras
|
||||
|
||||
# Verify installation
|
||||
echo "Verifying installations..."
|
||||
uv run python -c "import vllm; print(f'vLLM version: {vllm.__version__}')"
|
||||
uv run python -c "import prime_rl; print('Prime-RL imported successfully')"
|
||||
|
||||
echo "Prime-RL integration test environment setup complete!"
|
||||
|
||||
echo "Running Prime-RL integration tests..."
|
||||
export WANDB_MODE=offline # this makes this test not require a WANDB_API_KEY
|
||||
uv run pytest -vs tests/integration/test_rl.py -m gpu
|
||||
|
||||
echo "Prime-RL integration tests completed!"
|
||||
@@ -51,14 +51,14 @@ for BACK in "${BACKENDS[@]}"; do
|
||||
--enable-eplb \
|
||||
--trust-remote-code \
|
||||
--max-model-len 2048 \
|
||||
--all2all-backend $BACK \
|
||||
--port $PORT &
|
||||
--all2all-backend "$BACK" \
|
||||
--port "$PORT" &
|
||||
SERVER_PID=$!
|
||||
wait_for_server $PORT
|
||||
wait_for_server "$PORT"
|
||||
|
||||
TAG=$(echo "$MODEL" | tr '/: \\n' '_____')
|
||||
OUT="${OUT_DIR}/${TAG}_${BACK}.json"
|
||||
python3 tests/evals/gsm8k/gsm8k_eval.py --host http://127.0.0.1 --port $PORT --num-questions ${NUM_Q} --save-results ${OUT}
|
||||
python3 tests/evals/gsm8k/gsm8k_eval.py --host http://127.0.0.1 --port "$PORT" --num-questions "${NUM_Q}" --save-results "${OUT}"
|
||||
python3 - <<PY
|
||||
import json; acc=json.load(open('${OUT}'))['accuracy']
|
||||
print(f"${MODEL} ${BACK}: accuracy {acc:.3f}")
|
||||
|
||||
@@ -0,0 +1,57 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euxo pipefail
|
||||
|
||||
# Nightly e2e test for prefetch offloading with a MoE model.
|
||||
# Runs DeepSeek-V2-Lite with prefetch offloading of MoE expert weights
|
||||
# and validates GSM8K accuracy matches baseline (no offloading).
|
||||
#
|
||||
# args: [THRESHOLD] [NUM_QUESTIONS] [START_PORT]
|
||||
THRESHOLD=${1:-0.25}
|
||||
NUM_Q=${2:-1319}
|
||||
PORT=${3:-8030}
|
||||
OUT_DIR=${OUT_DIR:-/tmp/vllm-scheduled}
|
||||
mkdir -p "${OUT_DIR}"
|
||||
|
||||
wait_for_server() {
|
||||
local port=$1
|
||||
timeout 600 bash -c '
|
||||
until curl -sf "http://127.0.0.1:'"$port"'/health" > /dev/null; do
|
||||
sleep 1
|
||||
done'
|
||||
}
|
||||
|
||||
MODEL="deepseek-ai/DeepSeek-V2-Lite"
|
||||
|
||||
cleanup() {
|
||||
if [[ -n "${SERVER_PID:-}" ]] && kill -0 "${SERVER_PID}" 2>/dev/null; then
|
||||
kill "${SERVER_PID}" 2>/dev/null || true
|
||||
for _ in {1..20}; do
|
||||
kill -0 "${SERVER_PID}" 2>/dev/null || break
|
||||
sleep 0.5
|
||||
done
|
||||
kill -9 "${SERVER_PID}" 2>/dev/null || true
|
||||
fi
|
||||
}
|
||||
trap cleanup EXIT
|
||||
|
||||
vllm serve "$MODEL" \
|
||||
--max-model-len 2048 \
|
||||
--offload-group-size 8 \
|
||||
--offload-num-in-group 2 \
|
||||
--offload-prefetch-step 1 \
|
||||
--offload-params w13_weight w2_weight \
|
||||
--port "$PORT" &
|
||||
SERVER_PID=$!
|
||||
wait_for_server "$PORT"
|
||||
|
||||
TAG=$(echo "$MODEL" | tr '/: \\n' '_____')
|
||||
OUT="${OUT_DIR}/${TAG}_prefetch_offload.json"
|
||||
python3 tests/evals/gsm8k/gsm8k_eval.py --host http://127.0.0.1 --port "$PORT" --num-questions "${NUM_Q}" --save-results "${OUT}"
|
||||
python3 - <<PY
|
||||
import json; acc=json.load(open('${OUT}'))['accuracy']
|
||||
print(f"${MODEL} prefetch_offload: accuracy {acc:.3f}")
|
||||
assert acc >= ${THRESHOLD}, f"${MODEL} prefetch_offload accuracy {acc}"
|
||||
PY
|
||||
|
||||
cleanup
|
||||
SERVER_PID=
|
||||
@@ -47,20 +47,20 @@ for BACK in "${BACKENDS[@]}"; do
|
||||
vllm serve "$MODEL" \
|
||||
--enforce-eager \
|
||||
--enable-eplb \
|
||||
--all2all-backend $BACK \
|
||||
--all2all-backend "$BACK" \
|
||||
--eplb-config '{"window_size":10, "step_interval":100, "num_redundant_experts":0, "log_balancedness":true}' \
|
||||
--tensor-parallel-size ${TENSOR_PARALLEL_SIZE} \
|
||||
--data-parallel-size ${DATA_PARALLEL_SIZE} \
|
||||
--tensor-parallel-size "${TENSOR_PARALLEL_SIZE}" \
|
||||
--data-parallel-size "${DATA_PARALLEL_SIZE}" \
|
||||
--enable-expert-parallel \
|
||||
--trust-remote-code \
|
||||
--max-model-len 2048 \
|
||||
--port $PORT &
|
||||
--port "$PORT" &
|
||||
SERVER_PID=$!
|
||||
wait_for_server $PORT
|
||||
wait_for_server "$PORT"
|
||||
|
||||
TAG=$(echo "$MODEL" | tr '/: \\n' '_____')
|
||||
OUT="${OUT_DIR}/${TAG}_${BACK}.json"
|
||||
python3 tests/evals/gsm8k/gsm8k_eval.py --host http://127.0.0.1 --port $PORT --num-questions ${NUM_Q} --save-results ${OUT}
|
||||
python3 tests/evals/gsm8k/gsm8k_eval.py --host http://127.0.0.1 --port "$PORT" --num-questions "${NUM_Q}" --save-results "${OUT}"
|
||||
python3 - <<PY
|
||||
import json; acc=json.load(open('${OUT}'))['accuracy']
|
||||
print(f"${MODEL} ${BACK}: accuracy {acc:.3f}")
|
||||
|
||||
@@ -51,20 +51,20 @@ for BACK in "${BACKENDS[@]}"; do
|
||||
--tensor-parallel-size 4 \
|
||||
--enable-expert-parallel \
|
||||
--enable-eplb \
|
||||
--all2all-backend $BACK \
|
||||
--all2all-backend "$BACK" \
|
||||
--eplb-config '{"window_size":200,"step_interval":600,"use_async":true}' \
|
||||
--speculative-config '{"method":"qwen3_next_mtp","num_speculative_tokens":1}' \
|
||||
--trust-remote-code \
|
||||
--max-model-len 2048 \
|
||||
--gpu-memory-utilization 0.9 \
|
||||
"${PLATFORM_ARGS[@]}" \
|
||||
--port $PORT &
|
||||
--port "$PORT" &
|
||||
SERVER_PID=$!
|
||||
wait_for_server $PORT
|
||||
wait_for_server "$PORT"
|
||||
|
||||
TAG=$(echo "$MODEL" | tr '/: \\n' '_____')
|
||||
OUT="${OUT_DIR}/${TAG}_${BACK}.json"
|
||||
python3 tests/evals/gsm8k/gsm8k_eval.py --host http://127.0.0.1 --port $PORT --num-questions ${NUM_Q} --save-results ${OUT}
|
||||
python3 tests/evals/gsm8k/gsm8k_eval.py --host http://127.0.0.1 --port "$PORT" --num-questions "${NUM_Q}" --save-results "${OUT}"
|
||||
python3 - <<PY
|
||||
import json; acc=json.load(open('${OUT}'))['accuracy']
|
||||
print(f"${MODEL} ${BACK}: accuracy {acc:.3f}")
|
||||
|
||||
@@ -9,10 +9,11 @@ ENV_FILE=$1
|
||||
|
||||
# For testing on local vm, use `set -a` to export all variables
|
||||
source /etc/environment
|
||||
source $ENV_FILE
|
||||
# shellcheck source=/dev/null
|
||||
source "$ENV_FILE"
|
||||
|
||||
remove_docker_container() {
|
||||
docker rm -f $CONTAINER_NAME || true;
|
||||
docker rm -f "$CONTAINER_NAME" || true;
|
||||
}
|
||||
|
||||
trap remove_docker_container EXIT
|
||||
@@ -41,13 +42,13 @@ echo
|
||||
echo "starting docker...$CONTAINER_NAME"
|
||||
echo
|
||||
docker run \
|
||||
-v $DOWNLOAD_DIR:$DOWNLOAD_DIR \
|
||||
--env-file $ENV_FILE \
|
||||
-v "$DOWNLOAD_DIR":"$DOWNLOAD_DIR" \
|
||||
--env-file "$ENV_FILE" \
|
||||
-e HF_TOKEN="$HF_TOKEN" \
|
||||
-e TARGET_COMMIT=$BUILDKITE_COMMIT \
|
||||
-e MODEL=$MODEL \
|
||||
-e TARGET_COMMIT="$BUILDKITE_COMMIT" \
|
||||
-e MODEL="$MODEL" \
|
||||
-e WORKSPACE=/workspace \
|
||||
--name $CONTAINER_NAME \
|
||||
--name "$CONTAINER_NAME" \
|
||||
-d \
|
||||
--privileged \
|
||||
--network host \
|
||||
|
||||
@@ -42,21 +42,21 @@ echo "lanching vllm..."
|
||||
echo "logging to $VLLM_LOG"
|
||||
echo
|
||||
|
||||
vllm serve $MODEL \
|
||||
vllm serve "$MODEL" \
|
||||
--seed 42 \
|
||||
--max-num-seqs $MAX_NUM_SEQS \
|
||||
--max-num-batched-tokens $MAX_NUM_BATCHED_TOKENS \
|
||||
--tensor-parallel-size $TENSOR_PARALLEL_SIZE \
|
||||
--max-num-seqs "$MAX_NUM_SEQS" \
|
||||
--max-num-batched-tokens "$MAX_NUM_BATCHED_TOKENS" \
|
||||
--tensor-parallel-size "$TENSOR_PARALLEL_SIZE" \
|
||||
--no-enable-prefix-caching \
|
||||
--download_dir $DOWNLOAD_DIR \
|
||||
--max-model-len $MAX_MODEL_LEN > "$VLLM_LOG" 2>&1 &
|
||||
--download_dir "$DOWNLOAD_DIR" \
|
||||
--max-model-len "$MAX_MODEL_LEN" > "$VLLM_LOG" 2>&1 &
|
||||
|
||||
|
||||
echo "wait for 20 minutes.."
|
||||
echo
|
||||
# sleep 1200
|
||||
# wait for 10 minutes...
|
||||
for i in {1..120}; do
|
||||
for _ in {1..120}; do
|
||||
# TODO: detect other type of errors.
|
||||
if grep -Fq "raise RuntimeError" "$VLLM_LOG"; then
|
||||
echo "Detected RuntimeError, exiting."
|
||||
@@ -78,11 +78,11 @@ echo "logging to $BM_LOG"
|
||||
echo
|
||||
vllm bench serve \
|
||||
--backend vllm \
|
||||
--model $MODEL \
|
||||
--model "$MODEL" \
|
||||
--dataset-name sonnet \
|
||||
--dataset-path benchmarks/sonnet_4x.txt \
|
||||
--sonnet-input-len $INPUT_LEN \
|
||||
--sonnet-output-len $OUTPUT_LEN \
|
||||
--sonnet-input-len "$INPUT_LEN" \
|
||||
--sonnet-output-len "$OUTPUT_LEN" \
|
||||
--ignore-eos > "$BM_LOG"
|
||||
|
||||
echo "completed..."
|
||||
|
||||
@@ -76,16 +76,15 @@ mkdir -p "$INDICES_OUTPUT_DIR"
|
||||
# this indices have relative paths that could work as long as it is next to the wheel directory in s3
|
||||
# i.e., the wheels are always in s3://vllm-wheels/<commit>/
|
||||
# and indices can be placed in /<commit>/, or /nightly/, or /<version>/
|
||||
if [[ ! -z "$DEFAULT_VARIANT_ALIAS" ]]; then
|
||||
alias_arg="--alias-to-default $DEFAULT_VARIANT_ALIAS"
|
||||
else
|
||||
alias_arg=""
|
||||
alias_args=()
|
||||
if [[ -n "$DEFAULT_VARIANT_ALIAS" ]]; then
|
||||
alias_args=(--alias-to-default "$DEFAULT_VARIANT_ALIAS")
|
||||
fi
|
||||
|
||||
# HACK: we do not need regex module here, but it is required by pre-commit hook
|
||||
# To avoid any external dependency, we simply replace it back to the stdlib re module
|
||||
sed -i 's/import regex as re/import re/g' .buildkite/scripts/generate-nightly-index.py
|
||||
$PYTHON .buildkite/scripts/generate-nightly-index.py --version "$SUBPATH" --current-objects "$obj_json" --output-dir "$INDICES_OUTPUT_DIR" --comment "commit $BUILDKITE_COMMIT" $alias_arg
|
||||
$PYTHON .buildkite/scripts/generate-nightly-index.py --version "$SUBPATH" --current-objects "$obj_json" --output-dir "$INDICES_OUTPUT_DIR" --comment "commit $BUILDKITE_COMMIT" "${alias_args[@]}"
|
||||
|
||||
# copy indices to /<commit>/ unconditionally
|
||||
echo "Uploading indices to $S3_COMMIT_PREFIX"
|
||||
@@ -100,9 +99,9 @@ fi
|
||||
# re-generate and copy to /<pure_version>/ only if it does not have "dev" in the version
|
||||
if [[ "$version" != *"dev"* ]]; then
|
||||
echo "Re-generating indices for /$pure_version/"
|
||||
rm -rf "$INDICES_OUTPUT_DIR/*"
|
||||
rm -rf "${INDICES_OUTPUT_DIR:?}/*"
|
||||
mkdir -p "$INDICES_OUTPUT_DIR"
|
||||
# wheel-dir is overridden to be the commit directory, so that the indices point to the correct wheel path
|
||||
$PYTHON .buildkite/scripts/generate-nightly-index.py --version "$pure_version" --wheel-dir "$SUBPATH" --current-objects "$obj_json" --output-dir "$INDICES_OUTPUT_DIR" --comment "version $pure_version" $alias_arg
|
||||
$PYTHON .buildkite/scripts/generate-nightly-index.py --version "$pure_version" --wheel-dir "$SUBPATH" --current-objects "$obj_json" --output-dir "$INDICES_OUTPUT_DIR" --comment "version $pure_version" "${alias_args[@]}"
|
||||
aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "s3://$BUCKET/$pure_version/"
|
||||
fi
|
||||
|
||||
@@ -7,7 +7,7 @@ SUBPATH=$BUILDKITE_COMMIT
|
||||
S3_COMMIT_PREFIX="s3://$BUCKET/$SUBPATH/"
|
||||
|
||||
RELEASE_VERSION=$(buildkite-agent meta-data get release-version)
|
||||
GIT_VERSION=$(git describe --exact-match --tags $BUILDKITE_COMMIT 2>/dev/null)
|
||||
GIT_VERSION=$(git describe --exact-match --tags "$BUILDKITE_COMMIT" 2>/dev/null)
|
||||
|
||||
echo "Release version from Buildkite: $RELEASE_VERSION"
|
||||
|
||||
@@ -55,7 +55,7 @@ mkdir -p $DIST_DIR
|
||||
aws s3 cp --recursive --exclude "*" --include "vllm-${PURE_VERSION}*.whl" --exclude "*dev*" --exclude "*rc[0-9]*" "$S3_COMMIT_PREFIX" $DIST_DIR
|
||||
echo "Wheels copied to local directory"
|
||||
# generate source tarball
|
||||
git archive --format=tar.gz --output="$DIST_DIR/vllm-${PURE_VERSION}.tar.gz" $BUILDKITE_COMMIT
|
||||
git archive --format=tar.gz --output="$DIST_DIR/vllm-${PURE_VERSION}.tar.gz" "$BUILDKITE_COMMIT"
|
||||
ls -la $DIST_DIR
|
||||
|
||||
# upload wheels to PyPI (only default variant, i.e. files without '+' in the name)
|
||||
@@ -65,6 +65,6 @@ if [[ -z "$PYPI_WHEEL_FILES" ]]; then
|
||||
exit 1
|
||||
fi
|
||||
|
||||
python3 -m twine check $PYPI_WHEEL_FILES
|
||||
python3 -m twine upload --non-interactive --verbose $PYPI_WHEEL_FILES
|
||||
python3 -m twine check "$PYPI_WHEEL_FILES"
|
||||
python3 -m twine upload --non-interactive --verbose "$PYPI_WHEEL_FILES"
|
||||
echo "Wheels uploaded to PyPI"
|
||||
|
||||
@@ -55,7 +55,7 @@ mkdir -p all-rocm-wheels
|
||||
cp artifacts/rocm-base-wheels/*.whl all-rocm-wheels/ 2>/dev/null || true
|
||||
cp artifacts/rocm-vllm-wheel/*.whl all-rocm-wheels/ 2>/dev/null || true
|
||||
|
||||
WHEEL_COUNT=$(ls all-rocm-wheels/*.whl 2>/dev/null | wc -l)
|
||||
WHEEL_COUNT=$(find all-rocm-wheels -maxdepth 1 -name '*.whl' 2>/dev/null | wc -l)
|
||||
echo "Total wheels to upload: $WHEEL_COUNT"
|
||||
|
||||
if [ "$WHEEL_COUNT" -eq 0 ]; then
|
||||
@@ -115,7 +115,7 @@ if [[ "$BUILDKITE_BRANCH" == "main" && "$BUILDKITE_PULL_REQUEST" == "false" ]] |
|
||||
fi
|
||||
|
||||
# Extract version from vLLM wheel and update version-specific index
|
||||
VLLM_WHEEL=$(ls all-rocm-wheels/vllm*.whl 2>/dev/null | head -1)
|
||||
VLLM_WHEEL=$(find all-rocm-wheels -maxdepth 1 -name 'vllm*.whl' 2>/dev/null | head -1)
|
||||
if [ -n "$VLLM_WHEEL" ]; then
|
||||
VERSION=$(unzip -p "$VLLM_WHEEL" '**/METADATA' | grep '^Version: ' | cut -d' ' -f2)
|
||||
echo "Version in wheel: $VERSION"
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@@ -14,3 +14,8 @@ steps:
|
||||
- pytest -v -s basic_correctness/test_cumem.py
|
||||
- pytest -v -s basic_correctness/test_basic_correctness.py
|
||||
- pytest -v -s basic_correctness/test_cpu_offload.py
|
||||
mirror:
|
||||
amd:
|
||||
device: mi325_1
|
||||
depends_on:
|
||||
- image-build-amd
|
||||
|
||||
@@ -17,3 +17,15 @@ steps:
|
||||
- tests/benchmarks/
|
||||
commands:
|
||||
- pytest -v -s benchmarks/
|
||||
|
||||
- label: Attention Benchmarks Smoke Test (B200)
|
||||
device: b200
|
||||
num_gpus: 2
|
||||
optional: true
|
||||
working_dir: "/vllm-workspace/"
|
||||
timeout_in_minutes: 10
|
||||
source_file_dependencies:
|
||||
- benchmarks/attention_benchmarks/
|
||||
- vllm/v1/attention/
|
||||
commands:
|
||||
- python3 benchmarks/attention_benchmarks/benchmark.py --backends flash flashinfer --batch-specs "8q1s1k" --repeats 1 --warmup-iters 1
|
||||
|
||||
@@ -121,13 +121,10 @@ steps:
|
||||
optional: true
|
||||
commands:
|
||||
- nvidia-smi
|
||||
# Run all models and attn backends but only Inductor partition and native custom ops
|
||||
# -k "inductor_partition and not +rms_norm and not +quant_fp8"
|
||||
# Run all models but only FLASHINFER, Inductor partition and native custom ops
|
||||
# Qwen requires +quant_fp8 as -quant_fp8 rms+quant fusion is not supported
|
||||
# -k "inductor_partition and not +rms_norm and +quant_fp8 and qwen3"
|
||||
# Run just llama3 (fp8 & fp4) for all config combinations
|
||||
# -k "llama-3"
|
||||
- pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and not +rms_norm and not +quant_fp8" -k "inductor_partition and not +rms_norm and +quant_fp8 and qwen3" -k "llama-3"
|
||||
# Run just llama3 (fp8 & fp4) for all config combinations (only inductor partition)
|
||||
- pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and (FLASHINFER and not +rms_norm and (not +quant_fp8 or +quant_fp8 and qwen3) or llama-3)"
|
||||
|
||||
- label: Fusion E2E TP2 Quick (H100)
|
||||
timeout_in_minutes: 20
|
||||
@@ -162,7 +159,7 @@ steps:
|
||||
- tests/compile/fusions_e2e/
|
||||
commands:
|
||||
- nvidia-smi
|
||||
# Run just llama3 (fp4 & fp8 & bf16) for all config combinations
|
||||
# Run just llama3 (fp8 & bf16) for all config combinations
|
||||
- pytest -v -s tests/compile/fusions_e2e/test_tp2_ar_rms.py -k "llama-3"
|
||||
|
||||
- label: Fusion E2E TP2 AsyncTP Config Sweep (H100)
|
||||
@@ -197,7 +194,8 @@ steps:
|
||||
- tests/compile/fusions_e2e/
|
||||
commands:
|
||||
- nvidia-smi
|
||||
# Run all models and attn backends but only Inductor partition and native custom ops
|
||||
# Run all models but only FLASHINFER, Inductor partition and native custom ops
|
||||
# include qwen with +quant_fp8 as -quant_fp8 rms+quant fusion is not supported
|
||||
# for ar-rms-quant-fp4, also sweep llama3
|
||||
- pytest -v -s tests/compile/fusions_e2e/test_tp2_ar_rms.py -k "inductor_partition and not +rms_norm and not +quant_fp8" -k "Llama-3.1-8B-Instruct-FP4"
|
||||
- pytest -v -s tests/compile/fusions_e2e/test_tp2_async_tp.py -k "inductor_partition and not +rms_norm and not +quant_fp8"
|
||||
- pytest -v -s tests/compile/fusions_e2e/test_tp2_ar_rms.py -k "(FLASHINFER and inductor_partition and not +rms_norm and (not +quant_fp8 or +quant_fp8 and qwen3)) or Llama-3.1-8B-Instruct-FP4"
|
||||
- pytest -v -s tests/compile/fusions_e2e/test_tp2_async_tp.py -k "FLASHINFER and inductor_partition and not +rms_norm and (not +quant_fp8 or +quant_fp8 and qwen3)"
|
||||
|
||||
@@ -104,7 +104,6 @@ steps:
|
||||
# NEW rlhf examples
|
||||
- cd new_weight_syncing
|
||||
- VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf.py
|
||||
- VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf_async_new_apis.py
|
||||
|
||||
- label: Distributed Tests (8 GPUs)(H100)
|
||||
timeout_in_minutes: 10
|
||||
@@ -146,6 +145,7 @@ steps:
|
||||
num_devices: 2
|
||||
commands:
|
||||
- pytest -v -s tests/distributed/test_context_parallel.py
|
||||
- VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 examples/offline_inference/new_weight_syncing/rlhf_async_new_apis.py
|
||||
- VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=deepep_high_throughput
|
||||
- pytest -v -s tests/v1/distributed/test_dbo.py
|
||||
|
||||
@@ -165,6 +165,7 @@ steps:
|
||||
num_devices: 2
|
||||
num_nodes: 2
|
||||
no_plugin: true
|
||||
optional: true # TODO: revert once infra issue solved
|
||||
source_file_dependencies:
|
||||
- vllm/distributed/
|
||||
- vllm/engine/
|
||||
@@ -197,7 +198,18 @@ steps:
|
||||
- uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
|
||||
- DP_EP=1 bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh
|
||||
|
||||
- label: Pipeline + Context Parallelism (4 GPUs))
|
||||
- label: CrossLayer KV layout Distributed NixlConnector PD accuracy tests (4 GPUs)
|
||||
timeout_in_minutes: 30
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
num_devices: 4
|
||||
source_file_dependencies:
|
||||
- vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
|
||||
- tests/v1/kv_connector/nixl_integration/
|
||||
commands:
|
||||
- uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
|
||||
- CROSS_LAYERS_BLOCKS=True bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh
|
||||
|
||||
- label: Pipeline + Context Parallelism (4 GPUs)
|
||||
timeout_in_minutes: 60
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
num_devices: 4
|
||||
|
||||
@@ -29,15 +29,11 @@ steps:
|
||||
commands:
|
||||
- bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1
|
||||
|
||||
- label: Prime-RL Integration (2 GPUs)
|
||||
timeout_in_minutes: 30
|
||||
- label: DeepSeek V2-Lite Prefetch Offload Accuracy (H100)
|
||||
timeout_in_minutes: 60
|
||||
device: h100
|
||||
optional: true
|
||||
soft_fail: true
|
||||
num_devices: 2
|
||||
num_devices: 1
|
||||
working_dir: "/vllm-workspace"
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- .buildkite/scripts/run-prime-rl-test.sh
|
||||
commands:
|
||||
- nvidia-smi
|
||||
- bash .buildkite/scripts/run-prime-rl-test.sh
|
||||
- bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_prefetch_offload.sh 0.25 200 8030
|
||||
|
||||
@@ -28,3 +28,11 @@ steps:
|
||||
- pytest -v -s v1/engine/test_preprocess_error_handling.py
|
||||
# Run the rest of v1/engine tests
|
||||
- pytest -v -s v1/engine --ignore v1/engine/test_preprocess_error_handling.py
|
||||
mirror:
|
||||
amd:
|
||||
device: mi325_1
|
||||
depends_on:
|
||||
- image-build-amd
|
||||
commands:
|
||||
- pytest -v -s v1/e2e
|
||||
- pytest -v -s v1/engine
|
||||
|
||||
@@ -24,6 +24,11 @@ steps:
|
||||
- pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_collective_rpc.py
|
||||
- pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
|
||||
- pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
|
||||
mirror:
|
||||
amd:
|
||||
device: mi325_1
|
||||
depends_on:
|
||||
- image-build-amd
|
||||
|
||||
- label: Entrypoints Integration (API Server 1)
|
||||
timeout_in_minutes: 130
|
||||
@@ -42,15 +47,13 @@ steps:
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/tool_use
|
||||
- tests/entrypoints/sleep
|
||||
- tests/entrypoints/instrumentator
|
||||
- tests/entrypoints/rpc
|
||||
- tests/entrypoints/instrumentator
|
||||
- tests/tool_use
|
||||
commands:
|
||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||
- PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/rpc
|
||||
- pytest -v -s entrypoints/instrumentator
|
||||
- pytest -v -s entrypoints/sleep
|
||||
- PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/rpc
|
||||
- pytest -v -s tool_use
|
||||
|
||||
- label: Entrypoints Integration (Pooling)
|
||||
@@ -62,6 +65,11 @@ steps:
|
||||
commands:
|
||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||
- pytest -v -s entrypoints/pooling
|
||||
mirror:
|
||||
amd:
|
||||
device: mi325_1
|
||||
depends_on:
|
||||
- image-build-amd
|
||||
|
||||
- label: Entrypoints Integration (Responses API)
|
||||
timeout_in_minutes: 50
|
||||
|
||||
@@ -115,6 +115,7 @@ steps:
|
||||
- pytest -v -s tests/kernels/moe/test_nvfp4_moe.py
|
||||
- pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py
|
||||
- pytest -v -s tests/kernels/moe/test_flashinfer.py
|
||||
- pytest -v -s tests/kernels/moe/test_flashinfer_moe.py
|
||||
- pytest -v -s tests/kernels/moe/test_cutedsl_moe.py
|
||||
# e2e
|
||||
- pytest -v -s tests/models/quantization/test_nvfp4.py
|
||||
@@ -156,14 +157,3 @@ steps:
|
||||
- pytest -v -s kernels/moe/test_deepep_moe.py
|
||||
- pytest -v -s kernels/moe/test_pplx_cutlass_moe.py
|
||||
# - pytest -v -s kernels/moe/test_pplx_moe.py - failing on main
|
||||
|
||||
- label: Kernels Fp4 MoE Test (B200)
|
||||
timeout_in_minutes: 60
|
||||
device: b200
|
||||
num_devices: 1
|
||||
optional: true
|
||||
commands:
|
||||
- pytest -v -s kernels/moe/test_cutedsl_moe.py
|
||||
- pytest -v -s kernels/moe/test_flashinfer_moe.py
|
||||
- pytest -v -s kernels/moe/test_nvfp4_moe.py
|
||||
- pytest -v -s kernels/moe/test_ocp_mx_moe.py
|
||||
|
||||
@@ -73,3 +73,29 @@ steps:
|
||||
num_devices: 2
|
||||
commands:
|
||||
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=evals/gsm8k/configs/moe-refactor-dp-ep/config-b200.txt
|
||||
|
||||
- label: GPQA Eval (GPT-OSS) (H100)
|
||||
timeout_in_minutes: 120
|
||||
device: h100
|
||||
optional: true
|
||||
num_devices: 2
|
||||
source_file_dependencies:
|
||||
- csrc/
|
||||
- vllm/model_executor/layers/quantization
|
||||
- tests/evals/gpt_oss/
|
||||
commands:
|
||||
- uv pip install --system 'gpt-oss[eval]==0.0.5'
|
||||
- pytest -s -v evals/gpt_oss/test_gpqa_correctness.py --config-list-file=configs/models-h100.txt
|
||||
|
||||
- label: GPQA Eval (GPT-OSS) (B200)
|
||||
timeout_in_minutes: 120
|
||||
device: b200
|
||||
optional: true
|
||||
num_devices: 2
|
||||
source_file_dependencies:
|
||||
- csrc/
|
||||
- vllm/model_executor/layers/quantization
|
||||
- tests/evals/gpt_oss/
|
||||
commands:
|
||||
- uv pip install --system 'gpt-oss[eval]==0.0.5'
|
||||
- pytest -s -v evals/gpt_oss/test_gpqa_correctness.py --config-list-file=configs/models-b200.txt
|
||||
|
||||
@@ -16,6 +16,7 @@ steps:
|
||||
- pytest -v -s v1/sample
|
||||
- pytest -v -s v1/logits_processors
|
||||
- pytest -v -s v1/worker
|
||||
# TODO: create another `optional` test group for slow tests
|
||||
- pytest -v -s -m 'not slow_test' v1/spec_decode
|
||||
- pytest -v -s -m 'not cpu_test' v1/kv_connector/unit
|
||||
- pytest -v -s -m 'not cpu_test' v1/metrics
|
||||
@@ -25,6 +26,11 @@ steps:
|
||||
# Integration test for streaming correctness (requires special branch).
|
||||
- pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api
|
||||
- pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
|
||||
mirror:
|
||||
amd:
|
||||
device: mi325_1
|
||||
depends_on:
|
||||
- image-build-amd
|
||||
|
||||
- label: V1 Others (CPU)
|
||||
depends_on:
|
||||
@@ -108,9 +114,11 @@ steps:
|
||||
timeout_in_minutes: 50
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/detokenizer
|
||||
- tests/multimodal
|
||||
- tests/utils_
|
||||
commands:
|
||||
- pytest -v -s detokenizer
|
||||
- pytest -v -s -m 'not cpu_test' multimodal
|
||||
- pytest -v -s utils_
|
||||
|
||||
@@ -123,6 +131,7 @@ steps:
|
||||
- tests/test_inputs.py
|
||||
- tests/test_outputs.py
|
||||
- tests/test_pooling_params.py
|
||||
- tests/test_ray_env.py
|
||||
- tests/multimodal
|
||||
- tests/renderers
|
||||
- tests/standalone_tests/lazy_imports.py
|
||||
@@ -136,6 +145,7 @@ steps:
|
||||
- pytest -v -s test_inputs.py
|
||||
- pytest -v -s test_outputs.py
|
||||
- pytest -v -s test_pooling_params.py
|
||||
- pytest -v -s test_ray_env.py
|
||||
- pytest -v -s -m 'cpu_test' multimodal
|
||||
- pytest -v -s renderers
|
||||
- pytest -v -s tokenizers_
|
||||
@@ -143,20 +153,6 @@ steps:
|
||||
- pytest -v -s transformers_utils
|
||||
- pytest -v -s config
|
||||
|
||||
- label: GPT-OSS Eval (B200)
|
||||
timeout_in_minutes: 60
|
||||
working_dir: "/vllm-workspace/"
|
||||
device: b200
|
||||
optional: true
|
||||
source_file_dependencies:
|
||||
- tests/evals/gpt_oss
|
||||
- vllm/model_executor/models/gpt_oss.py
|
||||
- vllm/model_executor/layers/quantization/mxfp4.py
|
||||
- vllm/v1/attention/backends/flashinfer.py
|
||||
commands:
|
||||
- uv pip install --system 'gpt-oss[eval]==0.0.5'
|
||||
- pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58
|
||||
|
||||
- label: Batch Invariance (H100)
|
||||
timeout_in_minutes: 25
|
||||
device: h100
|
||||
|
||||
@@ -4,7 +4,6 @@ depends_on:
|
||||
steps:
|
||||
- label: Basic Models Tests (Initialization)
|
||||
timeout_in_minutes: 45
|
||||
mirror_hardwares: [amdexperimental]
|
||||
torch_nightly: true
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
@@ -16,7 +15,6 @@ steps:
|
||||
|
||||
- label: Basic Models Tests (Extra Initialization) %N
|
||||
timeout_in_minutes: 45
|
||||
mirror_hardwares: [amdexperimental]
|
||||
torch_nightly: true
|
||||
source_file_dependencies:
|
||||
- vllm/model_executor/models/
|
||||
@@ -38,6 +36,12 @@ steps:
|
||||
- tests/models/test_registry.py
|
||||
commands:
|
||||
- pytest -v -s models/test_terratorch.py models/test_transformers.py models/test_registry.py
|
||||
mirror:
|
||||
amd:
|
||||
device: mi325_1
|
||||
depends_on:
|
||||
- image-build-amd
|
||||
|
||||
|
||||
- label: Basic Models Test (Other CPU) # 5min
|
||||
depends_on:
|
||||
|
||||
@@ -4,7 +4,6 @@ depends_on:
|
||||
steps:
|
||||
- label: Language Models Tests (Standard)
|
||||
timeout_in_minutes: 25
|
||||
mirror_hardwares: [amdexperimental]
|
||||
torch_nightly: true
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
@@ -16,7 +15,6 @@ steps:
|
||||
|
||||
- label: Language Models Tests (Extra Standard) %N
|
||||
timeout_in_minutes: 45
|
||||
mirror_hardwares: [amdexperimental]
|
||||
torch_nightly: true
|
||||
source_file_dependencies:
|
||||
- vllm/model_executor/models/
|
||||
@@ -32,7 +30,6 @@ steps:
|
||||
|
||||
- label: Language Models Tests (Hybrid) %N
|
||||
timeout_in_minutes: 75
|
||||
mirror_hardwares: [amdexperimental]
|
||||
torch_nightly: true
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
@@ -40,7 +37,7 @@ steps:
|
||||
commands:
|
||||
# Install fast path packages for testing against transformers
|
||||
# Note: also needed to run plamo2 model in vLLM
|
||||
- uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/mamba@v2.2.5'
|
||||
- uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/mamba@v2.3.0'
|
||||
- uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2'
|
||||
# Shard hybrid language model tests
|
||||
- pytest -v -s models/language/generation -m hybrid_model --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --shard-id=$$BUILDKITE_PARALLEL_JOB
|
||||
@@ -48,7 +45,6 @@ steps:
|
||||
|
||||
- label: Language Models Test (Extended Generation) # 80min
|
||||
timeout_in_minutes: 110
|
||||
mirror_hardwares: [amdexperimental]
|
||||
optional: true
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
@@ -56,13 +52,21 @@ steps:
|
||||
commands:
|
||||
# Install fast path packages for testing against transformers
|
||||
# Note: also needed to run plamo2 model in vLLM
|
||||
- uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/mamba@v2.2.5'
|
||||
- uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/mamba@v2.3.0'
|
||||
- uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2'
|
||||
- pytest -v -s models/language/generation -m '(not core_model) and (not hybrid_model)'
|
||||
mirror:
|
||||
amd:
|
||||
device: mi325_1
|
||||
depends_on:
|
||||
- image-build-amd
|
||||
commands:
|
||||
- uv pip install --system --no-build-isolation 'git+https://github.com/AndreasKaratzas/mamba@fix-rocm-7.0-warp-size-constexpr'
|
||||
- uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2'
|
||||
- pytest -v -s models/language/generation -m '(not core_model) and (not hybrid_model)'
|
||||
|
||||
- label: Language Models Test (PPL)
|
||||
timeout_in_minutes: 110
|
||||
mirror_hardwares: [amdexperimental]
|
||||
optional: true
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
@@ -72,17 +76,20 @@ steps:
|
||||
|
||||
- label: Language Models Test (Extended Pooling) # 36min
|
||||
timeout_in_minutes: 50
|
||||
mirror_hardwares: [amdexperimental]
|
||||
optional: true
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/models/language/pooling
|
||||
commands:
|
||||
- pytest -v -s models/language/pooling -m 'not core_model'
|
||||
mirror:
|
||||
amd:
|
||||
device: mi325_1
|
||||
depends_on:
|
||||
- image-build-amd
|
||||
|
||||
- label: Language Models Test (MTEB)
|
||||
timeout_in_minutes: 110
|
||||
mirror_hardwares: [amdexperimental]
|
||||
optional: true
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
|
||||
@@ -12,3 +12,10 @@ steps:
|
||||
commands:
|
||||
- pytest -v -s samplers
|
||||
- VLLM_USE_FLASHINFER_SAMPLER=1 pytest -v -s samplers
|
||||
mirror:
|
||||
amd:
|
||||
device: mi325_1
|
||||
depends_on:
|
||||
- image-build-amd
|
||||
commands:
|
||||
- pytest -v -s samplers
|
||||
|
||||
48
.github/CODEOWNERS
vendored
48
.github/CODEOWNERS
vendored
@@ -2,42 +2,60 @@
|
||||
# for more info about CODEOWNERS file
|
||||
|
||||
# This lists cover the "core" components of vLLM that require careful review
|
||||
/vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-redhat @njhill @22quinn
|
||||
/vllm/model_executor/layers/attention @LucasWilkinson
|
||||
/vllm/compilation @zou3519 @youkaichao @ProExpertProg
|
||||
/vllm/distributed/kv_transfer @NickLucche @ApostaC @orozery
|
||||
/vllm/lora @jeejeelee
|
||||
/vllm/model_executor/layers/attention @LucasWilkinson @MatthewBonanni
|
||||
/vllm/model_executor/layers/fused_moe @mgoin @pavanimajety
|
||||
/vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth @yewentao256 @pavanimajety
|
||||
/vllm/model_executor/layers/mamba @tdoublep
|
||||
/vllm/model_executor/model_loader @22quinn
|
||||
/vllm/model_executor/layers/batch_invariant.py @yewentao256
|
||||
/vllm/multimodal @DarkLight1337 @ywang96 @NickLucche @tjtanaa
|
||||
/vllm/vllm_flash_attn @LucasWilkinson
|
||||
/vllm/lora @jeejeelee
|
||||
/vllm/reasoning @aarnphm @chaunceyjiang
|
||||
/vllm/entrypoints @aarnphm @chaunceyjiang
|
||||
/vllm/tool_parsers @aarnphm @chaunceyjiang
|
||||
/vllm/compilation @zou3519 @youkaichao @ProExpertProg
|
||||
/vllm/distributed/kv_transfer @NickLucche @ApostaC @orozery
|
||||
/vllm/vllm_flash_attn @LucasWilkinson @MatthewBonanni
|
||||
CMakeLists.txt @tlrmchlsmth @LucasWilkinson
|
||||
|
||||
# Any change to the VllmConfig changes can have a large user-facing impact,
|
||||
# so spam a lot of people
|
||||
/vllm/config @WoosukKwon @youkaichao @robertgshaw2-redhat @mgoin @tlrmchlsmth @houseroad @hmellor @yewentao256 @ProExpertProg
|
||||
/vllm/config/cache.py @WoosukKwon @youkaichao @robertgshaw2-redhat @mgoin @tlrmchlsmth @houseroad @hmellor @yewentao256 @ProExpertProg @heheda12345
|
||||
/vllm/config/cache.py @heheda12345
|
||||
|
||||
# Entrypoints
|
||||
/vllm/entrypoints/anthropic @mgoin @DarkLight1337
|
||||
/vllm/entrypoints/cli @hmellor @mgoin @DarkLight1337 @russellb
|
||||
/vllm/entrypoints/mcp @heheda12345
|
||||
/vllm/entrypoints/openai @aarnphm @chaunceyjiang @DarkLight1337 @russellb
|
||||
/vllm/entrypoints/openai/realtime @njhill
|
||||
/vllm/entrypoints/openai/speech_to_text @NickLucche
|
||||
/vllm/entrypoints/pooling @noooop
|
||||
/vllm/entrypoints/sagemaker @DarkLight1337
|
||||
/vllm/entrypoints/serve @njhill
|
||||
/vllm/entrypoints/*.py @njhill
|
||||
/vllm/entrypoints/chat_utils.py @DarkLight1337
|
||||
/vllm/entrypoints/llm.py @DarkLight1337
|
||||
|
||||
# Input/Output Processing
|
||||
/vllm/sampling_params.py @njhill @NickLucche
|
||||
/vllm/pooling_params.py @noooop @DarkLight1337
|
||||
/vllm/tokenizers @DarkLight1337 @njhill
|
||||
/vllm/renderers @DarkLight1337 @njhill
|
||||
/vllm/reasoning @aarnphm @chaunceyjiang
|
||||
/vllm/tool_parsers @aarnphm @chaunceyjiang
|
||||
|
||||
# vLLM V1
|
||||
/vllm/v1/attention @LucasWilkinson
|
||||
/vllm/v1/attention @LucasWilkinson @MatthewBonanni
|
||||
/vllm/v1/attention/backend.py @WoosukKwon @zhuohan123 @youkaichao @alexm-redhat @njhill
|
||||
/vllm/v1/attention/backends/mla @pavanimajety
|
||||
/vllm/v1/attention/backends/flashinfer.py @mgoin @pavanimajety
|
||||
/vllm/v1/attention/backends/triton_attn.py @tdoublep
|
||||
/vllm/v1/core @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @alexm-redhat @heheda12345 @ApostaC @orozery
|
||||
/vllm/v1/sample @22quinn @houseroad @njhill
|
||||
/vllm/v1/spec_decode @benchislett @luccafong
|
||||
/vllm/v1/spec_decode @benchislett @luccafong @MatthewBonanni
|
||||
/vllm/v1/structured_output @mgoin @russellb @aarnphm @benchislett
|
||||
/vllm/v1/kv_cache_interface.py @heheda12345
|
||||
/vllm/v1/kv_offload @ApostaC @orozery
|
||||
/vllm/v1/worker/gpu/kv_connector.py @orozery
|
||||
/vllm/v1/worker/kv_connector_model_runner_mixin.py @orozery
|
||||
/vllm/v1/worker/kv_connector_model_runner_mixin.py @orozery @NickLucche
|
||||
|
||||
# Model runner V2
|
||||
/vllm/v1/worker/gpu @WoosukKwon
|
||||
@@ -115,8 +133,8 @@ mkdocs.yaml @hmellor
|
||||
/vllm/model_executor/models/mixtral*.py @patrickvonplaten
|
||||
/vllm/model_executor/models/voxtral*.py @patrickvonplaten
|
||||
/vllm/model_executor/models/pixtral*.py @patrickvonplaten
|
||||
/vllm/tokenizers/mistral.py @patrickvonplaten
|
||||
/vllm/transformers_utils/configs/mistral.py @patrickvonplaten
|
||||
/vllm/transformers_utils/tokenizers/mistral.py @patrickvonplaten
|
||||
|
||||
# Kernels
|
||||
/vllm/v1/attention/ops/chunked_prefill_paged_decode.py @tdoublep
|
||||
@@ -152,9 +170,7 @@ mkdocs.yaml @hmellor
|
||||
/examples/pooling @noooop
|
||||
/tests/models/*/pooling* @noooop
|
||||
/tests/entrypoints/pooling @noooop
|
||||
/vllm/entrypoints/pooling @noooop
|
||||
/vllm/config/pooler.py @noooop
|
||||
/vllm/pooling_params.py @noooop
|
||||
/vllm/model_executor/layers/pooler @noooop
|
||||
|
||||
# Security guide and policies
|
||||
|
||||
1
.github/workflows/cleanup_pr_body.yml
vendored
1
.github/workflows/cleanup_pr_body.yml
vendored
@@ -19,6 +19,7 @@ jobs:
|
||||
uses: actions/setup-python@83679a892e2d95755f2dac6acb0bfd1e9ac5d548 # v6.1.0
|
||||
with:
|
||||
python-version: '3.12'
|
||||
cache: 'pip'
|
||||
|
||||
- name: Install Python dependencies
|
||||
run: |
|
||||
|
||||
3
.gitignore
vendored
3
.gitignore
vendored
@@ -238,3 +238,6 @@ ep_kernels_workspace/
|
||||
vllm/grpc/vllm_engine_pb2.py
|
||||
vllm/grpc/vllm_engine_pb2_grpc.py
|
||||
vllm/grpc/vllm_engine_pb2.pyi
|
||||
|
||||
# Ignore generated cpu headers
|
||||
csrc/cpu/cpu_attn_dispatch_generated.h
|
||||
|
||||
@@ -143,6 +143,11 @@ repos:
|
||||
name: Check attention backend documentation is up to date
|
||||
entry: python tools/pre_commit/generate_attention_backend_docs.py --check
|
||||
language: python
|
||||
- id: check-boolean-context-manager
|
||||
name: Check for boolean ops in with-statements
|
||||
entry: python tools/pre_commit/check_boolean_context_manager.py
|
||||
language: python
|
||||
types: [python]
|
||||
# Keep `suggestion` last
|
||||
- id: suggestion
|
||||
name: Suggestion
|
||||
|
||||
@@ -9,13 +9,14 @@ build:
|
||||
python: "3.12"
|
||||
jobs:
|
||||
post_checkout:
|
||||
- git fetch --unshallow || true
|
||||
- git fetch origin main --unshallow --no-tags --filter=blob:none || true
|
||||
pre_create_environment:
|
||||
- pip install uv
|
||||
create_environment:
|
||||
- uv venv $READTHEDOCS_VIRTUALENV_PATH
|
||||
install:
|
||||
- uv pip install --python $READTHEDOCS_VIRTUALENV_PATH/bin/python --no-cache-dir -r requirements/docs.txt
|
||||
|
||||
mkdocs:
|
||||
configuration: mkdocs.yaml
|
||||
fail_on_warning: true
|
||||
|
||||
# Optionally declare the Python requirements required to build your docs
|
||||
python:
|
||||
install:
|
||||
- requirements: requirements/docs.txt
|
||||
|
||||
@@ -293,6 +293,7 @@ set(VLLM_EXT_SRC
|
||||
"csrc/fused_qknorm_rope_kernel.cu"
|
||||
"csrc/layernorm_quant_kernels.cu"
|
||||
"csrc/sampler.cu"
|
||||
"csrc/topk.cu"
|
||||
"csrc/cuda_view.cu"
|
||||
"csrc/quantization/gptq/q_gemm.cu"
|
||||
"csrc/quantization/w8a8/int8/scaled_quant.cu"
|
||||
@@ -770,6 +771,24 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
# DeepSeek V3 fused A GEMM kernel (requires SM 9.0+, Hopper and later)
|
||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
|
||||
cuda_archs_loose_intersection(DSV3_FUSED_A_GEMM_ARCHS "9.0a;10.0f;11.0f" "${CUDA_ARCHS}")
|
||||
else()
|
||||
cuda_archs_loose_intersection(DSV3_FUSED_A_GEMM_ARCHS "9.0a;10.0a;10.1a;10.3a" "${CUDA_ARCHS}")
|
||||
endif()
|
||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.0 AND DSV3_FUSED_A_GEMM_ARCHS)
|
||||
set(DSV3_FUSED_A_GEMM_SRC "csrc/dsv3_fused_a_gemm.cu")
|
||||
set_gencode_flags_for_srcs(
|
||||
SRCS "${DSV3_FUSED_A_GEMM_SRC}"
|
||||
CUDA_ARCHS "${DSV3_FUSED_A_GEMM_ARCHS}")
|
||||
list(APPEND VLLM_EXT_SRC ${DSV3_FUSED_A_GEMM_SRC})
|
||||
message(STATUS "Building dsv3_fused_a_gemm for archs: ${DSV3_FUSED_A_GEMM_ARCHS}")
|
||||
else()
|
||||
message(STATUS "Not building dsv3_fused_a_gemm as no compatible archs found "
|
||||
"in CUDA target architectures.")
|
||||
endif()
|
||||
|
||||
# moe_data.cu is used by all CUTLASS MoE kernels.
|
||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
|
||||
cuda_archs_loose_intersection(CUTLASS_MOE_DATA_ARCHS "9.0a;10.0f;11.0f;12.0f" "${CUDA_ARCHS}")
|
||||
@@ -1081,6 +1100,27 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
message(STATUS "Not building Marlin MOE kernels as no compatible archs found"
|
||||
" in CUDA target architectures")
|
||||
endif()
|
||||
|
||||
# DeepSeek V3 router GEMM kernel - requires SM90+
|
||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
|
||||
cuda_archs_loose_intersection(DSV3_ROUTER_GEMM_ARCHS "9.0a;10.0f;11.0f" "${CUDA_ARCHS}")
|
||||
else()
|
||||
cuda_archs_loose_intersection(DSV3_ROUTER_GEMM_ARCHS "9.0a;10.0a;10.1a;10.3a" "${CUDA_ARCHS}")
|
||||
endif()
|
||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.0 AND DSV3_ROUTER_GEMM_ARCHS)
|
||||
set(DSV3_ROUTER_GEMM_SRC
|
||||
"csrc/moe/dsv3_router_gemm_entry.cu"
|
||||
"csrc/moe/dsv3_router_gemm_float_out.cu"
|
||||
"csrc/moe/dsv3_router_gemm_bf16_out.cu")
|
||||
set_gencode_flags_for_srcs(
|
||||
SRCS "${DSV3_ROUTER_GEMM_SRC}"
|
||||
CUDA_ARCHS "${DSV3_ROUTER_GEMM_ARCHS}")
|
||||
list(APPEND VLLM_MOE_EXT_SRC "${DSV3_ROUTER_GEMM_SRC}")
|
||||
message(STATUS "Building DSV3 router GEMM kernel for archs: ${DSV3_ROUTER_GEMM_ARCHS}")
|
||||
else()
|
||||
message(STATUS "Not building DSV3 router GEMM kernel as no compatible archs found"
|
||||
" (requires SM90+ and CUDA >= 12.0)")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
message(STATUS "Enabling moe extension.")
|
||||
|
||||
@@ -229,3 +229,40 @@ def get_batch_stats(requests: list[BatchRequest]) -> dict:
|
||||
sum(r.kv_len for r in requests) / len(requests) if requests else 0
|
||||
),
|
||||
}
|
||||
|
||||
|
||||
def get_batch_type(batch_spec: str, spec_decode_threshold: int = 8) -> str:
|
||||
"""
|
||||
Classify a batch spec into a type string.
|
||||
|
||||
Args:
|
||||
batch_spec: Batch specification string (e.g., "q2k", "8q1s1k", "2q2k_8q1s1k")
|
||||
spec_decode_threshold: Max q_len to be considered spec-decode vs extend
|
||||
|
||||
Returns:
|
||||
Type string: "prefill", "decode", "spec-decode", "extend", or "mixed (types...)"
|
||||
"""
|
||||
requests = parse_batch_spec(batch_spec)
|
||||
|
||||
# Classify each request
|
||||
types_present = set()
|
||||
for req in requests:
|
||||
if req.is_decode:
|
||||
types_present.add("decode")
|
||||
elif req.is_prefill:
|
||||
types_present.add("prefill")
|
||||
elif req.is_extend:
|
||||
# Distinguish spec-decode (small q_len) from extend (chunked prefill)
|
||||
if req.q_len <= spec_decode_threshold:
|
||||
types_present.add("spec-decode")
|
||||
else:
|
||||
types_present.add("extend")
|
||||
|
||||
if len(types_present) == 1:
|
||||
return types_present.pop()
|
||||
elif len(types_present) > 1:
|
||||
# Sort for consistent output
|
||||
sorted_types = sorted(types_present)
|
||||
return f"mixed ({'+'.join(sorted_types)})"
|
||||
else:
|
||||
return "unknown"
|
||||
|
||||
@@ -43,6 +43,7 @@ from common import (
|
||||
ModelParameterSweep,
|
||||
ParameterSweep,
|
||||
ResultsFormatter,
|
||||
batch_spec_sort_key,
|
||||
is_mla_backend,
|
||||
)
|
||||
|
||||
@@ -218,10 +219,13 @@ def run_model_parameter_sweep(
|
||||
by_param_and_spec[key].append(r)
|
||||
break
|
||||
|
||||
# Sort by param value then spec
|
||||
# Sort by param value then spec (batch_size, q_len, kv_len)
|
||||
sorted_keys = sorted(
|
||||
by_param_and_spec.keys(),
|
||||
key=lambda x: (int(x[0]) if x[0].isdigit() else x[0], x[1]),
|
||||
key=lambda x: (
|
||||
int(x[0]) if x[0].isdigit() else x[0],
|
||||
batch_spec_sort_key(x[1]),
|
||||
),
|
||||
)
|
||||
|
||||
current_param_value = None
|
||||
@@ -330,7 +334,7 @@ def run_parameter_sweep(
|
||||
by_spec[spec] = []
|
||||
by_spec[spec].append(r)
|
||||
|
||||
for spec in sorted(by_spec.keys()):
|
||||
for spec in sorted(by_spec.keys(), key=batch_spec_sort_key):
|
||||
results = by_spec[spec]
|
||||
best = min(results, key=lambda r: r.mean_time)
|
||||
console.print(
|
||||
@@ -496,15 +500,18 @@ def main():
|
||||
if "description" in yaml_config:
|
||||
console.print(f"[dim]{yaml_config['description']}[/]")
|
||||
|
||||
# Override args with YAML values
|
||||
# (YAML takes precedence unless CLI arg was explicitly set)
|
||||
# Backend(s)
|
||||
if "backend" in yaml_config:
|
||||
args.backend = yaml_config["backend"]
|
||||
args.backends = None
|
||||
elif "backends" in yaml_config:
|
||||
args.backends = yaml_config["backends"]
|
||||
args.backend = None
|
||||
# Override args with YAML values, but CLI args take precedence
|
||||
# Check if CLI provided backends (they would be non-None and not default)
|
||||
cli_backends_provided = args.backends is not None or args.backend is not None
|
||||
|
||||
# Backend(s) - only use YAML if CLI didn't specify
|
||||
if not cli_backends_provided:
|
||||
if "backend" in yaml_config:
|
||||
args.backend = yaml_config["backend"]
|
||||
args.backends = None
|
||||
elif "backends" in yaml_config:
|
||||
args.backends = yaml_config["backends"]
|
||||
args.backend = None
|
||||
|
||||
# Check for special modes
|
||||
if "mode" in yaml_config:
|
||||
@@ -544,13 +551,15 @@ def main():
|
||||
args.num_kv_heads = model.get("num_kv_heads", args.num_kv_heads)
|
||||
args.block_size = model.get("block_size", args.block_size)
|
||||
|
||||
# Benchmark settings
|
||||
if "benchmark" in yaml_config:
|
||||
bench = yaml_config["benchmark"]
|
||||
args.device = bench.get("device", args.device)
|
||||
args.repeats = bench.get("repeats", args.repeats)
|
||||
args.warmup_iters = bench.get("warmup_iters", args.warmup_iters)
|
||||
args.profile_memory = bench.get("profile_memory", args.profile_memory)
|
||||
# Benchmark settings (top-level keys)
|
||||
if "device" in yaml_config:
|
||||
args.device = yaml_config["device"]
|
||||
if "repeats" in yaml_config:
|
||||
args.repeats = yaml_config["repeats"]
|
||||
if "warmup_iters" in yaml_config:
|
||||
args.warmup_iters = yaml_config["warmup_iters"]
|
||||
if "profile_memory" in yaml_config:
|
||||
args.profile_memory = yaml_config["profile_memory"]
|
||||
|
||||
# Parameter sweep configuration
|
||||
if "parameter_sweep" in yaml_config:
|
||||
|
||||
@@ -12,16 +12,36 @@ from typing import Any
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from batch_spec import get_batch_type, parse_batch_spec
|
||||
from rich.console import Console
|
||||
from rich.table import Table
|
||||
|
||||
|
||||
def batch_spec_sort_key(spec: str) -> tuple[int, int, int]:
|
||||
"""
|
||||
Extract sorting key from batch spec: (batch_size, max_q_len, max_kv_len).
|
||||
|
||||
This ensures results are sorted by batch size first, then query length,
|
||||
then sequence length, rather than alphabetically.
|
||||
"""
|
||||
try:
|
||||
requests = parse_batch_spec(spec)
|
||||
batch_size = len(requests)
|
||||
max_q_len = max(r.q_len for r in requests) if requests else 0
|
||||
max_kv_len = max(r.kv_len for r in requests) if requests else 0
|
||||
return (batch_size, max_q_len, max_kv_len)
|
||||
except Exception:
|
||||
# Fallback for unparseable specs
|
||||
return (0, 0, 0)
|
||||
|
||||
|
||||
# Mock classes for vLLM attention infrastructure
|
||||
|
||||
|
||||
class MockHfConfig:
|
||||
"""Mock HuggingFace config that satisfies vLLM's requirements."""
|
||||
|
||||
def __init__(self, mla_dims: dict):
|
||||
def __init__(self, mla_dims: dict, index_topk: int | None = None):
|
||||
self.num_attention_heads = mla_dims["num_q_heads"]
|
||||
self.num_key_value_heads = mla_dims["num_kv_heads"]
|
||||
self.hidden_size = mla_dims["head_dim"] * mla_dims["num_q_heads"]
|
||||
@@ -32,6 +52,8 @@ class MockHfConfig:
|
||||
self.qk_rope_head_dim = mla_dims["qk_rope_head_dim"]
|
||||
self.v_head_dim = mla_dims["v_head_dim"]
|
||||
self.qk_head_dim = mla_dims["qk_nope_head_dim"] + mla_dims["qk_rope_head_dim"]
|
||||
if index_topk is not None:
|
||||
self.index_topk = index_topk
|
||||
|
||||
def get_text_config(self):
|
||||
return self
|
||||
@@ -82,6 +104,38 @@ class MockKVBProj:
|
||||
return (result,) # Return as tuple to match ColumnParallelLinear API
|
||||
|
||||
|
||||
class MockIndexer:
|
||||
"""Mock Indexer for sparse MLA backends.
|
||||
|
||||
Provides topk_indices_buffer that sparse MLA backends use to determine
|
||||
which KV cache slots to attend to for each token.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
max_num_tokens: int,
|
||||
topk_tokens: int,
|
||||
device: torch.device,
|
||||
):
|
||||
self.topk_tokens = topk_tokens
|
||||
self.topk_indices_buffer = torch.zeros(
|
||||
(max_num_tokens, topk_tokens),
|
||||
dtype=torch.int32,
|
||||
device=device,
|
||||
)
|
||||
|
||||
def fill_random_indices(self, num_tokens: int, max_kv_len: int):
|
||||
"""Fill topk_indices_buffer with random valid indices for benchmarking."""
|
||||
indices = torch.randint(
|
||||
0,
|
||||
max_kv_len,
|
||||
(num_tokens, self.topk_tokens),
|
||||
dtype=torch.int32,
|
||||
device=self.topk_indices_buffer.device,
|
||||
)
|
||||
self.topk_indices_buffer[:num_tokens] = indices
|
||||
|
||||
|
||||
class MockLayer(AttentionLayerBase):
|
||||
"""Mock attention layer with scale parameters and impl.
|
||||
|
||||
@@ -316,14 +370,19 @@ class ResultsFormatter:
|
||||
backends: List of backend names being compared
|
||||
compare_to_fastest: Show percentage comparison to fastest
|
||||
"""
|
||||
# Group by batch spec
|
||||
# Group by batch spec, preserving first-occurrence order
|
||||
by_spec = {}
|
||||
specs_order = []
|
||||
for r in results:
|
||||
spec = r.config.batch_spec
|
||||
if spec not in by_spec:
|
||||
by_spec[spec] = {}
|
||||
specs_order.append(spec)
|
||||
by_spec[spec][r.config.backend] = r
|
||||
|
||||
# Sort specs by (batch_size, q_len, kv_len) instead of alphabetically
|
||||
specs_order = sorted(by_spec.keys(), key=batch_spec_sort_key)
|
||||
|
||||
# Create shortened backend names for display
|
||||
def shorten_backend_name(name: str) -> str:
|
||||
"""Shorten long backend names for table display."""
|
||||
@@ -337,6 +396,8 @@ class ResultsFormatter:
|
||||
|
||||
table = Table(title="Attention Benchmark Results")
|
||||
table.add_column("Batch\nSpec", no_wrap=True)
|
||||
table.add_column("Type", no_wrap=True)
|
||||
table.add_column("Batch\nSize", justify="right", no_wrap=True)
|
||||
|
||||
multi = len(backends) > 1
|
||||
for backend in backends:
|
||||
@@ -350,12 +411,14 @@ class ResultsFormatter:
|
||||
table.add_column(col_rel, justify="right", no_wrap=False)
|
||||
|
||||
# Add rows
|
||||
for spec in sorted(by_spec.keys()):
|
||||
for spec in specs_order:
|
||||
spec_results = by_spec[spec]
|
||||
times = {b: r.mean_time for b, r in spec_results.items() if r.success}
|
||||
best_time = min(times.values()) if times else 0.0
|
||||
|
||||
row = [spec]
|
||||
batch_type = get_batch_type(spec)
|
||||
batch_size = len(parse_batch_spec(spec))
|
||||
row = [spec, batch_type, str(batch_size)]
|
||||
for backend in backends:
|
||||
if backend in spec_results:
|
||||
r = spec_results[backend]
|
||||
@@ -486,10 +549,11 @@ def get_attention_scale(head_dim: int) -> float:
|
||||
|
||||
def is_mla_backend(backend: str) -> bool:
|
||||
"""
|
||||
Check if backend is an MLA backend using the backend's is_mla() property.
|
||||
Check if backend is an MLA backend using the AttentionBackendEnum.
|
||||
|
||||
Args:
|
||||
backend: Backend name (e.g., "CUTLASS_MLA", "FLASHINFER_MLA")
|
||||
backend: Backend name matching AttentionBackendEnum exactly
|
||||
(e.g., "FLASHMLA_SPARSE")
|
||||
|
||||
Returns:
|
||||
True if the backend is an MLA backend, False otherwise
|
||||
@@ -497,7 +561,8 @@ def is_mla_backend(backend: str) -> bool:
|
||||
from vllm.v1.attention.backends.registry import AttentionBackendEnum
|
||||
|
||||
try:
|
||||
backend_class = AttentionBackendEnum[backend.upper()].get_class()
|
||||
backend_enum = AttentionBackendEnum[backend]
|
||||
backend_class = backend_enum.get_class()
|
||||
return backend_class.is_mla()
|
||||
except (KeyError, ValueError, ImportError):
|
||||
except (KeyError, ValueError, ImportError, AttributeError):
|
||||
return False
|
||||
|
||||
@@ -3,7 +3,7 @@
|
||||
model:
|
||||
name: "deepseek-v3"
|
||||
num_layers: 60
|
||||
num_q_heads: 128
|
||||
num_q_heads: 128 # Base value, can be swept for TP simulation
|
||||
num_kv_heads: 1 # MLA uses single latent KV
|
||||
head_dim: 576
|
||||
kv_lora_rank: 512
|
||||
@@ -12,6 +12,13 @@ model:
|
||||
v_head_dim: 128
|
||||
block_size: 128 # CUTLASS MLA and FlashAttn MLA use 128
|
||||
|
||||
# Model parameter sweep: simulate tensor parallelism by varying num_q_heads
|
||||
# TP=1: 128 heads, TP=2: 64 heads, TP=4: 32 heads, TP=8: 16 heads
|
||||
model_parameter_sweep:
|
||||
param_name: "num_q_heads"
|
||||
values: [128, 64, 32, 16]
|
||||
label_format: "{backend}_{value}h"
|
||||
|
||||
batch_specs:
|
||||
# Small batches, varying sequence lengths
|
||||
- "16q1s512" # 16 requests, 512 KV cache
|
||||
@@ -34,28 +41,30 @@ batch_specs:
|
||||
# Very large batches
|
||||
- "128q1s1k" # 128 requests, 1k KV cache
|
||||
- "128q1s2k" # 128 requests, 2k KV cache
|
||||
- "128q1s4k" # 128 requests, 4k KV cache
|
||||
- "128q1s8k" # 128 requests, 8k KV cache
|
||||
|
||||
# Long context
|
||||
- "32q1s16k" # 32 requests, 16k KV cache
|
||||
- "32q1s32k" # 32 requests, 32k KV cache
|
||||
|
||||
backends:
|
||||
- cutlass_mla
|
||||
- flashinfer_mla
|
||||
- flashattn_mla # Hopper only
|
||||
- flashmla # Hopper only
|
||||
- CUTLASS_MLA
|
||||
- FLASHINFER_MLA
|
||||
- FLASH_ATTN_MLA # Hopper only
|
||||
- FLASHMLA # Hopper only
|
||||
|
||||
device: "cuda:0"
|
||||
repeats: 5
|
||||
warmup_iters: 3
|
||||
repeats: 100
|
||||
warmup_iters: 10
|
||||
profile_memory: true
|
||||
|
||||
# Backend-specific tuning
|
||||
cutlass_mla:
|
||||
CUTLASS_MLA:
|
||||
num_kv_splits: auto # or specific value like 4, 8, 16
|
||||
|
||||
flashattn_mla:
|
||||
FLASH_ATTN_MLA:
|
||||
reorder_batch_threshold: 512
|
||||
|
||||
flashmla:
|
||||
FLASHMLA:
|
||||
reorder_batch_threshold: 1
|
||||
|
||||
@@ -45,10 +45,10 @@ batch_specs:
|
||||
- "4q4k_60q1s4k" # 4 prefill + 60 decode
|
||||
|
||||
backends:
|
||||
- cutlass_mla
|
||||
- flashinfer_mla
|
||||
- flashattn_mla # Hopper only
|
||||
- flashmla # Hopper only
|
||||
- CUTLASS_MLA
|
||||
- FLASHINFER_MLA
|
||||
- FLASH_ATTN_MLA # Hopper only
|
||||
- FLASHMLA # Hopper only
|
||||
|
||||
device: "cuda:0"
|
||||
repeats: 5
|
||||
|
||||
62
benchmarks/attention_benchmarks/configs/mla_prefill.yaml
Normal file
62
benchmarks/attention_benchmarks/configs/mla_prefill.yaml
Normal file
@@ -0,0 +1,62 @@
|
||||
# MLA prefill-only benchmark configuration for sparse backends
|
||||
|
||||
model:
|
||||
name: "deepseek-v3"
|
||||
num_layers: 60
|
||||
num_q_heads: 128
|
||||
num_kv_heads: 1
|
||||
head_dim: 576
|
||||
kv_lora_rank: 512
|
||||
qk_nope_head_dim: 128
|
||||
qk_rope_head_dim: 64
|
||||
v_head_dim: 128
|
||||
block_size: 128
|
||||
|
||||
# Model parameter sweep: simulate tensor parallelism by varying num_q_heads
|
||||
# TP=1: 128 heads, TP=2: 64 heads, TP=4: 32 heads, TP=8: 16 heads
|
||||
model_parameter_sweep:
|
||||
param_name: "num_q_heads"
|
||||
values: [128, 64, 32, 16]
|
||||
label_format: "{backend}_{value}h"
|
||||
|
||||
batch_specs:
|
||||
# Pure prefill
|
||||
- "1q512"
|
||||
- "1q1k"
|
||||
- "1q2k"
|
||||
- "1q4k"
|
||||
- "1q8k"
|
||||
|
||||
# Batched pure prefill
|
||||
- "2q512"
|
||||
- "2q1k"
|
||||
- "2q2k"
|
||||
- "2q4k"
|
||||
- "2q8k"
|
||||
- "4q512"
|
||||
- "4q1k"
|
||||
- "4q2k"
|
||||
- "4q4k"
|
||||
- "4q8k"
|
||||
- "8q512"
|
||||
- "8q1k"
|
||||
- "8q2k"
|
||||
- "8q4k"
|
||||
- "8q8k"
|
||||
|
||||
# Extend
|
||||
- "1q512s4k"
|
||||
- "1q512s8k"
|
||||
- "1q1ks8k"
|
||||
- "1q2ks8k"
|
||||
- "1q2ks16k"
|
||||
- "1q4ks16k"
|
||||
|
||||
backends:
|
||||
- FLASHMLA_SPARSE
|
||||
- FLASHINFER_MLA_SPARSE
|
||||
|
||||
device: "cuda:0"
|
||||
repeats: 10
|
||||
warmup_iters: 3
|
||||
profile_memory: true
|
||||
@@ -6,7 +6,7 @@
|
||||
description: "Decode vs Prefill pipeline crossover analysis"
|
||||
|
||||
# Test FlashAttn MLA
|
||||
backend: flashattn_mla
|
||||
backend: FLASH_ATTN_MLA
|
||||
|
||||
# Mode: decode_vs_prefill comparison (special sweep mode)
|
||||
# For each batch spec, we'll test both decode and prefill pipelines
|
||||
@@ -62,11 +62,10 @@ model:
|
||||
block_size: 128
|
||||
|
||||
# Benchmark settings
|
||||
benchmark:
|
||||
device: "cuda:0"
|
||||
repeats: 15 # More repeats for spec decode variance
|
||||
warmup_iters: 5
|
||||
profile_memory: false
|
||||
device: "cuda:0"
|
||||
repeats: 15 # More repeats for spec decode variance
|
||||
warmup_iters: 5
|
||||
profile_memory: false
|
||||
|
||||
# Output
|
||||
output:
|
||||
|
||||
@@ -41,18 +41,17 @@ batch_specs:
|
||||
|
||||
# Backends that support query length > 1
|
||||
backends:
|
||||
- flashattn_mla # reorder_batch_threshold = 512
|
||||
- flashmla # reorder_batch_threshold = 1 (tunable)
|
||||
- FLASH_ATTN_MLA # reorder_batch_threshold = 512
|
||||
- FLASHMLA # reorder_batch_threshold = 1 (tunable)
|
||||
|
||||
# FlashInfer-MLA also supports uniform spec-as-decode but with different mechanism
|
||||
# - flashinfer_mla
|
||||
# - FLASHINFER_MLA
|
||||
|
||||
# Benchmark settings
|
||||
benchmark:
|
||||
device: "cuda:0"
|
||||
repeats: 10 # More repeats for statistical significance
|
||||
warmup_iters: 5
|
||||
profile_memory: false
|
||||
device: "cuda:0"
|
||||
repeats: 10 # More repeats for statistical significance
|
||||
warmup_iters: 5
|
||||
profile_memory: false
|
||||
|
||||
# Test these threshold values for optimization
|
||||
parameter_sweep:
|
||||
|
||||
@@ -25,14 +25,22 @@ batch_specs:
|
||||
- "4q1k_16q1s2k" # 4 prefill + 16 decode
|
||||
- "2q4k_32q1s1k" # 2 large prefill + 32 decode
|
||||
|
||||
# Context extension
|
||||
- "q1ks2k" # 1k query, 2k sequence (chunked prefill)
|
||||
# Speculative decode (q <= 8)
|
||||
- "16q2s1k" # 16 requests, 2 spec tokens, 1k KV cache
|
||||
- "16q4s1k" # 16 requests, 4 spec tokens, 1k KV cache
|
||||
- "16q8s1k" # 16 requests, 8 spec tokens, 1k KV cache
|
||||
- "32q4s2k" # 32 requests, 4 spec tokens, 2k KV cache
|
||||
- "8q8s4k" # 8 requests, 8 spec tokens, 4k KV cache
|
||||
|
||||
# Context extension (chunked prefill)
|
||||
- "q1ks2k" # 1k query, 2k sequence
|
||||
- "2q1ks4k" # 2 requests: 1k query, 4k sequence
|
||||
|
||||
# Available backends: FLASH_ATTN, TRITON_ATTN, FLASHINFER
|
||||
backends:
|
||||
- flash
|
||||
- triton
|
||||
- flashinfer
|
||||
- FLASH_ATTN
|
||||
- TRITON_ATTN
|
||||
- FLASHINFER
|
||||
|
||||
device: "cuda:0"
|
||||
repeats: 5
|
||||
|
||||
@@ -8,14 +8,13 @@ This module provides helpers for running MLA backends without
|
||||
needing full VllmConfig integration.
|
||||
"""
|
||||
|
||||
import importlib
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from batch_spec import parse_batch_spec
|
||||
from common import (
|
||||
BenchmarkResult,
|
||||
MockHfConfig,
|
||||
MockIndexer,
|
||||
MockKVBProj,
|
||||
MockLayer,
|
||||
setup_mla_dims,
|
||||
@@ -62,6 +61,7 @@ def create_minimal_vllm_config(
|
||||
block_size: int = 128,
|
||||
max_num_seqs: int = 256,
|
||||
mla_dims: dict | None = None,
|
||||
index_topk: int | None = None,
|
||||
) -> VllmConfig:
|
||||
"""
|
||||
Create minimal VllmConfig for MLA benchmarks.
|
||||
@@ -73,6 +73,8 @@ def create_minimal_vllm_config(
|
||||
max_num_seqs: Maximum number of sequences
|
||||
mla_dims: Optional custom MLA dimensions dict. If not provided, uses
|
||||
setup_mla_dims(model_name)
|
||||
index_topk: Optional topk value for sparse MLA backends. If provided,
|
||||
the config will include index_topk for sparse attention.
|
||||
|
||||
Returns:
|
||||
VllmConfig for benchmarking
|
||||
@@ -82,7 +84,7 @@ def create_minimal_vllm_config(
|
||||
mla_dims = setup_mla_dims(model_name)
|
||||
|
||||
# Create mock HF config first (avoids downloading from HuggingFace)
|
||||
mock_hf_config = MockHfConfig(mla_dims)
|
||||
mock_hf_config = MockHfConfig(mla_dims, index_topk=index_topk)
|
||||
|
||||
# Create a temporary minimal config.json to avoid HF downloads
|
||||
# This ensures consistent ModelConfig construction without network access
|
||||
@@ -120,16 +122,12 @@ def create_minimal_vllm_config(
|
||||
seed=0,
|
||||
max_model_len=32768,
|
||||
quantization=None,
|
||||
quantization_param_path=None,
|
||||
enforce_eager=False,
|
||||
max_context_len_to_capture=None,
|
||||
max_seq_len_to_capture=8192,
|
||||
max_logprobs=20,
|
||||
disable_sliding_window=False,
|
||||
skip_tokenizer_init=True,
|
||||
served_model_name=None,
|
||||
limit_mm_per_prompt=None,
|
||||
use_async_output_proc=True,
|
||||
config_format="auto",
|
||||
)
|
||||
finally:
|
||||
@@ -180,56 +178,65 @@ def create_minimal_vllm_config(
|
||||
# ============================================================================
|
||||
|
||||
|
||||
# Backend name to class name prefix mapping
|
||||
_BACKEND_NAME_MAP = {
|
||||
"flashattn_mla": "FlashAttnMLA",
|
||||
"flashmla": "FlashMLA",
|
||||
"flashinfer_mla": "FlashInferMLA",
|
||||
"cutlass_mla": "CutlassMLA",
|
||||
}
|
||||
|
||||
# Special properties that differ from defaults
|
||||
# Backend-specific properties that can't be inferred from the backend class
|
||||
# Keys are AttentionBackendEnum names (uppercase)
|
||||
_BACKEND_PROPERTIES = {
|
||||
"flashmla": {
|
||||
"FLASHMLA": {
|
||||
"query_format": "concat", # Single concatenated tensor (vs tuple)
|
||||
"block_size": 64, # FlashMLA uses fixed block size
|
||||
},
|
||||
"flashinfer_mla": {
|
||||
"block_size": 64, # FlashInfer MLA only supports 32 or 64
|
||||
"FLASHMLA_SPARSE": {
|
||||
"query_format": "concat", # Single concatenated tensor (vs tuple)
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
def _get_backend_config(backend: str) -> dict:
|
||||
"""
|
||||
Get backend configuration using naming conventions.
|
||||
Get backend configuration from AttentionBackendEnum.
|
||||
|
||||
All MLA backends follow the pattern:
|
||||
- Module: vllm.v1.attention.backends.mla.{backend}
|
||||
- Impl: {Name}Impl
|
||||
- Metadata: {Name}Metadata (or MLACommonMetadata)
|
||||
- DecodeMetadata: {Name}DecodeMetadata (or MLACommonDecodeMetadata)
|
||||
- MetadataBuilder: {Name}MetadataBuilder
|
||||
Uses the registry to get the backend class and extract configuration
|
||||
from its methods (get_impl_cls, get_builder_cls, is_sparse, etc.).
|
||||
|
||||
Args:
|
||||
backend: Backend name matching AttentionBackendEnum exactly
|
||||
(e.g., "FLASHMLA_SPARSE")
|
||||
|
||||
Returns:
|
||||
Dict with backend configuration
|
||||
"""
|
||||
if backend not in _BACKEND_NAME_MAP:
|
||||
raise ValueError(f"Unknown backend: {backend}")
|
||||
from vllm.v1.attention.backends.registry import AttentionBackendEnum
|
||||
|
||||
name = _BACKEND_NAME_MAP[backend]
|
||||
try:
|
||||
backend_enum = AttentionBackendEnum[backend]
|
||||
backend_class = backend_enum.get_class()
|
||||
except (KeyError, ValueError) as e:
|
||||
valid_backends = [e.name for e in AttentionBackendEnum if e.name != "CUSTOM"]
|
||||
raise ValueError(
|
||||
f"Unknown backend: {backend}. "
|
||||
f"Valid MLA backends: {[b for b in valid_backends if 'MLA' in b]}"
|
||||
) from e
|
||||
|
||||
# Get block size from backend class
|
||||
block_sizes = backend_class.get_supported_kernel_block_sizes()
|
||||
# Use first supported block size (backends typically support one for MLA)
|
||||
block_size = block_sizes[0] if block_sizes else None
|
||||
if hasattr(block_size, "value"):
|
||||
# Handle MultipleOf enum
|
||||
block_size = None
|
||||
|
||||
# Check if sparse via class method if available
|
||||
is_sparse = getattr(backend_class, "is_sparse", lambda: False)()
|
||||
|
||||
# Get properties that can't be inferred
|
||||
props = _BACKEND_PROPERTIES.get(backend, {})
|
||||
|
||||
# Check if backend uses common metadata (FlashInfer, CUTLASS)
|
||||
uses_common = backend in ("flashinfer_mla", "cutlass_mla")
|
||||
|
||||
return {
|
||||
"module": f"vllm.v1.attention.backends.mla.{backend}",
|
||||
"impl_class": f"{name}Impl",
|
||||
"metadata_class": "MLACommonMetadata" if uses_common else f"{name}Metadata",
|
||||
"decode_metadata_class": "MLACommonDecodeMetadata"
|
||||
if uses_common
|
||||
else f"{name}DecodeMetadata",
|
||||
"builder_class": f"{name}MetadataBuilder",
|
||||
"backend_class": backend_class,
|
||||
"impl_class": backend_class.get_impl_cls(),
|
||||
"builder_class": backend_class.get_builder_cls(),
|
||||
"query_format": props.get("query_format", "tuple"),
|
||||
"block_size": props.get("block_size", None),
|
||||
"block_size": block_size,
|
||||
"is_sparse": is_sparse,
|
||||
}
|
||||
|
||||
|
||||
@@ -447,22 +454,26 @@ def _create_backend_impl(
|
||||
mla_dims: dict,
|
||||
vllm_config: VllmConfig,
|
||||
device: torch.device,
|
||||
max_num_tokens: int = 8192,
|
||||
index_topk: int | None = None,
|
||||
):
|
||||
"""
|
||||
Create backend implementation instance.
|
||||
|
||||
Args:
|
||||
backend_cfg: Backend configuration dict
|
||||
backend_cfg: Backend configuration dict from _get_backend_config()
|
||||
mla_dims: MLA dimension configuration
|
||||
vllm_config: VllmConfig instance
|
||||
device: Target device
|
||||
max_num_tokens: Maximum number of tokens for sparse indexer buffer
|
||||
index_topk: Topk value for sparse MLA backends
|
||||
|
||||
Returns:
|
||||
Tuple of (impl, layer, builder_instance)
|
||||
Tuple of (impl, layer, builder_instance, indexer)
|
||||
"""
|
||||
# Import backend classes
|
||||
backend_module = importlib.import_module(backend_cfg["module"])
|
||||
impl_class = getattr(backend_module, backend_cfg["impl_class"])
|
||||
# Get classes from backend config (already resolved by _get_backend_config)
|
||||
impl_class = backend_cfg["impl_class"]
|
||||
builder_class = backend_cfg["builder_class"]
|
||||
|
||||
# Calculate scale
|
||||
scale = 1.0 / np.sqrt(mla_dims["qk_nope_head_dim"] + mla_dims["qk_rope_head_dim"])
|
||||
@@ -474,26 +485,44 @@ def _create_backend_impl(
|
||||
v_head_dim=mla_dims["v_head_dim"],
|
||||
)
|
||||
|
||||
# Create indexer for sparse backends
|
||||
indexer = None
|
||||
if backend_cfg.get("is_sparse", False):
|
||||
if index_topk is None:
|
||||
index_topk = 2048 # Default topk for sparse MLA
|
||||
indexer = MockIndexer(
|
||||
max_num_tokens=max_num_tokens,
|
||||
topk_tokens=index_topk,
|
||||
device=device,
|
||||
)
|
||||
|
||||
# Build impl kwargs
|
||||
impl_kwargs = {
|
||||
"num_heads": mla_dims["num_q_heads"],
|
||||
"head_size": mla_dims["head_dim"],
|
||||
"scale": scale,
|
||||
"num_kv_heads": mla_dims["num_kv_heads"],
|
||||
"alibi_slopes": None,
|
||||
"sliding_window": None,
|
||||
"kv_cache_dtype": "auto",
|
||||
"logits_soft_cap": None,
|
||||
"attn_type": "decoder",
|
||||
"kv_sharing_target_layer_name": None,
|
||||
"q_lora_rank": None,
|
||||
"kv_lora_rank": mla_dims["kv_lora_rank"],
|
||||
"qk_nope_head_dim": mla_dims["qk_nope_head_dim"],
|
||||
"qk_rope_head_dim": mla_dims["qk_rope_head_dim"],
|
||||
"qk_head_dim": mla_dims["qk_nope_head_dim"] + mla_dims["qk_rope_head_dim"],
|
||||
"v_head_dim": mla_dims["v_head_dim"],
|
||||
"kv_b_proj": mock_kv_b_proj,
|
||||
}
|
||||
|
||||
# Add indexer for sparse backends
|
||||
if indexer is not None:
|
||||
impl_kwargs["indexer"] = indexer
|
||||
|
||||
# Create impl
|
||||
impl = impl_class(
|
||||
num_heads=mla_dims["num_q_heads"],
|
||||
head_size=mla_dims["head_dim"],
|
||||
scale=scale,
|
||||
num_kv_heads=mla_dims["num_kv_heads"],
|
||||
alibi_slopes=None,
|
||||
sliding_window=None,
|
||||
kv_cache_dtype="auto",
|
||||
logits_soft_cap=None,
|
||||
attn_type="decoder",
|
||||
kv_sharing_target_layer_name=None,
|
||||
q_lora_rank=None,
|
||||
kv_lora_rank=mla_dims["kv_lora_rank"],
|
||||
qk_nope_head_dim=mla_dims["qk_nope_head_dim"],
|
||||
qk_rope_head_dim=mla_dims["qk_rope_head_dim"],
|
||||
qk_head_dim=mla_dims["qk_nope_head_dim"] + mla_dims["qk_rope_head_dim"],
|
||||
v_head_dim=mla_dims["v_head_dim"],
|
||||
kv_b_proj=mock_kv_b_proj,
|
||||
)
|
||||
impl = impl_class(**impl_kwargs)
|
||||
|
||||
# Initialize DCP attributes
|
||||
if not hasattr(impl, "dcp_world_size") or impl.dcp_world_size in (None, -1):
|
||||
@@ -515,9 +544,7 @@ def _create_backend_impl(
|
||||
|
||||
# Create builder instance if needed
|
||||
builder_instance = None
|
||||
if backend_cfg["builder_class"]:
|
||||
builder_class = getattr(backend_module, backend_cfg["builder_class"])
|
||||
|
||||
if builder_class:
|
||||
# Populate static_forward_context so builder can find the layer
|
||||
# MockLayer inherits from AttentionLayerBase, so isinstance checks pass
|
||||
vllm_config.compilation_config.static_forward_context = {"placeholder": layer}
|
||||
@@ -529,7 +556,7 @@ def _create_backend_impl(
|
||||
device=device,
|
||||
)
|
||||
|
||||
return impl, layer, builder_instance
|
||||
return impl, layer, builder_instance, indexer
|
||||
|
||||
|
||||
# ============================================================================
|
||||
@@ -594,6 +621,7 @@ def _run_single_benchmark(
|
||||
backend_cfg: dict,
|
||||
mla_dims: dict,
|
||||
device: torch.device,
|
||||
indexer=None,
|
||||
) -> BenchmarkResult:
|
||||
"""
|
||||
Run a single benchmark iteration.
|
||||
@@ -606,6 +634,7 @@ def _run_single_benchmark(
|
||||
backend_cfg: Backend configuration dict
|
||||
mla_dims: MLA dimension configuration
|
||||
device: Target device
|
||||
indexer: Optional MockIndexer for sparse backends
|
||||
|
||||
Returns:
|
||||
BenchmarkResult with timing statistics
|
||||
@@ -613,7 +642,9 @@ def _run_single_benchmark(
|
||||
# Parse batch spec
|
||||
requests = parse_batch_spec(config.batch_spec)
|
||||
q_lens = [r.q_len for r in requests]
|
||||
kv_lens = [r.kv_len for r in requests]
|
||||
total_q = sum(q_lens)
|
||||
max_kv_len = max(kv_lens)
|
||||
|
||||
# Determine block size
|
||||
block_size = backend_cfg["block_size"] or config.block_size
|
||||
@@ -641,8 +672,16 @@ def _run_single_benchmark(
|
||||
torch.bfloat16,
|
||||
)
|
||||
|
||||
# Determine which forward method to use based on metadata
|
||||
if metadata.decode is not None:
|
||||
# Fill indexer with random indices for sparse backends
|
||||
is_sparse = backend_cfg.get("is_sparse", False)
|
||||
if is_sparse and indexer is not None:
|
||||
indexer.fill_random_indices(total_q, max_kv_len)
|
||||
|
||||
# Determine which forward method to use
|
||||
if is_sparse:
|
||||
# Sparse backends use forward_mqa
|
||||
forward_fn = lambda: impl.forward_mqa(decode_inputs, kv_cache, metadata, layer)
|
||||
elif metadata.decode is not None:
|
||||
forward_fn = lambda: impl._forward_decode(
|
||||
decode_inputs, kv_cache, metadata, layer
|
||||
)
|
||||
@@ -693,11 +732,13 @@ def _run_single_benchmark(
|
||||
def _run_mla_benchmark_batched(
|
||||
backend: str,
|
||||
configs_with_params: list[tuple], # [(config, threshold, num_splits), ...]
|
||||
index_topk: int = 2048,
|
||||
) -> list[BenchmarkResult]:
|
||||
"""
|
||||
Unified batched MLA benchmark runner for all backends.
|
||||
|
||||
Works for: flashattn_mla, flashmla, flashinfer_mla, cutlass_mla
|
||||
Works for: flashattn_mla, flashmla, flashinfer_mla, cutlass_mla,
|
||||
flashinfer_mla_sparse, flashmla_sparse
|
||||
|
||||
This function reuses backend initialization across multiple benchmarks
|
||||
to avoid setup/teardown overhead.
|
||||
@@ -707,6 +748,7 @@ def _run_mla_benchmark_batched(
|
||||
configs_with_params: List of (config, threshold, num_splits) tuples
|
||||
- threshold: reorder_batch_threshold (FlashAttn/FlashMLA only)
|
||||
- num_splits: num_kv_splits (CUTLASS only)
|
||||
index_topk: Topk value for sparse MLA backends (default 2048)
|
||||
|
||||
Returns:
|
||||
List of BenchmarkResult objects
|
||||
@@ -730,19 +772,27 @@ def _run_mla_benchmark_batched(
|
||||
if mla_dims is None:
|
||||
mla_dims = setup_mla_dims("deepseek-v3")
|
||||
|
||||
# Determine if this is a sparse backend
|
||||
is_sparse = backend_cfg.get("is_sparse", False)
|
||||
|
||||
# Create and set vLLM config for MLA (reused across all benchmarks)
|
||||
vllm_config = create_minimal_vllm_config(
|
||||
model_name="deepseek-v3", # Used only for model path
|
||||
block_size=block_size,
|
||||
mla_dims=mla_dims, # Use custom dims from config or default
|
||||
index_topk=index_topk if is_sparse else None,
|
||||
)
|
||||
|
||||
results = []
|
||||
|
||||
with set_current_vllm_config(vllm_config):
|
||||
# Create backend impl, layer, and builder (reused across benchmarks)
|
||||
impl, layer, builder_instance = _create_backend_impl(
|
||||
backend_cfg, mla_dims, vllm_config, device
|
||||
# Create backend impl, layer, builder, and indexer (reused across benchmarks)
|
||||
impl, layer, builder_instance, indexer = _create_backend_impl(
|
||||
backend_cfg,
|
||||
mla_dims,
|
||||
vllm_config,
|
||||
device,
|
||||
index_topk=index_topk if is_sparse else None,
|
||||
)
|
||||
|
||||
# Run each benchmark with the shared impl
|
||||
@@ -768,6 +818,7 @@ def _run_mla_benchmark_batched(
|
||||
backend_cfg,
|
||||
mla_dims,
|
||||
device,
|
||||
indexer=indexer,
|
||||
)
|
||||
results.append(result)
|
||||
|
||||
@@ -793,20 +844,24 @@ def run_mla_benchmark(
|
||||
config,
|
||||
reorder_batch_threshold: int | None = None,
|
||||
num_kv_splits: int | None = None,
|
||||
index_topk: int = 2048,
|
||||
) -> BenchmarkResult | list[BenchmarkResult]:
|
||||
"""
|
||||
Unified MLA benchmark runner for all backends.
|
||||
|
||||
Works for: flashattn_mla, flashmla, flashinfer_mla, cutlass_mla
|
||||
Works for: flashattn_mla, flashmla, flashinfer_mla, cutlass_mla,
|
||||
flashinfer_mla_sparse, flashmla_sparse
|
||||
|
||||
Always uses batched execution internally for optimal performance.
|
||||
|
||||
Args:
|
||||
backend: Backend name (flashattn_mla, flashmla, flashinfer_mla, cutlass_mla)
|
||||
backend: Backend name (flashattn_mla, flashmla, flashinfer_mla, cutlass_mla,
|
||||
flashinfer_mla_sparse, flashmla_sparse)
|
||||
config: BenchmarkConfig or list of (BenchmarkConfig, param) tuples
|
||||
reorder_batch_threshold: Threshold override for FlashAttn/FlashMLA
|
||||
(single config mode only)
|
||||
num_kv_splits: Number of KV splits for CUTLASS (single config mode only)
|
||||
index_topk: Topk value for sparse MLA backends (default 2048)
|
||||
|
||||
Returns:
|
||||
BenchmarkResult (single mode) or list of BenchmarkResult (batched mode)
|
||||
@@ -816,9 +871,9 @@ def run_mla_benchmark(
|
||||
# Already in batched format
|
||||
if len(config) > 0 and isinstance(config[0], tuple):
|
||||
# Format: [(cfg, param), ...] where param is threshold or num_splits
|
||||
if backend in ("flashattn_mla", "flashmla"):
|
||||
if backend in ("flashattn_mla", "flashmla", "flashmla_sparse"):
|
||||
configs_with_params = [(cfg, param, None) for cfg, param in config]
|
||||
else: # cutlass_mla or flashinfer_mla
|
||||
else: # cutlass_mla, flashinfer_mla, or sparse backends
|
||||
configs_with_params = [(cfg, None, param) for cfg, param in config]
|
||||
else:
|
||||
# Format: [cfg, ...] - just configs
|
||||
@@ -830,7 +885,7 @@ def run_mla_benchmark(
|
||||
return_single = True
|
||||
|
||||
# Use unified batched execution
|
||||
results = _run_mla_benchmark_batched(backend, configs_with_params)
|
||||
results = _run_mla_benchmark_batched(backend, configs_with_params, index_topk)
|
||||
|
||||
# Return single result or list based on input
|
||||
return results[0] if return_single else results
|
||||
|
||||
@@ -8,7 +8,9 @@ This module provides helpers for running standard attention backends
|
||||
(FlashAttention, Triton, FlashInfer) with real vLLM integration.
|
||||
"""
|
||||
|
||||
import logging
|
||||
import types
|
||||
from contextlib import contextmanager
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
@@ -24,8 +26,13 @@ from vllm.config import (
|
||||
ParallelConfig,
|
||||
SchedulerConfig,
|
||||
VllmConfig,
|
||||
set_current_vllm_config,
|
||||
)
|
||||
from vllm.v1.attention.backends.utils import (
|
||||
CommonAttentionMetadata,
|
||||
get_kv_cache_layout,
|
||||
set_kv_cache_layout,
|
||||
)
|
||||
from vllm.v1.attention.backends.utils import CommonAttentionMetadata
|
||||
from vllm.v1.kv_cache_interface import FullAttentionSpec
|
||||
|
||||
# ============================================================================
|
||||
@@ -33,37 +40,41 @@ from vllm.v1.kv_cache_interface import FullAttentionSpec
|
||||
# ============================================================================
|
||||
|
||||
|
||||
_BACKEND_CONFIG = {
|
||||
"flash": {
|
||||
"module": "vllm.v1.attention.backends.flash_attn",
|
||||
"backend_class": "FlashAttentionBackend",
|
||||
"dtype": torch.float16,
|
||||
"cache_layout": "standard",
|
||||
# ^ [2, num_blocks, block_size, num_kv_heads, head_dim]
|
||||
},
|
||||
"triton": {
|
||||
"module": "vllm.v1.attention.backends.triton_attn",
|
||||
"backend_class": "TritonAttentionBackend",
|
||||
"dtype": torch.float32,
|
||||
"cache_layout": "standard",
|
||||
},
|
||||
"flashinfer": {
|
||||
"module": "vllm.v1.attention.backends.flashinfer",
|
||||
"backend_class": "FlashInferBackend",
|
||||
"dtype": torch.float16,
|
||||
"cache_layout": "flashinfer",
|
||||
# ^ [num_blocks, 2, block_size, num_kv_heads, head_dim]
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
def _get_backend_config(backend: str) -> dict:
|
||||
if backend not in _BACKEND_CONFIG:
|
||||
"""
|
||||
Get backend configuration from AttentionBackendEnum.
|
||||
|
||||
Args:
|
||||
backend: Backend name matching AttentionBackendEnum exactly
|
||||
(e.g., "FLASH_ATTN", "TRITON_ATTN", "FLASHINFER")
|
||||
|
||||
Returns:
|
||||
Dict with backend_class
|
||||
"""
|
||||
from vllm.v1.attention.backends.registry import AttentionBackendEnum
|
||||
|
||||
try:
|
||||
backend_enum = AttentionBackendEnum[backend]
|
||||
backend_class = backend_enum.get_class()
|
||||
except (KeyError, ValueError) as e:
|
||||
valid_backends = [b.name for b in AttentionBackendEnum if b.name != "CUSTOM"]
|
||||
raise ValueError(
|
||||
f"Unknown backend: {backend}. "
|
||||
f"Available: {', '.join(_BACKEND_CONFIG.keys())}"
|
||||
)
|
||||
return _BACKEND_CONFIG[backend]
|
||||
f"Unknown backend: {backend}. Valid backends: {valid_backends}"
|
||||
) from e
|
||||
|
||||
return {"backend_class": backend_class}
|
||||
|
||||
|
||||
@contextmanager
|
||||
def log_warnings_and_errors_only():
|
||||
"""Temporarily set vLLM logger to WARNING level."""
|
||||
logger = logging.getLogger("vllm")
|
||||
old_level = logger.level
|
||||
logger.setLevel(logging.WARNING)
|
||||
try:
|
||||
yield
|
||||
finally:
|
||||
logger.setLevel(old_level)
|
||||
|
||||
|
||||
# ============================================================================
|
||||
@@ -88,11 +99,7 @@ def _build_common_attn_metadata(
|
||||
query_start_loc_cpu = query_start_loc.cpu()
|
||||
|
||||
seq_lens = torch.tensor(kv_lens, dtype=torch.int32, device=device)
|
||||
seq_lens_cpu = seq_lens.cpu()
|
||||
max_seq_len = int(seq_lens_cpu.max())
|
||||
|
||||
context_lens = [kv - q for kv, q in zip(kv_lens, q_lens)]
|
||||
num_computed_tokens_cpu = torch.tensor(context_lens, dtype=torch.int32)
|
||||
max_seq_len = int(seq_lens.max().item())
|
||||
|
||||
max_blocks = (max(kv_lens) + block_size - 1) // block_size
|
||||
num_blocks = batch_size * max_blocks
|
||||
@@ -107,8 +114,6 @@ def _build_common_attn_metadata(
|
||||
query_start_loc=query_start_loc,
|
||||
query_start_loc_cpu=query_start_loc_cpu,
|
||||
seq_lens=seq_lens,
|
||||
seq_lens_cpu=seq_lens_cpu,
|
||||
num_computed_tokens_cpu=num_computed_tokens_cpu,
|
||||
num_reqs=batch_size,
|
||||
num_actual_tokens=total_tokens,
|
||||
max_query_len=max_query_len,
|
||||
@@ -121,7 +126,6 @@ def _build_common_attn_metadata(
|
||||
|
||||
def _create_vllm_config(
|
||||
config: BenchmarkConfig,
|
||||
dtype: torch.dtype,
|
||||
max_num_blocks: int,
|
||||
) -> VllmConfig:
|
||||
"""Create a VllmConfig for benchmarking with mock model methods."""
|
||||
@@ -129,7 +133,7 @@ def _create_vllm_config(
|
||||
model="meta-llama/Meta-Llama-3-8B",
|
||||
tokenizer="meta-llama/Meta-Llama-3-8B",
|
||||
trust_remote_code=False,
|
||||
dtype=dtype,
|
||||
dtype="auto", # Use model's native dtype
|
||||
seed=0,
|
||||
max_model_len=1024,
|
||||
)
|
||||
@@ -198,15 +202,12 @@ def _create_backend_impl(
|
||||
backend_cfg: dict,
|
||||
config: BenchmarkConfig,
|
||||
device: torch.device,
|
||||
dtype: torch.dtype,
|
||||
):
|
||||
"""Create backend implementation instance."""
|
||||
import importlib
|
||||
|
||||
backend_module = importlib.import_module(backend_cfg["module"])
|
||||
backend_class = getattr(backend_module, backend_cfg["backend_class"])
|
||||
backend_class = backend_cfg["backend_class"]
|
||||
|
||||
scale = get_attention_scale(config.head_dim)
|
||||
dtype = backend_cfg["dtype"]
|
||||
|
||||
impl = backend_class.get_impl_cls()(
|
||||
num_heads=config.num_q_heads,
|
||||
@@ -227,7 +228,7 @@ def _create_backend_impl(
|
||||
|
||||
layer = MockLayer(device, kv_cache_spec=kv_cache_spec)
|
||||
|
||||
return backend_class, impl, layer, dtype
|
||||
return backend_class, impl, layer
|
||||
|
||||
|
||||
def _create_metadata_builder(
|
||||
@@ -235,11 +236,44 @@ def _create_metadata_builder(
|
||||
kv_cache_spec: FullAttentionSpec,
|
||||
vllm_config: VllmConfig,
|
||||
device: torch.device,
|
||||
backend_name: str = "",
|
||||
):
|
||||
"""Create metadata builder instance."""
|
||||
return backend_class.get_builder_cls()(
|
||||
layer_names = ["layer_0"]
|
||||
builder_cls = backend_class.get_builder_cls()
|
||||
|
||||
# Flashinfer needs get_per_layer_parameters mocked since we don't have
|
||||
# real model layers registered
|
||||
if backend_name == "FLASHINFER":
|
||||
import unittest.mock
|
||||
|
||||
from vllm.v1.attention.backends.utils import PerLayerParameters
|
||||
|
||||
def mock_get_per_layer_parameters(vllm_config, layer_names, impl_cls):
|
||||
head_size = vllm_config.model_config.get_head_size()
|
||||
return {
|
||||
layer_name: PerLayerParameters(
|
||||
window_left=-1, # No sliding window
|
||||
logits_soft_cap=0.0, # No soft cap
|
||||
sm_scale=1.0 / (head_size**0.5), # Standard scale
|
||||
)
|
||||
for layer_name in layer_names
|
||||
}
|
||||
|
||||
with unittest.mock.patch(
|
||||
"vllm.v1.attention.backends.flashinfer.get_per_layer_parameters",
|
||||
mock_get_per_layer_parameters,
|
||||
):
|
||||
return builder_cls(
|
||||
kv_cache_spec=kv_cache_spec,
|
||||
layer_names=layer_names,
|
||||
vllm_config=vllm_config,
|
||||
device=device,
|
||||
)
|
||||
|
||||
return builder_cls(
|
||||
kv_cache_spec=kv_cache_spec,
|
||||
layer_names=["layer_0"],
|
||||
layer_names=layer_names,
|
||||
vllm_config=vllm_config,
|
||||
device=device,
|
||||
)
|
||||
@@ -281,39 +315,44 @@ def _create_input_tensors(
|
||||
def _create_kv_cache(
|
||||
config: BenchmarkConfig,
|
||||
max_num_blocks: int,
|
||||
cache_layout: str,
|
||||
backend_class,
|
||||
device: torch.device,
|
||||
dtype: torch.dtype,
|
||||
) -> list:
|
||||
"""Create KV cache tensors for all layers."""
|
||||
if cache_layout == "flashinfer":
|
||||
# FlashInfer layout: [num_blocks, 2, block_size, num_kv_heads, head_dim]
|
||||
cache_list = [
|
||||
torch.zeros(
|
||||
max_num_blocks,
|
||||
2,
|
||||
config.block_size,
|
||||
config.num_kv_heads,
|
||||
config.head_dim,
|
||||
device=device,
|
||||
dtype=dtype,
|
||||
)
|
||||
for _ in range(config.num_layers)
|
||||
]
|
||||
else:
|
||||
# Standard layout: [2, num_blocks, block_size, num_kv_heads, head_dim]
|
||||
cache_list = [
|
||||
torch.zeros(
|
||||
2,
|
||||
max_num_blocks,
|
||||
config.block_size,
|
||||
config.num_kv_heads,
|
||||
config.head_dim,
|
||||
device=device,
|
||||
dtype=dtype,
|
||||
)
|
||||
for _ in range(config.num_layers)
|
||||
]
|
||||
"""Create KV cache tensors for all layers using the backend's methods.
|
||||
|
||||
Uses the backend's get_kv_cache_shape() and get_kv_cache_stride_order()
|
||||
to create the cache with the correct shape and memory layout.
|
||||
"""
|
||||
# Get the logical shape from the backend
|
||||
cache_shape = backend_class.get_kv_cache_shape(
|
||||
num_blocks=max_num_blocks,
|
||||
block_size=config.block_size,
|
||||
num_kv_heads=config.num_kv_heads,
|
||||
head_size=config.head_dim,
|
||||
)
|
||||
|
||||
# Get the stride order for custom memory layout
|
||||
try:
|
||||
stride_order = backend_class.get_kv_cache_stride_order()
|
||||
assert len(stride_order) == len(cache_shape)
|
||||
except (AttributeError, NotImplementedError):
|
||||
stride_order = tuple(range(len(cache_shape)))
|
||||
|
||||
# Permute shape to physical layout order
|
||||
physical_shape = tuple(cache_shape[i] for i in stride_order)
|
||||
|
||||
# Compute inverse permutation to get back to logical view
|
||||
inv_order = [stride_order.index(i) for i in range(len(stride_order))]
|
||||
|
||||
cache_list = []
|
||||
for _ in range(config.num_layers):
|
||||
# Allocate in physical layout order (contiguous in memory)
|
||||
cache = torch.zeros(*physical_shape, device=device, dtype=dtype)
|
||||
# Permute to logical view
|
||||
cache = cache.permute(*inv_order)
|
||||
cache_list.append(cache)
|
||||
|
||||
return cache_list
|
||||
|
||||
|
||||
@@ -396,7 +435,7 @@ def run_attention_benchmark(config: BenchmarkConfig) -> BenchmarkResult:
|
||||
"""
|
||||
Run standard attention benchmark with real kernels.
|
||||
|
||||
Supports: flash, triton, flashinfer
|
||||
Supports: FLASH_ATTN, TRITON_ATTN, FLASHINFER
|
||||
|
||||
Args:
|
||||
config: Benchmark configuration
|
||||
@@ -411,60 +450,79 @@ def run_attention_benchmark(config: BenchmarkConfig) -> BenchmarkResult:
|
||||
|
||||
requests = parse_batch_spec(config.batch_spec)
|
||||
|
||||
if config.backend == "flashinfer":
|
||||
if config.backend == "FLASHINFER":
|
||||
requests = reorder_for_flashinfer(requests)
|
||||
|
||||
q_lens = [r.q_len for r in requests]
|
||||
kv_lens = [r.kv_len for r in requests]
|
||||
total_q = sum(q_lens)
|
||||
max_kv = max(kv_lens)
|
||||
batch_size = len(q_lens)
|
||||
|
||||
max_num_blocks = (max_kv + config.block_size - 1) // config.block_size
|
||||
# Calculate total blocks needed: batch_size * max_blocks_per_request
|
||||
max_blocks_per_request = (max_kv + config.block_size - 1) // config.block_size
|
||||
max_num_blocks = batch_size * max_blocks_per_request
|
||||
|
||||
backend_class, impl, layer, dtype = _create_backend_impl(
|
||||
backend_cfg, config, device
|
||||
)
|
||||
# Suppress vLLM logs during setup to reduce spam
|
||||
with log_warnings_and_errors_only():
|
||||
# Create vllm_config first - uses model's native dtype via "auto"
|
||||
vllm_config = _create_vllm_config(config, max_num_blocks)
|
||||
dtype = vllm_config.model_config.dtype
|
||||
|
||||
common_metadata = _build_common_attn_metadata(
|
||||
q_lens, kv_lens, config.block_size, device
|
||||
)
|
||||
# Wrap everything in set_current_vllm_config context
|
||||
# This is required for backends like flashinfer that need global config
|
||||
with set_current_vllm_config(vllm_config):
|
||||
backend_class, impl, layer = _create_backend_impl(
|
||||
backend_cfg, config, device, dtype
|
||||
)
|
||||
|
||||
kv_cache_spec = FullAttentionSpec(
|
||||
block_size=config.block_size,
|
||||
num_kv_heads=config.num_kv_heads,
|
||||
head_size=config.head_dim,
|
||||
dtype=dtype,
|
||||
)
|
||||
# Set KV cache layout if the backend requires a specific one
|
||||
# (e.g., FlashInfer requires HND on SM100/Blackwell for TRTLLM attention)
|
||||
required_layout = backend_class.get_required_kv_cache_layout()
|
||||
if required_layout is not None:
|
||||
set_kv_cache_layout(required_layout)
|
||||
get_kv_cache_layout.cache_clear()
|
||||
|
||||
vllm_config = _create_vllm_config(config, dtype, max_num_blocks)
|
||||
common_metadata = _build_common_attn_metadata(
|
||||
q_lens, kv_lens, config.block_size, device
|
||||
)
|
||||
|
||||
builder = _create_metadata_builder(
|
||||
backend_class, kv_cache_spec, vllm_config, device
|
||||
)
|
||||
kv_cache_spec = FullAttentionSpec(
|
||||
block_size=config.block_size,
|
||||
num_kv_heads=config.num_kv_heads,
|
||||
head_size=config.head_dim,
|
||||
dtype=dtype,
|
||||
)
|
||||
|
||||
attn_metadata = builder.build(
|
||||
common_prefix_len=0,
|
||||
common_attn_metadata=common_metadata,
|
||||
)
|
||||
builder = _create_metadata_builder(
|
||||
backend_class, kv_cache_spec, vllm_config, device, config.backend
|
||||
)
|
||||
|
||||
q_list, k_list, v_list = _create_input_tensors(config, total_q, device, dtype)
|
||||
attn_metadata = builder.build(
|
||||
common_prefix_len=0,
|
||||
common_attn_metadata=common_metadata,
|
||||
)
|
||||
|
||||
cache_list = _create_kv_cache(
|
||||
config, max_num_blocks, backend_cfg["cache_layout"], device, dtype
|
||||
)
|
||||
q_list, k_list, v_list = _create_input_tensors(
|
||||
config, total_q, device, dtype
|
||||
)
|
||||
|
||||
times, mem_stats = _run_single_benchmark(
|
||||
config,
|
||||
impl,
|
||||
layer,
|
||||
q_list,
|
||||
k_list,
|
||||
v_list,
|
||||
cache_list,
|
||||
attn_metadata,
|
||||
device,
|
||||
dtype,
|
||||
)
|
||||
cache_list = _create_kv_cache(
|
||||
config, max_num_blocks, backend_class, device, dtype
|
||||
)
|
||||
|
||||
times, mem_stats = _run_single_benchmark(
|
||||
config,
|
||||
impl,
|
||||
layer,
|
||||
q_list,
|
||||
k_list,
|
||||
v_list,
|
||||
cache_list,
|
||||
attn_metadata,
|
||||
device,
|
||||
dtype,
|
||||
)
|
||||
|
||||
mean_time = np.mean(times)
|
||||
throughput = total_q / mean_time if mean_time > 0 else 0
|
||||
|
||||
@@ -46,10 +46,10 @@ echo "VLLM_LOGGING_LEVEL=$VLLM_LOGGING_LEVEL"
|
||||
echo "RESULT_FILE=$RESULT"
|
||||
echo "====================== AUTO TUNEPARAMETERS ===================="
|
||||
|
||||
rm -rf $LOG_FOLDER
|
||||
rm -rf $PROFILE_PATH
|
||||
mkdir -p $LOG_FOLDER
|
||||
mkdir -p $PROFILE_PATH
|
||||
rm -rf "$LOG_FOLDER"
|
||||
rm -rf "$PROFILE_PATH"
|
||||
mkdir -p "$LOG_FOLDER"
|
||||
mkdir -p "$PROFILE_PATH"
|
||||
|
||||
cd "$BASE/vllm"
|
||||
|
||||
@@ -114,7 +114,7 @@ start_server() {
|
||||
|
||||
# wait for 10 minutes...
|
||||
server_started=0
|
||||
for i in {1..60}; do
|
||||
for _ in {1..60}; do
|
||||
# This line checks whether the server is still alive or not,
|
||||
# since that we should always have permission to send signal to the server process.
|
||||
kill -0 $server_pid 2> /dev/null || break
|
||||
@@ -145,12 +145,12 @@ run_benchmark() {
|
||||
local vllm_log="$LOG_FOLDER/vllm_log_${max_num_seqs}_${max_num_batched_tokens}.txt"
|
||||
echo "vllm_log: $vllm_log"
|
||||
echo
|
||||
rm -f $vllm_log
|
||||
rm -f "$vllm_log"
|
||||
pkill -if "vllm serve" || true
|
||||
|
||||
echo "starting server..."
|
||||
# Call start_server without a profile_dir to avoid profiling overhead
|
||||
start_server $gpu_memory_utilization $max_num_seqs $max_num_batched_tokens $vllm_log ""
|
||||
start_server "$gpu_memory_utilization" "$max_num_seqs" "$max_num_batched_tokens" "$vllm_log" ""
|
||||
result=$?
|
||||
if [[ "$result" -eq 1 ]]; then
|
||||
echo "server failed to start. gpu_memory_utilization:$gpu_memory_utilization, max_num_seqs:$max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens"
|
||||
@@ -168,15 +168,15 @@ run_benchmark() {
|
||||
# --profile flag is removed from this call
|
||||
vllm bench serve \
|
||||
--backend vllm \
|
||||
--model $MODEL \
|
||||
--model "$MODEL" \
|
||||
--dataset-name random \
|
||||
--random-input-len $adjusted_input_len \
|
||||
--random-output-len $OUTPUT_LEN \
|
||||
--random-output-len "$OUTPUT_LEN" \
|
||||
--ignore-eos \
|
||||
--disable-tqdm \
|
||||
--request-rate inf \
|
||||
--percentile-metrics ttft,tpot,itl,e2el \
|
||||
--goodput e2el:$MAX_LATENCY_ALLOWED_MS \
|
||||
--goodput e2el:"$MAX_LATENCY_ALLOWED_MS" \
|
||||
--num-prompts 1000 \
|
||||
--random-prefix-len $prefix_len \
|
||||
--host "$HOSTNAME" \
|
||||
@@ -195,20 +195,20 @@ run_benchmark() {
|
||||
request_rate=$((${throughput%.*} + 1))
|
||||
while ((request_rate > 0)); do
|
||||
# clear prefix cache
|
||||
curl -X POST http://${HOSTNAME}:8004/reset_prefix_cache
|
||||
curl -X POST http://"${HOSTNAME}":8004/reset_prefix_cache
|
||||
sleep 5
|
||||
bm_log="$LOG_FOLDER/bm_log_${max_num_seqs}_${max_num_batched_tokens}_requestrate_${request_rate}.txt"
|
||||
vllm bench serve \
|
||||
--backend vllm \
|
||||
--model $MODEL \
|
||||
--model "$MODEL" \
|
||||
--dataset-name random \
|
||||
--random-input-len $adjusted_input_len \
|
||||
--random-output-len $OUTPUT_LEN \
|
||||
--random-output-len "$OUTPUT_LEN" \
|
||||
--ignore-eos \
|
||||
--disable-tqdm \
|
||||
--request-rate $request_rate \
|
||||
--percentile-metrics ttft,tpot,itl,e2el \
|
||||
--goodput e2el:$MAX_LATENCY_ALLOWED_MS \
|
||||
--goodput e2el:"$MAX_LATENCY_ALLOWED_MS" \
|
||||
--num-prompts 100 \
|
||||
--random-prefix-len $prefix_len \
|
||||
--host "$HOSTNAME" \
|
||||
@@ -255,7 +255,7 @@ gpu_memory_utilization=0.98
|
||||
find_gpu_memory_utilization=0
|
||||
while (( $(echo "$gpu_memory_utilization >= 0.9" | bc -l) )); do
|
||||
# Pass empty string for profile_dir argument
|
||||
start_server $gpu_memory_utilization "${num_seqs_list[-1]}" "${num_batched_tokens_list[-1]}" "$LOG_FOLDER/vllm_log_gpu_memory_utilization_$gpu_memory_utilization.log" ""
|
||||
start_server "$gpu_memory_utilization" "${num_seqs_list[-1]}" "${num_batched_tokens_list[-1]}" "$LOG_FOLDER/vllm_log_gpu_memory_utilization_$gpu_memory_utilization.log" ""
|
||||
result=$?
|
||||
if [[ "$result" -eq 0 ]]; then
|
||||
find_gpu_memory_utilization=1
|
||||
@@ -274,7 +274,7 @@ fi
|
||||
|
||||
for num_seqs in "${num_seqs_list[@]}"; do
|
||||
for num_batched_tokens in "${num_batched_tokens_list[@]}"; do
|
||||
run_benchmark $num_seqs $num_batched_tokens $gpu_memory_utilization
|
||||
run_benchmark "$num_seqs" "$num_batched_tokens" "$gpu_memory_utilization"
|
||||
done
|
||||
done
|
||||
echo "finish permutations"
|
||||
@@ -285,7 +285,7 @@ echo "finish permutations"
|
||||
if (( $(echo "$best_throughput > 0" | bc -l) )); then
|
||||
echo
|
||||
echo "Benchmark tuning finished. Now running profiling on the best configuration found..."
|
||||
echo "Best config: max_num_seqs: $best_max_num_seqs, max_num_batched_tokens: $best_num_batched_tokens, throughput: $best_throughput"
|
||||
echo "Best config: max_num_seqs: $best_max_num_seqs, max_num_batched_tokens: $best_num_batched_tokens, throughput: $best_throughput, goodput: $best_goodput"
|
||||
echo
|
||||
|
||||
vllm_log="$LOG_FOLDER/vllm_log_BEST_PROFILE.txt"
|
||||
@@ -293,7 +293,7 @@ if (( $(echo "$best_throughput > 0" | bc -l) )); then
|
||||
|
||||
# Start server with the best params and profiling ENABLED
|
||||
echo "Starting server for profiling..."
|
||||
start_server $gpu_memory_utilization $best_max_num_seqs $best_num_batched_tokens "$vllm_log" "$PROFILE_PATH"
|
||||
start_server "$gpu_memory_utilization" "$best_max_num_seqs" "$best_num_batched_tokens" "$vllm_log" "$PROFILE_PATH"
|
||||
|
||||
# Run benchmark with the best params and the --profile flag
|
||||
echo "Running benchmark with profiling..."
|
||||
@@ -301,15 +301,15 @@ if (( $(echo "$best_throughput > 0" | bc -l) )); then
|
||||
adjusted_input_len=$(( INPUT_LEN - prefix_len ))
|
||||
vllm bench serve \
|
||||
--backend vllm \
|
||||
--model $MODEL \
|
||||
--model "$MODEL" \
|
||||
--dataset-name random \
|
||||
--random-input-len $adjusted_input_len \
|
||||
--random-output-len $OUTPUT_LEN \
|
||||
--random-output-len "$OUTPUT_LEN" \
|
||||
--ignore-eos \
|
||||
--disable-tqdm \
|
||||
--request-rate $best_request_rate \
|
||||
--request-rate "$best_request_rate" \
|
||||
--percentile-metrics ttft,tpot,itl,e2el \
|
||||
--goodput e2el:$MAX_LATENCY_ALLOWED_MS \
|
||||
--goodput e2el:"$MAX_LATENCY_ALLOWED_MS" \
|
||||
--num-prompts 100 \
|
||||
--random-prefix-len $prefix_len \
|
||||
--host "$HOSTNAME" \
|
||||
|
||||
@@ -64,7 +64,7 @@ for i in $(seq 0 $(($num_runs - 1))); do
|
||||
else
|
||||
STATUS="FAILURE"
|
||||
((FAILURE_COUNT++))
|
||||
FAILED_RUNS+=("Run #$((i+1)): $(echo $run_object | jq -c .)")
|
||||
FAILED_RUNS+=("Run #$((i+1)): $(echo "$run_object" | jq -c .)")
|
||||
fi
|
||||
|
||||
RUN_OUTPUT=$(<"$RUN_OUTPUT_FILE")
|
||||
|
||||
471
benchmarks/benchmark_topk_topp.py
Normal file
471
benchmarks/benchmark_topk_topp.py
Normal file
@@ -0,0 +1,471 @@
|
||||
#!/usr/bin/env python3
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""
|
||||
Benchmark comparing Triton vs PyTorch sort-based top-k/top-p implementations.
|
||||
|
||||
Compares:
|
||||
- apply_top_k_top_p_triton (Triton binary search)
|
||||
- apply_top_k_top_p (PyTorch sort-based)
|
||||
|
||||
Scenarios:
|
||||
- top_k only (whole batch, partial batch)
|
||||
- top_p only (whole batch, partial batch)
|
||||
- mix of top_k and top_p
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import gc
|
||||
from dataclasses import dataclass
|
||||
|
||||
import torch
|
||||
|
||||
from vllm.v1.sample.ops.topk_topp_sampler import apply_top_k_top_p_pytorch
|
||||
from vllm.v1.sample.ops.topk_topp_triton import (
|
||||
apply_top_k_top_p_triton,
|
||||
reset_buffer_cache,
|
||||
)
|
||||
|
||||
|
||||
@dataclass
|
||||
class BenchmarkConfig:
|
||||
"""Configuration for a benchmark run."""
|
||||
|
||||
name: str
|
||||
batch_size: int
|
||||
vocab_size: int
|
||||
# k and p can be tensors or None
|
||||
k_values: torch.Tensor | None # [batch_size] or None
|
||||
p_values: torch.Tensor | None # [batch_size] or None
|
||||
description: str
|
||||
ops_pct: float = 0.0 # Percentage of ops relative to batch size
|
||||
|
||||
|
||||
def calculate_ops_pct(
|
||||
k_values: torch.Tensor | None,
|
||||
p_values: torch.Tensor | None,
|
||||
vocab_size: int,
|
||||
batch_size: int,
|
||||
) -> float:
|
||||
"""
|
||||
Calculate the percentage of active top-k and top-p operations.
|
||||
|
||||
Returns percentage where 100% = batch_size ops.
|
||||
E.g., if all rows have both top-k and top-p active, returns 200%.
|
||||
"""
|
||||
active_ops = 0
|
||||
|
||||
if k_values is not None:
|
||||
# Count rows where k < vocab_size (active top-k filtering)
|
||||
active_ops += (k_values < vocab_size).sum().item()
|
||||
|
||||
if p_values is not None:
|
||||
# Count rows where p < 1.0 (active top-p filtering)
|
||||
active_ops += (p_values < 1.0).sum().item()
|
||||
|
||||
return (active_ops / batch_size) * 100 if batch_size > 0 else 0.0
|
||||
|
||||
|
||||
def create_logits(
|
||||
batch_size: int, vocab_size: int, device: str = "cuda"
|
||||
) -> torch.Tensor:
|
||||
"""Create random logits mimicking a realistic LLM distribution.
|
||||
|
||||
Uses a Zipf-like probability distribution (rank^-1.1) converted to logits
|
||||
via log, then randomly permuted per row. This produces a peaked distribution
|
||||
where a small number of tokens capture most probability mass, similar to
|
||||
real model outputs.
|
||||
"""
|
||||
# Create Zipf-like probabilities: p(rank) ~ rank^(-alpha)
|
||||
ranks = torch.arange(1, vocab_size + 1, dtype=torch.float32, device=device)
|
||||
probs = ranks.pow(-1.1)
|
||||
probs = probs / probs.sum()
|
||||
|
||||
# Convert to logits (log-probabilities, unnormalized is fine)
|
||||
base_logits = probs.log()
|
||||
|
||||
# Broadcast to batch and randomly permute each row
|
||||
logits = base_logits.unsqueeze(0).expand(batch_size, -1).clone()
|
||||
for i in range(batch_size):
|
||||
logits[i] = logits[i, torch.randperm(vocab_size, device=device)]
|
||||
|
||||
return logits
|
||||
|
||||
|
||||
def measure_memory() -> tuple[int, int]:
|
||||
"""Return (allocated, reserved) memory in bytes."""
|
||||
torch.cuda.synchronize()
|
||||
return torch.cuda.memory_allocated(), torch.cuda.max_memory_allocated()
|
||||
|
||||
|
||||
def reset_memory_stats():
|
||||
"""Reset peak memory statistics."""
|
||||
reset_buffer_cache()
|
||||
torch.cuda.reset_peak_memory_stats()
|
||||
torch.cuda.empty_cache()
|
||||
gc.collect()
|
||||
|
||||
|
||||
def benchmark_function(
|
||||
func,
|
||||
logits: torch.Tensor,
|
||||
k: torch.Tensor | None,
|
||||
p: torch.Tensor | None,
|
||||
warmup_iters: int = 5,
|
||||
benchmark_iters: int = 20,
|
||||
) -> tuple[float, int]:
|
||||
"""
|
||||
Benchmark a function and return (avg_time_ms, peak_memory_bytes).
|
||||
|
||||
Returns average time in milliseconds and peak memory usage.
|
||||
"""
|
||||
# Warmup
|
||||
for _ in range(warmup_iters):
|
||||
logits_copy = logits.clone()
|
||||
func(logits_copy, k, p)
|
||||
torch.cuda.synchronize()
|
||||
|
||||
# Reset memory stats before benchmark
|
||||
reset_memory_stats()
|
||||
|
||||
# Benchmark
|
||||
start_events = [
|
||||
torch.cuda.Event(enable_timing=True) for _ in range(benchmark_iters)
|
||||
]
|
||||
end_events = [torch.cuda.Event(enable_timing=True) for _ in range(benchmark_iters)]
|
||||
|
||||
for i in range(benchmark_iters):
|
||||
logits_copy = logits.clone()
|
||||
start_events[i].record()
|
||||
func(logits_copy, k, p)
|
||||
end_events[i].record()
|
||||
|
||||
torch.cuda.synchronize()
|
||||
|
||||
# Calculate timing
|
||||
times = [
|
||||
start_events[i].elapsed_time(end_events[i]) for i in range(benchmark_iters)
|
||||
]
|
||||
avg_time = sum(times) / len(times)
|
||||
|
||||
# Get peak memory
|
||||
_, peak_memory = measure_memory()
|
||||
|
||||
return avg_time, peak_memory
|
||||
|
||||
|
||||
def create_benchmark_configs(
|
||||
batch_sizes: list[int],
|
||||
vocab_sizes: list[int],
|
||||
device: str = "cuda",
|
||||
) -> list[BenchmarkConfig]:
|
||||
"""Create all benchmark configurations."""
|
||||
configs = []
|
||||
|
||||
for vocab_size in vocab_sizes:
|
||||
for batch_size in batch_sizes:
|
||||
# 1. Top-k only - whole batch (all rows have k < vocab_size)
|
||||
k_all = torch.full((batch_size,), 50, dtype=torch.int32, device=device)
|
||||
configs.append(
|
||||
BenchmarkConfig(
|
||||
name=f"topk_whole_b{batch_size}_v{vocab_size // 1000}k",
|
||||
batch_size=batch_size,
|
||||
vocab_size=vocab_size,
|
||||
k_values=k_all,
|
||||
p_values=None,
|
||||
description=f"Top-k only (whole batch, k=50), "
|
||||
f"batch={batch_size}, vocab={vocab_size}",
|
||||
ops_pct=calculate_ops_pct(k_all, None, vocab_size, batch_size),
|
||||
)
|
||||
)
|
||||
|
||||
# 2. Top-k only - partial batch (half have k=50, half have k=vocab_size)
|
||||
k_partial = torch.full((batch_size,), 50, dtype=torch.int32, device=device)
|
||||
k_partial[batch_size // 2 :] = vocab_size # No filtering for second half
|
||||
configs.append(
|
||||
BenchmarkConfig(
|
||||
name=f"topk_partial_b{batch_size}_v{vocab_size // 1000}k",
|
||||
batch_size=batch_size,
|
||||
vocab_size=vocab_size,
|
||||
k_values=k_partial,
|
||||
p_values=None,
|
||||
description=f"Top-k only (partial batch, 50% k=50, 50% k=vocab), "
|
||||
f"batch={batch_size}, vocab={vocab_size}",
|
||||
ops_pct=calculate_ops_pct(k_partial, None, vocab_size, batch_size),
|
||||
)
|
||||
)
|
||||
|
||||
# 3. Top-p only - whole batch (all rows have p < 1.0)
|
||||
p_all = torch.full((batch_size,), 0.9, dtype=torch.float32, device=device)
|
||||
configs.append(
|
||||
BenchmarkConfig(
|
||||
name=f"topp_whole_b{batch_size}_v{vocab_size // 1000}k",
|
||||
batch_size=batch_size,
|
||||
vocab_size=vocab_size,
|
||||
k_values=None,
|
||||
p_values=p_all,
|
||||
description=f"Top-p only (whole batch, p=0.9), "
|
||||
f"batch={batch_size}, vocab={vocab_size}",
|
||||
ops_pct=calculate_ops_pct(None, p_all, vocab_size, batch_size),
|
||||
)
|
||||
)
|
||||
|
||||
# 4. Top-p only - partial batch (half have p=0.9, half have p=1.0)
|
||||
p_partial = torch.full(
|
||||
(batch_size,), 0.9, dtype=torch.float32, device=device
|
||||
)
|
||||
p_partial[batch_size // 2 :] = 1.0 # No filtering for second half
|
||||
configs.append(
|
||||
BenchmarkConfig(
|
||||
name=f"topp_partial_b{batch_size}_v{vocab_size // 1000}k",
|
||||
batch_size=batch_size,
|
||||
vocab_size=vocab_size,
|
||||
k_values=None,
|
||||
p_values=p_partial,
|
||||
description=f"Top-p only (partial batch, 50% p=0.9, 50% p=1.0), "
|
||||
f"batch={batch_size}, vocab={vocab_size}",
|
||||
ops_pct=calculate_ops_pct(None, p_partial, vocab_size, batch_size),
|
||||
)
|
||||
)
|
||||
|
||||
# 5. Mix of top-k and top-p (both applied to whole batch)
|
||||
k_mix = torch.full((batch_size,), 100, dtype=torch.int32, device=device)
|
||||
p_mix = torch.full((batch_size,), 0.9, dtype=torch.float32, device=device)
|
||||
configs.append(
|
||||
BenchmarkConfig(
|
||||
name=f"topk_topp_whole_b{batch_size}_v{vocab_size // 1000}k",
|
||||
batch_size=batch_size,
|
||||
vocab_size=vocab_size,
|
||||
k_values=k_mix,
|
||||
p_values=p_mix,
|
||||
description=f"Top-k + Top-p (whole batch, k=100, p=0.9), "
|
||||
f"batch={batch_size}, vocab={vocab_size}",
|
||||
ops_pct=calculate_ops_pct(k_mix, p_mix, vocab_size, batch_size),
|
||||
)
|
||||
)
|
||||
|
||||
# 6. Mix with partial application (some rows k only, some p only, some both)
|
||||
k_mixed = torch.full(
|
||||
(batch_size,), vocab_size, dtype=torch.int32, device=device
|
||||
)
|
||||
p_mixed = torch.full((batch_size,), 1.0, dtype=torch.float32, device=device)
|
||||
# First third: k only
|
||||
third = batch_size // 3
|
||||
k_mixed[:third] = 50
|
||||
# Second third: p only
|
||||
p_mixed[third : 2 * third] = 0.5
|
||||
# Last third: both k and p
|
||||
k_mixed[2 * third :] = 100
|
||||
p_mixed[2 * third :] = 0.9
|
||||
configs.append(
|
||||
BenchmarkConfig(
|
||||
name=f"mixed_partial_b{batch_size}_v{vocab_size // 1000}k",
|
||||
batch_size=batch_size,
|
||||
vocab_size=vocab_size,
|
||||
k_values=k_mixed,
|
||||
p_values=p_mixed,
|
||||
description=f"Mixed partial (1/3 k=50, 1/3 p=0.9, 1/3 both), "
|
||||
f"batch={batch_size}, vocab={vocab_size}",
|
||||
ops_pct=calculate_ops_pct(k_mixed, p_mixed, vocab_size, batch_size),
|
||||
)
|
||||
)
|
||||
|
||||
return configs
|
||||
|
||||
|
||||
def format_memory(bytes_val: int) -> str:
|
||||
"""Format memory in human-readable form."""
|
||||
if bytes_val >= 1024**3:
|
||||
return f"{bytes_val / (1024**3):.2f} GB"
|
||||
elif bytes_val >= 1024**2:
|
||||
return f"{bytes_val / (1024**2):.2f} MB"
|
||||
elif bytes_val >= 1024:
|
||||
return f"{bytes_val / 1024:.2f} KB"
|
||||
return f"{bytes_val} B"
|
||||
|
||||
|
||||
def run_benchmark(
|
||||
configs: list[BenchmarkConfig],
|
||||
warmup_iters: int = 5,
|
||||
benchmark_iters: int = 20,
|
||||
verbose: bool = True,
|
||||
):
|
||||
"""Run all benchmarks and print results."""
|
||||
results = []
|
||||
|
||||
print("=" * 100)
|
||||
print("Top-k/Top-p Benchmark: Triton vs PyTorch Sort-based")
|
||||
print("=" * 100)
|
||||
print()
|
||||
|
||||
for config in configs:
|
||||
if verbose:
|
||||
print(f"Running: {config.description}")
|
||||
|
||||
# Create fresh logits for this config
|
||||
logits = create_logits(config.batch_size, config.vocab_size)
|
||||
|
||||
# Benchmark Triton
|
||||
reset_memory_stats()
|
||||
triton_time, triton_mem = benchmark_function(
|
||||
apply_top_k_top_p_triton,
|
||||
logits,
|
||||
config.k_values,
|
||||
config.p_values,
|
||||
warmup_iters,
|
||||
benchmark_iters,
|
||||
)
|
||||
|
||||
# Benchmark PyTorch
|
||||
reset_memory_stats()
|
||||
pytorch_time, pytorch_mem = benchmark_function(
|
||||
apply_top_k_top_p_pytorch,
|
||||
logits,
|
||||
config.k_values,
|
||||
config.p_values,
|
||||
warmup_iters,
|
||||
benchmark_iters,
|
||||
)
|
||||
|
||||
speedup = pytorch_time / triton_time if triton_time > 0 else float("inf")
|
||||
mem_ratio = pytorch_mem / triton_mem if triton_mem > 0 else float("inf")
|
||||
|
||||
result = {
|
||||
"config": config,
|
||||
"triton_time_ms": triton_time,
|
||||
"pytorch_time_ms": pytorch_time,
|
||||
"triton_mem": triton_mem,
|
||||
"pytorch_mem": pytorch_mem,
|
||||
"speedup": speedup,
|
||||
"mem_ratio": mem_ratio,
|
||||
}
|
||||
results.append(result)
|
||||
|
||||
if verbose:
|
||||
print(f" Triton: {triton_time:.3f} ms, {format_memory(triton_mem)}")
|
||||
print(f" PyTorch: {pytorch_time:.3f} ms, {format_memory(pytorch_mem)}")
|
||||
print(f" Speedup: {speedup:.2f}x, Memory ratio: {mem_ratio:.2f}x")
|
||||
print()
|
||||
|
||||
# Clean up
|
||||
del logits
|
||||
reset_memory_stats()
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def print_summary_table(results: list[dict]):
|
||||
"""Print a summary table of results."""
|
||||
print()
|
||||
print("=" * 130)
|
||||
print("SUMMARY TABLE")
|
||||
print("=" * 130)
|
||||
print()
|
||||
|
||||
# Header
|
||||
header = (
|
||||
f"{'Scenario':<40} {'Batch':>6} {'Vocab':>7} {'Ops%':>6} "
|
||||
f"{'Triton (ms)':>12} {'PyTorch (ms)':>13} {'Speedup':>8} "
|
||||
f"{'Tri Mem':>10} {'Pyt Mem':>10}"
|
||||
)
|
||||
print(header)
|
||||
print("-" * 130)
|
||||
|
||||
# Group by scenario type
|
||||
current_vocab = None
|
||||
for result in results:
|
||||
config = result["config"]
|
||||
|
||||
# Add separator between vocab sizes
|
||||
if current_vocab != config.vocab_size:
|
||||
if current_vocab is not None:
|
||||
print("-" * 130)
|
||||
current_vocab = config.vocab_size
|
||||
|
||||
scenario = config.name.split("_b")[0] # Extract scenario name
|
||||
print(
|
||||
f"{scenario:<40} {config.batch_size:>6} {config.vocab_size:>7} "
|
||||
f"{config.ops_pct:>5.0f}% "
|
||||
f"{result['triton_time_ms']:>12.3f} {result['pytorch_time_ms']:>13.3f} "
|
||||
f"{result['speedup']:>7.2f}x "
|
||||
f"{format_memory(result['triton_mem']):>10} "
|
||||
f"{format_memory(result['pytorch_mem']):>10}"
|
||||
)
|
||||
|
||||
print("=" * 130)
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Benchmark Triton vs PyTorch sort-based top-k/top-p implementations"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--batch-sizes",
|
||||
type=int,
|
||||
nargs="+",
|
||||
default=[1, 4, 16, 64, 128, 512, 1024, 2048],
|
||||
help="Batch sizes to test (default: 1 4 16 64)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--vocab-sizes",
|
||||
type=int,
|
||||
nargs="+",
|
||||
default=[32768, 131072], # 32k, 128k
|
||||
help="Vocabulary sizes to test (default: 32768 131072)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--warmup-iters",
|
||||
type=int,
|
||||
default=5,
|
||||
help="Number of warmup iterations (default: 5)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--benchmark-iters",
|
||||
type=int,
|
||||
default=20,
|
||||
help="Number of benchmark iterations (default: 20)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--quiet",
|
||||
action="store_true",
|
||||
help="Only print summary table",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Print configuration
|
||||
print(f"Batch sizes: {args.batch_sizes}")
|
||||
print(f"Vocab sizes: {args.vocab_sizes}")
|
||||
print(f"Warmup iterations: {args.warmup_iters}")
|
||||
print(f"Benchmark iterations: {args.benchmark_iters}")
|
||||
print()
|
||||
|
||||
# Check CUDA
|
||||
if not torch.cuda.is_available():
|
||||
print("ERROR: CUDA is not available. This benchmark requires a GPU.")
|
||||
return
|
||||
|
||||
device_name = torch.cuda.get_device_name(0)
|
||||
print(f"GPU: {device_name}")
|
||||
print()
|
||||
|
||||
# Create configs
|
||||
configs = create_benchmark_configs(
|
||||
args.batch_sizes,
|
||||
args.vocab_sizes,
|
||||
)
|
||||
|
||||
# Run benchmarks
|
||||
results = run_benchmark(
|
||||
configs,
|
||||
warmup_iters=args.warmup_iters,
|
||||
benchmark_iters=args.benchmark_iters,
|
||||
verbose=not args.quiet,
|
||||
)
|
||||
|
||||
# Print summary
|
||||
print_summary_table(results)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -13,6 +13,7 @@ from torch.utils.benchmark import Measurement as TMeasurement
|
||||
from tqdm import tqdm
|
||||
|
||||
import vllm._custom_ops as ops
|
||||
from vllm.benchmarks.lib.utils import default_vllm_config
|
||||
from vllm.model_executor.layers.layernorm import RMSNorm
|
||||
from vllm.model_executor.layers.quantization.utils.fp8_utils import (
|
||||
per_token_group_quant_fp8,
|
||||
@@ -291,6 +292,7 @@ def print_timers(timers: Iterable[TMeasurement]):
|
||||
compare.print()
|
||||
|
||||
|
||||
@default_vllm_config()
|
||||
def main():
|
||||
torch.set_default_device("cuda")
|
||||
bench_params = get_bench_params()
|
||||
|
||||
@@ -7,6 +7,7 @@ import itertools
|
||||
import torch
|
||||
|
||||
import vllm.model_executor.layers.activation # noqa F401
|
||||
from vllm.benchmarks.lib.utils import default_vllm_config
|
||||
from vllm.model_executor.custom_op import op_registry
|
||||
from vllm.triton_utils import triton
|
||||
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
||||
@@ -18,6 +19,7 @@ intermediate_size = [3072, 9728, 12288]
|
||||
configs = list(itertools.product(batch_size_range, seq_len_range, intermediate_size))
|
||||
|
||||
|
||||
@default_vllm_config()
|
||||
def benchmark_activation(
|
||||
batch_size: int,
|
||||
seq_len: int,
|
||||
|
||||
@@ -8,6 +8,7 @@ os.environ["VLLM_USE_DEEP_GEMM"] = "0"
|
||||
|
||||
import torch
|
||||
|
||||
from vllm.benchmarks.lib.utils import default_vllm_config
|
||||
from vllm.model_executor.layers.quantization.utils.fp8_utils import (
|
||||
W8A8BlockFp8LinearOp,
|
||||
)
|
||||
@@ -40,6 +41,7 @@ DEEPSEEK_V3_SHAPES = [
|
||||
]
|
||||
|
||||
|
||||
@default_vllm_config()
|
||||
def build_w8a8_block_fp8_runner(M, N, K, block_size, device, use_cutlass):
|
||||
"""Build runner function for w8a8 block fp8 matmul."""
|
||||
factor_for_scale = 1e-2
|
||||
@@ -11,6 +11,7 @@ import torch
|
||||
import vllm.model_executor.layers.fused_moe.modular_kernel as mk
|
||||
from tests.kernels.moe.utils import make_dummy_moe_config
|
||||
from vllm import _custom_ops as ops
|
||||
from vllm.model_executor.layers.fused_moe.activation import MoEActivation
|
||||
from vllm.model_executor.layers.fused_moe.config import fp8_w8a8_moe_quant_config
|
||||
from vllm.model_executor.layers.fused_moe.cutlass_moe import CutlassExpertsFp8
|
||||
from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts, fused_topk
|
||||
@@ -161,7 +162,7 @@ def bench_run(
|
||||
w2_fp8q_cutlass,
|
||||
topk_weights,
|
||||
topk_ids,
|
||||
activation="silu",
|
||||
activation=MoEActivation.SILU,
|
||||
global_num_experts=num_experts,
|
||||
)
|
||||
torch.cuda.synchronize()
|
||||
|
||||
@@ -30,6 +30,9 @@ import torch.distributed as dist
|
||||
from torch.distributed import ProcessGroup
|
||||
|
||||
from vllm.distributed.device_communicators.custom_all_reduce import CustomAllreduce
|
||||
from vllm.distributed.device_communicators.flashinfer_all_reduce import (
|
||||
FlashInferAllReduce,
|
||||
)
|
||||
from vllm.distributed.device_communicators.pynccl import (
|
||||
PyNcclCommunicator,
|
||||
register_nccl_symmetric_ops,
|
||||
@@ -44,7 +47,7 @@ from vllm.utils.argparse_utils import FlexibleArgumentParser
|
||||
logger = init_logger(__name__)
|
||||
|
||||
# Default sequence lengths to benchmark
|
||||
DEFAULT_SEQUENCE_LENGTHS = [128, 512, 1024, 2048, 4096, 8192]
|
||||
DEFAULT_SEQUENCE_LENGTHS = [16, 64, 128, 512, 1024, 2048, 4096, 8192]
|
||||
|
||||
# Fixed hidden size and dtype for all benchmarks
|
||||
HIDDEN_SIZE = 8192
|
||||
@@ -81,6 +84,7 @@ class CommunicatorBenchmark:
|
||||
self.symm_mem_comm = None
|
||||
self.symm_mem_comm_multimem = None
|
||||
self.symm_mem_comm_two_shot = None
|
||||
self.fi_ar_comm = None
|
||||
|
||||
self._init_communicators()
|
||||
|
||||
@@ -161,6 +165,22 @@ class CommunicatorBenchmark:
|
||||
)
|
||||
self.symm_mem_comm_two_shot = None
|
||||
|
||||
try:
|
||||
self.fi_ar_comm = FlashInferAllReduce(
|
||||
group=self.cpu_group,
|
||||
device=self.device,
|
||||
)
|
||||
if not self.fi_ar_comm.disabled:
|
||||
logger.info("Rank %s: FlashInferAllReduce initialized", self.rank)
|
||||
else:
|
||||
logger.info("Rank %s: FlashInferAllReduce disabled", self.rank)
|
||||
self.fi_ar_comm = None
|
||||
except Exception as e:
|
||||
logger.warning(
|
||||
"Rank %s: Failed to initialize FlashInferAllReduce: %s", self.rank, e
|
||||
)
|
||||
self.fi_ar_comm = None
|
||||
|
||||
def benchmark_allreduce(
|
||||
self, sequence_length: int, num_warmup: int, num_trials: int
|
||||
) -> dict[str, float]:
|
||||
@@ -180,7 +200,8 @@ class CommunicatorBenchmark:
|
||||
lambda t, c=comm: c.custom_all_reduce(t),
|
||||
lambda t, c=comm: c.should_custom_ar(t),
|
||||
comm.capture(),
|
||||
"1stage", # env variable value
|
||||
{"VLLM_CUSTOM_ALLREDUCE_ALGO": "1stage"},
|
||||
None, # no destroy function
|
||||
)
|
||||
)
|
||||
# CustomAllreduce two-shot
|
||||
@@ -190,7 +211,8 @@ class CommunicatorBenchmark:
|
||||
lambda t, c=comm: c.custom_all_reduce(t),
|
||||
lambda t, c=comm: c.should_custom_ar(t),
|
||||
comm.capture(),
|
||||
"2stage", # env variable value
|
||||
{"VLLM_CUSTOM_ALLREDUCE_ALGO": "2stage"},
|
||||
None, # no destroy function
|
||||
)
|
||||
)
|
||||
|
||||
@@ -202,7 +224,8 @@ class CommunicatorBenchmark:
|
||||
lambda t, c=comm: c.all_reduce(t),
|
||||
lambda t: True, # Always available if initialized
|
||||
nullcontext(),
|
||||
None, # no env variable needed
|
||||
{}, # no env variable needed
|
||||
None, # no destroy function
|
||||
)
|
||||
)
|
||||
communicators.append(
|
||||
@@ -211,7 +234,8 @@ class CommunicatorBenchmark:
|
||||
lambda t: torch.ops.vllm.all_reduce_symmetric_with_copy(t),
|
||||
lambda t: True, # Always available if initialized
|
||||
nullcontext(),
|
||||
None, # no env variable needed
|
||||
{}, # no env variable needed
|
||||
None, # no destroy function
|
||||
)
|
||||
)
|
||||
|
||||
@@ -223,7 +247,8 @@ class CommunicatorBenchmark:
|
||||
lambda t, c=comm: c.all_reduce(t),
|
||||
lambda t, c=comm: c.should_use_symm_mem(t),
|
||||
nullcontext(),
|
||||
None, # no env variable needed
|
||||
{}, # no env variable needed
|
||||
None, # no destroy function
|
||||
)
|
||||
)
|
||||
|
||||
@@ -235,29 +260,67 @@ class CommunicatorBenchmark:
|
||||
lambda t, c=comm: c.all_reduce(t),
|
||||
lambda t, c=comm: c.should_use_symm_mem(t),
|
||||
nullcontext(),
|
||||
None, # no env variable needed
|
||||
{}, # no env variable needed
|
||||
None, # no destroy function needed
|
||||
)
|
||||
)
|
||||
|
||||
if self.fi_ar_comm is not None:
|
||||
comm = self.fi_ar_comm
|
||||
communicators.append(
|
||||
(
|
||||
"flashinfer_trtllm",
|
||||
lambda t, c=comm: c.all_reduce(t),
|
||||
lambda t, c=comm: c.should_use_fi_ar(t),
|
||||
nullcontext(),
|
||||
{"VLLM_FLASHINFER_ALLREDUCE_BACKEND": "trtllm"},
|
||||
lambda c=comm: c.destroy(),
|
||||
)
|
||||
)
|
||||
communicators.append(
|
||||
(
|
||||
"flashinfer_mnnvl",
|
||||
lambda t, c=comm: c.all_reduce(t),
|
||||
lambda t, c=comm: c.should_use_fi_ar(t),
|
||||
nullcontext(),
|
||||
{"VLLM_FLASHINFER_ALLREDUCE_BACKEND": "mnnvl"},
|
||||
lambda c=comm: c.destroy(),
|
||||
)
|
||||
)
|
||||
|
||||
# Benchmark each communicator
|
||||
for name, allreduce_fn, should_use_fn, context, env_var in communicators:
|
||||
# Set environment variable if needed
|
||||
if env_var is not None:
|
||||
os.environ["VLLM_CUSTOM_ALLREDUCE_ALGO"] = env_var
|
||||
else:
|
||||
# Clear the environment variable to avoid interference
|
||||
os.environ.pop("VLLM_CUSTOM_ALLREDUCE_ALGO", None)
|
||||
|
||||
latency = self.benchmark_allreduce_single(
|
||||
sequence_length,
|
||||
allreduce_fn,
|
||||
should_use_fn,
|
||||
context,
|
||||
num_warmup,
|
||||
num_trials,
|
||||
)
|
||||
if latency is not None:
|
||||
results[name] = latency
|
||||
for (
|
||||
name,
|
||||
allreduce_fn,
|
||||
should_use_fn,
|
||||
context,
|
||||
env_dict,
|
||||
destroy_fn,
|
||||
) in communicators:
|
||||
# Save original values and apply new environment variables
|
||||
saved_env = {key: os.environ.get(key) for key in env_dict}
|
||||
for key, value in env_dict.items():
|
||||
os.environ[key] = value
|
||||
try:
|
||||
latency = self.benchmark_allreduce_single(
|
||||
sequence_length,
|
||||
allreduce_fn,
|
||||
should_use_fn,
|
||||
context,
|
||||
num_warmup,
|
||||
num_trials,
|
||||
)
|
||||
if latency is not None:
|
||||
results[name] = latency
|
||||
finally:
|
||||
if destroy_fn is not None:
|
||||
destroy_fn()
|
||||
# Restore environment variables to their original state
|
||||
for key, original_value in saved_env.items():
|
||||
if original_value is None:
|
||||
os.environ.pop(key, None)
|
||||
else:
|
||||
os.environ[key] = original_value
|
||||
|
||||
return results
|
||||
|
||||
|
||||
@@ -5,8 +5,11 @@
|
||||
Benchmark for FlashInfer fused collective operations vs standard operations.
|
||||
|
||||
This benchmark compares:
|
||||
1. FlashInfer's trtllm_allreduce_fusion (fused allreduce + rmsnorm + optional quant)
|
||||
2. Standard tensor_model_parallel_all_reduce + separate rmsnorm/quant operations
|
||||
1. FlashInfer's allreduce_fusion with trtllm backend
|
||||
(fused allreduce + rmsnorm + optional FP8/FP4 quant)
|
||||
2. FlashInfer's allreduce_fusion with mnnvl backend
|
||||
(fused allreduce + rmsnorm only, no quantization support)
|
||||
3. Standard tensor_model_parallel_all_reduce + separate rmsnorm/quant operations
|
||||
|
||||
Usage with torchrun:
|
||||
torchrun --nproc_per_node=2 benchmark_fused_collective.py
|
||||
@@ -24,7 +27,6 @@ import torch.distributed as dist # type: ignore
|
||||
|
||||
from vllm.config.vllm import CompilationConfig, VllmConfig, set_current_vllm_config
|
||||
from vllm.distributed import (
|
||||
get_tp_group,
|
||||
tensor_model_parallel_all_reduce,
|
||||
)
|
||||
from vllm.distributed.parallel_state import (
|
||||
@@ -49,14 +51,19 @@ SCALED_FP4_QUANT_OP = torch.ops._C.scaled_fp4_quant
|
||||
logger = init_logger(__name__)
|
||||
|
||||
# Try to import FlashInfer
|
||||
TorchDistBackend = None
|
||||
try:
|
||||
import flashinfer.comm as flashinfer_comm # type: ignore
|
||||
from flashinfer.comm.mnnvl import ( # type: ignore
|
||||
TorchDistBackend,
|
||||
)
|
||||
|
||||
if not hasattr(flashinfer_comm, "trtllm_allreduce_fusion"):
|
||||
if not (
|
||||
hasattr(flashinfer_comm, "allreduce_fusion")
|
||||
and hasattr(flashinfer_comm, "create_allreduce_fusion_workspace")
|
||||
):
|
||||
flashinfer_comm = None
|
||||
logger.warning(
|
||||
"FlashInfer comm module found but missing trtllm_allreduce_fusion"
|
||||
)
|
||||
logger.warning("FlashInfer comm module found but missing allreduce_fusion API")
|
||||
except ImportError:
|
||||
flashinfer_comm = None
|
||||
logger.warning("FlashInfer not found, only benchmarking standard operations")
|
||||
@@ -74,57 +81,70 @@ _FI_MAX_SIZES = {
|
||||
8: 64 * MiB, # 64MB
|
||||
}
|
||||
|
||||
# Global workspace tensor for FlashInfer
|
||||
_FI_WORKSPACE_TENSOR = None
|
||||
# Global workspace tensors for FlashInfer (keyed by backend name)
|
||||
_FI_WORKSPACES: dict = {}
|
||||
|
||||
# Backends to benchmark
|
||||
FLASHINFER_BACKENDS = ["trtllm", "mnnvl"]
|
||||
|
||||
|
||||
def setup_flashinfer_workspace(
|
||||
backend: str,
|
||||
world_size: int,
|
||||
rank: int,
|
||||
hidden_dim: int,
|
||||
max_token_num: int,
|
||||
use_fp32_lamport: bool = False,
|
||||
dtype: torch.dtype,
|
||||
):
|
||||
"""Setup FlashInfer workspace for fused allreduce operations."""
|
||||
global _FI_WORKSPACE_TENSOR
|
||||
global FI_WORKSPACES
|
||||
|
||||
if flashinfer_comm is None:
|
||||
return None, None
|
||||
return None
|
||||
|
||||
if world_size not in _FI_MAX_SIZES:
|
||||
logger.warning("FlashInfer not supported for world size %s", world_size)
|
||||
return None, None
|
||||
return None
|
||||
|
||||
try:
|
||||
# Create IPC workspace
|
||||
ipc_handles, workspace_tensor = (
|
||||
flashinfer_comm.trtllm_create_ipc_workspace_for_all_reduce_fusion(
|
||||
tp_rank=rank,
|
||||
tp_size=world_size,
|
||||
max_token_num=max_token_num,
|
||||
hidden_dim=hidden_dim,
|
||||
group=get_tp_group().device_group,
|
||||
use_fp32_lamport=use_fp32_lamport,
|
||||
)
|
||||
kwargs = {}
|
||||
if TorchDistBackend is not None:
|
||||
kwargs["comm_backend"] = TorchDistBackend(group=dist.group.WORLD)
|
||||
|
||||
workspace = flashinfer_comm.create_allreduce_fusion_workspace(
|
||||
backend=backend,
|
||||
world_size=world_size,
|
||||
rank=rank,
|
||||
max_token_num=max_token_num,
|
||||
hidden_dim=hidden_dim,
|
||||
dtype=dtype,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
_FI_WORKSPACE_TENSOR = workspace_tensor
|
||||
return ipc_handles, workspace_tensor
|
||||
_FI_WORKSPACES[backend] = workspace
|
||||
return workspace
|
||||
except Exception as e:
|
||||
logger.error("Failed to setup FlashInfer workspace: %s", e)
|
||||
return None, None
|
||||
logger.error(
|
||||
"Failed to setup FlashInfer workspace (backend=%s): %s", backend, e
|
||||
)
|
||||
return None
|
||||
|
||||
|
||||
def cleanup_flashinfer_workspace(ipc_handles):
|
||||
"""Cleanup FlashInfer workspace."""
|
||||
if flashinfer_comm is None or ipc_handles is None:
|
||||
def cleanup_flashinfer_workspaces():
|
||||
"""Cleanup all FlashInfer workspaces."""
|
||||
if flashinfer_comm is None:
|
||||
return
|
||||
|
||||
try:
|
||||
group = get_tp_group().device_group
|
||||
flashinfer_comm.trtllm_destroy_ipc_workspace_for_all_reduce(ipc_handles, group)
|
||||
except Exception as e:
|
||||
logger.error("Failed to cleanup FlashInfer workspace: %s", e)
|
||||
for backend, workspace in _FI_WORKSPACES.items():
|
||||
try:
|
||||
workspace.destroy()
|
||||
except Exception as e:
|
||||
logger.error(
|
||||
"Failed to cleanup FlashInfer workspace (backend=%s): %s",
|
||||
backend,
|
||||
e,
|
||||
)
|
||||
_FI_WORKSPACES.clear()
|
||||
|
||||
|
||||
class FlashInferFusedAllReduceParams:
|
||||
@@ -132,25 +152,15 @@ class FlashInferFusedAllReduceParams:
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
rank: int,
|
||||
world_size: int,
|
||||
use_fp32_lamport: bool = False,
|
||||
max_token_num: int = 1024,
|
||||
):
|
||||
self.rank = rank
|
||||
self.world_size = world_size
|
||||
self.use_fp32_lamport = use_fp32_lamport
|
||||
self.trigger_completion_at_end = True
|
||||
self.launch_with_pdl = True
|
||||
self.fp32_acc = True
|
||||
self.max_token_num = max_token_num
|
||||
|
||||
def get_trtllm_fused_allreduce_kwargs(self):
|
||||
def get_flashinfer_fused_allreduce_kwargs(self):
|
||||
return {
|
||||
"world_rank": self.rank,
|
||||
"world_size": self.world_size,
|
||||
"launch_with_pdl": self.launch_with_pdl,
|
||||
"trigger_completion_at_end": self.trigger_completion_at_end,
|
||||
"fp32_acc": self.fp32_acc,
|
||||
}
|
||||
|
||||
@@ -161,11 +171,12 @@ def flashinfer_fused_allreduce_rmsnorm(
|
||||
rms_gamma: torch.Tensor,
|
||||
rms_eps: float,
|
||||
allreduce_params: "FlashInferFusedAllReduceParams",
|
||||
workspace: object,
|
||||
use_oneshot: bool,
|
||||
norm_out: torch.Tensor | None = None,
|
||||
):
|
||||
"""FlashInfer fused allreduce + rmsnorm operation."""
|
||||
if flashinfer_comm is None or _FI_WORKSPACE_TENSOR is None:
|
||||
if flashinfer_comm is None or workspace is None:
|
||||
raise RuntimeError("FlashInfer not available or workspace not initialized")
|
||||
|
||||
if norm_out is None:
|
||||
@@ -174,24 +185,25 @@ def flashinfer_fused_allreduce_rmsnorm(
|
||||
else:
|
||||
residual_out = input_tensor
|
||||
|
||||
flashinfer_comm.trtllm_allreduce_fusion(
|
||||
allreduce_in=input_tensor,
|
||||
token_num=input_tensor.shape[0],
|
||||
layout_code = None
|
||||
if workspace.backend == "trtllm":
|
||||
layout_code = flashinfer_comm.QuantizationSFLayout.SWIZZLED_128x4
|
||||
|
||||
flashinfer_comm.allreduce_fusion(
|
||||
input=input_tensor,
|
||||
workspace=workspace,
|
||||
pattern=flashinfer_comm.AllReduceFusionPattern.kARResidualRMSNorm,
|
||||
residual_in=residual,
|
||||
residual_out=residual_out,
|
||||
norm_out=norm_out,
|
||||
rms_gamma=rms_gamma,
|
||||
rms_eps=rms_eps,
|
||||
hidden_dim=input_tensor.shape[-1],
|
||||
workspace_ptrs=_FI_WORKSPACE_TENSOR,
|
||||
pattern_code=flashinfer_comm.AllReduceFusionPattern.kARResidualRMSNorm,
|
||||
allreduce_out=None,
|
||||
quant_out=None,
|
||||
scale_out=None,
|
||||
layout_code=flashinfer_comm.QuantizationSFLayout.SWIZZLED_128x4,
|
||||
layout_code=layout_code,
|
||||
scale_factor=None,
|
||||
use_oneshot=use_oneshot,
|
||||
**allreduce_params.get_trtllm_fused_allreduce_kwargs(),
|
||||
**allreduce_params.get_flashinfer_fused_allreduce_kwargs(),
|
||||
)
|
||||
|
||||
|
||||
@@ -202,12 +214,16 @@ def flashinfer_fused_allreduce_rmsnorm_fp8_quant(
|
||||
rms_eps: float,
|
||||
scale_factor: torch.Tensor,
|
||||
allreduce_params: FlashInferFusedAllReduceParams,
|
||||
workspace: object,
|
||||
use_oneshot: bool = True,
|
||||
norm_out: torch.Tensor | None = None,
|
||||
quant_out: torch.Tensor | None = None,
|
||||
):
|
||||
"""FlashInfer fused allreduce + rmsnorm + FP8 quantization."""
|
||||
if flashinfer_comm is None or _FI_WORKSPACE_TENSOR is None:
|
||||
"""FlashInfer fused allreduce + rmsnorm + FP8 quantization.
|
||||
|
||||
Note: Only supported by the trtllm backend.
|
||||
"""
|
||||
if flashinfer_comm is None or workspace is None:
|
||||
raise RuntimeError("FlashInfer not available or workspace not initialized")
|
||||
|
||||
if norm_out is None:
|
||||
@@ -216,24 +232,21 @@ def flashinfer_fused_allreduce_rmsnorm_fp8_quant(
|
||||
else:
|
||||
residual_out = input_tensor
|
||||
|
||||
flashinfer_comm.trtllm_allreduce_fusion(
|
||||
allreduce_in=input_tensor,
|
||||
token_num=input_tensor.shape[0],
|
||||
flashinfer_comm.allreduce_fusion(
|
||||
input=input_tensor,
|
||||
workspace=workspace,
|
||||
pattern=flashinfer_comm.AllReduceFusionPattern.kARResidualRMSNormFP8Quant,
|
||||
residual_in=residual,
|
||||
residual_out=residual_out,
|
||||
norm_out=norm_out,
|
||||
rms_gamma=rms_gamma,
|
||||
rms_eps=rms_eps,
|
||||
hidden_dim=input_tensor.shape[-1],
|
||||
workspace_ptrs=_FI_WORKSPACE_TENSOR,
|
||||
pattern_code=flashinfer_comm.AllReduceFusionPattern.kARResidualRMSNormFP8Quant,
|
||||
allreduce_out=None,
|
||||
quant_out=quant_out,
|
||||
scale_out=None,
|
||||
layout_code=flashinfer_comm.QuantizationSFLayout.SWIZZLED_128x4,
|
||||
scale_factor=scale_factor,
|
||||
use_oneshot=use_oneshot,
|
||||
**allreduce_params.get_trtllm_fused_allreduce_kwargs(),
|
||||
**allreduce_params.get_flashinfer_fused_allreduce_kwargs(),
|
||||
)
|
||||
|
||||
|
||||
@@ -244,13 +257,17 @@ def flashinfer_fused_allreduce_rmsnorm_fp4_quant(
|
||||
rms_eps: float,
|
||||
input_global_scale: torch.Tensor,
|
||||
allreduce_params: FlashInferFusedAllReduceParams,
|
||||
workspace: object,
|
||||
quant_out: torch.Tensor,
|
||||
use_oneshot: bool,
|
||||
output_scale: torch.Tensor,
|
||||
norm_out: torch.Tensor | None = None,
|
||||
):
|
||||
"""FlashInfer fused allreduce + rmsnorm + FP4 quantization."""
|
||||
if flashinfer_comm is None or _FI_WORKSPACE_TENSOR is None:
|
||||
"""FlashInfer fused allreduce + rmsnorm + FP4 quantization.
|
||||
|
||||
Note: Only supported by the trtllm backend.
|
||||
"""
|
||||
if flashinfer_comm is None or workspace is None:
|
||||
raise RuntimeError("FlashInfer not available or workspace not initialized")
|
||||
|
||||
if norm_out is None:
|
||||
@@ -259,24 +276,21 @@ def flashinfer_fused_allreduce_rmsnorm_fp4_quant(
|
||||
else:
|
||||
residual_out = input_tensor
|
||||
|
||||
flashinfer_comm.trtllm_allreduce_fusion(
|
||||
allreduce_in=input_tensor,
|
||||
token_num=input_tensor.shape[0],
|
||||
flashinfer_comm.allreduce_fusion(
|
||||
input=input_tensor,
|
||||
workspace=workspace,
|
||||
pattern=flashinfer_comm.AllReduceFusionPattern.kARResidualRMSNormFP4Quant,
|
||||
residual_in=residual,
|
||||
residual_out=residual_out,
|
||||
norm_out=norm_out,
|
||||
rms_gamma=rms_gamma,
|
||||
rms_eps=rms_eps,
|
||||
hidden_dim=input_tensor.shape[-1],
|
||||
workspace_ptrs=_FI_WORKSPACE_TENSOR,
|
||||
pattern_code=flashinfer_comm.AllReduceFusionPattern.kARResidualRMSNormFP4Quant,
|
||||
allreduce_out=None,
|
||||
quant_out=quant_out,
|
||||
scale_out=output_scale,
|
||||
layout_code=flashinfer_comm.QuantizationSFLayout.SWIZZLED_128x4,
|
||||
scale_factor=input_global_scale,
|
||||
use_oneshot=use_oneshot,
|
||||
**allreduce_params.get_trtllm_fused_allreduce_kwargs(),
|
||||
**allreduce_params.get_flashinfer_fused_allreduce_kwargs(),
|
||||
)
|
||||
|
||||
|
||||
@@ -409,13 +423,16 @@ def run_benchmarks(
|
||||
dtype: torch.dtype,
|
||||
use_residual: bool,
|
||||
allreduce_params: FlashInferFusedAllReduceParams | None,
|
||||
workspaces: dict,
|
||||
quant_modes: set[str],
|
||||
no_oneshot: bool,
|
||||
):
|
||||
"""Run all benchmarks for given configuration.
|
||||
|
||||
Args:
|
||||
quant_mode: "none", "fp8_only", "fp4_only", or "all"
|
||||
allreduce_params: Shared parameters for FlashInfer fused allreduce.
|
||||
workspaces: Dict mapping backend name ("trtllm", "mnnvl") to workspace.
|
||||
quant_modes: Set of quantization modes: "none", "fp8", "fp4".
|
||||
"""
|
||||
(
|
||||
input_tensor,
|
||||
@@ -431,18 +448,18 @@ def run_benchmarks(
|
||||
|
||||
rms_eps = 1e-6
|
||||
results = {}
|
||||
vllm_fused_allreduce = VllmFusedAllreduce(hidden_dim, dtype)
|
||||
use_oneshot_options = [False] if no_oneshot else [True, False]
|
||||
|
||||
# Create RMSNorm and QuantFP8 layers once for native benchmarks
|
||||
|
||||
if "none" in quant_modes:
|
||||
# Standard AllReduce + RMSNorm
|
||||
# Re-create VllmFusedAllreduce per config so CustomOp binds the
|
||||
# correct forward method (native vs custom kernel).
|
||||
for custom_op in ["-rms_norm", "+rms_norm"]:
|
||||
with set_current_vllm_config(
|
||||
VllmConfig(compilation_config=CompilationConfig(custom_ops=[custom_op]))
|
||||
):
|
||||
try:
|
||||
vllm_fused_allreduce = VllmFusedAllreduce(hidden_dim, dtype)
|
||||
suffix = (
|
||||
"_custom_rms_norm" if "+" in custom_op else "_native_rms_norm"
|
||||
)
|
||||
@@ -461,6 +478,7 @@ def run_benchmarks(
|
||||
VllmConfig(compilation_config=CompilationConfig(custom_ops=["-rms_norm"]))
|
||||
):
|
||||
try:
|
||||
vllm_fused_allreduce = VllmFusedAllreduce(hidden_dim, dtype)
|
||||
standard_allreduce_rmsnorm_native_compiled = torch.compile(
|
||||
vllm_fused_allreduce.allreduce_rmsnorm,
|
||||
fullgraph=True,
|
||||
@@ -476,10 +494,11 @@ def run_benchmarks(
|
||||
logger.error("Standard AllReduce+RMSNorm Native Compiled failed: %s", e)
|
||||
results["standard_allreduce_rmsnorm_native_compiled"] = float("inf")
|
||||
|
||||
# FlashInfer Fused AllReduce + RMSNorm Oneshot/Twoshot
|
||||
if flashinfer_comm is not None and allreduce_params is not None:
|
||||
# FlashInfer Fused AllReduce + RMSNorm (all backends)
|
||||
for backend, workspace in workspaces.items():
|
||||
for use_oneshot in use_oneshot_options:
|
||||
suffix = "_oneshot" if use_oneshot else "_twoshot"
|
||||
key = f"flashinfer_{backend}_fused_allreduce_rmsnorm{suffix}"
|
||||
try:
|
||||
time_ms = benchmark_operation(
|
||||
flashinfer_fused_allreduce_rmsnorm,
|
||||
@@ -489,14 +508,17 @@ def run_benchmarks(
|
||||
rms_gamma=rms_gamma,
|
||||
rms_eps=rms_eps,
|
||||
allreduce_params=allreduce_params,
|
||||
workspace=workspace,
|
||||
use_oneshot=use_oneshot,
|
||||
)
|
||||
results[f"flashinfer_fused_allreduce_rmsnorm{suffix}"] = time_ms
|
||||
results[key] = time_ms
|
||||
except Exception as e:
|
||||
logger.error("FlashInfer Fused AllReduce+RMSNorm failed: %s", e)
|
||||
results[f"flashinfer_fused_allreduce_rmsnorm{suffix}"] = float(
|
||||
"inf"
|
||||
logger.error(
|
||||
"FlashInfer (%s) Fused AllReduce+RMSNorm failed: %s",
|
||||
backend,
|
||||
e,
|
||||
)
|
||||
results[key] = float("inf")
|
||||
|
||||
if "fp8" in quant_modes:
|
||||
# Standard AllReduce + RMSNorm + FP8 Quant
|
||||
@@ -505,7 +527,7 @@ def run_benchmarks(
|
||||
"_custom_rms_norm" if "+" in rms_norm_custom_op else "_native_rms_norm"
|
||||
)
|
||||
for quant_fp8_custom_op in ["-quant_fp8", "+quant_fp8"]:
|
||||
suffix += (
|
||||
op_suffix = suffix + (
|
||||
"_custom_quant_fp8"
|
||||
if "+" in quant_fp8_custom_op
|
||||
else "_native_quant_fp8"
|
||||
@@ -518,16 +540,17 @@ def run_benchmarks(
|
||||
)
|
||||
):
|
||||
try:
|
||||
vllm_fused_allreduce = VllmFusedAllreduce(hidden_dim, dtype)
|
||||
time_ms = benchmark_operation(
|
||||
vllm_fused_allreduce.allreduce_rmsnorm_fp8_quant,
|
||||
input_tensor,
|
||||
residual=residual,
|
||||
scale_factor=scale_fp8,
|
||||
)
|
||||
results[f"standard_allreduce{suffix}"] = time_ms
|
||||
results[f"standard_allreduce{op_suffix}"] = time_ms
|
||||
except Exception as e:
|
||||
logger.error("Standard AllReduce+RMSNorm+FP8 failed: %s", e)
|
||||
results[f"standard_allreduce{suffix}"] = float("inf")
|
||||
results[f"standard_allreduce{op_suffix}"] = float("inf")
|
||||
|
||||
# Standard AllReduce + RMSNorm + FP8 Quant Native Compiled
|
||||
with set_current_vllm_config(
|
||||
@@ -538,6 +561,7 @@ def run_benchmarks(
|
||||
)
|
||||
):
|
||||
try:
|
||||
vllm_fused_allreduce = VllmFusedAllreduce(hidden_dim, dtype)
|
||||
standard_allreduce_rmsnorm_fp8_quant_native_compiled = torch.compile(
|
||||
vllm_fused_allreduce.allreduce_rmsnorm_fp8_quant,
|
||||
fullgraph=True,
|
||||
@@ -560,10 +584,12 @@ def run_benchmarks(
|
||||
"inf"
|
||||
)
|
||||
|
||||
# FlashInfer Fused AllReduce + RMSNorm + FP8 Quant Oneshot
|
||||
if flashinfer_comm is not None and allreduce_params is not None:
|
||||
# FlashInfer Fused AllReduce + RMSNorm + FP8 Quant (trtllm only)
|
||||
if "trtllm" in workspaces:
|
||||
trtllm_ws = workspaces["trtllm"]
|
||||
for use_oneshot in use_oneshot_options:
|
||||
suffix = "_oneshot" if use_oneshot else "_twoshot"
|
||||
key = f"flashinfer_trtllm_fused_allreduce_rmsnorm_fp8_quant{suffix}"
|
||||
try:
|
||||
time_ms = benchmark_operation(
|
||||
flashinfer_fused_allreduce_rmsnorm_fp8_quant,
|
||||
@@ -575,19 +601,16 @@ def run_benchmarks(
|
||||
scale_factor=scale_fp8,
|
||||
quant_out=quant_out_fp8,
|
||||
allreduce_params=allreduce_params,
|
||||
workspace=trtllm_ws,
|
||||
use_oneshot=use_oneshot,
|
||||
)
|
||||
results[f"flashinfer_fused_allreduce_rmsnorm_fp8_quant{suffix}"] = (
|
||||
time_ms
|
||||
)
|
||||
results[key] = time_ms
|
||||
except Exception as e:
|
||||
logger.error(
|
||||
"FlashInfer Fused AllReduce+RMSNorm+FP8 Oneshot failed: %s",
|
||||
"FlashInfer (trtllm) Fused AllReduce+RMSNorm+FP8 failed: %s",
|
||||
e,
|
||||
)
|
||||
results[f"flashinfer_fused_allreduce_rmsnorm_fp8_quant{suffix}"] = (
|
||||
float("inf")
|
||||
)
|
||||
results[key] = float("inf")
|
||||
|
||||
if "fp4" in quant_modes and current_platform.has_device_capability(100):
|
||||
# Standard AllReduce + RMSNorm + FP4 Quant
|
||||
@@ -603,6 +626,7 @@ def run_benchmarks(
|
||||
)
|
||||
):
|
||||
try:
|
||||
vllm_fused_allreduce = VllmFusedAllreduce(hidden_dim, dtype)
|
||||
time_ms = benchmark_operation(
|
||||
vllm_fused_allreduce.allreduce_rmsnorm_fp4_quant,
|
||||
input_tensor,
|
||||
@@ -621,6 +645,7 @@ def run_benchmarks(
|
||||
VllmConfig(compilation_config=CompilationConfig(custom_ops=["-rms_norm"]))
|
||||
):
|
||||
try:
|
||||
vllm_fused_allreduce = VllmFusedAllreduce(hidden_dim, dtype)
|
||||
standard_allreduce_rmsnorm_fp4_quant_native_compiled = torch.compile(
|
||||
vllm_fused_allreduce.allreduce_rmsnorm_fp4_quant,
|
||||
fullgraph=True,
|
||||
@@ -645,10 +670,12 @@ def run_benchmarks(
|
||||
"inf"
|
||||
)
|
||||
|
||||
# FlashInfer Fused AllReduce + RMSNorm + FP4 Quant Oneshot
|
||||
if flashinfer_comm is not None and allreduce_params is not None:
|
||||
# FlashInfer Fused AllReduce + RMSNorm + FP4 Quant (trtllm only)
|
||||
if "trtllm" in workspaces:
|
||||
trtllm_ws = workspaces["trtllm"]
|
||||
for use_oneshot in use_oneshot_options:
|
||||
suffix = "_oneshot" if use_oneshot else "_twoshot"
|
||||
key = f"flashinfer_trtllm_fused_allreduce_rmsnorm_fp4_quant{suffix}"
|
||||
try:
|
||||
time_ms = benchmark_operation(
|
||||
flashinfer_fused_allreduce_rmsnorm_fp4_quant,
|
||||
@@ -659,49 +686,18 @@ def run_benchmarks(
|
||||
rms_eps=rms_eps,
|
||||
input_global_scale=scale_fp4,
|
||||
allreduce_params=allreduce_params,
|
||||
workspace=trtllm_ws,
|
||||
quant_out=fp4_quant_out,
|
||||
output_scale=fp4_output_scale,
|
||||
use_oneshot=use_oneshot,
|
||||
)
|
||||
results[f"flashinfer_fused_allreduce_rmsnorm_fp4_quant{suffix}"] = (
|
||||
time_ms
|
||||
)
|
||||
results[key] = time_ms
|
||||
except Exception as e:
|
||||
logger.error(
|
||||
"FlashInfer Fused AllReduce+RMSNorm+FP4 Oneshot failed: %s",
|
||||
"FlashInfer (trtllm) Fused AllReduce+RMSNorm+FP4 failed: %s",
|
||||
e,
|
||||
)
|
||||
results[f"flashinfer_fused_allreduce_rmsnorm_fp4_quant{suffix}"] = (
|
||||
float("inf")
|
||||
)
|
||||
|
||||
# FlashInfer Fused AllReduce + RMSNorm + FP4 Quant Two-shot
|
||||
if flashinfer_comm is not None and allreduce_params is not None:
|
||||
try:
|
||||
time_ms = benchmark_operation(
|
||||
flashinfer_fused_allreduce_rmsnorm_fp4_quant,
|
||||
input_tensor,
|
||||
residual=residual,
|
||||
norm_out=norm_out,
|
||||
rms_gamma=rms_gamma,
|
||||
rms_eps=rms_eps,
|
||||
input_global_scale=scale_fp4,
|
||||
allreduce_params=allreduce_params,
|
||||
quant_out=fp4_quant_out,
|
||||
output_scale=fp4_output_scale,
|
||||
use_oneshot=False,
|
||||
)
|
||||
results["flashinfer_fused_allreduce_rmsnorm_fp4_quant_twoshot"] = (
|
||||
time_ms
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(
|
||||
"FlashInfer Fused AllReduce+RMSNorm+FP4 Two-shot failed: %s",
|
||||
e,
|
||||
)
|
||||
results["flashinfer_fused_allreduce_rmsnorm_fp4_quant_twoshot"] = float(
|
||||
"inf"
|
||||
)
|
||||
results[key] = float("inf")
|
||||
|
||||
return results
|
||||
|
||||
@@ -1039,24 +1035,33 @@ def main():
|
||||
|
||||
configs = list(itertools.product(args.num_tokens, dtypes, residual_options))
|
||||
|
||||
# Setup FlashInfer workspace if available
|
||||
ipc_handles = None
|
||||
# Setup FlashInfer workspaces for all backends
|
||||
allreduce_params = None
|
||||
|
||||
if flashinfer_comm is not None:
|
||||
# Use the largest hidden dimension for workspace setup
|
||||
max_element_size = max(torch.finfo(dt).bits // 8 for dt in dtypes)
|
||||
workspace_dtype = (
|
||||
torch.float32
|
||||
if max_element_size == 4
|
||||
else (torch.bfloat16 if torch.bfloat16 in dtypes else torch.float16)
|
||||
)
|
||||
max_num_token = _FI_MAX_SIZES.get(world_size) // (
|
||||
args.hidden_dim * world_size * 2
|
||||
args.hidden_dim * max_element_size
|
||||
)
|
||||
|
||||
ipc_handles, workspace_tensor = setup_flashinfer_workspace(
|
||||
world_size, rank, args.hidden_dim, max_num_token
|
||||
)
|
||||
|
||||
if workspace_tensor is not None:
|
||||
allreduce_params = FlashInferFusedAllReduceParams(
|
||||
rank=rank,
|
||||
for backend in FLASHINFER_BACKENDS:
|
||||
setup_flashinfer_workspace(
|
||||
backend=backend,
|
||||
world_size=world_size,
|
||||
rank=rank,
|
||||
hidden_dim=args.hidden_dim,
|
||||
max_token_num=max_num_token,
|
||||
dtype=workspace_dtype,
|
||||
)
|
||||
|
||||
if _FI_WORKSPACES:
|
||||
allreduce_params = FlashInferFusedAllReduceParams(
|
||||
max_token_num=max_num_token,
|
||||
)
|
||||
|
||||
@@ -1081,6 +1086,7 @@ def main():
|
||||
dtype,
|
||||
use_residual,
|
||||
allreduce_params,
|
||||
workspaces=_FI_WORKSPACES,
|
||||
quant_modes=quant_modes,
|
||||
no_oneshot=args.no_oneshot,
|
||||
)
|
||||
@@ -1119,11 +1125,13 @@ def main():
|
||||
|
||||
finally:
|
||||
# Cleanup
|
||||
if ipc_handles is not None:
|
||||
cleanup_flashinfer_workspace(ipc_handles)
|
||||
cleanup_flashinfer_workspaces()
|
||||
|
||||
dist.barrier()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
from vllm.config import VllmConfig, set_current_vllm_config
|
||||
|
||||
with set_current_vllm_config(VllmConfig()):
|
||||
main()
|
||||
|
||||
@@ -5,12 +5,14 @@ import time
|
||||
|
||||
import torch
|
||||
|
||||
from vllm.benchmarks.lib.utils import default_vllm_config
|
||||
from vllm.model_executor.layers.layernorm import RMSNorm
|
||||
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
||||
from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE, set_random_seed
|
||||
|
||||
|
||||
@torch.inference_mode()
|
||||
@default_vllm_config()
|
||||
def main(
|
||||
num_tokens: int,
|
||||
hidden_size: int,
|
||||
|
||||
@@ -16,6 +16,7 @@ import torch
|
||||
from ray.experimental.tqdm_ray import tqdm
|
||||
|
||||
from vllm.model_executor.layers.fused_moe import fused_topk
|
||||
from vllm.model_executor.layers.fused_moe.activation import MoEActivation
|
||||
from vllm.model_executor.layers.fused_moe.config import (
|
||||
FusedMoEConfig,
|
||||
FusedMoEParallelConfig,
|
||||
@@ -99,13 +100,38 @@ def benchmark_config(
|
||||
dtype: torch.dtype,
|
||||
use_fp8_w8a8: bool,
|
||||
use_int8_w8a16: bool,
|
||||
use_int4_w4a16: bool = False,
|
||||
num_iters: int = 100,
|
||||
block_quant_shape: list[int] = None,
|
||||
use_deep_gemm: bool = False,
|
||||
) -> float:
|
||||
init_dtype = torch.float16 if use_fp8_w8a8 else dtype
|
||||
x = torch.randn(num_tokens, hidden_size, dtype=dtype)
|
||||
if use_int8_w8a16:
|
||||
if use_int4_w4a16:
|
||||
# Int4 packed weights: 2 int4 values per uint8 byte
|
||||
# K dimension is packed (halved)
|
||||
intermediate_size = shard_intermediate_size // 2 # after silu_and_mul
|
||||
w1 = torch.randint(
|
||||
0,
|
||||
255,
|
||||
(
|
||||
num_experts,
|
||||
shard_intermediate_size,
|
||||
hidden_size // 2, # int4 packing
|
||||
),
|
||||
dtype=torch.uint8,
|
||||
)
|
||||
w2 = torch.randint(
|
||||
0,
|
||||
255,
|
||||
(
|
||||
num_experts,
|
||||
hidden_size,
|
||||
intermediate_size // 2, # int4 packing
|
||||
),
|
||||
dtype=torch.uint8,
|
||||
)
|
||||
elif use_int8_w8a16:
|
||||
w1 = torch.randint(
|
||||
-127,
|
||||
127,
|
||||
@@ -139,7 +165,20 @@ def benchmark_config(
|
||||
w2_scale = None
|
||||
a1_scale = None
|
||||
a2_scale = None
|
||||
if use_int8_w8a16:
|
||||
if use_int4_w4a16:
|
||||
if block_quant_shape is None:
|
||||
raise ValueError("block_quant_shape is required for int4_w4a16")
|
||||
group_size = block_quant_shape[1]
|
||||
# Scales shape: (E, N, K // group_size) in fp16
|
||||
w1_scale = torch.rand(
|
||||
(num_experts, shard_intermediate_size, hidden_size // group_size),
|
||||
dtype=dtype,
|
||||
)
|
||||
w2_scale = torch.rand(
|
||||
(num_experts, hidden_size, intermediate_size // group_size),
|
||||
dtype=dtype,
|
||||
)
|
||||
elif use_int8_w8a16:
|
||||
w1_scale = torch.randn(
|
||||
(num_experts, 2 * shard_intermediate_size), dtype=torch.float32
|
||||
)
|
||||
@@ -198,6 +237,7 @@ def benchmark_config(
|
||||
a1_scale=a1_scale,
|
||||
a2_scale=a2_scale,
|
||||
block_shape=block_quant_shape,
|
||||
weight_dtype="int4" if use_int4_w4a16 else None,
|
||||
)
|
||||
|
||||
deep_gemm_experts = None
|
||||
@@ -211,7 +251,8 @@ def benchmark_config(
|
||||
hidden_dim=hidden_size,
|
||||
intermediate_size_per_partition=shard_intermediate_size,
|
||||
num_local_experts=num_experts,
|
||||
activation="silu",
|
||||
num_logical_experts=num_experts,
|
||||
activation=MoEActivation.SILU,
|
||||
moe_parallel_config=FusedMoEParallelConfig.make_no_parallel(),
|
||||
in_dtype=init_dtype,
|
||||
routing_method=RoutingMethodType.TopK,
|
||||
@@ -226,9 +267,10 @@ def benchmark_config(
|
||||
x, input_gating, topk, renormalize=not use_deep_gemm
|
||||
)
|
||||
|
||||
inplace = not disable_inplace()
|
||||
if use_deep_gemm:
|
||||
return deep_gemm_experts(
|
||||
x, w1, w2, topk_weights, topk_ids, inplace=True
|
||||
x, w1, w2, topk_weights, topk_ids, inplace=inplace
|
||||
)
|
||||
return fused_experts(
|
||||
x,
|
||||
@@ -236,7 +278,7 @@ def benchmark_config(
|
||||
w2,
|
||||
topk_weights,
|
||||
topk_ids,
|
||||
inplace=True,
|
||||
inplace=inplace,
|
||||
quant_config=quant_config,
|
||||
)
|
||||
|
||||
@@ -478,6 +520,7 @@ class BenchmarkWorker:
|
||||
dtype: torch.dtype,
|
||||
use_fp8_w8a8: bool,
|
||||
use_int8_w8a16: bool,
|
||||
use_int4_w4a16: bool = False,
|
||||
block_quant_shape: list[int] = None,
|
||||
use_deep_gemm: bool = False,
|
||||
) -> tuple[dict[str, int], float]:
|
||||
@@ -485,7 +528,10 @@ class BenchmarkWorker:
|
||||
|
||||
set_random_seed(self.seed)
|
||||
dtype_str = _get_config_dtype_str(
|
||||
dtype, use_int8_w8a16=use_int8_w8a16, use_fp8_w8a8=use_fp8_w8a8
|
||||
dtype,
|
||||
use_int8_w8a16=use_int8_w8a16,
|
||||
use_fp8_w8a8=use_fp8_w8a8,
|
||||
use_int4_w4a16=use_int4_w4a16,
|
||||
)
|
||||
# NOTE(woosuk): The current naming convention uses w2.shape[2], which
|
||||
# is the intermediate size after silu_and_mul.
|
||||
@@ -516,6 +562,7 @@ class BenchmarkWorker:
|
||||
dtype,
|
||||
use_fp8_w8a8,
|
||||
use_int8_w8a16,
|
||||
use_int4_w4a16=use_int4_w4a16,
|
||||
num_iters=100,
|
||||
block_quant_shape=block_quant_shape,
|
||||
use_deep_gemm=use_deep_gemm,
|
||||
@@ -532,6 +579,7 @@ class BenchmarkWorker:
|
||||
dtype: torch.dtype,
|
||||
use_fp8_w8a8: bool,
|
||||
use_int8_w8a16: bool,
|
||||
use_int4_w4a16: bool,
|
||||
search_space: list[dict[str, int]],
|
||||
block_quant_shape: list[int],
|
||||
use_deep_gemm: bool,
|
||||
@@ -542,7 +590,7 @@ class BenchmarkWorker:
|
||||
best_config = None
|
||||
best_time = float("inf")
|
||||
if current_platform.is_rocm():
|
||||
is_fp16 = not (use_fp8_w8a8 or use_int8_w8a16)
|
||||
is_fp16 = not (use_fp8_w8a8 or use_int8_w8a16 or use_int4_w4a16)
|
||||
search_space = prune_rocm_search_space(
|
||||
num_tokens,
|
||||
shard_intermediate_size,
|
||||
@@ -571,6 +619,7 @@ class BenchmarkWorker:
|
||||
dtype,
|
||||
use_fp8_w8a8,
|
||||
use_int8_w8a16,
|
||||
use_int4_w4a16,
|
||||
num_iters=20,
|
||||
block_quant_shape=block_quant_shape,
|
||||
use_deep_gemm=use_deep_gemm,
|
||||
@@ -618,6 +667,7 @@ def sort_config(config: BenchmarkConfig) -> BenchmarkConfig:
|
||||
else {}
|
||||
),
|
||||
**({"kpack": config["kpack"]} if "kpack" in config else {}),
|
||||
**({"SPLIT_K": config["SPLIT_K"]} if "SPLIT_K" in config else {}),
|
||||
}
|
||||
|
||||
|
||||
@@ -630,11 +680,15 @@ def save_configs(
|
||||
dtype: torch.dtype,
|
||||
use_fp8_w8a8: bool,
|
||||
use_int8_w8a16: bool,
|
||||
use_int4_w4a16: bool,
|
||||
block_quant_shape: list[int],
|
||||
save_dir: str,
|
||||
) -> None:
|
||||
dtype_str = _get_config_dtype_str(
|
||||
dtype, use_int8_w8a16=use_int8_w8a16, use_fp8_w8a8=use_fp8_w8a8
|
||||
dtype,
|
||||
use_int8_w8a16=use_int8_w8a16,
|
||||
use_fp8_w8a8=use_fp8_w8a8,
|
||||
use_int4_w4a16=use_int4_w4a16,
|
||||
)
|
||||
|
||||
# NOTE(woosuk): The current naming convention uses w2.shape[2], which
|
||||
@@ -686,6 +740,7 @@ def get_model_params(config):
|
||||
"DeepseekV2ForCausalLM",
|
||||
"DeepseekV3ForCausalLM",
|
||||
"DeepseekV32ForCausalLM",
|
||||
"GlmMoeDsaForCausalLM",
|
||||
"Glm4MoeForCausalLM",
|
||||
"Glm4MoeLiteForCausalLM",
|
||||
"NemotronHForCausalLM",
|
||||
@@ -735,6 +790,38 @@ def get_model_params(config):
|
||||
return E, topk, intermediate_size, hidden_size
|
||||
|
||||
|
||||
def get_quantization_group_size(config) -> int | None:
|
||||
"""Extract the quantization group size from the HF model config.
|
||||
|
||||
This reads directly from the HuggingFace config object (as returned by
|
||||
``get_config()``), not from vLLM's quantization config classes.
|
||||
|
||||
Supports AWQ/GPTQ-style configs (direct 'group_size' key) and
|
||||
compressed-tensors configs (nested inside 'config_groups').
|
||||
"""
|
||||
quantization_config = getattr(config, "quantization_config", {})
|
||||
if not isinstance(quantization_config, dict):
|
||||
return None
|
||||
# AWQ / GPTQ style: group_size is a top-level key
|
||||
gs = quantization_config.get("group_size")
|
||||
if gs is not None:
|
||||
return gs
|
||||
# compressed-tensors style: group_size is nested in config_groups
|
||||
config_groups = quantization_config.get("config_groups", {})
|
||||
if not isinstance(config_groups, dict):
|
||||
return None
|
||||
for group_cfg in config_groups.values():
|
||||
if not isinstance(group_cfg, dict):
|
||||
continue
|
||||
weights = group_cfg.get("weights", {})
|
||||
if not isinstance(weights, dict):
|
||||
continue
|
||||
gs = weights.get("group_size")
|
||||
if gs is not None:
|
||||
return gs
|
||||
return None
|
||||
|
||||
|
||||
def main(args: argparse.Namespace):
|
||||
print(args)
|
||||
|
||||
@@ -753,7 +840,20 @@ def main(args: argparse.Namespace):
|
||||
dtype = torch.float16 if current_platform.is_rocm() else config.dtype
|
||||
use_fp8_w8a8 = args.dtype == "fp8_w8a8"
|
||||
use_int8_w8a16 = args.dtype == "int8_w8a16"
|
||||
use_int4_w4a16 = args.dtype == "int4_w4a16"
|
||||
block_quant_shape = get_weight_block_size_safety(config)
|
||||
if use_int4_w4a16:
|
||||
group_size = get_quantization_group_size(config)
|
||||
if group_size is None:
|
||||
raise ValueError(
|
||||
"Could not determine group_size from model config. "
|
||||
"The model's quantization_config must contain a 'group_size' "
|
||||
"field (AWQ/GPTQ) or 'config_groups.*.weights.group_size' "
|
||||
"(compressed-tensors)."
|
||||
)
|
||||
# For int4_w4a16, block_shape = [0, group_size]
|
||||
# block_shape[0]=0 means no block quantization on N dimension
|
||||
block_quant_shape = [0, group_size]
|
||||
|
||||
if args.batch_size is None:
|
||||
batch_sizes = [
|
||||
@@ -807,8 +907,20 @@ def main(args: argparse.Namespace):
|
||||
return ray.get(outputs)
|
||||
|
||||
if args.tune:
|
||||
is_fp16 = not (use_fp8_w8a8 or use_int8_w8a16)
|
||||
search_space = get_configs_compute_bound(is_fp16, block_quant_shape)
|
||||
# int4_w4a16 weights are uint8-packed, not fp16; treat like fp8 for
|
||||
# search space generation (no matrix_instr_nonkdim/kpack exploration).
|
||||
is_fp16 = not (use_fp8_w8a8 or use_int8_w8a16 or use_int4_w4a16)
|
||||
# For int4_w4a16, the group_size constraint on BLOCK_SIZE_K does not
|
||||
# apply: the gptq_awq kernel handles arbitrary BLOCK_SIZE_K regardless
|
||||
# of group_size. Skip block_quant_shape filtering to keep the full
|
||||
# search space (e.g. BLOCK_SIZE_K=64 with group_size=128).
|
||||
tune_block_quant_shape = None if use_int4_w4a16 else block_quant_shape
|
||||
search_space = get_configs_compute_bound(is_fp16, tune_block_quant_shape)
|
||||
if use_int4_w4a16:
|
||||
# SPLIT_K is a required kernel constexpr for gptq_awq kernel;
|
||||
# only SPLIT_K=1 is used at runtime, so fix it during tuning.
|
||||
for cfg in search_space:
|
||||
cfg["SPLIT_K"] = 1
|
||||
print(f"Start tuning over {len(search_space)} configurations...")
|
||||
if use_deep_gemm:
|
||||
raise ValueError(
|
||||
@@ -828,6 +940,7 @@ def main(args: argparse.Namespace):
|
||||
dtype,
|
||||
use_fp8_w8a8,
|
||||
use_int8_w8a16,
|
||||
use_int4_w4a16,
|
||||
search_space,
|
||||
block_quant_shape,
|
||||
use_deep_gemm,
|
||||
@@ -847,6 +960,7 @@ def main(args: argparse.Namespace):
|
||||
dtype,
|
||||
use_fp8_w8a8,
|
||||
use_int8_w8a16,
|
||||
use_int4_w4a16,
|
||||
block_quant_shape,
|
||||
args.save_dir,
|
||||
)
|
||||
@@ -865,6 +979,7 @@ def main(args: argparse.Namespace):
|
||||
dtype,
|
||||
use_fp8_w8a8,
|
||||
use_int8_w8a16,
|
||||
use_int4_w4a16,
|
||||
block_quant_shape,
|
||||
use_deep_gemm,
|
||||
)
|
||||
@@ -887,7 +1002,10 @@ if __name__ == "__main__":
|
||||
)
|
||||
parser.add_argument("--enable-expert-parallel", "-enable-ep", action="store_true")
|
||||
parser.add_argument(
|
||||
"--dtype", type=str, choices=["auto", "fp8_w8a8", "int8_w8a16"], default="auto"
|
||||
"--dtype",
|
||||
type=str,
|
||||
choices=["auto", "fp8_w8a8", "int8_w8a16", "int4_w4a16"],
|
||||
default="auto",
|
||||
)
|
||||
parser.add_argument("--use-deep-gemm", action="store_true")
|
||||
parser.add_argument(
|
||||
|
||||
278
benchmarks/kernels/benchmark_moe_defaults.py
Normal file
278
benchmarks/kernels/benchmark_moe_defaults.py
Normal file
@@ -0,0 +1,278 @@
|
||||
#!/usr/bin/env python3
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""
|
||||
Benchmark comparing old vs new default fused MoE configs.
|
||||
|
||||
Runs the triton fused_moe kernel with three configurations for each scenario:
|
||||
1. Tuned config (from JSON file, if available) — the target to match
|
||||
2. Old default (the hardcoded defaults before this change)
|
||||
3. New default (the improved defaults)
|
||||
|
||||
Usage:
|
||||
python benchmarks/kernels/benchmark_moe_defaults.py
|
||||
|
||||
Produces a table showing kernel time (us) and speedup of new vs old defaults.
|
||||
"""
|
||||
|
||||
import torch
|
||||
|
||||
from vllm.model_executor.layers.fused_moe import fused_topk, override_config
|
||||
from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig
|
||||
from vllm.model_executor.layers.fused_moe.fused_moe import (
|
||||
fused_experts,
|
||||
get_default_config,
|
||||
get_moe_configs,
|
||||
)
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.triton_utils import triton
|
||||
from vllm.utils.torch_utils import set_random_seed
|
||||
|
||||
FP8_DTYPE = current_platform.fp8_dtype()
|
||||
|
||||
|
||||
def old_default_config(M, E, N, K, topk, dtype=None, block_shape=None):
|
||||
"""The original defaults before https://github.com/vllm-project/vllm/pull/34846,
|
||||
for comparison."""
|
||||
if dtype == "fp8_w8a8" and block_shape is not None:
|
||||
return {
|
||||
"BLOCK_SIZE_M": 64,
|
||||
"BLOCK_SIZE_N": block_shape[0],
|
||||
"BLOCK_SIZE_K": block_shape[1],
|
||||
"GROUP_SIZE_M": 32,
|
||||
"SPLIT_K": 1,
|
||||
"num_warps": 4,
|
||||
"num_stages": 3 if not current_platform.is_rocm() else 2,
|
||||
}
|
||||
elif M <= E:
|
||||
return {
|
||||
"BLOCK_SIZE_M": 16,
|
||||
"BLOCK_SIZE_N": 32,
|
||||
"BLOCK_SIZE_K": 64,
|
||||
"GROUP_SIZE_M": 1,
|
||||
"SPLIT_K": 1,
|
||||
}
|
||||
else:
|
||||
return {
|
||||
"BLOCK_SIZE_M": 64,
|
||||
"BLOCK_SIZE_N": 64,
|
||||
"BLOCK_SIZE_K": 32,
|
||||
"GROUP_SIZE_M": 8,
|
||||
"SPLIT_K": 1,
|
||||
}
|
||||
|
||||
|
||||
def benchmark_config(
|
||||
config,
|
||||
M,
|
||||
E,
|
||||
N,
|
||||
K,
|
||||
topk,
|
||||
dtype,
|
||||
use_fp8=False,
|
||||
block_shape=None,
|
||||
num_iters=100,
|
||||
):
|
||||
"""Time a single kernel config. Returns kernel time in microseconds."""
|
||||
init_dtype = torch.float16 if use_fp8 else dtype
|
||||
|
||||
a = torch.randn(M, K, device="cuda", dtype=init_dtype) / 10
|
||||
w1 = torch.randn(E, 2 * N, K, device="cuda", dtype=init_dtype) / 10
|
||||
w2 = torch.randn(E, K, N, device="cuda", dtype=init_dtype) / 10
|
||||
|
||||
w1_scale = None
|
||||
w2_scale = None
|
||||
a1_scale = None
|
||||
a2_scale = None
|
||||
if use_fp8:
|
||||
if block_shape is not None:
|
||||
bsn, bsk = block_shape
|
||||
n_tiles_w1 = triton.cdiv(2 * N, bsn)
|
||||
k_tiles_w1 = triton.cdiv(K, bsk)
|
||||
n_tiles_w2 = triton.cdiv(K, bsn)
|
||||
k_tiles_w2 = triton.cdiv(N, bsk)
|
||||
w1_scale = torch.rand(
|
||||
E, n_tiles_w1, k_tiles_w1, device="cuda", dtype=torch.float32
|
||||
)
|
||||
w2_scale = torch.rand(
|
||||
E, n_tiles_w2, k_tiles_w2, device="cuda", dtype=torch.float32
|
||||
)
|
||||
else:
|
||||
w1_scale = torch.rand(E, device="cuda", dtype=torch.float32)
|
||||
w2_scale = torch.rand(E, device="cuda", dtype=torch.float32)
|
||||
a1_scale = torch.rand(1, device="cuda", dtype=torch.float32)
|
||||
a2_scale = torch.rand(1, device="cuda", dtype=torch.float32)
|
||||
# Only weights are stored in fp8; activations stay in bf16/fp16
|
||||
# and get dynamically quantized inside the kernel.
|
||||
w1 = w1.to(FP8_DTYPE)
|
||||
w2 = w2.to(FP8_DTYPE)
|
||||
|
||||
quant_config = FusedMoEQuantConfig.make(
|
||||
quant_dtype=torch.float8_e4m3fn if use_fp8 else None,
|
||||
w1_scale=w1_scale,
|
||||
w2_scale=w2_scale,
|
||||
a1_scale=a1_scale,
|
||||
a2_scale=a2_scale,
|
||||
block_shape=block_shape,
|
||||
)
|
||||
|
||||
gating = torch.randn(M, E, device="cuda", dtype=torch.float32)
|
||||
|
||||
# Warmup
|
||||
for _ in range(20):
|
||||
with override_config(config):
|
||||
topk_weights, topk_ids, _ = fused_topk(a, gating, topk, renormalize=True)
|
||||
fused_experts(
|
||||
a,
|
||||
w1,
|
||||
w2,
|
||||
topk_weights,
|
||||
topk_ids,
|
||||
quant_config=quant_config,
|
||||
)
|
||||
torch.cuda.synchronize()
|
||||
|
||||
# Benchmark
|
||||
start = torch.cuda.Event(enable_timing=True)
|
||||
end = torch.cuda.Event(enable_timing=True)
|
||||
start.record()
|
||||
for _ in range(num_iters):
|
||||
with override_config(config):
|
||||
topk_weights, topk_ids, _ = fused_topk(a, gating, topk, renormalize=True)
|
||||
fused_experts(
|
||||
a,
|
||||
w1,
|
||||
w2,
|
||||
topk_weights,
|
||||
topk_ids,
|
||||
quant_config=quant_config,
|
||||
)
|
||||
end.record()
|
||||
torch.cuda.synchronize()
|
||||
return start.elapsed_time(end) / num_iters * 1000 # ms -> us
|
||||
|
||||
|
||||
# Model configurations: (name, E, N, K, topk, dtype_str, use_fp8, block_shape)
|
||||
# N = moe_intermediate_size // tp_size (the value used in config file lookup)
|
||||
MODELS = [
|
||||
# --- Few experts ---
|
||||
("Mixtral bf16", 8, 7168, 4096, 2, None, False, None),
|
||||
("Mixtral fp8", 8, 7168, 4096, 2, "fp8_w8a8", True, None),
|
||||
# --- Many experts: real model shapes at tp=1 ---
|
||||
# Qwen2-MoE-57B: E=60, topk=4, N=1408, K=2048
|
||||
("Qwen2-MoE bf16", 60, 1408, 2048, 4, None, False, None),
|
||||
# DeepSeek-V2: E=64, topk=6, N=1407, K=4096
|
||||
# (use 1408 to avoid odd alignment; real model is 1407)
|
||||
("DeepSeek-V2 bf16", 64, 1408, 4096, 6, None, False, None),
|
||||
# OLMoE-7B: E=64, topk=8, N=2048, K=2048
|
||||
("OLMoE bf16", 64, 2048, 2048, 8, None, False, None),
|
||||
# GLM-4-100B-A10B: E=128, topk=8, N=1408, K=4096
|
||||
("GLM-4-MoE bf16", 128, 1408, 4096, 8, None, False, None),
|
||||
# Qwen3-30B-A3B: E=128, topk=8, N=768, K=2048
|
||||
("Qwen3-MoE bf16", 128, 768, 2048, 8, None, False, None),
|
||||
# DeepSeek-V3 / MiMo-V2-Flash: E=256, topk=8, N=2048, K=7168
|
||||
("DeepSeek-V3 bf16", 256, 2048, 7168, 8, None, False, None),
|
||||
# Qwen3.5-70B-A22B (Qwen3-Next): E=512, topk=10, N=512, K=2048
|
||||
("Qwen3-Next bf16", 512, 512, 2048, 10, None, False, None),
|
||||
# E=128 N=1856 bf16
|
||||
("E128 N1856 bf16", 128, 1856, 4096, 8, None, False, None),
|
||||
# E=256 N=512 bf16 (DS-V3 tp=4)
|
||||
("DS-V3 tp4 bf16", 256, 512, 7168, 8, None, False, None),
|
||||
# E=512 N=512 bf16 (Qwen3-Next tp=1)
|
||||
("Qwen3-Next bf16", 512, 512, 2048, 10, None, False, None),
|
||||
# E=512 N=256 bf16 (Qwen3-Next tp=2)
|
||||
("Qwen3-Next tp2", 512, 256, 2048, 10, None, False, None),
|
||||
# --- FP8 block quant (many experts) ---
|
||||
# DS-V3 tp=4: E=256, N=512, fp8 block
|
||||
("DS-V3 tp4 fp8blk", 256, 512, 7168, 8, "fp8_w8a8", True, [128, 128]),
|
||||
# DS-V3 tp=8: E=256, N=256, fp8 block
|
||||
("DS-V3 tp8 fp8blk", 256, 256, 7168, 8, "fp8_w8a8", True, [128, 128]),
|
||||
# Qwen3-Next tp=2 fp8 block
|
||||
("Qwen3-Next tp2 fp8blk", 512, 256, 2048, 10, "fp8_w8a8", True, [128, 128]),
|
||||
]
|
||||
|
||||
BATCH_SIZES = [1, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096]
|
||||
|
||||
|
||||
def main():
|
||||
set_random_seed(0)
|
||||
torch.set_default_device("cuda")
|
||||
dtype = torch.bfloat16
|
||||
|
||||
for name, E, N, K, topk, dtype_str, use_fp8, block_shape in MODELS:
|
||||
print(f"\n{'=' * 90}")
|
||||
print(f" {name} (E={E}, N={N}, K={K}, topk={topk})")
|
||||
print(f"{'=' * 90}")
|
||||
|
||||
# Try to load tuned config
|
||||
block_n = block_shape[0] if block_shape else None
|
||||
block_k = block_shape[1] if block_shape else None
|
||||
tuned = get_moe_configs(E, N, dtype_str, block_n, block_k)
|
||||
has_tuned = tuned is not None
|
||||
print(f" Tuned config available: {has_tuned}")
|
||||
|
||||
hdr = (
|
||||
f"{'Batch':>6} | {'Tuned (us)':>11} | {'Old (us)':>11} | "
|
||||
f"{'New (us)':>11} | {'New/Old':>8} | {'New/Tuned':>10}"
|
||||
)
|
||||
print(f" {hdr}")
|
||||
print(f" {'-' * len(hdr)}")
|
||||
|
||||
for M in BATCH_SIZES:
|
||||
old_cfg = old_default_config(M, E, N, K, topk, dtype_str, block_shape)
|
||||
new_cfg = get_default_config(M, E, N, K, topk, dtype_str, block_shape)
|
||||
|
||||
if has_tuned:
|
||||
tuned_cfg = tuned[min(tuned.keys(), key=lambda x: abs(x - M))]
|
||||
t_tuned = benchmark_config(
|
||||
tuned_cfg,
|
||||
M,
|
||||
E,
|
||||
N,
|
||||
K,
|
||||
topk,
|
||||
dtype,
|
||||
use_fp8=use_fp8,
|
||||
block_shape=block_shape,
|
||||
)
|
||||
else:
|
||||
t_tuned = None
|
||||
|
||||
t_old = benchmark_config(
|
||||
old_cfg,
|
||||
M,
|
||||
E,
|
||||
N,
|
||||
K,
|
||||
topk,
|
||||
dtype,
|
||||
use_fp8=use_fp8,
|
||||
block_shape=block_shape,
|
||||
)
|
||||
t_new = benchmark_config(
|
||||
new_cfg,
|
||||
M,
|
||||
E,
|
||||
N,
|
||||
K,
|
||||
topk,
|
||||
dtype,
|
||||
use_fp8=use_fp8,
|
||||
block_shape=block_shape,
|
||||
)
|
||||
|
||||
ratio_new_old = t_new / t_old
|
||||
tuned_str = f"{t_tuned:11.2f}" if t_tuned else f"{'N/A':>11}"
|
||||
ratio_tuned = f"{t_new / t_tuned:10.2f}x" if t_tuned else f"{'N/A':>10}"
|
||||
# flag regressions where new default is >5% slower than old
|
||||
marker = " <--" if ratio_new_old > 1.05 else ""
|
||||
|
||||
print(
|
||||
f" {M:>6} | {tuned_str} | {t_old:11.2f} | {t_new:11.2f} "
|
||||
f"| {ratio_new_old:7.2f}x | {ratio_tuned}{marker}"
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -36,6 +36,7 @@ from typing import Any
|
||||
import numpy as np
|
||||
import torch
|
||||
|
||||
from vllm.benchmarks.lib.utils import default_vllm_config
|
||||
from vllm.model_executor.layers.rotary_embedding import get_rope
|
||||
from vllm.transformers_utils.config import get_config
|
||||
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
||||
@@ -78,6 +79,7 @@ def calculate_stats(times: list[float]) -> dict[str, float]:
|
||||
}
|
||||
|
||||
|
||||
@default_vllm_config()
|
||||
def benchmark_mrope(
|
||||
model_name: str,
|
||||
num_tokens: int,
|
||||
|
||||
@@ -7,6 +7,7 @@ from unittest.mock import patch
|
||||
import pandas as pd
|
||||
import torch
|
||||
|
||||
from vllm.benchmarks.lib.utils import default_vllm_config
|
||||
from vllm.model_executor.layers.quantization.input_quant_fp8 import QuantFP8
|
||||
from vllm.model_executor.layers.quantization.utils.quant_utils import GroupShape
|
||||
from vllm.triton_utils import triton
|
||||
@@ -84,6 +85,7 @@ def calculate_diff(
|
||||
configs = []
|
||||
|
||||
|
||||
@default_vllm_config()
|
||||
def benchmark_quantization(
|
||||
batch_size,
|
||||
hidden_size,
|
||||
@@ -5,6 +5,7 @@ import itertools
|
||||
|
||||
import torch
|
||||
|
||||
from vllm.benchmarks.lib.utils import default_vllm_config
|
||||
from vllm.model_executor.layers.rotary_embedding import get_rope
|
||||
from vllm.triton_utils import triton
|
||||
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
||||
@@ -29,6 +30,7 @@ def get_benchmark(head_size, rotary_dim, is_neox_style, device):
|
||||
args={},
|
||||
)
|
||||
)
|
||||
@default_vllm_config()
|
||||
def benchmark(batch_size, seq_len, num_heads, provider):
|
||||
dtype = torch.bfloat16
|
||||
max_position = 8192
|
||||
|
||||
@@ -71,7 +71,7 @@ while [[ $# -gt 0 ]]; do
|
||||
usage
|
||||
;;
|
||||
*)
|
||||
echo "Unknown argument: $1\n"
|
||||
printf "Unknown argument: %s\n" "$1"
|
||||
usage
|
||||
;;
|
||||
esac
|
||||
@@ -84,15 +84,17 @@ mkdir -p "$OUTPUT_DIR"
|
||||
QPS_VALUES=(25 20 15 10 5 1)
|
||||
|
||||
# Common parameters
|
||||
COMMON_PARAMS="--backend $BACKEND \
|
||||
--model $MODEL \
|
||||
--dataset $DATASET \
|
||||
--structured-output-ratio $STRUCTURED_OUTPUT_RATIO \
|
||||
--save-results \
|
||||
--result-dir $OUTPUT_DIR \
|
||||
--output-len $MAX_NEW_TOKENS \
|
||||
--port $PORT \
|
||||
--tokenizer-mode $TOKENIZER_MODE"
|
||||
COMMON_PARAMS=(
|
||||
--backend "$BACKEND"
|
||||
--model "$MODEL"
|
||||
--dataset "$DATASET"
|
||||
--structured-output-ratio "$STRUCTURED_OUTPUT_RATIO"
|
||||
--save-results
|
||||
--result-dir "$OUTPUT_DIR"
|
||||
--output-len "$MAX_NEW_TOKENS"
|
||||
--port "$PORT"
|
||||
--tokenizer-mode "$TOKENIZER_MODE"
|
||||
)
|
||||
|
||||
echo "Starting structured output benchmark with model: $MODEL"
|
||||
echo "Backend: $BACKEND"
|
||||
@@ -109,17 +111,17 @@ for qps in "${QPS_VALUES[@]}"; do
|
||||
GIT_BRANCH=$(git rev-parse --abbrev-ref HEAD 2>/dev/null || echo "unknown")
|
||||
|
||||
# Construct filename for this run
|
||||
FILENAME="${BACKEND}_${qps}qps_$(basename $MODEL)_${DATASET}_${GIT_HASH}.json"
|
||||
FILENAME="${BACKEND}_${qps}qps_$(basename "$MODEL")_${DATASET}_${GIT_HASH}_${GIT_BRANCH}.json"
|
||||
|
||||
NUM_PROMPTS=$(echo "$TOTAL_SECONDS * $qps" | bc)
|
||||
NUM_PROMPTS=${NUM_PROMPTS%.*} # Remove fractional part
|
||||
echo "Running benchmark with $NUM_PROMPTS prompts"
|
||||
|
||||
# Run the benchmark
|
||||
python "$SCRIPT_DIR/benchmark_serving_structured_output.py" $COMMON_PARAMS \
|
||||
--request-rate $qps \
|
||||
python "$SCRIPT_DIR/benchmark_serving_structured_output.py" "${COMMON_PARAMS[@]}" \
|
||||
--request-rate "$qps" \
|
||||
--result-filename "$FILENAME" \
|
||||
--num-prompts $NUM_PROMPTS
|
||||
--num-prompts "$NUM_PROMPTS"
|
||||
|
||||
echo "Completed benchmark with QPS: $qps"
|
||||
echo "----------------------------------------"
|
||||
|
||||
@@ -18,6 +18,7 @@ set(ENABLE_AVX512 $ENV{VLLM_CPU_AVX512})
|
||||
set(ENABLE_AVX512BF16 $ENV{VLLM_CPU_AVX512BF16})
|
||||
set(ENABLE_AVX512VNNI $ENV{VLLM_CPU_AVX512VNNI})
|
||||
set(ENABLE_AMXBF16 $ENV{VLLM_CPU_AMXBF16})
|
||||
set(ENABLE_ARM_BF16 $ENV{VLLM_CPU_ARM_BF16})
|
||||
|
||||
include_directories("${CMAKE_SOURCE_DIR}/csrc")
|
||||
|
||||
@@ -115,6 +116,10 @@ else()
|
||||
set(AVX512_FOUND ON)
|
||||
message(STATUS "AVX512 support enabled via VLLM_CPU_AVX512 environment variable")
|
||||
endif()
|
||||
if (ENABLE_ARM_BF16)
|
||||
set(ARM_BF16_FOUND ON)
|
||||
message(STATUS "ARM BF16 support enabled via VLLM_CPU_ARM_BF16 environment variable")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if (AVX512_FOUND AND NOT AVX512_DISABLED)
|
||||
|
||||
@@ -19,7 +19,7 @@ else()
|
||||
FetchContent_Declare(
|
||||
flashmla
|
||||
GIT_REPOSITORY https://github.com/vllm-project/FlashMLA
|
||||
GIT_TAG c2afa9cb93e674d5a9120a170a6da57b89267208
|
||||
GIT_TAG 692917b1cda61b93ac9ee2d846ec54e75afe87b1
|
||||
GIT_PROGRESS TRUE
|
||||
CONFIGURE_COMMAND ""
|
||||
BUILD_COMMAND ""
|
||||
|
||||
@@ -38,7 +38,7 @@ else()
|
||||
FetchContent_Declare(
|
||||
vllm-flash-attn
|
||||
GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
|
||||
GIT_TAG 188be16520ceefdc625fdf71365585d2ee348fe2
|
||||
GIT_TAG 5824e6e2008271063c3229ab3e7032bd74abbbc6
|
||||
GIT_PROGRESS TRUE
|
||||
# Don't share the vllm-flash-attn build between build types
|
||||
BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn
|
||||
|
||||
@@ -9,6 +9,113 @@
|
||||
|
||||
namespace vllm {
|
||||
|
||||
struct alignas(32) u32x8_t {
|
||||
uint32_t u0, u1, u2, u3, u4, u5, u6, u7;
|
||||
};
|
||||
|
||||
__device__ __forceinline__ void ld256(u32x8_t& val, const u32x8_t* ptr) {
|
||||
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 1000 && \
|
||||
defined(CUDA_VERSION) && CUDA_VERSION >= 12090
|
||||
asm volatile("ld.global.nc.v8.u32 {%0,%1,%2,%3,%4,%5,%6,%7}, [%8];\n"
|
||||
: "=r"(val.u0), "=r"(val.u1), "=r"(val.u2), "=r"(val.u3),
|
||||
"=r"(val.u4), "=r"(val.u5), "=r"(val.u6), "=r"(val.u7)
|
||||
: "l"(ptr));
|
||||
#else
|
||||
const uint4* uint_ptr = reinterpret_cast<const uint4*>(ptr);
|
||||
uint4 top_half = __ldg(&uint_ptr[0]);
|
||||
uint4 bottom_half = __ldg(&uint_ptr[1]);
|
||||
val.u0 = top_half.x;
|
||||
val.u1 = top_half.y;
|
||||
val.u2 = top_half.z;
|
||||
val.u3 = top_half.w;
|
||||
val.u4 = bottom_half.x;
|
||||
val.u5 = bottom_half.y;
|
||||
val.u6 = bottom_half.z;
|
||||
val.u7 = bottom_half.w;
|
||||
#endif
|
||||
}
|
||||
|
||||
__device__ __forceinline__ void st256(u32x8_t& val, u32x8_t* ptr) {
|
||||
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 1000 && \
|
||||
defined(CUDA_VERSION) && CUDA_VERSION >= 12090
|
||||
asm volatile("st.global.v8.u32 [%0], {%1,%2,%3,%4,%5,%6,%7,%8};\n"
|
||||
:
|
||||
: "l"(ptr), "r"(val.u0), "r"(val.u1), "r"(val.u2), "r"(val.u3),
|
||||
"r"(val.u4), "r"(val.u5), "r"(val.u6), "r"(val.u7)
|
||||
: "memory");
|
||||
#else
|
||||
uint4* uint_ptr = reinterpret_cast<uint4*>(ptr);
|
||||
uint_ptr[0] = make_uint4(val.u0, val.u1, val.u2, val.u3);
|
||||
uint_ptr[1] = make_uint4(val.u4, val.u5, val.u6, val.u7);
|
||||
#endif
|
||||
}
|
||||
|
||||
template <bool support_256>
|
||||
struct VecTraits;
|
||||
|
||||
template <>
|
||||
struct VecTraits<true> {
|
||||
static constexpr int ARCH_MAX_VEC_SIZE = 32;
|
||||
using vec_t = u32x8_t;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct VecTraits<false> {
|
||||
static constexpr int ARCH_MAX_VEC_SIZE = 16;
|
||||
using vec_t = int4;
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
struct PackedTraits;
|
||||
|
||||
template <>
|
||||
struct PackedTraits<c10::BFloat16> {
|
||||
using packed_t = __nv_bfloat162;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct PackedTraits<c10::Half> {
|
||||
using packed_t = __half2;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct PackedTraits<float> {
|
||||
using packed_t = float2;
|
||||
};
|
||||
|
||||
template <typename packed_t>
|
||||
__device__ __forceinline__ float2 cast_to_float2(const packed_t& val) {
|
||||
if constexpr (std::is_same_v<packed_t, __nv_bfloat162>) {
|
||||
return __bfloat1622float2(val);
|
||||
} else if constexpr (std::is_same_v<packed_t, __half2>) {
|
||||
return __half22float2(val);
|
||||
} else if constexpr (std::is_same_v<packed_t, float2>) {
|
||||
return float2(val);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename packed_t>
|
||||
__device__ __forceinline__ packed_t cast_to_packed(const float2& val) {
|
||||
if constexpr (std::is_same_v<packed_t, __nv_bfloat162>) {
|
||||
return __float22bfloat162_rn(val);
|
||||
} else if constexpr (std::is_same_v<packed_t, __half2>) {
|
||||
return __float22half2_rn(val);
|
||||
} else if constexpr (std::is_same_v<packed_t, float2>) {
|
||||
return float2(val);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename packed_t>
|
||||
__device__ __forceinline__ packed_t packed_mul(const packed_t& x,
|
||||
const packed_t& y) {
|
||||
if constexpr (std::is_same_v<packed_t, __nv_bfloat162> ||
|
||||
std::is_same_v<packed_t, __half2>) {
|
||||
return __hmul2(x, y);
|
||||
} else if constexpr (std::is_same_v<packed_t, float2>) {
|
||||
return make_float2(x.x * y.x, x.y * y.y);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename scalar_t, scalar_t (*ACT_FN)(const scalar_t&),
|
||||
bool act_first>
|
||||
__device__ __forceinline__ scalar_t compute(const scalar_t& x,
|
||||
@@ -16,52 +123,69 @@ __device__ __forceinline__ scalar_t compute(const scalar_t& x,
|
||||
return act_first ? ACT_FN(x) * y : x * ACT_FN(y);
|
||||
}
|
||||
|
||||
template <typename packed_t, packed_t (*PACKED_ACT_FN)(const packed_t&),
|
||||
bool act_first>
|
||||
__device__ __forceinline__ packed_t packed_compute(const packed_t& x,
|
||||
const packed_t& y) {
|
||||
return act_first ? packed_mul(PACKED_ACT_FN(x), y)
|
||||
: packed_mul(x, PACKED_ACT_FN(y));
|
||||
}
|
||||
|
||||
// Check if all pointers are 16-byte aligned for int4 vectorized access
|
||||
__device__ __forceinline__ bool is_16byte_aligned(const void* ptr) {
|
||||
__host__ __device__ __forceinline__ bool is_16byte_aligned(const void* ptr) {
|
||||
return (reinterpret_cast<uintptr_t>(ptr) & 15) == 0;
|
||||
}
|
||||
|
||||
// Check if all pointers are 16-byte aligned for longlong4_32a vectorized access
|
||||
__host__ __device__ __forceinline__ bool is_32byte_aligned(const void* ptr) {
|
||||
return (reinterpret_cast<uintptr_t>(ptr) & 31) == 0;
|
||||
}
|
||||
|
||||
// Activation and gating kernel template.
|
||||
template <typename scalar_t, scalar_t (*ACT_FN)(const scalar_t&),
|
||||
bool act_first>
|
||||
template <typename scalar_t, typename packed_t,
|
||||
scalar_t (*ACT_FN)(const scalar_t&),
|
||||
packed_t (*PACKED_ACT_FN)(const packed_t&), bool act_first,
|
||||
bool use_vec, bool use_256b = false>
|
||||
__global__ void act_and_mul_kernel(
|
||||
scalar_t* __restrict__ out, // [..., d]
|
||||
const scalar_t* __restrict__ input, // [..., 2, d]
|
||||
const int d) {
|
||||
constexpr int VEC_SIZE = 16 / sizeof(scalar_t);
|
||||
const int64_t token_idx = blockIdx.x;
|
||||
const scalar_t* x_ptr = input + token_idx * 2 * d;
|
||||
const scalar_t* x_ptr = input + blockIdx.x * 2 * d;
|
||||
const scalar_t* y_ptr = x_ptr + d;
|
||||
scalar_t* out_ptr = out + token_idx * d;
|
||||
scalar_t* out_ptr = out + blockIdx.x * d;
|
||||
|
||||
// Check alignment for 128-bit vectorized access.
|
||||
// All three pointers must be 16-byte aligned for safe int4 operations.
|
||||
const bool aligned = is_16byte_aligned(x_ptr) && is_16byte_aligned(y_ptr) &&
|
||||
is_16byte_aligned(out_ptr);
|
||||
if constexpr (use_vec) {
|
||||
// Fast path: 128-bit/256-bit vectorized loop
|
||||
using vec_t = typename VecTraits<use_256b>::vec_t;
|
||||
constexpr int ARCH_MAX_VEC_SIZE = VecTraits<use_256b>::ARCH_MAX_VEC_SIZE;
|
||||
constexpr int VEC_SIZE = ARCH_MAX_VEC_SIZE / sizeof(packed_t);
|
||||
|
||||
if (aligned && d >= VEC_SIZE) {
|
||||
// Fast path: 128-bit vectorized loop
|
||||
const int4* x_vec = reinterpret_cast<const int4*>(x_ptr);
|
||||
const int4* y_vec = reinterpret_cast<const int4*>(y_ptr);
|
||||
int4* out_vec = reinterpret_cast<int4*>(out_ptr);
|
||||
const int num_vecs = d / VEC_SIZE;
|
||||
const int vec_end = num_vecs * VEC_SIZE;
|
||||
const vec_t* x_vec = reinterpret_cast<const vec_t*>(x_ptr);
|
||||
const vec_t* y_vec = reinterpret_cast<const vec_t*>(y_ptr);
|
||||
vec_t* out_vec = reinterpret_cast<vec_t*>(out_ptr);
|
||||
const int num_vecs = d / 2 / VEC_SIZE;
|
||||
|
||||
for (int i = threadIdx.x; i < num_vecs; i += blockDim.x) {
|
||||
int4 x = VLLM_LDG(&x_vec[i]), y = VLLM_LDG(&y_vec[i]), r;
|
||||
auto* xp = reinterpret_cast<scalar_t*>(&x);
|
||||
auto* yp = reinterpret_cast<scalar_t*>(&y);
|
||||
auto* rp = reinterpret_cast<scalar_t*>(&r);
|
||||
vec_t x, y;
|
||||
if constexpr (use_256b) {
|
||||
ld256(x, &x_vec[i]);
|
||||
ld256(y, &y_vec[i]);
|
||||
} else {
|
||||
x = VLLM_LDG(&x_vec[i]);
|
||||
y = VLLM_LDG(&y_vec[i]);
|
||||
}
|
||||
auto* xp = reinterpret_cast<packed_t*>(&x);
|
||||
auto* yp = reinterpret_cast<packed_t*>(&y);
|
||||
#pragma unroll
|
||||
for (int j = 0; j < VEC_SIZE; j++) {
|
||||
rp[j] = compute<scalar_t, ACT_FN, act_first>(xp[j], yp[j]);
|
||||
xp[j] =
|
||||
packed_compute<packed_t, PACKED_ACT_FN, act_first>(xp[j], yp[j]);
|
||||
}
|
||||
if constexpr (use_256b) {
|
||||
st256(x, &out_vec[i]);
|
||||
} else {
|
||||
out_vec[i] = x;
|
||||
}
|
||||
out_vec[i] = r;
|
||||
}
|
||||
// Scalar cleanup for remaining elements
|
||||
for (int i = vec_end + threadIdx.x; i < d; i += blockDim.x) {
|
||||
out_ptr[i] = compute<scalar_t, ACT_FN, act_first>(VLLM_LDG(&x_ptr[i]),
|
||||
VLLM_LDG(&y_ptr[i]));
|
||||
}
|
||||
} else {
|
||||
// Scalar fallback for unaligned data or small d
|
||||
@@ -79,6 +203,15 @@ __device__ __forceinline__ T silu_kernel(const T& x) {
|
||||
return (T)(((float)x) / (1.0f + expf((float)-x)));
|
||||
}
|
||||
|
||||
template <typename packed_t>
|
||||
__device__ __forceinline__ packed_t packed_silu_kernel(const packed_t& val) {
|
||||
// x * sigmoid(x)
|
||||
float2 fval = cast_to_float2(val);
|
||||
fval.x = fval.x / (1.0f + expf(-fval.x));
|
||||
fval.y = fval.y / (1.0f + expf(-fval.y));
|
||||
return cast_to_packed<packed_t>(fval);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__device__ __forceinline__ T gelu_kernel(const T& x) {
|
||||
// Equivalent to PyTorch GELU with 'none' approximation.
|
||||
@@ -89,6 +222,18 @@ __device__ __forceinline__ T gelu_kernel(const T& x) {
|
||||
return (T)(f * 0.5f * (1.0f + ::erf(f * ALPHA)));
|
||||
}
|
||||
|
||||
template <typename packed_t>
|
||||
__device__ __forceinline__ packed_t packed_gelu_kernel(const packed_t& val) {
|
||||
// Equivalent to PyTorch GELU with 'none' approximation.
|
||||
// Refer to:
|
||||
// https://github.com/pytorch/pytorch/blob/8ac9b20d4b090c213799e81acf48a55ea8d437d6/aten/src/ATen/native/cuda/ActivationGeluKernel.cu#L36-L38
|
||||
constexpr float ALPHA = M_SQRT1_2;
|
||||
float2 fval = cast_to_float2(val);
|
||||
fval.x = fval.x * 0.5f * (1.0f + ::erf(fval.x * ALPHA));
|
||||
fval.y = fval.y * 0.5f * (1.0f + ::erf(fval.y * ALPHA));
|
||||
return cast_to_packed<packed_t>(fval);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__device__ __forceinline__ T gelu_tanh_kernel(const T& x) {
|
||||
// Equivalent to PyTorch GELU with 'tanh' approximation.
|
||||
@@ -102,32 +247,83 @@ __device__ __forceinline__ T gelu_tanh_kernel(const T& x) {
|
||||
return (T)(0.5f * f * (1.0f + ::tanhf(inner)));
|
||||
}
|
||||
|
||||
template <typename packed_t>
|
||||
__device__ __forceinline__ packed_t
|
||||
packed_gelu_tanh_kernel(const packed_t& val) {
|
||||
// Equivalent to PyTorch GELU with 'tanh' approximation.
|
||||
// Refer to:
|
||||
// https://github.com/pytorch/pytorch/blob/8ac9b20d4b090c213799e81acf48a55ea8d437d6/aten/src/ATen/native/cuda/ActivationGeluKernel.cu#L25-L30
|
||||
float2 fval = cast_to_float2(val);
|
||||
constexpr float BETA = M_SQRT2 * M_2_SQRTPI * 0.5f;
|
||||
constexpr float KAPPA = 0.044715;
|
||||
|
||||
float x_cube = fval.x * fval.x * fval.x;
|
||||
float inner = BETA * (fval.x + KAPPA * x_cube);
|
||||
fval.x = 0.5f * fval.x * (1.0f + ::tanhf(inner));
|
||||
|
||||
x_cube = fval.y * fval.y * fval.y;
|
||||
inner = BETA * (fval.y + KAPPA * x_cube);
|
||||
fval.y = 0.5f * fval.y * (1.0f + ::tanhf(inner));
|
||||
return cast_to_packed<packed_t>(fval);
|
||||
}
|
||||
|
||||
} // namespace vllm
|
||||
|
||||
// Launch activation and gating kernel.
|
||||
// Use ACT_FIRST (bool) indicating whether to apply the activation function
|
||||
// first.
|
||||
#define LAUNCH_ACTIVATION_GATE_KERNEL(KERNEL, ACT_FIRST) \
|
||||
int d = input.size(-1) / 2; \
|
||||
int64_t num_tokens = input.numel() / input.size(-1); \
|
||||
dim3 grid(num_tokens); \
|
||||
dim3 block(std::min(d, 1024)); \
|
||||
if (num_tokens == 0) { \
|
||||
return; \
|
||||
} \
|
||||
const at::cuda::OptionalCUDAGuard device_guard(device_of(input)); \
|
||||
const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); \
|
||||
VLLM_DISPATCH_FLOATING_TYPES( \
|
||||
input.scalar_type(), "act_and_mul_kernel", [&] { \
|
||||
vllm::act_and_mul_kernel<scalar_t, KERNEL<scalar_t>, ACT_FIRST> \
|
||||
<<<grid, block, 0, stream>>>(out.data_ptr<scalar_t>(), \
|
||||
input.data_ptr<scalar_t>(), d); \
|
||||
});
|
||||
#define LAUNCH_ACTIVATION_GATE_KERNEL(KERNEL, PACKED_KERNEL, ACT_FIRST) \
|
||||
auto dtype = input.scalar_type(); \
|
||||
int d = input.size(-1) / 2; \
|
||||
int64_t num_tokens = input.numel() / input.size(-1); \
|
||||
if (num_tokens == 0) { \
|
||||
return; \
|
||||
} \
|
||||
dim3 grid(num_tokens); \
|
||||
int cc_major = at::cuda::getCurrentDeviceProperties()->major; \
|
||||
int support_vec = (cc_major >= 10 && num_tokens > 128) ? 32 : 16; \
|
||||
int vec_size = support_vec / at::elementSize(dtype); \
|
||||
const bool use_vec = (d % vec_size == 0); \
|
||||
const at::cuda::OptionalCUDAGuard device_guard(device_of(input)); \
|
||||
const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); \
|
||||
if (use_vec) { \
|
||||
dim3 block(std::min(d / vec_size, 1024)); \
|
||||
if (cc_major >= 10 && num_tokens > 128) { \
|
||||
VLLM_DISPATCH_FLOATING_TYPES(dtype, "act_and_mul_kernel", [&] { \
|
||||
vllm::act_and_mul_kernel< \
|
||||
scalar_t, typename vllm::PackedTraits<scalar_t>::packed_t, \
|
||||
KERNEL<scalar_t>, \
|
||||
PACKED_KERNEL<typename vllm::PackedTraits<scalar_t>::packed_t>, \
|
||||
ACT_FIRST, true, true><<<grid, block, 0, stream>>>( \
|
||||
out.data_ptr<scalar_t>(), input.data_ptr<scalar_t>(), d); \
|
||||
}); \
|
||||
} else { \
|
||||
VLLM_DISPATCH_FLOATING_TYPES(dtype, "act_and_mul_kernel", [&] { \
|
||||
vllm::act_and_mul_kernel< \
|
||||
scalar_t, typename vllm::PackedTraits<scalar_t>::packed_t, \
|
||||
KERNEL<scalar_t>, \
|
||||
PACKED_KERNEL<typename vllm::PackedTraits<scalar_t>::packed_t>, \
|
||||
ACT_FIRST, true, false><<<grid, block, 0, stream>>>( \
|
||||
out.data_ptr<scalar_t>(), input.data_ptr<scalar_t>(), d); \
|
||||
}); \
|
||||
} \
|
||||
} else { \
|
||||
dim3 block(std::min(d, 1024)); \
|
||||
VLLM_DISPATCH_FLOATING_TYPES(dtype, "act_and_mul_kernel", [&] { \
|
||||
vllm::act_and_mul_kernel< \
|
||||
scalar_t, typename vllm::PackedTraits<scalar_t>::packed_t, \
|
||||
KERNEL<scalar_t>, \
|
||||
PACKED_KERNEL<typename vllm::PackedTraits<scalar_t>::packed_t>, \
|
||||
ACT_FIRST, false><<<grid, block, 0, stream>>>( \
|
||||
out.data_ptr<scalar_t>(), input.data_ptr<scalar_t>(), d); \
|
||||
}); \
|
||||
}
|
||||
|
||||
void silu_and_mul(torch::Tensor& out, // [..., d]
|
||||
torch::Tensor& input) // [..., 2 * d]
|
||||
{
|
||||
LAUNCH_ACTIVATION_GATE_KERNEL(vllm::silu_kernel, true);
|
||||
LAUNCH_ACTIVATION_GATE_KERNEL(vllm::silu_kernel, vllm::packed_silu_kernel,
|
||||
true);
|
||||
}
|
||||
|
||||
void mul_and_silu(torch::Tensor& out, // [..., d]
|
||||
@@ -135,19 +331,22 @@ void mul_and_silu(torch::Tensor& out, // [..., d]
|
||||
{
|
||||
// The difference between mul_and_silu and silu_and_mul is that mul_and_silu
|
||||
// applies the silu to the latter half of the input.
|
||||
LAUNCH_ACTIVATION_GATE_KERNEL(vllm::silu_kernel, false);
|
||||
LAUNCH_ACTIVATION_GATE_KERNEL(vllm::silu_kernel, vllm::packed_silu_kernel,
|
||||
false);
|
||||
}
|
||||
|
||||
void gelu_and_mul(torch::Tensor& out, // [..., d]
|
||||
torch::Tensor& input) // [..., 2 * d]
|
||||
{
|
||||
LAUNCH_ACTIVATION_GATE_KERNEL(vllm::gelu_kernel, true);
|
||||
LAUNCH_ACTIVATION_GATE_KERNEL(vllm::gelu_kernel, vllm::packed_gelu_kernel,
|
||||
true);
|
||||
}
|
||||
|
||||
void gelu_tanh_and_mul(torch::Tensor& out, // [..., d]
|
||||
torch::Tensor& input) // [..., 2 * d]
|
||||
{
|
||||
LAUNCH_ACTIVATION_GATE_KERNEL(vllm::gelu_tanh_kernel, true);
|
||||
LAUNCH_ACTIVATION_GATE_KERNEL(vllm::gelu_tanh_kernel,
|
||||
vllm::packed_gelu_tanh_kernel, true);
|
||||
}
|
||||
|
||||
namespace vllm {
|
||||
@@ -158,42 +357,57 @@ __device__ __forceinline__ T fatrelu_kernel(const T& x, const float threshold) {
|
||||
return (T)(f > threshold ? f : 0.0f);
|
||||
}
|
||||
|
||||
template <typename scalar_t, scalar_t (*ACT_FN)(const scalar_t&, const float)>
|
||||
template <typename packed_t>
|
||||
__device__ __forceinline__ packed_t
|
||||
packed_fatrelu_kernel(const packed_t& val, const float threshold) {
|
||||
float2 fval = cast_to_float2(val);
|
||||
fval.x = fval.x > threshold ? fval.x : 0.0f;
|
||||
fval.y = fval.y > threshold ? fval.y : 0.0f;
|
||||
return cast_to_packed<packed_t>(fval);
|
||||
}
|
||||
|
||||
template <typename scalar_t, typename packed_t,
|
||||
scalar_t (*ACT_FN)(const scalar_t&, const float),
|
||||
packed_t (*PACKED_ACT_FN)(const packed_t&, const float), bool use_vec,
|
||||
bool use_256b = false>
|
||||
__global__ void act_and_mul_kernel_with_param(
|
||||
scalar_t* __restrict__ out, const scalar_t* __restrict__ input, const int d,
|
||||
const float param) {
|
||||
constexpr int VEC_SIZE = 16 / sizeof(scalar_t);
|
||||
const int64_t token_idx = blockIdx.x;
|
||||
const scalar_t* x_ptr = input + token_idx * 2 * d;
|
||||
const scalar_t* x_ptr = input + blockIdx.x * 2 * d;
|
||||
const scalar_t* y_ptr = x_ptr + d;
|
||||
scalar_t* out_ptr = out + token_idx * d;
|
||||
scalar_t* out_ptr = out + blockIdx.x * d;
|
||||
|
||||
// Check alignment for 128-bit vectorized access
|
||||
const bool aligned = is_16byte_aligned(x_ptr) && is_16byte_aligned(y_ptr) &&
|
||||
is_16byte_aligned(out_ptr);
|
||||
if constexpr (use_vec) {
|
||||
// Fast path: 128-bit/256-bit vectorized loop
|
||||
using vec_t = typename VecTraits<use_256b>::vec_t;
|
||||
constexpr int ARCH_MAX_VEC_SIZE = VecTraits<use_256b>::ARCH_MAX_VEC_SIZE;
|
||||
constexpr int VEC_SIZE = ARCH_MAX_VEC_SIZE / sizeof(packed_t);
|
||||
|
||||
if (aligned && d >= VEC_SIZE) {
|
||||
// Fast path: 128-bit vectorized loop
|
||||
const int4* x_vec = reinterpret_cast<const int4*>(x_ptr);
|
||||
const int4* y_vec = reinterpret_cast<const int4*>(y_ptr);
|
||||
int4* out_vec = reinterpret_cast<int4*>(out_ptr);
|
||||
const int num_vecs = d / VEC_SIZE;
|
||||
const int vec_end = num_vecs * VEC_SIZE;
|
||||
const vec_t* x_vec = reinterpret_cast<const vec_t*>(x_ptr);
|
||||
const vec_t* y_vec = reinterpret_cast<const vec_t*>(y_ptr);
|
||||
vec_t* out_vec = reinterpret_cast<vec_t*>(out_ptr);
|
||||
const int num_vecs = d / 2 / VEC_SIZE;
|
||||
|
||||
for (int i = threadIdx.x; i < num_vecs; i += blockDim.x) {
|
||||
int4 x = VLLM_LDG(&x_vec[i]), y = VLLM_LDG(&y_vec[i]), r;
|
||||
auto* xp = reinterpret_cast<scalar_t*>(&x);
|
||||
auto* yp = reinterpret_cast<scalar_t*>(&y);
|
||||
auto* rp = reinterpret_cast<scalar_t*>(&r);
|
||||
vec_t x, y;
|
||||
if constexpr (use_256b) {
|
||||
ld256(x, &x_vec[i]);
|
||||
ld256(y, &y_vec[i]);
|
||||
} else {
|
||||
x = VLLM_LDG(&x_vec[i]);
|
||||
y = VLLM_LDG(&y_vec[i]);
|
||||
}
|
||||
auto* xp = reinterpret_cast<packed_t*>(&x);
|
||||
auto* yp = reinterpret_cast<packed_t*>(&y);
|
||||
#pragma unroll
|
||||
for (int j = 0; j < VEC_SIZE; j++) {
|
||||
rp[j] = ACT_FN(xp[j], param) * yp[j];
|
||||
xp[j] = packed_mul(PACKED_ACT_FN(xp[j], param), yp[j]);
|
||||
}
|
||||
if constexpr (use_256b) {
|
||||
st256(x, &out_vec[i]);
|
||||
} else {
|
||||
out_vec[i] = x;
|
||||
}
|
||||
out_vec[i] = r;
|
||||
}
|
||||
// Scalar cleanup for remaining elements
|
||||
for (int i = vec_end + threadIdx.x; i < d; i += blockDim.x) {
|
||||
out_ptr[i] = ACT_FN(VLLM_LDG(&x_ptr[i]), param) * VLLM_LDG(&y_ptr[i]);
|
||||
}
|
||||
} else {
|
||||
// Scalar fallback for unaligned data or small d
|
||||
@@ -276,20 +490,58 @@ __global__ void swigluoai_and_mul_kernel(
|
||||
|
||||
} // namespace vllm
|
||||
|
||||
#define LAUNCH_ACTIVATION_GATE_KERNEL_WITH_PARAM(KERNEL, PARAM) \
|
||||
int d = input.size(-1) / 2; \
|
||||
int64_t num_tokens = input.numel() / input.size(-1); \
|
||||
dim3 grid(num_tokens); \
|
||||
dim3 block(std::min(d, 1024)); \
|
||||
const at::cuda::OptionalCUDAGuard device_guard(device_of(input)); \
|
||||
const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); \
|
||||
VLLM_DISPATCH_FLOATING_TYPES( \
|
||||
input.scalar_type(), "act_and_mul_kernel_with_param", [&] { \
|
||||
vllm::act_and_mul_kernel_with_param<scalar_t, KERNEL<scalar_t>> \
|
||||
<<<grid, block, 0, stream>>>(out.data_ptr<scalar_t>(), \
|
||||
input.data_ptr<scalar_t>(), d, \
|
||||
PARAM); \
|
||||
});
|
||||
#define LAUNCH_ACTIVATION_GATE_KERNEL_WITH_PARAM(KERNEL, PACKED_KERNEL, PARAM) \
|
||||
auto dtype = input.scalar_type(); \
|
||||
int d = input.size(-1) / 2; \
|
||||
int64_t num_tokens = input.numel() / input.size(-1); \
|
||||
if (num_tokens == 0) { \
|
||||
return; \
|
||||
} \
|
||||
dim3 grid(num_tokens); \
|
||||
int cc_major = at::cuda::getCurrentDeviceProperties()->major; \
|
||||
int support_vec = (cc_major >= 10 && num_tokens > 128) ? 32 : 16; \
|
||||
int vec_size = support_vec / at::elementSize(dtype); \
|
||||
const bool use_vec = (d % vec_size == 0); \
|
||||
const at::cuda::OptionalCUDAGuard device_guard(device_of(input)); \
|
||||
const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); \
|
||||
if (use_vec) { \
|
||||
dim3 block(std::min(d / vec_size, 1024)); \
|
||||
if (cc_major >= 10 && num_tokens > 128) { \
|
||||
VLLM_DISPATCH_FLOATING_TYPES( \
|
||||
dtype, "act_and_mul_kernel_with_param", [&] { \
|
||||
vllm::act_and_mul_kernel_with_param< \
|
||||
scalar_t, typename vllm::PackedTraits<scalar_t>::packed_t, \
|
||||
KERNEL<scalar_t>, \
|
||||
PACKED_KERNEL< \
|
||||
typename vllm::PackedTraits<scalar_t>::packed_t>, \
|
||||
true, true><<<grid, block, 0, stream>>>( \
|
||||
out.data_ptr<scalar_t>(), input.data_ptr<scalar_t>(), d, \
|
||||
PARAM); \
|
||||
}); \
|
||||
} else { \
|
||||
VLLM_DISPATCH_FLOATING_TYPES( \
|
||||
dtype, "act_and_mul_kernel_with_param", [&] { \
|
||||
vllm::act_and_mul_kernel_with_param< \
|
||||
scalar_t, typename vllm::PackedTraits<scalar_t>::packed_t, \
|
||||
KERNEL<scalar_t>, \
|
||||
PACKED_KERNEL< \
|
||||
typename vllm::PackedTraits<scalar_t>::packed_t>, \
|
||||
true, false><<<grid, block, 0, stream>>>( \
|
||||
out.data_ptr<scalar_t>(), input.data_ptr<scalar_t>(), d, \
|
||||
PARAM); \
|
||||
}); \
|
||||
} \
|
||||
} else { \
|
||||
dim3 block(std::min(d, 1024)); \
|
||||
VLLM_DISPATCH_FLOATING_TYPES(dtype, "act_and_mul_kernel_with_param", [&] { \
|
||||
vllm::act_and_mul_kernel_with_param< \
|
||||
scalar_t, typename vllm::PackedTraits<scalar_t>::packed_t, \
|
||||
KERNEL<scalar_t>, \
|
||||
PACKED_KERNEL<typename vllm::PackedTraits<scalar_t>::packed_t>, \
|
||||
false><<<grid, block, 0, stream>>>( \
|
||||
out.data_ptr<scalar_t>(), input.data_ptr<scalar_t>(), d, PARAM); \
|
||||
}); \
|
||||
}
|
||||
|
||||
#define LAUNCH_SIGLUOAI_AND_MUL(KERNEL, ALPHA, LIMIT) \
|
||||
int d = input.size(-1) / 2; \
|
||||
@@ -309,7 +561,8 @@ __global__ void swigluoai_and_mul_kernel(
|
||||
void fatrelu_and_mul(torch::Tensor& out, // [..., d],
|
||||
torch::Tensor& input, // [..., 2 * d]
|
||||
double threshold) {
|
||||
LAUNCH_ACTIVATION_GATE_KERNEL_WITH_PARAM(vllm::fatrelu_kernel, threshold);
|
||||
LAUNCH_ACTIVATION_GATE_KERNEL_WITH_PARAM(
|
||||
vllm::fatrelu_kernel, vllm::packed_fatrelu_kernel, threshold);
|
||||
}
|
||||
void swigluoai_and_mul(torch::Tensor& out, // [..., d]
|
||||
torch::Tensor& input, // [..., 2 * d]
|
||||
@@ -319,39 +572,41 @@ void swigluoai_and_mul(torch::Tensor& out, // [..., d]
|
||||
namespace vllm {
|
||||
|
||||
// Element-wise activation kernel template.
|
||||
template <typename scalar_t, scalar_t (*ACT_FN)(const scalar_t&)>
|
||||
template <typename scalar_t, scalar_t (*ACT_FN)(const scalar_t&), bool use_vec,
|
||||
bool use_256b = false>
|
||||
__global__ void activation_kernel(
|
||||
scalar_t* __restrict__ out, // [..., d]
|
||||
const scalar_t* __restrict__ input, // [..., d]
|
||||
const int d) {
|
||||
constexpr int VEC_SIZE = 16 / sizeof(scalar_t);
|
||||
const int64_t token_idx = blockIdx.x;
|
||||
const scalar_t* in_ptr = input + token_idx * d;
|
||||
scalar_t* out_ptr = out + token_idx * d;
|
||||
const scalar_t* in_ptr = input + blockIdx.x * d;
|
||||
scalar_t* out_ptr = out + blockIdx.x * d;
|
||||
|
||||
// Check alignment for 128-bit vectorized access
|
||||
const bool aligned = is_16byte_aligned(in_ptr) && is_16byte_aligned(out_ptr);
|
||||
|
||||
if (aligned && d >= VEC_SIZE) {
|
||||
// Fast path: 128-bit vectorized loop
|
||||
const int4* in_vec = reinterpret_cast<const int4*>(in_ptr);
|
||||
int4* out_vec = reinterpret_cast<int4*>(out_ptr);
|
||||
if constexpr (use_vec) {
|
||||
// Fast path: 128-bit/256-bit vectorized loop
|
||||
using vec_t = typename VecTraits<use_256b>::vec_t;
|
||||
constexpr int ARCH_MAX_VEC_SIZE = VecTraits<use_256b>::ARCH_MAX_VEC_SIZE;
|
||||
constexpr int VEC_SIZE = ARCH_MAX_VEC_SIZE / sizeof(scalar_t);
|
||||
const vec_t* in_vec = reinterpret_cast<const vec_t*>(in_ptr);
|
||||
vec_t* out_vec = reinterpret_cast<vec_t*>(out_ptr);
|
||||
const int num_vecs = d / VEC_SIZE;
|
||||
const int vec_end = num_vecs * VEC_SIZE;
|
||||
|
||||
for (int i = threadIdx.x; i < num_vecs; i += blockDim.x) {
|
||||
int4 v = VLLM_LDG(&in_vec[i]), r;
|
||||
vec_t v;
|
||||
if constexpr (use_256b) {
|
||||
ld256(v, &in_vec[i]);
|
||||
} else {
|
||||
v = VLLM_LDG(&in_vec[i]);
|
||||
}
|
||||
auto* vp = reinterpret_cast<scalar_t*>(&v);
|
||||
auto* rp = reinterpret_cast<scalar_t*>(&r);
|
||||
#pragma unroll
|
||||
for (int j = 0; j < VEC_SIZE; j++) {
|
||||
rp[j] = ACT_FN(vp[j]);
|
||||
vp[j] = ACT_FN(vp[j]);
|
||||
}
|
||||
if constexpr (use_256b) {
|
||||
st256(v, &out_vec[i]);
|
||||
} else {
|
||||
out_vec[i] = v;
|
||||
}
|
||||
out_vec[i] = r;
|
||||
}
|
||||
// Scalar cleanup for remaining elements
|
||||
for (int i = vec_end + threadIdx.x; i < d; i += blockDim.x) {
|
||||
out_ptr[i] = ACT_FN(VLLM_LDG(&in_ptr[i]));
|
||||
}
|
||||
} else {
|
||||
// Scalar fallback for unaligned data or small d
|
||||
@@ -365,18 +620,43 @@ __global__ void activation_kernel(
|
||||
} // namespace vllm
|
||||
|
||||
// Launch element-wise activation kernel.
|
||||
#define LAUNCH_ACTIVATION_KERNEL(KERNEL) \
|
||||
int d = input.size(-1); \
|
||||
int64_t num_tokens = input.numel() / d; \
|
||||
dim3 grid(num_tokens); \
|
||||
dim3 block(std::min(d, 1024)); \
|
||||
const at::cuda::OptionalCUDAGuard device_guard(device_of(input)); \
|
||||
const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); \
|
||||
VLLM_DISPATCH_FLOATING_TYPES(input.scalar_type(), "activation_kernel", [&] { \
|
||||
vllm::activation_kernel<scalar_t, KERNEL<scalar_t>> \
|
||||
<<<grid, block, 0, stream>>>(out.data_ptr<scalar_t>(), \
|
||||
input.data_ptr<scalar_t>(), d); \
|
||||
});
|
||||
#define LAUNCH_ACTIVATION_KERNEL(KERNEL) \
|
||||
auto dtype = input.scalar_type(); \
|
||||
int d = input.size(-1); \
|
||||
int64_t num_tokens = input.numel() / input.size(-1); \
|
||||
if (num_tokens == 0) { \
|
||||
return; \
|
||||
} \
|
||||
dim3 grid(num_tokens); \
|
||||
int cc_major = at::cuda::getCurrentDeviceProperties()->major; \
|
||||
int support_vec = (cc_major >= 10 && num_tokens > 128) ? 32 : 16; \
|
||||
int vec_size = support_vec / at::elementSize(dtype); \
|
||||
const bool use_vec = (d % vec_size == 0); \
|
||||
const at::cuda::OptionalCUDAGuard device_guard(device_of(input)); \
|
||||
const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); \
|
||||
if (use_vec) { \
|
||||
dim3 block(std::min(d / vec_size, 1024)); \
|
||||
if (cc_major >= 10 && num_tokens > 128) { \
|
||||
VLLM_DISPATCH_FLOATING_TYPES(dtype, "activation_kernel", [&] { \
|
||||
vllm::activation_kernel<scalar_t, KERNEL<scalar_t>, true, true> \
|
||||
<<<grid, block, 0, stream>>>(out.data_ptr<scalar_t>(), \
|
||||
input.data_ptr<scalar_t>(), d); \
|
||||
}); \
|
||||
} else { \
|
||||
VLLM_DISPATCH_FLOATING_TYPES(dtype, "activation_kernel", [&] { \
|
||||
vllm::activation_kernel<scalar_t, KERNEL<scalar_t>, true, false> \
|
||||
<<<grid, block, 0, stream>>>(out.data_ptr<scalar_t>(), \
|
||||
input.data_ptr<scalar_t>(), d); \
|
||||
}); \
|
||||
} \
|
||||
} else { \
|
||||
dim3 block(std::min(d, 1024)); \
|
||||
VLLM_DISPATCH_FLOATING_TYPES(dtype, "activation_kernel", [&] { \
|
||||
vllm::activation_kernel<scalar_t, KERNEL<scalar_t>, false> \
|
||||
<<<grid, block, 0, stream>>>(out.data_ptr<scalar_t>(), \
|
||||
input.data_ptr<scalar_t>(), d); \
|
||||
}); \
|
||||
}
|
||||
|
||||
namespace vllm {
|
||||
|
||||
|
||||
@@ -1234,8 +1234,13 @@ void cp_gather_and_upconvert_fp8_kv_cache(
|
||||
"src_cache and seq_lens must be on the same device");
|
||||
TORCH_CHECK(src_cache.device() == workspace_starts.device(),
|
||||
"src_cache and workspace_starts must be on the same device");
|
||||
|
||||
TORCH_CHECK(src_cache.dtype() == torch::kUInt8, "src_cache must be uint8");
|
||||
auto dtype = src_cache.scalar_type();
|
||||
TORCH_CHECK(
|
||||
dtype == at::ScalarType::Byte || // uint8
|
||||
dtype == at::ScalarType::Float8_e4m3fn || // fp8 e4m3
|
||||
dtype == at::ScalarType::Float8_e5m2, // fp8 e5m2
|
||||
"src_cache must be uint8, float8_e4m3fn, or float8_e5m2, but got ",
|
||||
src_cache.dtype());
|
||||
TORCH_CHECK(dst.dtype() == torch::kBFloat16, "dst must be bfloat16");
|
||||
TORCH_CHECK(head_dim == 576, "head_dim must be 576 for MLA");
|
||||
|
||||
@@ -1244,14 +1249,21 @@ void cp_gather_and_upconvert_fp8_kv_cache(
|
||||
int64_t cache_entry_stride = src_cache.stride(1);
|
||||
int64_t dst_entry_stride = dst.stride(0);
|
||||
|
||||
const uint8_t* src_ptr = nullptr;
|
||||
if (dtype == at::ScalarType::Byte) {
|
||||
src_ptr = src_cache.data_ptr<uint8_t>();
|
||||
} else {
|
||||
// float8_e4m3fn or float8_e5m2
|
||||
src_ptr = reinterpret_cast<const uint8_t*>(src_cache.data_ptr());
|
||||
}
|
||||
|
||||
// Decide on the number of splits based on the batch size
|
||||
int num_splits = batch_size > 128 ? 2 : batch_size > 64 ? 4 : 16;
|
||||
dim3 grid(batch_size, num_splits);
|
||||
dim3 block(576);
|
||||
|
||||
vllm::cp_gather_and_upconvert_fp8_kv_cache<<<grid, block, 0, stream>>>(
|
||||
src_cache.data_ptr<uint8_t>(),
|
||||
reinterpret_cast<__nv_bfloat16*>(dst.data_ptr()),
|
||||
src_ptr, reinterpret_cast<__nv_bfloat16*>(dst.data_ptr()),
|
||||
block_table.data_ptr<int32_t>(), seq_lens.data_ptr<int32_t>(),
|
||||
workspace_starts.data_ptr<int32_t>(), block_size, head_dim,
|
||||
block_table_stride, cache_block_stride, cache_entry_stride,
|
||||
@@ -1293,7 +1305,8 @@ void indexer_k_quant_and_cache(
|
||||
const at::cuda::OptionalCUDAGuard device_guard(device_of(k));
|
||||
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
||||
|
||||
DISPATCH_BY_KV_CACHE_DTYPE(k.dtype(), "fp8_e4m3",
|
||||
static const std::string kv_cache_dtype = "fp8_e4m3";
|
||||
DISPATCH_BY_KV_CACHE_DTYPE(k.dtype(), kv_cache_dtype,
|
||||
CALL_INDEXER_K_QUANT_AND_CACHE);
|
||||
}
|
||||
|
||||
|
||||
@@ -16,6 +16,8 @@ torch::Tensor get_scheduler_metadata(
|
||||
isa = cpu_attention::ISA::VEC16;
|
||||
} else if (isa_hint == "neon") {
|
||||
isa = cpu_attention::ISA::NEON;
|
||||
} else if (isa_hint == "vxe") {
|
||||
isa = cpu_attention::ISA::VXE;
|
||||
} else {
|
||||
TORCH_CHECK(false, "Unsupported CPU attention ISA hint: " + isa_hint);
|
||||
}
|
||||
@@ -100,6 +102,8 @@ void cpu_attn_reshape_and_cache(
|
||||
return cpu_attention::ISA::VEC16;
|
||||
} else if (isa == "neon") {
|
||||
return cpu_attention::ISA::NEON;
|
||||
} else if (isa == "vxe") {
|
||||
return cpu_attention::ISA::VXE;
|
||||
} else {
|
||||
TORCH_CHECK(false, "Invalid ISA type: " + isa);
|
||||
}
|
||||
|
||||
@@ -12,7 +12,7 @@
|
||||
#include "cpu/utils.hpp"
|
||||
|
||||
namespace cpu_attention {
|
||||
enum class ISA { AMX, VEC, VEC16, NEON };
|
||||
enum class ISA { AMX, VEC, VEC16, NEON, VXE };
|
||||
|
||||
template <ISA isa, typename scalar_t, int64_t head_dim>
|
||||
class AttentionImpl {};
|
||||
@@ -821,7 +821,7 @@ struct VecTypeTrait<c10::BFloat16> {
|
||||
using vec_t = vec_op::BF16Vec16;
|
||||
};
|
||||
|
||||
#if !defined(__powerpc__) && !defined(__s390x__)
|
||||
#if !defined(__powerpc__)
|
||||
template <>
|
||||
struct VecTypeTrait<c10::Half> {
|
||||
using vec_t = vec_op::FP16Vec16;
|
||||
|
||||
386
csrc/cpu/cpu_attn_vxe.hpp
Normal file
386
csrc/cpu/cpu_attn_vxe.hpp
Normal file
@@ -0,0 +1,386 @@
|
||||
#ifndef CPU_ATTN_VXE_HPP
|
||||
#define CPU_ATTN_VXE_HPP
|
||||
|
||||
#include "cpu_attn_impl.hpp"
|
||||
#include <vecintrin.h>
|
||||
#include <type_traits>
|
||||
|
||||
namespace cpu_attention {
|
||||
|
||||
namespace {
|
||||
|
||||
// s390x Vector = 16 bytes (128 bits)
|
||||
#define BLOCK_SIZE_ALIGNMENT 32
|
||||
#define HEAD_SIZE_ALIGNMENT 32
|
||||
#define MAX_Q_HEAD_NUM_PER_ITER 16
|
||||
|
||||
template <typename kv_cache_t>
|
||||
FORCE_INLINE void load_row8_B_as_f32(const kv_cache_t* p, __vector float& b0,
|
||||
__vector float& b1);
|
||||
|
||||
// [1] Float Specialization
|
||||
template <>
|
||||
FORCE_INLINE void load_row8_B_as_f32<float>(const float* p, __vector float& b0,
|
||||
__vector float& b1) {
|
||||
// Explicitly cast to long long for offset, and float* for pointer
|
||||
b0 = vec_xl((long long)0, const_cast<float*>(p));
|
||||
b1 = vec_xl((long long)0, const_cast<float*>(p + 4));
|
||||
}
|
||||
|
||||
// [2] BFloat16 Specialization (Big Endian Fix)
|
||||
template <>
|
||||
FORCE_INLINE void load_row8_B_as_f32<c10::BFloat16>(const c10::BFloat16* p,
|
||||
__vector float& b0,
|
||||
__vector float& b1) {
|
||||
// 1. Load 8 BF16s (16 bytes) into one vector
|
||||
// Explicit cast to unsigned short* for vec_xl to return vector unsigned short
|
||||
__vector unsigned short raw = vec_xl((long long)0, (unsigned short*)p);
|
||||
|
||||
// 2. Prepare Zero vector
|
||||
__vector unsigned short zeros = vec_splat_u16(0);
|
||||
|
||||
// 3. Merge High/Low to expand BF16 -> Float32
|
||||
// On Big Endian, a float is [BF16_bits | 16_zero_bits]
|
||||
b0 = (__vector float)vec_mergeh(raw, zeros);
|
||||
b1 = (__vector float)vec_mergel(raw, zeros);
|
||||
}
|
||||
|
||||
template <>
|
||||
FORCE_INLINE void load_row8_B_as_f32<c10::Half>(const c10::Half* p,
|
||||
__vector float& b0,
|
||||
__vector float& b1) {
|
||||
alignas(16) float tmp[8];
|
||||
|
||||
// Manual unroll / conversion
|
||||
tmp[0] = static_cast<float>(p[0]);
|
||||
tmp[1] = static_cast<float>(p[1]);
|
||||
tmp[2] = static_cast<float>(p[2]);
|
||||
tmp[3] = static_cast<float>(p[3]);
|
||||
tmp[4] = static_cast<float>(p[4]);
|
||||
tmp[5] = static_cast<float>(p[5]);
|
||||
tmp[6] = static_cast<float>(p[6]);
|
||||
tmp[7] = static_cast<float>(p[7]);
|
||||
|
||||
// Explicit arguments for intrinsic: (long long offset, float* ptr)
|
||||
b0 = vec_xl((long long)0, (float*)tmp);
|
||||
b1 = vec_xl((long long)0, (float*)(tmp + 4));
|
||||
}
|
||||
|
||||
template <int32_t M, typename kv_cache_t>
|
||||
FORCE_INLINE void gemm_micro_s390x_Mx8_Ku4(
|
||||
const float* __restrict A, // [M x K]
|
||||
const kv_cache_t* __restrict B, // [K x 8]
|
||||
float* __restrict C, // [M x 8]
|
||||
int64_t lda, int64_t ldb, int64_t ldc, int32_t K, bool accumulate) {
|
||||
static_assert(1 <= M && M <= 8, "M must be in [1,8]");
|
||||
|
||||
// Helper macros to unroll codegen for M rows
|
||||
#define ROWS_APPLY(OP) OP(0) OP(1) OP(2) OP(3) OP(4) OP(5) OP(6) OP(7)
|
||||
#define IF_M(i) if constexpr (M > (i))
|
||||
|
||||
// 1. Define A pointers
|
||||
#define DECL_A(i) const float* a##i = A + (i) * lda;
|
||||
ROWS_APPLY(DECL_A)
|
||||
#undef DECL_A
|
||||
|
||||
// 2. Define Accumulators (2 vectors covers 8 columns)
|
||||
#define DECL_ACC(i) __vector float acc##i##_0, acc##i##_1;
|
||||
ROWS_APPLY(DECL_ACC)
|
||||
#undef DECL_ACC
|
||||
|
||||
// 3. Initialize Accumulators (Load C or Zero)
|
||||
#define INIT_ACC(i) \
|
||||
IF_M(i) { \
|
||||
if (accumulate) { \
|
||||
acc##i##_0 = \
|
||||
vec_xl((long long)0, const_cast<float*>(C + (i) * ldc + 0)); \
|
||||
acc##i##_1 = \
|
||||
vec_xl((long long)0, const_cast<float*>(C + (i) * ldc + 4)); \
|
||||
} else { \
|
||||
acc##i##_0 = vec_splats(0.0f); \
|
||||
acc##i##_1 = vec_splats(0.0f); \
|
||||
} \
|
||||
}
|
||||
ROWS_APPLY(INIT_ACC)
|
||||
#undef INIT_ACC
|
||||
|
||||
int32_t k = 0;
|
||||
|
||||
for (; k + 3 < K; k += 4) {
|
||||
// Load 4 values of A for each Row M: A[k...k+3]
|
||||
#define LOAD_A4(i) \
|
||||
__vector float a##i##v; \
|
||||
IF_M(i) a##i##v = vec_xl((long long)0, const_cast<float*>(a##i + k));
|
||||
ROWS_APPLY(LOAD_A4)
|
||||
#undef LOAD_A4
|
||||
|
||||
// Helper: FMA for specific lane L of A
|
||||
// s390x: vec_madd(b, vec_splat(a, lane), acc)
|
||||
#define FMAS_LANE(i, aiv, L) \
|
||||
IF_M(i) { \
|
||||
__vector float a_broad = vec_splat(aiv, L); \
|
||||
acc##i##_0 = vec_madd(b0, a_broad, acc##i##_0); \
|
||||
acc##i##_1 = vec_madd(b1, a_broad, acc##i##_1); \
|
||||
}
|
||||
|
||||
// Unroll K=0..3
|
||||
{
|
||||
__vector float b0, b1;
|
||||
load_row8_B_as_f32<kv_cache_t>(B + (int64_t)(k + 0) * ldb, b0, b1);
|
||||
#define STEP_K0(i) FMAS_LANE(i, a##i##v, 0)
|
||||
ROWS_APPLY(STEP_K0)
|
||||
#undef STEP_K0
|
||||
}
|
||||
{
|
||||
__vector float b0, b1;
|
||||
load_row8_B_as_f32<kv_cache_t>(B + (int64_t)(k + 1) * ldb, b0, b1);
|
||||
#define STEP_K1(i) FMAS_LANE(i, a##i##v, 1)
|
||||
ROWS_APPLY(STEP_K1)
|
||||
#undef STEP_K1
|
||||
}
|
||||
{
|
||||
__vector float b0, b1;
|
||||
load_row8_B_as_f32<kv_cache_t>(B + (int64_t)(k + 2) * ldb, b0, b1);
|
||||
#define STEP_K2(i) FMAS_LANE(i, a##i##v, 2)
|
||||
ROWS_APPLY(STEP_K2)
|
||||
#undef STEP_K2
|
||||
}
|
||||
|
||||
{
|
||||
__vector float b0, b1;
|
||||
load_row8_B_as_f32<kv_cache_t>(B + (int64_t)(k + 3) * ldb, b0, b1);
|
||||
#define STEP_K3(i) FMAS_LANE(i, a##i##v, 3)
|
||||
ROWS_APPLY(STEP_K3)
|
||||
#undef STEP_K3
|
||||
}
|
||||
#undef FMAS_LANE
|
||||
}
|
||||
|
||||
for (; k < K; ++k) {
|
||||
__vector float b0, b1;
|
||||
load_row8_B_as_f32<kv_cache_t>(B + (int64_t)k * ldb, b0, b1);
|
||||
#define TAIL_ROW(i) \
|
||||
IF_M(i) { \
|
||||
__vector float ai = vec_splats(*(a##i + k)); \
|
||||
acc##i##_0 = vec_madd(b0, ai, acc##i##_0); \
|
||||
acc##i##_1 = vec_madd(b1, ai, acc##i##_1); \
|
||||
}
|
||||
ROWS_APPLY(TAIL_ROW)
|
||||
#undef TAIL_ROW
|
||||
}
|
||||
|
||||
#define STORE_ROW(i) \
|
||||
IF_M(i) { \
|
||||
vec_xst(acc##i##_0, 0, C + (i) * ldc + 0); \
|
||||
vec_xst(acc##i##_1, 0, C + (i) * ldc + 4); \
|
||||
}
|
||||
ROWS_APPLY(STORE_ROW)
|
||||
#undef STORE_ROW
|
||||
|
||||
#undef ROWS_APPLY
|
||||
#undef IF_M
|
||||
}
|
||||
|
||||
template <int32_t N, typename kv_cache_t>
|
||||
FORCE_INLINE void gemm_macro_s390x_Mx8_Ku4(const float* __restrict A,
|
||||
const kv_cache_t* __restrict B,
|
||||
float* __restrict C, int32_t M,
|
||||
int32_t K, int64_t lda, int64_t ldb,
|
||||
int64_t ldc, bool accumulate) {
|
||||
static_assert(N % 8 == 0, "N must be a multiple of 8");
|
||||
for (int32_t m = 0; m < M;) {
|
||||
int32_t mb = (M - m >= 8) ? 8 : (M - m >= 4) ? 4 : (M - m >= 2) ? 2 : 1;
|
||||
const float* Ab = A + m * lda;
|
||||
float* Cb = C + m * ldc;
|
||||
|
||||
for (int32_t n = 0; n < N; n += 8) {
|
||||
const kv_cache_t* Bn = B + n;
|
||||
float* Cn = Cb + n;
|
||||
switch (mb) {
|
||||
case 8:
|
||||
gemm_micro_s390x_Mx8_Ku4<8, kv_cache_t>(Ab, Bn, Cn, lda, ldb, ldc, K,
|
||||
accumulate);
|
||||
break;
|
||||
case 4:
|
||||
gemm_micro_s390x_Mx8_Ku4<4, kv_cache_t>(Ab, Bn, Cn, lda, ldb, ldc, K,
|
||||
accumulate);
|
||||
break;
|
||||
case 2:
|
||||
gemm_micro_s390x_Mx8_Ku4<2, kv_cache_t>(Ab, Bn, Cn, lda, ldb, ldc, K,
|
||||
accumulate);
|
||||
break;
|
||||
default:
|
||||
gemm_micro_s390x_Mx8_Ku4<1, kv_cache_t>(Ab, Bn, Cn, lda, ldb, ldc, K,
|
||||
accumulate);
|
||||
break;
|
||||
}
|
||||
}
|
||||
m += mb;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename kv_cache_t>
|
||||
class TileGemmS390X {
|
||||
public:
|
||||
template <AttentionGemmPhase phase, int32_t k_size>
|
||||
FORCE_INLINE static void gemm(const int32_t m_size,
|
||||
float* __restrict__ a_tile,
|
||||
kv_cache_t* __restrict__ b_tile,
|
||||
float* __restrict__ c_tile, const int64_t lda,
|
||||
const int64_t ldb, const int64_t ldc,
|
||||
const int32_t block_size,
|
||||
const int32_t dynamic_k_size,
|
||||
const bool accum_c) {
|
||||
if constexpr (phase == AttentionGemmPhase::QK) {
|
||||
gemm_macro_s390x_Mx8_Ku4<BLOCK_SIZE_ALIGNMENT, kv_cache_t>(
|
||||
a_tile, b_tile, c_tile, m_size, k_size, lda, ldb, ldc, accum_c);
|
||||
} else {
|
||||
gemm_macro_s390x_Mx8_Ku4<HEAD_SIZE_ALIGNMENT, kv_cache_t>(
|
||||
a_tile, b_tile, c_tile, m_size, dynamic_k_size, lda, ldb, ldc,
|
||||
accum_c);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace
|
||||
|
||||
template <typename scalar_t, int64_t head_dim>
|
||||
class AttentionImpl<ISA::VXE, scalar_t, head_dim> {
|
||||
public:
|
||||
using query_t = scalar_t;
|
||||
using q_buffer_t = float;
|
||||
using kv_cache_t = scalar_t;
|
||||
using logits_buffer_t = float;
|
||||
using partial_output_buffer_t = float;
|
||||
using prob_buffer_t = float;
|
||||
|
||||
constexpr static int64_t BlockSizeAlignment = BLOCK_SIZE_ALIGNMENT;
|
||||
constexpr static int64_t HeadDimAlignment = HEAD_SIZE_ALIGNMENT;
|
||||
constexpr static int64_t MaxQHeadNumPerIteration = MAX_Q_HEAD_NUM_PER_ITER;
|
||||
constexpr static int64_t HeadDim = head_dim;
|
||||
constexpr static ISA ISAType = ISA::VXE;
|
||||
constexpr static bool scale_on_logits =
|
||||
false; // Scale is applied to Q during copy
|
||||
|
||||
public:
|
||||
AttentionImpl() {}
|
||||
|
||||
template <template <typename tile_gemm_t> typename attention>
|
||||
FORCE_INLINE void execute_attention(DEFINE_CPU_ATTENTION_PARAMS) {
|
||||
attention<TileGemmS390X<kv_cache_t>> attention_iteration;
|
||||
attention_iteration(CPU_ATTENTION_PARAMS);
|
||||
}
|
||||
|
||||
// Strides for Memory Layout
|
||||
constexpr static int64_t k_cache_token_group_stride(
|
||||
const int32_t block_size) {
|
||||
return BlockSizeAlignment; // [head_dim, block_size] layout
|
||||
}
|
||||
|
||||
constexpr static int64_t v_cache_token_group_stride(
|
||||
const int32_t block_size) {
|
||||
return head_dim * BlockSizeAlignment;
|
||||
}
|
||||
|
||||
constexpr static int64_t v_cache_head_group_stride(const int32_t block_size) {
|
||||
return HeadDimAlignment;
|
||||
}
|
||||
|
||||
static void copy_q_heads_tile(scalar_t* __restrict__ src,
|
||||
float* __restrict__ q_buffer,
|
||||
const int32_t q_num,
|
||||
const int32_t q_heads_per_kv,
|
||||
const int64_t q_num_stride,
|
||||
const int64_t q_head_stride, float scale) {
|
||||
__vector float scale_vec = vec_splats(scale);
|
||||
constexpr bool is_bf16 = std::is_same<scalar_t, c10::BFloat16>::value;
|
||||
|
||||
// Process 8 elements at a time (32 bytes of float output)
|
||||
for (int32_t i = 0; i < q_num; ++i) {
|
||||
for (int32_t h = 0; h < q_heads_per_kv; ++h) {
|
||||
scalar_t* curr_src = src + i * q_num_stride + h * q_head_stride;
|
||||
float* curr_dst =
|
||||
q_buffer + i * q_heads_per_kv * head_dim + h * head_dim;
|
||||
|
||||
int32_t d = 0;
|
||||
for (; d <= head_dim - 8; d += 8) {
|
||||
if constexpr (is_bf16) {
|
||||
__vector float v0, v1;
|
||||
// Reuse our Big-Endian-Safe loader
|
||||
load_row8_B_as_f32<scalar_t>(curr_src + d, v0, v1);
|
||||
|
||||
v0 = vec_mul(v0, scale_vec);
|
||||
v1 = vec_mul(v1, scale_vec);
|
||||
|
||||
vec_xst(v0, 0, curr_dst + d);
|
||||
vec_xst(v1, 0, curr_dst + d + 4);
|
||||
} else {
|
||||
__vector float v0 = vec_xl((long long)0, (float*)curr_src + d);
|
||||
__vector float v1 = vec_xl((long long)0, (float*)curr_src + d + 4);
|
||||
|
||||
v0 = vec_mul(v0, scale_vec);
|
||||
v1 = vec_mul(v1, scale_vec);
|
||||
|
||||
vec_xst(v0, 0, curr_dst + d);
|
||||
vec_xst(v1, 0, curr_dst + d + 4);
|
||||
}
|
||||
}
|
||||
|
||||
for (; d < head_dim; ++d) {
|
||||
float val = static_cast<float>(curr_src[d]);
|
||||
curr_dst[d] = val * scale;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void reshape_and_cache(
|
||||
const scalar_t* __restrict__ key, const scalar_t* __restrict__ value,
|
||||
scalar_t* __restrict__ key_cache, scalar_t* __restrict__ value_cache,
|
||||
const int64_t* __restrict__ slot_mapping, const int64_t token_num,
|
||||
const int64_t key_token_num_stride, const int64_t value_token_num_stride,
|
||||
const int64_t head_num, const int64_t key_head_num_stride,
|
||||
const int64_t value_head_num_stride, const int64_t num_blocks,
|
||||
const int64_t num_blocks_stride, const int64_t cache_head_num_stride,
|
||||
const int64_t block_size, const int64_t block_size_stride) {
|
||||
#pragma omp parallel for collapse(2)
|
||||
for (int64_t token_idx = 0; token_idx < token_num; ++token_idx) {
|
||||
for (int64_t head_idx = 0; head_idx < head_num; ++head_idx) {
|
||||
const int64_t pos = slot_mapping[token_idx];
|
||||
if (pos < 0) continue;
|
||||
|
||||
const int64_t block_idx = pos / block_size;
|
||||
const int64_t block_offset = pos % block_size;
|
||||
|
||||
{
|
||||
const scalar_t* key_src = key + token_idx * key_token_num_stride +
|
||||
head_idx * key_head_num_stride;
|
||||
scalar_t* key_dst = key_cache + block_idx * num_blocks_stride +
|
||||
head_idx * cache_head_num_stride + block_offset;
|
||||
|
||||
for (int64_t i = 0, j = 0; i < head_dim; ++i, j += block_size) {
|
||||
key_dst[j] = key_src[i];
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
const scalar_t* val_src = value + token_idx * value_token_num_stride +
|
||||
head_idx * value_head_num_stride;
|
||||
scalar_t* val_dst = value_cache + block_idx * num_blocks_stride +
|
||||
head_idx * cache_head_num_stride +
|
||||
block_offset * head_dim;
|
||||
|
||||
std::memcpy(val_dst, val_src, sizeof(scalar_t) * head_dim);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace cpu_attention
|
||||
|
||||
#undef BLOCK_SIZE_ALIGNMENT
|
||||
#undef HEAD_SIZE_ALIGNMENT
|
||||
#undef MAX_Q_HEAD_NUM_PER_ITER
|
||||
|
||||
#endif
|
||||
@@ -147,7 +147,7 @@ void fused_moe_impl(scalar_t* __restrict__ output, scalar_t* __restrict__ input,
|
||||
const int32_t token_num, const int32_t expert_num,
|
||||
const int32_t topk_num, const int32_t input_size_13,
|
||||
const int32_t output_size_13, const int32_t input_size_2,
|
||||
const int32_t output_size_2) {
|
||||
const int32_t output_size_2, const bool skip_weighted) {
|
||||
using scalar_vec_t = typename cpu_utils::VecTypeTrait<scalar_t>::vec_t;
|
||||
constexpr int32_t gemm_n_tile_size = gemm_t::NSize;
|
||||
constexpr int32_t gemm_m_tile_size = gemm_t::MaxMSize;
|
||||
@@ -582,6 +582,11 @@ void fused_moe_impl(scalar_t* __restrict__ output, scalar_t* __restrict__ input,
|
||||
scalar_t* __restrict__ curr_output_buffer =
|
||||
output + token_id * output_size_2;
|
||||
|
||||
if (skip_weighted) {
|
||||
// Only for topk_num == 1
|
||||
*curr_weight = 1.0f;
|
||||
}
|
||||
|
||||
if (topk_num > 1) {
|
||||
{
|
||||
int32_t w2_output_idx = curr_expand_token_id_index_buffer[0];
|
||||
@@ -699,7 +704,7 @@ void cpu_fused_moe(
|
||||
const std::optional<torch::Tensor>& w2_bias, // [expert_num, output_size_2]
|
||||
const torch::Tensor& topk_weights, // [token_num, k], float32
|
||||
const torch::Tensor& topk_id, // [token_num, k], int32
|
||||
const std::string& act, const std::string& isa) {
|
||||
const bool skip_weighted, const std::string& act, const std::string& isa) {
|
||||
const int32_t token_num = input.size(0);
|
||||
const int32_t input_size_13 = input.size(1);
|
||||
const int64_t input_stride = input.stride(0);
|
||||
@@ -711,6 +716,8 @@ void cpu_fused_moe(
|
||||
const int32_t topk_num = topk_id.size(1);
|
||||
const FusedMOEAct act_type = get_act_type(act);
|
||||
cpu_utils::ISA isa_type = cpu_utils::get_isa(isa);
|
||||
TORCH_CHECK(!skip_weighted || topk_num == 1,
|
||||
"skip_weighted is only supported for topk=1 on CPU");
|
||||
|
||||
VLLM_DISPATCH_FLOATING_TYPES(w13.scalar_type(), "cpu_fused_moe", [&]() {
|
||||
CPU_ISA_DISPATCH_IMPL(isa_type, [&]() {
|
||||
@@ -721,7 +728,7 @@ void cpu_fused_moe(
|
||||
w2_bias.has_value() ? w2_bias->data_ptr<scalar_t>() : nullptr,
|
||||
topk_weights.data_ptr<float>(), topk_id.data_ptr<int32_t>(), act_type,
|
||||
token_num, expert_num, topk_num, input_size_13, output_size_13,
|
||||
input_size_2, output_size_2);
|
||||
input_size_2, output_size_2, skip_weighted);
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user