Compare commits
896 Commits
v0.11.1
...
v0.13.0rc3
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
f124b56786 | ||
|
|
d78e128b8b | ||
|
|
761b730dcb | ||
|
|
f34eca5f01 | ||
|
|
4cd332f3cf | ||
|
|
16484d394c | ||
|
|
e397bd6592 | ||
|
|
6a88d590bb | ||
|
|
ad8c073131 | ||
|
|
c01d589813 | ||
|
|
60dbf7d8f1 | ||
|
|
a450c64a30 | ||
|
|
b2191abdca | ||
|
|
51e5b3e3c4 | ||
|
|
ec154c36ee | ||
|
|
970713d4a4 | ||
|
|
17fec3af09 | ||
|
|
855b101d75 | ||
|
|
d0502b4928 | ||
|
|
3f175f18a2 | ||
|
|
ed586e7724 | ||
|
|
2a1776b7ac | ||
|
|
185c22bf2f | ||
|
|
e4806d973a | ||
|
|
4429d934de | ||
|
|
33278073d6 | ||
|
|
1adeb3b84c | ||
|
|
e3a1cd1c59 | ||
|
|
3778673ea8 | ||
|
|
b337647aa0 | ||
|
|
a524d1ba0a | ||
|
|
87b4d1557d | ||
|
|
84e23d103d | ||
|
|
738648fb81 | ||
|
|
917fdae5b2 | ||
|
|
e2ed238885 | ||
|
|
174e39ead7 | ||
|
|
9ccbf6b692 | ||
|
|
ae2e503dda | ||
|
|
9e33a1a75b | ||
|
|
add4b0ca44 | ||
|
|
ae88aada38 | ||
|
|
5ccf0efa84 | ||
|
|
994acec0cc | ||
|
|
48b8456ff9 | ||
|
|
5b64ac21f9 | ||
|
|
a8ec486592 | ||
|
|
6ecc1e411b | ||
|
|
0bb0bae436 | ||
|
|
060893654d | ||
|
|
e9add129ad | ||
|
|
3224ea9915 | ||
|
|
3a20450d31 | ||
|
|
1a55cfafcb | ||
|
|
add1b9d3de | ||
|
|
dcb31196da | ||
|
|
f569c654e1 | ||
|
|
97f2f160fd | ||
|
|
29f7d97715 | ||
|
|
dc7fb5bebe | ||
|
|
24429d5924 | ||
|
|
6e78ed6ba7 | ||
|
|
7c16f3fbcc | ||
|
|
ddbfbe5278 | ||
|
|
763963aa73 | ||
|
|
39cefbdf17 | ||
|
|
ace34e3783 | ||
|
|
e5db3e2774 | ||
|
|
64251f48df | ||
|
|
1cec5b7ea9 | ||
|
|
b09806e28f | ||
|
|
fdc135d768 | ||
|
|
4fa7ce46f3 | ||
|
|
57e9bf1864 | ||
|
|
2f32a68d75 | ||
|
|
f5dfbbd8e9 | ||
|
|
fc0119425c | ||
|
|
86a3261525 | ||
|
|
08f8a5627e | ||
|
|
b4039c08b5 | ||
|
|
1e6b115300 | ||
|
|
13618626df | ||
|
|
6ec0d8dbe4 | ||
|
|
9693dd0fe3 | ||
|
|
1f19d8f899 | ||
|
|
cd7740ac5c | ||
|
|
02a5880394 | ||
|
|
d2c919dcc2 | ||
|
|
f3237f3f6b | ||
|
|
9c0ee995a8 | ||
|
|
09ad3b76b3 | ||
|
|
dc13c99eed | ||
|
|
3e34adcdfb | ||
|
|
3e41992fec | ||
|
|
91401c7a26 | ||
|
|
f90319d5d1 | ||
|
|
302b2c1eb9 | ||
|
|
8f8fda261a | ||
|
|
fe1787107e | ||
|
|
783644e4ac | ||
|
|
197473c4e7 | ||
|
|
947dfda9c2 | ||
|
|
9f2fc16a69 | ||
|
|
6a6fc41c79 | ||
|
|
f355ad5412 | ||
|
|
042da73244 | ||
|
|
b5945d49c0 | ||
|
|
ba80926681 | ||
|
|
0ab23c2b2b | ||
|
|
48661d275f | ||
|
|
d527cf0b3d | ||
|
|
2cc5affc38 | ||
|
|
a00d88973d | ||
|
|
61249b177d | ||
|
|
c817b14151 | ||
|
|
3efdc3feae | ||
|
|
0efd9f867c | ||
|
|
90d6cf921f | ||
|
|
cf3eacfe58 | ||
|
|
92fea56fd1 | ||
|
|
e458270a95 | ||
|
|
72aaac5b66 | ||
|
|
0e71eaa644 | ||
|
|
8781cd6b88 | ||
|
|
aa3c250c48 | ||
|
|
305b168a9f | ||
|
|
93db3256a4 | ||
|
|
17cb540248 | ||
|
|
97a042f3bc | ||
|
|
3a3b06ee70 | ||
|
|
f4417f8449 | ||
|
|
a11f4a81e0 | ||
|
|
853611bb18 | ||
|
|
d917747c95 | ||
|
|
a5f9fb5960 | ||
|
|
4515eb1a0b | ||
|
|
13d63b65e0 | ||
|
|
b4e8b91278 | ||
|
|
6299628d32 | ||
|
|
fba8906930 | ||
|
|
d02d1043de | ||
|
|
979f50efd0 | ||
|
|
36c9ce2554 | ||
|
|
1a516557e1 | ||
|
|
d6464f2679 | ||
|
|
7e24e5d4d6 | ||
|
|
5a87d8b9b1 | ||
|
|
d1e1fb4363 | ||
|
|
b51255f369 | ||
|
|
b4054c8ab4 | ||
|
|
25221b44bb | ||
|
|
8580919ac3 | ||
|
|
166ac3c94d | ||
|
|
b9e0951f96 | ||
|
|
fcb894222f | ||
|
|
6ccb7baeb1 | ||
|
|
eea41804a4 | ||
|
|
9f042ba26b | ||
|
|
e72d65b959 | ||
|
|
a9e4106f28 | ||
|
|
e8e8cd73e5 | ||
|
|
253305d5b2 | ||
|
|
794a7875ee | ||
|
|
2dcbac9077 | ||
|
|
aacf0abf8b | ||
|
|
c756fb6781 | ||
|
|
d017bceb08 | ||
|
|
cebda2a4af | ||
|
|
53d2420b44 | ||
|
|
9db78f34dc | ||
|
|
434ac76a7c | ||
|
|
ed7af3178a | ||
|
|
180345807f | ||
|
|
d007387aa7 | ||
|
|
3bdd426636 | ||
|
|
06462392e4 | ||
|
|
7d80c73d42 | ||
|
|
b75f826fca | ||
|
|
c3487aca34 | ||
|
|
abe93bce59 | ||
|
|
2e7035dd8c | ||
|
|
4c2e10ea19 | ||
|
|
03b5f940fd | ||
|
|
2e7054da06 | ||
|
|
3c680f4a17 | ||
|
|
fccd532587 | ||
|
|
00e5cbb967 | ||
|
|
7618dc973d | ||
|
|
f8dacc66b6 | ||
|
|
7cab92fd45 | ||
|
|
73a484caa1 | ||
|
|
b37bf51e75 | ||
|
|
95501a70ec | ||
|
|
e858bfe051 | ||
|
|
d471b2aff0 | ||
|
|
9e6562a3f6 | ||
|
|
0b6a8a304c | ||
|
|
804e3468c0 | ||
|
|
83319b44c2 | ||
|
|
56037dfa2f | ||
|
|
5dcd593baf | ||
|
|
5c213d2899 | ||
|
|
ee14644ba9 | ||
|
|
1166c31cc7 | ||
|
|
03416eada6 | ||
|
|
c72ea10723 | ||
|
|
67475a6e81 | ||
|
|
9c32df6101 | ||
|
|
aeb82b1930 | ||
|
|
aed846917f | ||
|
|
e4605d225e | ||
|
|
58d5b3f514 | ||
|
|
c2e1987a6e | ||
|
|
e130845984 | ||
|
|
4b03b50211 | ||
|
|
4c6fd25880 | ||
|
|
03b91f7262 | ||
|
|
f6227c22ab | ||
|
|
ea657f2078 | ||
|
|
db14f61f2d | ||
|
|
78c7503364 | ||
|
|
e41312a2f5 | ||
|
|
7b35011ad1 | ||
|
|
ae339b1a67 | ||
|
|
0ee6416f67 | ||
|
|
d9417096d1 | ||
|
|
9d6235ca9a | ||
|
|
f1599ca55d | ||
|
|
60d17251c9 | ||
|
|
1fb632fdb6 | ||
|
|
6af70e11a0 | ||
|
|
ae0f69b16a | ||
|
|
799804d140 | ||
|
|
0d402d2600 | ||
|
|
d1b5e7afbf | ||
|
|
fcd5306f65 | ||
|
|
398a596ed2 | ||
|
|
67312cad11 | ||
|
|
87aee9ed2b | ||
|
|
184076c3fe | ||
|
|
eb1051fb95 | ||
|
|
80433e225e | ||
|
|
5c2433a6f3 | ||
|
|
77072e93b3 | ||
|
|
2e660c2434 | ||
|
|
408cf42f67 | ||
|
|
9e77ffca3f | ||
|
|
bcb6f5947f | ||
|
|
cd00c443d2 | ||
|
|
d143271234 | ||
|
|
c6df05ebb4 | ||
|
|
d726a7b0ed | ||
|
|
344b50d525 | ||
|
|
735284ed86 | ||
|
|
444f0e3f33 | ||
|
|
af0444bf40 | ||
|
|
0044c4038c | ||
|
|
b952f4d3c3 | ||
|
|
541a2ef892 | ||
|
|
b0f4866a77 | ||
|
|
879ddb09c3 | ||
|
|
1b0482b9d1 | ||
|
|
e83b7e379c | ||
|
|
27f4c2fd46 | ||
|
|
a49d813fa8 | ||
|
|
17eb25e327 | ||
|
|
dce6d229f7 | ||
|
|
cbedb703cc | ||
|
|
8d3da4c79d | ||
|
|
421125d03a | ||
|
|
671427efbf | ||
|
|
21bb323542 | ||
|
|
17a9abec2b | ||
|
|
92c35abb24 | ||
|
|
43e7593031 | ||
|
|
c46b932df2 | ||
|
|
6476382384 | ||
|
|
d6aeaddf4a | ||
|
|
a238cbd89d | ||
|
|
4026ae31e9 | ||
|
|
b12f4a9830 | ||
|
|
40a046cd82 | ||
|
|
e858bc4d14 | ||
|
|
e3fbb6f152 | ||
|
|
c4d62618ca | ||
|
|
62079d8600 | ||
|
|
bf4a901af9 | ||
|
|
7e31c3a3f6 | ||
|
|
dc839ad03d | ||
|
|
02a4169193 | ||
|
|
7b5575fa7d | ||
|
|
77e4472809 | ||
|
|
962d703818 | ||
|
|
e23ca3a0e8 | ||
|
|
3633035a3f | ||
|
|
bff78310d9 | ||
|
|
adb315060c | ||
|
|
4e26d3b09e | ||
|
|
66e674cdd5 | ||
|
|
dff0a2b394 | ||
|
|
dc264bcea1 | ||
|
|
78c44fd722 | ||
|
|
e7296b08da | ||
|
|
da7bc54ea8 | ||
|
|
949a6a19d2 | ||
|
|
2c174420f5 | ||
|
|
0d8a7d8a26 | ||
|
|
9843e332da | ||
|
|
b7d85cf25c | ||
|
|
c2894d3883 | ||
|
|
3628bcaaf2 | ||
|
|
b73b158ab0 | ||
|
|
7ae13c66ba | ||
|
|
f16356fe36 | ||
|
|
65ee97288a | ||
|
|
62b3333448 | ||
|
|
feecba09af | ||
|
|
6038b1b04b | ||
|
|
60a66ea2dc | ||
|
|
06579f9a82 | ||
|
|
6e865b6a83 | ||
|
|
d698bb382d | ||
|
|
2c22c4ca2d | ||
|
|
5867819eaf | ||
|
|
7c9b2c8f81 | ||
|
|
0098a6e3da | ||
|
|
befb59e5b1 | ||
|
|
aaddc9c82a | ||
|
|
263c38d74d | ||
|
|
bcf43ab1f3 | ||
|
|
4470ee2f90 | ||
|
|
690cc3ef20 | ||
|
|
1f0d184590 | ||
|
|
c8ab988b15 | ||
|
|
48a5fff66e | ||
|
|
1119f6e47a | ||
|
|
e10c84e06a | ||
|
|
ece2825a29 | ||
|
|
652ba93da3 | ||
|
|
6dcb07f676 | ||
|
|
46cbbca05c | ||
|
|
b286a311c2 | ||
|
|
990f806473 | ||
|
|
5b4b42c0b6 | ||
|
|
cc050558f4 | ||
|
|
5c32a06a04 | ||
|
|
dd97e047e0 | ||
|
|
9998ea5b57 | ||
|
|
74c4d80c6c | ||
|
|
1b7c7f5159 | ||
|
|
6796ce8bdb | ||
|
|
e96a6a6dca | ||
|
|
6366c098d7 | ||
|
|
842aba501d | ||
|
|
f2f4cea6cc | ||
|
|
dfdda96747 | ||
|
|
ffdd18111b | ||
|
|
b8a6ae4158 | ||
|
|
899e2ef558 | ||
|
|
68eb5c8d97 | ||
|
|
5430e110c0 | ||
|
|
3f1b03739a | ||
|
|
9aa33a74b0 | ||
|
|
fd68e909db | ||
|
|
404fc4bfc0 | ||
|
|
82a64b3d8f | ||
|
|
9ae2f60374 | ||
|
|
80f8af4b2f | ||
|
|
8aaa81b35f | ||
|
|
fca3f46658 | ||
|
|
28097d5638 | ||
|
|
dd38ba3a26 | ||
|
|
5f91cdda75 | ||
|
|
33a3d6c798 | ||
|
|
c493b9d092 | ||
|
|
ad32e3e19c | ||
|
|
1109f98288 | ||
|
|
b5407869c8 | ||
|
|
2902c34826 | ||
|
|
ac1886588f | ||
|
|
2fc5d6e0d7 | ||
|
|
afe9eb408e | ||
|
|
19bee6d12d | ||
|
|
dd5d1ef780 | ||
|
|
d1f7392c5f | ||
|
|
9ae3c55b10 | ||
|
|
9bcf92295a | ||
|
|
5aa9b09040 | ||
|
|
1bb17ecb39 | ||
|
|
15b1511a15 | ||
|
|
b78772c433 | ||
|
|
f5d3d93c40 | ||
|
|
78f4bb0ba8 | ||
|
|
b294e28db2 | ||
|
|
787b84a9fc | ||
|
|
42c1949643 | ||
|
|
cc4e296ea6 | ||
|
|
a21cd9ed23 | ||
|
|
7fe9c1a223 | ||
|
|
3f42b05fbc | ||
|
|
69520bc695 | ||
|
|
3a7751485b | ||
|
|
bbfb55c29e | ||
|
|
0bec63fa31 | ||
|
|
c719c40540 | ||
|
|
b08025a83b | ||
|
|
d7284a2604 | ||
|
|
506ed87e87 | ||
|
|
4dd7978374 | ||
|
|
5cdd664509 | ||
|
|
5f67361fd1 | ||
|
|
5d91d2b292 | ||
|
|
c014de1ec7 | ||
|
|
1b1e35aaf9 | ||
|
|
5e5646e206 | ||
|
|
0a9caca9f5 | ||
|
|
e6f114ac25 | ||
|
|
6fc5841db1 | ||
|
|
3ff5b53bc2 | ||
|
|
1528e079e2 | ||
|
|
afb1e5b380 | ||
|
|
1c593e117d | ||
|
|
a2b053dc85 | ||
|
|
1d93f11675 | ||
|
|
2d613de9ae | ||
|
|
c77b9929a0 | ||
|
|
63b1da76ba | ||
|
|
52cb349fc0 | ||
|
|
0ec8422171 | ||
|
|
2eb4fe9129 | ||
|
|
51c57b51dd | ||
|
|
60c3d413af | ||
|
|
68ffbca7e4 | ||
|
|
951445a52d | ||
|
|
d8c6210eea | ||
|
|
8bbcf8b6e7 | ||
|
|
70fb77b4dc | ||
|
|
48d15a32aa | ||
|
|
3b221cb661 | ||
|
|
0037b5746a | ||
|
|
f5b0846ba0 | ||
|
|
13ea39bc09 | ||
|
|
4b612664fd | ||
|
|
653591d5e7 | ||
|
|
e2fbfc955e | ||
|
|
a690fb5bd6 | ||
|
|
81fe3f82af | ||
|
|
53bf71b0f0 | ||
|
|
f441d36cee | ||
|
|
22274b2184 | ||
|
|
fc95521ba5 | ||
|
|
d0cd728907 | ||
|
|
fa8804ad9c | ||
|
|
4b40924998 | ||
|
|
c0dfc89485 | ||
|
|
44822d7ff2 | ||
|
|
342c4f1472 | ||
|
|
1336a1ea24 | ||
|
|
eaf81485ed | ||
|
|
38caf7fa1a | ||
|
|
cabc77cc86 | ||
|
|
ec7035c9d4 | ||
|
|
fc6acc88ca | ||
|
|
d0985c5feb | ||
|
|
092bb73b8a | ||
|
|
5d43f7372e | ||
|
|
37593deb02 | ||
|
|
f5516039c5 | ||
|
|
36db0a35e4 | ||
|
|
5cfa967efa | ||
|
|
b95db244ee | ||
|
|
ad9d656bfa | ||
|
|
f37e8938d2 | ||
|
|
f0a28bf661 | ||
|
|
86e178f7c4 | ||
|
|
014ece97c7 | ||
|
|
62de4f4257 | ||
|
|
83805a6078 | ||
|
|
1ab8fc8197 | ||
|
|
f72a817bdf | ||
|
|
ec38a7368d | ||
|
|
21c2627934 | ||
|
|
39d28108f4 | ||
|
|
cd719de5cb | ||
|
|
8c363ed666 | ||
|
|
64bc09ba27 | ||
|
|
47539cfd3e | ||
|
|
2afcec4dec | ||
|
|
9381b5cde0 | ||
|
|
66b5840287 | ||
|
|
82c795d6f2 | ||
|
|
e1464c3a08 | ||
|
|
a491b0911b | ||
|
|
b9d0504a36 | ||
|
|
1656ad3704 | ||
|
|
fa59fe417f | ||
|
|
fe3398fab2 | ||
|
|
ad7f714d62 | ||
|
|
f4341f45d3 | ||
|
|
34a984274e | ||
|
|
f223ed4181 | ||
|
|
04a797cd0e | ||
|
|
6afc0ffaf6 | ||
|
|
39e63dec7c | ||
|
|
4a80ad0a25 | ||
|
|
4b17ce6815 | ||
|
|
e23f665d83 | ||
|
|
ca1b1e7296 | ||
|
|
762a4a6ca9 | ||
|
|
b2c50eda50 | ||
|
|
1dcafb3dea | ||
|
|
ea3370b428 | ||
|
|
c625d7b1c6 | ||
|
|
6173682b6e | ||
|
|
9726e64530 | ||
|
|
3fd1fb0b60 | ||
|
|
a51f4186f2 | ||
|
|
7675ba30de | ||
|
|
7c1ed45848 | ||
|
|
1986de1375 | ||
|
|
3461e7efd8 | ||
|
|
fecae12cd7 | ||
|
|
8d9338fae4 | ||
|
|
d40c854009 | ||
|
|
4332955602 | ||
|
|
f946a8d743 | ||
|
|
6f9d81d03b | ||
|
|
fae6943068 | ||
|
|
3bcbb30cbf | ||
|
|
9e6bcda3ac | ||
|
|
9eec282cb5 | ||
|
|
0808eb813b | ||
|
|
460d8bbf2d | ||
|
|
e2f56c309d | ||
|
|
f8151b66fa | ||
|
|
1168768a2d | ||
|
|
8e7a891602 | ||
|
|
953d9c820b | ||
|
|
33b06a6f24 | ||
|
|
5c2b5cb422 | ||
|
|
3cb32e5d6e | ||
|
|
ccbdf51bd5 | ||
|
|
5f5521bd5d | ||
|
|
b2c1d294fa | ||
|
|
cc0f2a0e19 | ||
|
|
480598958e | ||
|
|
b34e8775a3 | ||
|
|
f4b76056ee | ||
|
|
37b15e97e8 | ||
|
|
c7ba1f6bc7 | ||
|
|
18523b87f6 | ||
|
|
745a3bae1a | ||
|
|
35657bcd7a | ||
|
|
be493e0b3c | ||
|
|
ae0ce1be27 | ||
|
|
a5345bf49d | ||
|
|
e5a621b724 | ||
|
|
38658ec6f3 | ||
|
|
a24ea5414b | ||
|
|
ea228b4491 | ||
|
|
d45269b378 | ||
|
|
ee9841daa9 | ||
|
|
0840abdd24 | ||
|
|
e1f262337b | ||
|
|
fc1d8be3dc | ||
|
|
cd007a53b4 | ||
|
|
66d3d5422c | ||
|
|
bab438ff3e | ||
|
|
882851dc81 | ||
|
|
2f5f9acd55 | ||
|
|
cf348c8d27 | ||
|
|
a5abd1d384 | ||
|
|
e6d4f3c254 | ||
|
|
51906c8c55 | ||
|
|
0838b52e2e | ||
|
|
00d3310d2d | ||
|
|
da3222f371 | ||
|
|
43c5792592 | ||
|
|
3ecabd06ee | ||
|
|
c069086b9c | ||
|
|
11ea5ec1ff | ||
|
|
ecb1952378 | ||
|
|
da8e1a1bf9 | ||
|
|
ee80aee1ca | ||
|
|
0aeb698b77 | ||
|
|
9bb33c8919 | ||
|
|
a67dec7cba | ||
|
|
77740191de | ||
|
|
df01eda4dc | ||
|
|
ba1fcd84a7 | ||
|
|
56539cddac | ||
|
|
430dd4d9eb | ||
|
|
c4c0354eec | ||
|
|
e603129505 | ||
|
|
0b0aa874e8 | ||
|
|
70d5953f82 | ||
|
|
3650a74ed8 | ||
|
|
bb706d6048 | ||
|
|
e30859dff3 | ||
|
|
452a7c9f7c | ||
|
|
d9d342d214 | ||
|
|
53d7f1f601 | ||
|
|
c5ee430328 | ||
|
|
8d6a89dffd | ||
|
|
56531b79cc | ||
|
|
12866af748 | ||
|
|
d8819c88eb | ||
|
|
de75b0bb70 | ||
|
|
7df0289782 | ||
|
|
0abc79482a | ||
|
|
4e57c6587f | ||
|
|
e7d776273d | ||
|
|
c32a18cbe7 | ||
|
|
b07555d26f | ||
|
|
0353d2e162 | ||
|
|
a1f2676879 | ||
|
|
48ddb02b79 | ||
|
|
e502098643 | ||
|
|
dbc3d9991a | ||
|
|
794029f012 | ||
|
|
0231ce836a | ||
|
|
516c3f7847 | ||
|
|
51fc9e017a | ||
|
|
bf0c75cd4f | ||
|
|
c2c661af9b | ||
|
|
798e87db5c | ||
|
|
de6889946b | ||
|
|
7a80b01889 | ||
|
|
e1dd706cd1 | ||
|
|
a685b47c57 | ||
|
|
32c40b95e0 | ||
|
|
db2906108a | ||
|
|
67fc16cd8c | ||
|
|
6330f9477d | ||
|
|
ef1f7030f0 | ||
|
|
12c007e288 | ||
|
|
f242cfcdd5 | ||
|
|
888152bf87 | ||
|
|
fe3a4f5b34 | ||
|
|
98caeadd54 | ||
|
|
64deead719 | ||
|
|
7992324f23 | ||
|
|
40a6f53f6c | ||
|
|
ce58fdc1c3 | ||
|
|
a21256c463 | ||
|
|
316c8492bf | ||
|
|
2d9ee28cab | ||
|
|
81db702ed2 | ||
|
|
92effb07a4 | ||
|
|
87185c88d5 | ||
|
|
9cf4edae6e | ||
|
|
7012d8b45e | ||
|
|
22b42b5402 | ||
|
|
cb7214d8ea | ||
|
|
77e10c9cab | ||
|
|
6f1355a1b7 | ||
|
|
a4ad43ad5a | ||
|
|
a178a0b40b | ||
|
|
b8328b49fb | ||
|
|
5f9679a43b | ||
|
|
699bca76c0 | ||
|
|
c17610e2ba | ||
|
|
71df2a57ef | ||
|
|
4dd42db566 | ||
|
|
84371daf75 | ||
|
|
f32c7d6f54 | ||
|
|
3cfa63ad99 | ||
|
|
4d6afcaddc | ||
|
|
97588c4d12 | ||
|
|
839c6b7b72 | ||
|
|
8f066146c3 | ||
|
|
cec418b5df | ||
|
|
cc313cb73d | ||
|
|
26a465584a | ||
|
|
e924bbb4f4 | ||
|
|
656516c315 | ||
|
|
e48b2e6848 | ||
|
|
7a228b5305 | ||
|
|
f716a15372 | ||
|
|
2601f18a82 | ||
|
|
4de87866a8 | ||
|
|
eca7a8fb59 | ||
|
|
8005e606bf | ||
|
|
68dfe28eae | ||
|
|
ed40d85929 | ||
|
|
0ff70821c9 | ||
|
|
5253f4276f | ||
|
|
30854783ad | ||
|
|
1073ba68b0 | ||
|
|
c309bb5245 | ||
|
|
3e1ad40655 | ||
|
|
62d54ba46d | ||
|
|
b004c00418 | ||
|
|
7f12c82fa6 | ||
|
|
6fb0215eee | ||
|
|
55c21c8836 | ||
|
|
3999442f1c | ||
|
|
71362ffab4 | ||
|
|
20ee418adc | ||
|
|
389aa1b2eb | ||
|
|
3ed767ec06 | ||
|
|
5f96c00c55 | ||
|
|
4587063267 | ||
|
|
472fdee974 | ||
|
|
df78aeef08 | ||
|
|
7df331c66b | ||
|
|
eb5352a770 | ||
|
|
d1cf8214e5 | ||
|
|
730bd35378 | ||
|
|
f55c76c2b3 | ||
|
|
d84d8f4429 | ||
|
|
ae66818379 | ||
|
|
d44a63c6d6 | ||
|
|
066209a045 | ||
|
|
5f7209a793 | ||
|
|
2d4978a57e | ||
|
|
6965a392a4 | ||
|
|
5a4802588e | ||
|
|
8e22da1d7f | ||
|
|
a4fdf2405c | ||
|
|
e6309acdba | ||
|
|
988ee66b0d | ||
|
|
ea38474ac5 | ||
|
|
742e9ff6b3 | ||
|
|
e9056056fb | ||
|
|
1489902b53 | ||
|
|
933f67ecd8 | ||
|
|
fd65015a14 | ||
|
|
77e1c035d0 | ||
|
|
6f403501a0 | ||
|
|
052950e5b3 | ||
|
|
1ef9c9e294 | ||
|
|
5c8f2adf50 | ||
|
|
ed8e6843cc | ||
|
|
d045e22dfe | ||
|
|
1d34eb11e0 | ||
|
|
9a3101b2ba | ||
|
|
d5dbdbfcb2 | ||
|
|
30d6466238 | ||
|
|
e9af6ba62a | ||
|
|
c6fa3895e9 | ||
|
|
3137991f55 | ||
|
|
57430fc95c | ||
|
|
c68c7b403d | ||
|
|
53a1ba6ec5 | ||
|
|
1840c5cb18 | ||
|
|
1bed891f72 | ||
|
|
ceca060501 | ||
|
|
75648b16dd | ||
|
|
460d02a417 | ||
|
|
b4c8fbaae2 | ||
|
|
e99e467384 | ||
|
|
a42ab317ac | ||
|
|
b7f1f490a6 | ||
|
|
30b44a1598 | ||
|
|
1f400c58b8 | ||
|
|
711241c13c | ||
|
|
d7219bcda3 | ||
|
|
4050bae417 | ||
|
|
f1805db1a6 | ||
|
|
434f3d3eb8 | ||
|
|
2092ce8c39 | ||
|
|
fc9f821d20 | ||
|
|
9452863088 | ||
|
|
2b1b3dfa4b | ||
|
|
cca2d2cdbe | ||
|
|
aab0102a26 | ||
|
|
b34129bf8e | ||
|
|
4d7231e774 | ||
|
|
8ac3a41487 | ||
|
|
7d6da483b0 | ||
|
|
e4c3182c68 | ||
|
|
b4734b9550 | ||
|
|
30b9c67743 | ||
|
|
11857a00b0 | ||
|
|
8c25f9cfb6 | ||
|
|
56e96b37e4 | ||
|
|
698024ecce | ||
|
|
0730414999 | ||
|
|
a982f5b5ea | ||
|
|
0e741c12e3 | ||
|
|
56669c1f29 | ||
|
|
3f5f36da3f | ||
|
|
e1eefa4c40 | ||
|
|
ed6ae1e36a | ||
|
|
9875be6431 | ||
|
|
df44df0143 | ||
|
|
87cbbdff63 | ||
|
|
986ab5db63 | ||
|
|
dd39f91edb | ||
|
|
c7a29d2c8d | ||
|
|
8237ab8a2b | ||
|
|
3fd74189db | ||
|
|
5e5a7eb16f | ||
|
|
3d84ef9054 | ||
|
|
4d01b64284 | ||
|
|
114b0e2500 | ||
|
|
647464719b | ||
|
|
e5bfcb6a88 | ||
|
|
22924383e1 | ||
|
|
56f45eddaf | ||
|
|
82b05b15e6 | ||
|
|
a2e9ebe9e2 | ||
|
|
93c8672ceb | ||
|
|
371b1d4c61 | ||
|
|
c9e093116c | ||
|
|
c0c2dd1e0b | ||
|
|
06c20c9904 | ||
|
|
6eb745d9bd | ||
|
|
66483a9d00 | ||
|
|
edfe867208 | ||
|
|
dc45efc8ef | ||
|
|
fb8851f254 | ||
|
|
a903d59ffa | ||
|
|
322cb02872 | ||
|
|
2c52c7fd9a | ||
|
|
1e1c06789e | ||
|
|
7218f83992 | ||
|
|
20e4497be2 | ||
|
|
1c7bcc55b8 | ||
|
|
a9705a290a | ||
|
|
64192d5624 | ||
|
|
fe25772aa9 | ||
|
|
0cca9b4d13 | ||
|
|
a8c536829c | ||
|
|
fcbcba6c70 | ||
|
|
3168285fca | ||
|
|
3fb0d90999 | ||
|
|
05c2dee7e9 | ||
|
|
1d642872a2 | ||
|
|
9ccef8e333 | ||
|
|
537cc635c7 | ||
|
|
5031cd5d55 | ||
|
|
3aaa94ac99 | ||
|
|
8e38e99829 | ||
|
|
0075bfffd4 | ||
|
|
cb0a7b4bea | ||
|
|
8f4f77a727 | ||
|
|
22e44ad589 | ||
|
|
88f5b19f0b | ||
|
|
613abb50d5 | ||
|
|
cdeec2e606 | ||
|
|
1607e664f0 | ||
|
|
68d7231991 | ||
|
|
2fd893b4ce | ||
|
|
02f5903b84 | ||
|
|
ac10fd3c69 | ||
|
|
9d2d561257 | ||
|
|
fe69f331f8 | ||
|
|
3319a493fc | ||
|
|
61728cd1df | ||
|
|
0c80efd94f | ||
|
|
a8b70304d6 | ||
|
|
d44e9df7d4 | ||
|
|
48fc8b1e59 | ||
|
|
1ffe934c8a | ||
|
|
2c8b9182b5 | ||
|
|
4f5299f717 | ||
|
|
09540cd918 | ||
|
|
da2f6800e0 | ||
|
|
ba558c029a | ||
|
|
97cfa99d59 | ||
|
|
bbc6c2f1e5 | ||
|
|
8151609583 | ||
|
|
fdf93486d6 | ||
|
|
d69062c67a | ||
|
|
ae4821a108 | ||
|
|
7ed27f3cb5 | ||
|
|
a4511e38db | ||
|
|
71d0ae1c54 | ||
|
|
3d4e7d34be | ||
|
|
6a25ea5f0e | ||
|
|
73ff872db0 | ||
|
|
468a8d72ba | ||
|
|
4c23690f43 | ||
|
|
814843e021 | ||
|
|
20852c8f4c | ||
|
|
40b6b38f2c | ||
|
|
da94c7c0eb | ||
|
|
1395461f5f | ||
|
|
9912b8ccb8 | ||
|
|
49ef847aa8 | ||
|
|
67745d189f | ||
|
|
2a2d5d2780 | ||
|
|
c3e2978620 | ||
|
|
e4bb2684bc | ||
|
|
c64c0b78de | ||
|
|
0af3d4f0df | ||
|
|
da8dadf68b | ||
|
|
f226a3f0c1 | ||
|
|
c2612371ad | ||
|
|
49a986ecd4 | ||
|
|
f6aa122698 | ||
|
|
184b12fdc6 | ||
|
|
b9489f51e1 | ||
|
|
285eaa4285 |
24
.buildkite/ci_config.yaml
Normal file
24
.buildkite/ci_config.yaml
Normal file
@@ -0,0 +1,24 @@
|
||||
name: vllm_ci
|
||||
job_dirs:
|
||||
- ".buildkite/test_areas"
|
||||
- ".buildkite/image_build"
|
||||
run_all_patterns:
|
||||
- "docker/Dockerfile"
|
||||
- "CMakeLists.txt"
|
||||
- "requirements/common.txt"
|
||||
- "requirements/cuda.txt"
|
||||
- "requirements/build.txt"
|
||||
- "requirements/test.txt"
|
||||
- "setup.py"
|
||||
- "csrc/"
|
||||
- "cmake/"
|
||||
run_all_exclude_patterns:
|
||||
- "docker/Dockerfile."
|
||||
- "csrc/cpu/"
|
||||
- "csrc/rocm/"
|
||||
- "cmake/hipify.py"
|
||||
- "cmake/cpu_extension.cmake"
|
||||
registries: public.ecr.aws/q9t5s3a7
|
||||
repositories:
|
||||
main: "vllm-ci-postmerge-repo"
|
||||
premerge: "vllm-ci-test-repo"
|
||||
@@ -1,46 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import argparse
|
||||
import os
|
||||
|
||||
template = """<!DOCTYPE html>
|
||||
<html>
|
||||
<body>
|
||||
<h1>Links for vLLM</h1/>
|
||||
<a href="../{x86_wheel_html_escaped}">{x86_wheel}</a><br/>
|
||||
<a href="../{arm_wheel_html_escaped}">{arm_wheel}</a><br/>
|
||||
</body>
|
||||
</html>
|
||||
"""
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--wheel", help="The wheel path.", required=True)
|
||||
args = parser.parse_args()
|
||||
|
||||
filename = os.path.basename(args.wheel)
|
||||
|
||||
with open("index.html", "w") as f:
|
||||
print(f"Generated index.html for {args.wheel}")
|
||||
# sync the abi tag with .buildkite/scripts/upload-wheels.sh
|
||||
if "x86_64" in filename:
|
||||
x86_wheel = filename
|
||||
arm_wheel = filename.replace("x86_64", "aarch64").replace(
|
||||
"manylinux1", "manylinux2014"
|
||||
)
|
||||
elif "aarch64" in filename:
|
||||
x86_wheel = filename.replace("aarch64", "x86_64").replace(
|
||||
"manylinux2014", "manylinux1"
|
||||
)
|
||||
arm_wheel = filename
|
||||
else:
|
||||
raise ValueError(f"Unsupported wheel: {filename}")
|
||||
# cloudfront requires escaping the '+' character
|
||||
f.write(
|
||||
template.format(
|
||||
x86_wheel=x86_wheel,
|
||||
x86_wheel_html_escaped=x86_wheel.replace("+", "%2B"),
|
||||
arm_wheel=arm_wheel,
|
||||
arm_wheel_html_escaped=arm_wheel.replace("+", "%2B"),
|
||||
)
|
||||
)
|
||||
56
.buildkite/image_build/image_build.sh
Executable file
56
.buildkite/image_build/image_build.sh
Executable file
@@ -0,0 +1,56 @@
|
||||
#!/bin/bash
|
||||
set -e
|
||||
|
||||
if [[ $# -lt 8 ]]; then
|
||||
echo "Usage: $0 <registry> <repo> <commit> <branch> <vllm_use_precompiled> <vllm_merge_base_commit> <cache_from> <cache_to>"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
REGISTRY=$1
|
||||
REPO=$2
|
||||
BUILDKITE_COMMIT=$3
|
||||
BRANCH=$4
|
||||
VLLM_USE_PRECOMPILED=$5
|
||||
VLLM_MERGE_BASE_COMMIT=$6
|
||||
CACHE_FROM=$7
|
||||
CACHE_TO=$8
|
||||
|
||||
# authenticate with AWS ECR
|
||||
aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin $REGISTRY
|
||||
aws ecr get-login-password --region us-east-1 | docker login --username AWS --password-stdin 936637512419.dkr.ecr.us-east-1.amazonaws.com
|
||||
|
||||
# docker buildx
|
||||
docker buildx create --name vllm-builder --driver docker-container --use
|
||||
docker buildx inspect --bootstrap
|
||||
docker buildx ls
|
||||
|
||||
# skip build if image already exists
|
||||
if [[ -z $(docker manifest inspect $REGISTRY/$REPO:$BUILDKITE_COMMIT) ]]; then
|
||||
echo "Image not found, proceeding with build..."
|
||||
else
|
||||
echo "Image found"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
if [[ "${VLLM_USE_PRECOMPILED:-0}" == "1" ]]; then
|
||||
merge_base_commit_build_args="--build-arg VLLM_MERGE_BASE_COMMIT=${VLLM_MERGE_BASE_COMMIT}"
|
||||
else
|
||||
merge_base_commit_build_args=""
|
||||
fi
|
||||
|
||||
# build
|
||||
docker buildx build --file docker/Dockerfile \
|
||||
--build-arg max_jobs=16 \
|
||||
--build-arg buildkite_commit=$BUILDKITE_COMMIT \
|
||||
--build-arg USE_SCCACHE=1 \
|
||||
--build-arg TORCH_CUDA_ARCH_LIST="8.0 8.9 9.0 10.0" \
|
||||
--build-arg FI_TORCH_CUDA_ARCH_LIST="8.0 8.9 9.0a 10.0a" \
|
||||
--build-arg VLLM_USE_PRECOMPILED="${VLLM_USE_PRECOMPILED:-0}" \
|
||||
${merge_base_commit_build_args} \
|
||||
--cache-from type=registry,ref=${CACHE_FROM},mode=max \
|
||||
--cache-to type=registry,ref=${CACHE_TO},mode=max \
|
||||
--tag ${REGISTRY}/${REPO}:${BUILDKITE_COMMIT} \
|
||||
$( [[ "${BRANCH}" == "main" ]] && echo "--tag ${REGISTRY}/${REPO}:latest" ) \
|
||||
--push \
|
||||
--target test \
|
||||
--progress plain .
|
||||
57
.buildkite/image_build/image_build.yaml
Normal file
57
.buildkite/image_build/image_build.yaml
Normal file
@@ -0,0 +1,57 @@
|
||||
group: Abuild
|
||||
steps:
|
||||
- label: ":docker: Build image"
|
||||
key: image-build
|
||||
depends_on: []
|
||||
commands:
|
||||
- .buildkite/image_build/image_build.sh $REGISTRY $REPO $BUILDKITE_COMMIT $BRANCH $VLLM_USE_PRECOMPILED $VLLM_MERGE_BASE_COMMIT $CACHE_FROM $CACHE_TO
|
||||
retry:
|
||||
automatic:
|
||||
- exit_status: -1 # Agent was lost
|
||||
limit: 2
|
||||
- exit_status: -10 # Agent was lost
|
||||
limit: 2
|
||||
|
||||
- label: ":docker: Build CPU image"
|
||||
key: image-build-cpu
|
||||
depends_on: []
|
||||
commands:
|
||||
- .buildkite/image_build/image_build_cpu.sh $REGISTRY $REPO $BUILDKITE_COMMIT
|
||||
env:
|
||||
DOCKER_BUILDKIT: "1"
|
||||
retry:
|
||||
automatic:
|
||||
- exit_status: -1 # Agent was lost
|
||||
limit: 2
|
||||
- exit_status: -10 # Agent was lost
|
||||
limit: 2
|
||||
|
||||
- label: ":docker: Build HPU image"
|
||||
soft_fail: true
|
||||
depends_on: []
|
||||
key: image-build-hpu
|
||||
commands:
|
||||
- .buildkite/image_build/image_build_hpu.sh $REGISTRY $REPO $BUILDKITE_COMMIT
|
||||
env:
|
||||
DOCKER_BUILDKIT: "1"
|
||||
retry:
|
||||
automatic:
|
||||
- exit_status: -1 # Agent was lost
|
||||
limit: 2
|
||||
- exit_status: -10 # Agent was lost
|
||||
limit: 2
|
||||
|
||||
- label: ":docker: Build CPU arm64 image"
|
||||
key: cpu-arm64-image-build
|
||||
depends_on: []
|
||||
optional: true
|
||||
commands:
|
||||
- .buildkite/image_build/image_build_cpu_arm64.sh $REGISTRY $REPO $BUILDKITE_COMMIT
|
||||
env:
|
||||
DOCKER_BUILDKIT: "1"
|
||||
retry:
|
||||
automatic:
|
||||
- exit_status: -1 # Agent was lost
|
||||
limit: 2
|
||||
- exit_status: -10 # Agent was lost
|
||||
limit: 2
|
||||
36
.buildkite/image_build/image_build_cpu.sh
Executable file
36
.buildkite/image_build/image_build_cpu.sh
Executable file
@@ -0,0 +1,36 @@
|
||||
#!/bin/bash
|
||||
set -e
|
||||
|
||||
if [[ $# -lt 3 ]]; then
|
||||
echo "Usage: $0 <registry> <repo> <commit>"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
REGISTRY=$1
|
||||
REPO=$2
|
||||
BUILDKITE_COMMIT=$3
|
||||
|
||||
# authenticate with AWS ECR
|
||||
aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin $REGISTRY
|
||||
|
||||
# skip build if image already exists
|
||||
if [[ -z $(docker manifest inspect $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu) ]]; then
|
||||
echo "Image not found, proceeding with build..."
|
||||
else
|
||||
echo "Image found"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
# build
|
||||
docker build --file docker/Dockerfile.cpu \
|
||||
--build-arg max_jobs=16 \
|
||||
--build-arg buildkite_commit=$BUILDKITE_COMMIT \
|
||||
--build-arg VLLM_CPU_AVX512BF16=true \
|
||||
--build-arg VLLM_CPU_AVX512VNNI=true \
|
||||
--build-arg VLLM_CPU_AMXBF16=true \
|
||||
--tag $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu \
|
||||
--target vllm-test \
|
||||
--progress plain .
|
||||
|
||||
# push
|
||||
docker push $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu
|
||||
33
.buildkite/image_build/image_build_cpu_arm64.sh
Executable file
33
.buildkite/image_build/image_build_cpu_arm64.sh
Executable file
@@ -0,0 +1,33 @@
|
||||
#!/bin/bash
|
||||
set -e
|
||||
|
||||
if [[ $# -lt 3 ]]; then
|
||||
echo "Usage: $0 <registry> <repo> <commit>"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
REGISTRY=$1
|
||||
REPO=$2
|
||||
BUILDKITE_COMMIT=$3
|
||||
|
||||
# authenticate with AWS ECR
|
||||
aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin $REGISTRY
|
||||
|
||||
# skip build if image already exists
|
||||
if [[ -z $(docker manifest inspect $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu) ]]; then
|
||||
echo "Image not found, proceeding with build..."
|
||||
else
|
||||
echo "Image found"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
# build
|
||||
docker build --file docker/Dockerfile.cpu \
|
||||
--build-arg max_jobs=16 \
|
||||
--build-arg buildkite_commit=$BUILDKITE_COMMIT \
|
||||
--tag $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu \
|
||||
--target vllm-test \
|
||||
--progress plain .
|
||||
|
||||
# push
|
||||
docker push $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu
|
||||
34
.buildkite/image_build/image_build_hpu.sh
Executable file
34
.buildkite/image_build/image_build_hpu.sh
Executable file
@@ -0,0 +1,34 @@
|
||||
#!/bin/bash
|
||||
set -e
|
||||
|
||||
if [[ $# -lt 3 ]]; then
|
||||
echo "Usage: $0 <registry> <repo> <commit>"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
REGISTRY=$1
|
||||
REPO=$2
|
||||
BUILDKITE_COMMIT=$3
|
||||
|
||||
# authenticate with AWS ECR
|
||||
aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin $REGISTRY
|
||||
|
||||
# skip build if image already exists
|
||||
if [[ -z $(docker manifest inspect $REGISTRY/$REPO:$BUILDKITE_COMMIT-hpu) ]]; then
|
||||
echo "Image not found, proceeding with build..."
|
||||
else
|
||||
echo "Image found"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
# build
|
||||
docker build \
|
||||
--file tests/pytorch_ci_hud_benchmark/Dockerfile.hpu \
|
||||
--build-arg max_jobs=16 \
|
||||
--build-arg buildkite_commit=$BUILDKITE_COMMIT \
|
||||
--tag $REGISTRY/$REPO:$BUILDKITE_COMMIT-hpu \
|
||||
--progress plain \
|
||||
https://github.com/vllm-project/vllm-gaudi.git
|
||||
|
||||
# push
|
||||
docker push $REGISTRY/$REPO:$BUILDKITE_COMMIT-hpu
|
||||
@@ -8,3 +8,4 @@ tasks:
|
||||
value: 0.80
|
||||
limit: 250 # will run on 250 * 14 subjects = 3500 samples
|
||||
num_fewshot: 5
|
||||
rtol: 0.05
|
||||
|
||||
1
.buildkite/lm-eval-harness/configs/models-large-rocm.txt
Normal file
1
.buildkite/lm-eval-harness/configs/models-large-rocm.txt
Normal file
@@ -0,0 +1 @@
|
||||
Meta-Llama-4-Maverick-17B-128E-Instruct-FP8.yaml
|
||||
@@ -9,11 +9,40 @@ pytest -s -v test_lm_eval_correctness.py \
|
||||
--tp-size=1
|
||||
"""
|
||||
|
||||
import os
|
||||
from contextlib import contextmanager
|
||||
|
||||
import lm_eval
|
||||
import numpy as np
|
||||
import yaml
|
||||
|
||||
RTOL = 0.08
|
||||
DEFAULT_RTOL = 0.08
|
||||
|
||||
|
||||
@contextmanager
|
||||
def scoped_env_vars(new_env: dict[str, str]):
|
||||
if not new_env:
|
||||
# Fast path: nothing to do
|
||||
yield
|
||||
return
|
||||
|
||||
old_values = {}
|
||||
new_keys = []
|
||||
|
||||
try:
|
||||
for key, value in new_env.items():
|
||||
if key in os.environ:
|
||||
old_values[key] = os.environ[key]
|
||||
else:
|
||||
new_keys.append(key)
|
||||
os.environ[key] = str(value)
|
||||
yield
|
||||
finally:
|
||||
# Restore / clean up
|
||||
for key, value in old_values.items():
|
||||
os.environ[key] = value
|
||||
for key in new_keys:
|
||||
os.environ.pop(key, None)
|
||||
|
||||
|
||||
def launch_lm_eval(eval_config, tp_size):
|
||||
@@ -32,23 +61,26 @@ def launch_lm_eval(eval_config, tp_size):
|
||||
f"trust_remote_code={trust_remote_code},"
|
||||
f"max_model_len={max_model_len},"
|
||||
)
|
||||
results = lm_eval.simple_evaluate(
|
||||
model=backend,
|
||||
model_args=model_args,
|
||||
tasks=[task["name"] for task in eval_config["tasks"]],
|
||||
num_fewshot=eval_config["num_fewshot"],
|
||||
limit=eval_config["limit"],
|
||||
# TODO(yeq): using chat template w/ fewshot_as_multiturn is supposed help
|
||||
# text models. however, this is regressing measured strict-match for
|
||||
# existing text models in CI, so only apply it for mm, or explicitly set
|
||||
apply_chat_template=eval_config.get(
|
||||
"apply_chat_template", backend == "vllm-vlm"
|
||||
),
|
||||
fewshot_as_multiturn=eval_config.get("fewshot_as_multiturn", False),
|
||||
# Forward decoding and early-stop controls (e.g., max_gen_toks, until=...)
|
||||
gen_kwargs=eval_config.get("gen_kwargs"),
|
||||
batch_size=batch_size,
|
||||
)
|
||||
|
||||
env_vars = eval_config.get("env_vars", None)
|
||||
with scoped_env_vars(env_vars):
|
||||
results = lm_eval.simple_evaluate(
|
||||
model=backend,
|
||||
model_args=model_args,
|
||||
tasks=[task["name"] for task in eval_config["tasks"]],
|
||||
num_fewshot=eval_config["num_fewshot"],
|
||||
limit=eval_config["limit"],
|
||||
# TODO(yeq): using chat template w/ fewshot_as_multiturn is supposed help
|
||||
# text models. however, this is regressing measured strict-match for
|
||||
# existing text models in CI, so only apply it for mm, or explicitly set
|
||||
apply_chat_template=eval_config.get(
|
||||
"apply_chat_template", backend == "vllm-vlm"
|
||||
),
|
||||
fewshot_as_multiturn=eval_config.get("fewshot_as_multiturn", False),
|
||||
# Forward decoding and early-stop controls (e.g., max_gen_toks, until=...)
|
||||
gen_kwargs=eval_config.get("gen_kwargs"),
|
||||
batch_size=batch_size,
|
||||
)
|
||||
return results
|
||||
|
||||
|
||||
@@ -57,6 +89,8 @@ def test_lm_eval_correctness_param(config_filename, tp_size):
|
||||
|
||||
results = launch_lm_eval(eval_config, tp_size)
|
||||
|
||||
rtol = eval_config.get("rtol", DEFAULT_RTOL)
|
||||
|
||||
success = True
|
||||
for task in eval_config["tasks"]:
|
||||
for metric in task["metrics"]:
|
||||
@@ -64,8 +98,9 @@ def test_lm_eval_correctness_param(config_filename, tp_size):
|
||||
measured_value = results["results"][task["name"]][metric["name"]]
|
||||
print(
|
||||
f"{task['name']} | {metric['name']}: "
|
||||
f"ground_truth={ground_truth} | measured={measured_value}"
|
||||
f"ground_truth={ground_truth:.3f} | "
|
||||
f"measured={measured_value:.3f} | rtol={rtol}"
|
||||
)
|
||||
success = success and np.isclose(ground_truth, measured_value, rtol=RTOL)
|
||||
success = success and np.isclose(ground_truth, measured_value, rtol=rtol)
|
||||
|
||||
assert success
|
||||
|
||||
@@ -108,6 +108,65 @@ The number of this test is less stable compared to the delay and latency benchma
|
||||
|
||||
WARNING: The benchmarking script will save json results by itself, so please do not configure `--save-results` or other results-saving-related parameters in `serving-tests.json`.
|
||||
|
||||
#### Default Parameters Field
|
||||
|
||||
We can specify default parameters in a JSON field with key `defaults`. Parameters defined in the field are applied globally to all serving tests, and can be overridden in test case fields. Here is an example:
|
||||
|
||||
<details>
|
||||
<summary> An Example of default parameters field </summary>
|
||||
|
||||
```json
|
||||
{
|
||||
"defaults": {
|
||||
"qps_list": [
|
||||
"inf"
|
||||
],
|
||||
"server_environment_variables": {
|
||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1
|
||||
},
|
||||
"server_parameters": {
|
||||
"tensor_parallel_size": 1,
|
||||
"dtype": "bfloat16",
|
||||
"block_size": 128,
|
||||
"disable_log_stats": "",
|
||||
"load_format": "dummy"
|
||||
},
|
||||
"client_parameters": {
|
||||
"backend": "vllm",
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 128,
|
||||
"random-output-len": 128,
|
||||
"num_prompts": 200,
|
||||
"ignore-eos": ""
|
||||
}
|
||||
},
|
||||
"tests": [
|
||||
{
|
||||
"test_name": "serving_llama3B_tp2_random_128_128",
|
||||
"server_parameters": {
|
||||
"model": "meta-llama/Llama-3.2-3B-Instruct",
|
||||
"tensor_parallel_size": 2,
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "meta-llama/Llama-3.2-3B-Instruct",
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_qwen3_tp4_random_128_128",
|
||||
"server_parameters": {
|
||||
"model": "Qwen/Qwen3-14B",
|
||||
"tensor_parallel_size": 4,
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "Qwen/Qwen3-14B",
|
||||
}
|
||||
},
|
||||
]
|
||||
}
|
||||
```
|
||||
|
||||
</details>
|
||||
|
||||
### Visualizing the results
|
||||
|
||||
The `convert-results-json-to-markdown.py` helps you put the benchmarking results inside a markdown table, by formatting [descriptions.md](performance-benchmarks-descriptions.md) with real benchmarking results.
|
||||
|
||||
@@ -110,7 +110,8 @@ json2envs() {
|
||||
wait_for_server() {
|
||||
# wait for vllm server to start
|
||||
# return 1 if vllm server crashes
|
||||
timeout 1200 bash -c '
|
||||
local timeout_val="1200"
|
||||
timeout "$timeout_val" bash -c '
|
||||
until curl -X POST localhost:8000/v1/completions; do
|
||||
sleep 1
|
||||
done' && return 0 || return 1
|
||||
@@ -316,12 +317,44 @@ run_throughput_tests() {
|
||||
run_serving_tests() {
|
||||
# run serving tests using `vllm bench serve` command
|
||||
# $1: a json file specifying serving test cases
|
||||
#
|
||||
# Supported JSON formats:
|
||||
# 1) Plain format: top-level array
|
||||
# [ { "test_name": "...", "server_parameters": {...}, ... }, ... ]
|
||||
#
|
||||
# 2) Default parameters field + plain format tests
|
||||
# {
|
||||
# "defaults": { ... },
|
||||
# "tests": [ { "test_name": "...", "server_parameters": {...}, ... }, ... ]
|
||||
# }
|
||||
|
||||
local serving_test_file
|
||||
serving_test_file=$1
|
||||
|
||||
# Iterate over serving tests
|
||||
jq -c '.[]' "$serving_test_file" | while read -r params; do
|
||||
jq -c '
|
||||
if type == "array" then
|
||||
# Plain format: test cases array
|
||||
.[]
|
||||
elif (type == "object" and has("tests")) then
|
||||
# merge the default parameters into each test cases
|
||||
. as $root
|
||||
| ($root.defaults // {}) as $d
|
||||
| ($root.tests // [])[]
|
||||
# default qps / max_concurrency from defaults if missing
|
||||
| .qps_list = (.qps_list // $d.qps_list)
|
||||
| .max_concurrency_list = (.max_concurrency_list // $d.max_concurrency_list)
|
||||
# merge envs / params: test overrides defaults
|
||||
| .server_environment_variables =
|
||||
(($d.server_environment_variables // {}) + (.server_environment_variables // {}))
|
||||
| .server_parameters =
|
||||
(($d.server_parameters // {}) + (.server_parameters // {}))
|
||||
| .client_parameters =
|
||||
(($d.client_parameters // {}) + (.client_parameters // {}))
|
||||
else
|
||||
error("Unsupported serving test file format: must be array or object with .tests")
|
||||
end
|
||||
' "$serving_test_file" | while read -r params; do
|
||||
# get the test name, and append the GPU type back to it.
|
||||
test_name=$(echo "$params" | jq -r '.test_name')
|
||||
if [[ ! "$test_name" =~ ^serving_ ]]; then
|
||||
@@ -335,20 +368,25 @@ run_serving_tests() {
|
||||
continue
|
||||
fi
|
||||
|
||||
# get client and server arguments
|
||||
# get client and server arguments (after merged the default parameters)
|
||||
server_params=$(echo "$params" | jq -r '.server_parameters')
|
||||
server_envs=$(echo "$params" | jq -r '.server_environment_variables')
|
||||
client_params=$(echo "$params" | jq -r '.client_parameters')
|
||||
|
||||
server_args=$(json2args "$server_params")
|
||||
server_envs=$(json2envs "$server_envs")
|
||||
client_args=$(json2args "$client_params")
|
||||
|
||||
# qps_list
|
||||
qps_list=$(echo "$params" | jq -r '.qps_list')
|
||||
qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
|
||||
echo "Running over qps list $qps_list"
|
||||
|
||||
# max_concurrency_list (fallback to num_prompts if missing)
|
||||
max_concurrency_list=$(echo "$params" | jq -r '.max_concurrency_list')
|
||||
if [[ -z "$max_concurrency_list" || "$max_concurrency_list" == "null" ]]; then
|
||||
num_prompts=$(echo "$client_params" | jq -r '.num_prompts')
|
||||
max_concurrency_list="[$num_prompts]"
|
||||
num_prompts=$(echo "$client_params" | jq -r '.num_prompts')
|
||||
max_concurrency_list="[$num_prompts]"
|
||||
fi
|
||||
max_concurrency_list=$(echo "$max_concurrency_list" | jq -r '.[] | @sh')
|
||||
echo "Running over max concurrency list $max_concurrency_list"
|
||||
|
||||
@@ -1,610 +0,0 @@
|
||||
[
|
||||
{
|
||||
"test_name": "serving_llama8B_bf16_tp1_sharegpt",
|
||||
"qps_list": ["inf"],
|
||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
||||
"server_environment_variables": {
|
||||
"VLLM_RPC_TIMEOUT": 100000,
|
||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||
"VLLM_CPU_SGL_KERNEL": 1,
|
||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||
},
|
||||
"server_parameters": {
|
||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||
"tensor_parallel_size": 1,
|
||||
"dtype": "bfloat16",
|
||||
"distributed_executor_backend": "mp",
|
||||
"block_size": 128,
|
||||
"trust_remote_code": "",
|
||||
"disable_log_stats": "",
|
||||
"enforce_eager": "",
|
||||
"max_num_batched_tokens": 2048,
|
||||
"max_num_seqs": 256,
|
||||
"load_format": "dummy"
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||
"backend": "vllm",
|
||||
"dataset_name": "sharegpt",
|
||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||
"num_prompts": 200
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_bf16_tp2_sharegpt",
|
||||
"qps_list": ["inf"],
|
||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
||||
"server_environment_variables": {
|
||||
"VLLM_RPC_TIMEOUT": 100000,
|
||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||
"VLLM_CPU_SGL_KERNEL": 1,
|
||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||
},
|
||||
"server_parameters": {
|
||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||
"tensor_parallel_size": 2,
|
||||
"dtype": "bfloat16",
|
||||
"distributed_executor_backend": "mp",
|
||||
"block_size": 128,
|
||||
"trust_remote_code": "",
|
||||
"disable_log_stats": "",
|
||||
"enforce_eager": "",
|
||||
"max_num_batched_tokens": 2048,
|
||||
"max_num_seqs": 256,
|
||||
"load_format": "dummy"
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||
"backend": "vllm",
|
||||
"dataset_name": "sharegpt",
|
||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||
"num_prompts": 200
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_bf16_tp4_sharegpt",
|
||||
"qps_list": ["inf"],
|
||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
||||
"server_environment_variables": {
|
||||
"VLLM_RPC_TIMEOUT": 100000,
|
||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||
"VLLM_CPU_SGL_KERNEL": 1,
|
||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||
},
|
||||
"server_parameters": {
|
||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||
"tensor_parallel_size": 4,
|
||||
"dtype": "bfloat16",
|
||||
"distributed_executor_backend": "mp",
|
||||
"block_size": 128,
|
||||
"trust_remote_code": "",
|
||||
"disable_log_stats": "",
|
||||
"enforce_eager": "",
|
||||
"max_num_batched_tokens": 2048,
|
||||
"max_num_seqs": 256,
|
||||
"load_format": "dummy"
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||
"backend": "vllm",
|
||||
"dataset_name": "sharegpt",
|
||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||
"num_prompts": 200
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_bf16_tp1_random_128_128",
|
||||
"qps_list": ["inf"],
|
||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
|
||||
"server_environment_variables": {
|
||||
"VLLM_RPC_TIMEOUT": 100000,
|
||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||
"VLLM_CPU_SGL_KERNEL": 1,
|
||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||
},
|
||||
"server_parameters": {
|
||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||
"tensor_parallel_size": 1,
|
||||
"dtype": "bfloat16",
|
||||
"distributed_executor_backend": "mp",
|
||||
"block_size": 128,
|
||||
"trust_remote_code": "",
|
||||
"enable_chunked_prefill": "",
|
||||
"disable_log_stats": "",
|
||||
"enforce_eager": "",
|
||||
"max_num_batched_tokens": 2048,
|
||||
"max_num_seqs": 256,
|
||||
"load_format": "dummy"
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||
"backend": "vllm",
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 128,
|
||||
"random-output-len": 128,
|
||||
"ignore-eos": "",
|
||||
"num_prompts": 1000
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_bf16_tp2_random_128_128",
|
||||
"qps_list": ["inf"],
|
||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
|
||||
"server_environment_variables": {
|
||||
"VLLM_RPC_TIMEOUT": 100000,
|
||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||
"VLLM_CPU_SGL_KERNEL": 1,
|
||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||
},
|
||||
"server_parameters": {
|
||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||
"tensor_parallel_size": 2,
|
||||
"dtype": "bfloat16",
|
||||
"distributed_executor_backend": "mp",
|
||||
"block_size": 128,
|
||||
"trust_remote_code": "",
|
||||
"enable_chunked_prefill": "",
|
||||
"disable_log_stats": "",
|
||||
"enforce_eager": "",
|
||||
"max_num_batched_tokens": 2048,
|
||||
"max_num_seqs": 256,
|
||||
"load_format": "dummy"
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||
"backend": "vllm",
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 128,
|
||||
"random-output-len": 128,
|
||||
"ignore-eos": "",
|
||||
"num_prompts": 1000
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_bf16_tp4_random_128_128",
|
||||
"qps_list": ["inf"],
|
||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
|
||||
"server_environment_variables": {
|
||||
"VLLM_RPC_TIMEOUT": 100000,
|
||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||
"VLLM_CPU_SGL_KERNEL": 1,
|
||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||
},
|
||||
"server_parameters": {
|
||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||
"tensor_parallel_size": 4,
|
||||
"dtype": "bfloat16",
|
||||
"distributed_executor_backend": "mp",
|
||||
"block_size": 128,
|
||||
"trust_remote_code": "",
|
||||
"enable_chunked_prefill": "",
|
||||
"disable_log_stats": "",
|
||||
"enforce_eager": "",
|
||||
"max_num_batched_tokens": 2048,
|
||||
"max_num_seqs": 256,
|
||||
"load_format": "dummy"
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||
"backend": "vllm",
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 128,
|
||||
"random-output-len": 128,
|
||||
"num_prompts": 1000
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_int8_tp1_sharegpt",
|
||||
"qps_list": ["inf"],
|
||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
||||
"server_environment_variables": {
|
||||
"VLLM_RPC_TIMEOUT": 100000,
|
||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||
"VLLM_CPU_SGL_KERNEL": 1,
|
||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||
},
|
||||
"server_parameters": {
|
||||
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
|
||||
"tensor_parallel_size": 1,
|
||||
"dtype": "bfloat16",
|
||||
"distributed_executor_backend": "mp",
|
||||
"block_size": 128,
|
||||
"trust_remote_code": "",
|
||||
"disable_log_stats": "",
|
||||
"enforce_eager": "",
|
||||
"max_num_batched_tokens": 2048,
|
||||
"max_num_seqs": 256,
|
||||
"load_format": "dummy"
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
|
||||
"backend": "vllm",
|
||||
"dataset_name": "sharegpt",
|
||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||
"num_prompts": 200
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_int8_tp2_sharegpt",
|
||||
"qps_list": ["inf"],
|
||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
||||
"server_environment_variables": {
|
||||
"VLLM_RPC_TIMEOUT": 100000,
|
||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||
"VLLM_CPU_SGL_KERNEL": 1,
|
||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||
},
|
||||
"server_parameters": {
|
||||
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
|
||||
"tensor_parallel_size": 2,
|
||||
"dtype": "bfloat16",
|
||||
"distributed_executor_backend": "mp",
|
||||
"block_size": 128,
|
||||
"trust_remote_code": "",
|
||||
"disable_log_stats": "",
|
||||
"enforce_eager": "",
|
||||
"max_num_batched_tokens": 2048,
|
||||
"max_num_seqs": 256,
|
||||
"load_format": "dummy"
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
|
||||
"backend": "vllm",
|
||||
"dataset_name": "sharegpt",
|
||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||
"num_prompts": 200
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_int8_tp4_sharegpt",
|
||||
"qps_list": ["inf"],
|
||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
||||
"server_environment_variables": {
|
||||
"VLLM_RPC_TIMEOUT": 100000,
|
||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||
"VLLM_CPU_SGL_KERNEL": 1,
|
||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||
},
|
||||
"server_parameters": {
|
||||
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
|
||||
"tensor_parallel_size": 4,
|
||||
"dtype": "bfloat16",
|
||||
"distributed_executor_backend": "mp",
|
||||
"block_size": 128,
|
||||
"trust_remote_code": "",
|
||||
"disable_log_stats": "",
|
||||
"enforce_eager": "",
|
||||
"max_num_batched_tokens": 2048,
|
||||
"max_num_seqs": 256,
|
||||
"load_format": "dummy"
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
|
||||
"backend": "vllm",
|
||||
"dataset_name": "sharegpt",
|
||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||
"num_prompts": 200
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_int8_tp1_random_128_128",
|
||||
"qps_list": ["inf"],
|
||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
|
||||
"server_environment_variables": {
|
||||
"VLLM_RPC_TIMEOUT": 100000,
|
||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||
"VLLM_CPU_SGL_KERNEL": 1,
|
||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||
},
|
||||
"server_parameters": {
|
||||
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
|
||||
"tensor_parallel_size": 1,
|
||||
"dtype": "bfloat16",
|
||||
"distributed_executor_backend": "mp",
|
||||
"block_size": 128,
|
||||
"trust_remote_code": "",
|
||||
"enable_chunked_prefill": "",
|
||||
"disable_log_stats": "",
|
||||
"enforce_eager": "",
|
||||
"max_num_batched_tokens": 2048,
|
||||
"max_num_seqs": 256,
|
||||
"load_format": "dummy"
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
|
||||
"backend": "vllm",
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 128,
|
||||
"random-output-len": 128,
|
||||
"ignore-eos": "",
|
||||
"num_prompts": 1000
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_int8_tp2_random_128_128",
|
||||
"qps_list": ["inf"],
|
||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
|
||||
"server_environment_variables": {
|
||||
"VLLM_RPC_TIMEOUT": 100000,
|
||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||
"VLLM_CPU_SGL_KERNEL": 1,
|
||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||
},
|
||||
"server_parameters": {
|
||||
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
|
||||
"tensor_parallel_size": 2,
|
||||
"dtype": "bfloat16",
|
||||
"distributed_executor_backend": "mp",
|
||||
"block_size": 128,
|
||||
"trust_remote_code": "",
|
||||
"enable_chunked_prefill": "",
|
||||
"disable_log_stats": "",
|
||||
"enforce_eager": "",
|
||||
"max_num_batched_tokens": 2048,
|
||||
"max_num_seqs": 256,
|
||||
"load_format": "dummy"
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
|
||||
"backend": "vllm",
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 128,
|
||||
"random-output-len": 128,
|
||||
"ignore-eos": "",
|
||||
"num_prompts": 1000
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_int8_tp4_random_128_128",
|
||||
"qps_list": ["inf"],
|
||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
|
||||
"server_environment_variables": {
|
||||
"VLLM_RPC_TIMEOUT": 100000,
|
||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||
"VLLM_CPU_SGL_KERNEL": 1,
|
||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||
},
|
||||
"server_parameters": {
|
||||
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
|
||||
"tensor_parallel_size": 4,
|
||||
"dtype": "bfloat16",
|
||||
"distributed_executor_backend": "mp",
|
||||
"block_size": 128,
|
||||
"trust_remote_code": "",
|
||||
"enable_chunked_prefill": "",
|
||||
"disable_log_stats": "",
|
||||
"enforce_eager": "",
|
||||
"max_num_batched_tokens": 2048,
|
||||
"max_num_seqs": 256,
|
||||
"load_format": "dummy"
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
|
||||
"backend": "vllm",
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 128,
|
||||
"random-output-len": 128,
|
||||
"ignore-eos": "",
|
||||
"num_prompts": 1000
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_int4_tp1_sharegpt",
|
||||
"qps_list": ["inf"],
|
||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
||||
"server_environment_variables": {
|
||||
"VLLM_RPC_TIMEOUT": 100000,
|
||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||
"VLLM_CPU_SGL_KERNEL": 1,
|
||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||
},
|
||||
"server_parameters": {
|
||||
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
||||
"quantization": "awq",
|
||||
"tensor_parallel_size": 1,
|
||||
"dtype": "bfloat16",
|
||||
"distributed_executor_backend": "mp",
|
||||
"block_size": 128,
|
||||
"trust_remote_code": "",
|
||||
"disable_log_stats": "",
|
||||
"enforce_eager": "",
|
||||
"max_num_batched_tokens": 2048,
|
||||
"max_num_seqs": 256,
|
||||
"load_format": "dummy"
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
||||
"backend": "vllm",
|
||||
"dataset_name": "sharegpt",
|
||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||
"num_prompts": 200
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_int4_tp2_sharegpt",
|
||||
"qps_list": ["inf"],
|
||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
||||
"server_environment_variables": {
|
||||
"VLLM_RPC_TIMEOUT": 100000,
|
||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||
"VLLM_CPU_SGL_KERNEL": 1,
|
||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||
},
|
||||
"server_parameters": {
|
||||
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
||||
"quantization": "awq",
|
||||
"tensor_parallel_size": 2,
|
||||
"dtype": "bfloat16",
|
||||
"distributed_executor_backend": "mp",
|
||||
"block_size": 128,
|
||||
"trust_remote_code": "",
|
||||
"disable_log_stats": "",
|
||||
"enforce_eager": "",
|
||||
"max_num_batched_tokens": 2048,
|
||||
"max_num_seqs": 256,
|
||||
"load_format": "dummy"
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
||||
"backend": "vllm",
|
||||
"dataset_name": "sharegpt",
|
||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||
"num_prompts": 200
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_int4_tp4_sharegpt",
|
||||
"qps_list": ["inf"],
|
||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
||||
"server_environment_variables": {
|
||||
"VLLM_RPC_TIMEOUT": 100000,
|
||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||
"VLLM_CPU_SGL_KERNEL": 1,
|
||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||
},
|
||||
"server_parameters": {
|
||||
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
||||
"quantization": "awq",
|
||||
"tensor_parallel_size": 4,
|
||||
"dtype": "bfloat16",
|
||||
"distributed_executor_backend": "mp",
|
||||
"block_size": 128,
|
||||
"trust_remote_code": "",
|
||||
"disable_log_stats": "",
|
||||
"enforce_eager": "",
|
||||
"max_num_batched_tokens": 2048,
|
||||
"max_num_seqs": 256,
|
||||
"load_format": "dummy"
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
||||
"backend": "vllm",
|
||||
"dataset_name": "sharegpt",
|
||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||
"num_prompts": 200
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_int4_tp1_random_128_128",
|
||||
"qps_list": ["inf"],
|
||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
|
||||
"server_environment_variables": {
|
||||
"VLLM_RPC_TIMEOUT": 100000,
|
||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||
"VLLM_CPU_SGL_KERNEL": 1,
|
||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||
},
|
||||
"server_parameters": {
|
||||
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
||||
"quantization": "awq",
|
||||
"tensor_parallel_size": 1,
|
||||
"dtype": "bfloat16",
|
||||
"distributed_executor_backend": "mp",
|
||||
"block_size": 128,
|
||||
"trust_remote_code": "",
|
||||
"enable_chunked_prefill": "",
|
||||
"disable_log_stats": "",
|
||||
"enforce_eager": "",
|
||||
"max_num_batched_tokens": 2048,
|
||||
"max_num_seqs": 256,
|
||||
"load_format": "dummy"
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
||||
"backend": "vllm",
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 128,
|
||||
"random-output-len": 128,
|
||||
"ignore-eos": "",
|
||||
"num_prompts": 1000
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_int4_tp2_random_128_128",
|
||||
"qps_list": ["inf"],
|
||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
|
||||
"server_environment_variables": {
|
||||
"VLLM_RPC_TIMEOUT": 100000,
|
||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||
"VLLM_CPU_SGL_KERNEL": 1,
|
||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||
},
|
||||
"server_parameters": {
|
||||
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
||||
"quantization": "awq",
|
||||
"tensor_parallel_size": 2,
|
||||
"dtype": "bfloat16",
|
||||
"distributed_executor_backend": "mp",
|
||||
"block_size": 128,
|
||||
"trust_remote_code": "",
|
||||
"enable_chunked_prefill": "",
|
||||
"disable_log_stats": "",
|
||||
"enforce_eager": "",
|
||||
"max_num_batched_tokens": 2048,
|
||||
"max_num_seqs": 256,
|
||||
"load_format": "dummy"
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
||||
"backend": "vllm",
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 128,
|
||||
"random-output-len": 128,
|
||||
"ignore-eos": "",
|
||||
"num_prompts": 1000
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_int4_tp4_random_128_128",
|
||||
"qps_list": ["inf"],
|
||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
|
||||
"server_environment_variables": {
|
||||
"VLLM_RPC_TIMEOUT": 100000,
|
||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||
"VLLM_CPU_SGL_KERNEL": 1,
|
||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||
},
|
||||
"server_parameters": {
|
||||
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
||||
"quantization": "awq",
|
||||
"tensor_parallel_size": 4,
|
||||
"dtype": "bfloat16",
|
||||
"distributed_executor_backend": "mp",
|
||||
"block_size": 128,
|
||||
"trust_remote_code": "",
|
||||
"enable_chunked_prefill": "",
|
||||
"disable_log_stats": "",
|
||||
"enforce_eager": "",
|
||||
"max_num_batched_tokens": 2048,
|
||||
"max_num_seqs": 256,
|
||||
"load_format": "dummy"
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
||||
"backend": "vllm",
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 128,
|
||||
"random-output-len": 128,
|
||||
"ignore-eos": "",
|
||||
"num_prompts": 1000
|
||||
}
|
||||
}
|
||||
]
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,276 +1,246 @@
|
||||
[
|
||||
{
|
||||
"test_name": "serving_llama8B_tp1_sharegpt",
|
||||
"qps_list": [1, 4, 16, "inf"],
|
||||
"max_concurrency_list": [32],
|
||||
"server_environment_variables": {
|
||||
"VLLM_RPC_TIMEOUT": 100000,
|
||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||
"VLLM_CPU_SGL_KERNEL": 1,
|
||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||
},
|
||||
"server_parameters": {
|
||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||
"tensor_parallel_size": 1,
|
||||
"dtype": "bfloat16",
|
||||
"distributed_executor_backend": "mp",
|
||||
"block_size": 128,
|
||||
"trust_remote_code": "",
|
||||
"disable_log_stats": "",
|
||||
"enforce_eager": "",
|
||||
"max_num_batched_tokens": 2048,
|
||||
"max_num_seqs": 256,
|
||||
"load_format": "dummy"
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||
"backend": "vllm",
|
||||
"dataset_name": "sharegpt",
|
||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||
"num_prompts": 32
|
||||
}
|
||||
{
|
||||
"defaults": {
|
||||
"qps_list": [
|
||||
"inf"
|
||||
],
|
||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
||||
"server_environment_variables": {
|
||||
"VLLM_RPC_TIMEOUT": 100000,
|
||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||
"VLLM_CPU_SGL_KERNEL": 1,
|
||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_tp2_sharegpt",
|
||||
"qps_list": [1, 4, 16, "inf"],
|
||||
"max_concurrency_list": [32],
|
||||
"server_environment_variables": {
|
||||
"VLLM_RPC_TIMEOUT": 100000,
|
||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||
"VLLM_CPU_SGL_KERNEL": 1,
|
||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||
},
|
||||
"server_parameters": {
|
||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||
"tensor_parallel_size": 2,
|
||||
"dtype": "bfloat16",
|
||||
"distributed_executor_backend": "mp",
|
||||
"block_size": 128,
|
||||
"trust_remote_code": "",
|
||||
"disable_log_stats": "",
|
||||
"enforce_eager": "",
|
||||
"max_num_batched_tokens": 2048,
|
||||
"max_num_seqs": 256,
|
||||
"load_format": "dummy"
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||
"backend": "vllm",
|
||||
"dataset_name": "sharegpt",
|
||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||
"num_prompts": 32
|
||||
}
|
||||
"server_parameters": {
|
||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||
"tensor_parallel_size": 1,
|
||||
"dtype": "bfloat16",
|
||||
"distributed_executor_backend": "mp",
|
||||
"block_size": 128,
|
||||
"trust_remote_code": "",
|
||||
"disable_log_stats": "",
|
||||
"enforce_eager": "",
|
||||
"max_num_batched_tokens": 2048,
|
||||
"max_num_seqs": 256,
|
||||
"load_format": "dummy"
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_tp1_random_128_128",
|
||||
"qps_list": [1, 4, 16, "inf"],
|
||||
"max_concurrency_list": [32],
|
||||
"server_environment_variables": {
|
||||
"VLLM_RPC_TIMEOUT": 100000,
|
||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||
"VLLM_CPU_SGL_KERNEL": 1,
|
||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||
},
|
||||
"server_parameters": {
|
||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||
"tensor_parallel_size": 1,
|
||||
"dtype": "bfloat16",
|
||||
"distributed_executor_backend": "mp",
|
||||
"block_size": 128,
|
||||
"trust_remote_code": "",
|
||||
"enable_chunked_prefill": "",
|
||||
"disable_log_stats": "",
|
||||
"enforce_eager": "",
|
||||
"max_num_batched_tokens": 2048,
|
||||
"max_num_seqs": 256,
|
||||
"load_format": "dummy"
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||
"backend": "vllm",
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 128,
|
||||
"random-output-len": 128,
|
||||
"ignore-eos": "",
|
||||
"num_prompts": 32
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_tp2_random_128_128",
|
||||
"qps_list": [1, 4, 16, "inf"],
|
||||
"max_concurrency_list": [32],
|
||||
"server_environment_variables": {
|
||||
"VLLM_RPC_TIMEOUT": 100000,
|
||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||
"VLLM_CPU_SGL_KERNEL": 1,
|
||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||
},
|
||||
"server_parameters": {
|
||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||
"tensor_parallel_size": 2,
|
||||
"dtype": "bfloat16",
|
||||
"distributed_executor_backend": "mp",
|
||||
"block_size": 128,
|
||||
"trust_remote_code": "",
|
||||
"enable_chunked_prefill": "",
|
||||
"disable_log_stats": "",
|
||||
"enforce_eager": "",
|
||||
"max_num_batched_tokens": 2048,
|
||||
"max_num_seqs": 256,
|
||||
"load_format": "dummy"
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||
"backend": "vllm",
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 128,
|
||||
"random-output-len": 128,
|
||||
"ignore-eos": "",
|
||||
"num_prompts": 32
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_tp1_random_128_2048",
|
||||
"qps_list": [1, 4, 16, "inf"],
|
||||
"max_concurrency_list": [32],
|
||||
"server_environment_variables": {
|
||||
"VLLM_RPC_TIMEOUT": 100000,
|
||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||
"VLLM_CPU_SGL_KERNEL": 1,
|
||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||
},
|
||||
"server_parameters": {
|
||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||
"tensor_parallel_size": 1,
|
||||
"dtype": "bfloat16",
|
||||
"distributed_executor_backend": "mp",
|
||||
"block_size": 128,
|
||||
"trust_remote_code": "",
|
||||
"enable_chunked_prefill": "",
|
||||
"disable_log_stats": "",
|
||||
"enforce_eager": "",
|
||||
"max_num_batched_tokens": 2048,
|
||||
"max_num_seqs": 256,
|
||||
"load_format": "dummy"
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||
"backend": "vllm",
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 128,
|
||||
"random-output-len": 2048,
|
||||
"ignore-eos": "",
|
||||
"num_prompts": 32
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_tp2_random_128_2048",
|
||||
"qps_list": [1, 4, 16, "inf"],
|
||||
"max_concurrency_list": [32],
|
||||
"server_environment_variables": {
|
||||
"VLLM_RPC_TIMEOUT": 100000,
|
||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||
"VLLM_CPU_SGL_KERNEL": 1,
|
||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||
},
|
||||
"server_parameters": {
|
||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||
"tensor_parallel_size": 2,
|
||||
"dtype": "bfloat16",
|
||||
"distributed_executor_backend": "mp",
|
||||
"block_size": 128,
|
||||
"trust_remote_code": "",
|
||||
"enable_chunked_prefill": "",
|
||||
"disable_log_stats": "",
|
||||
"enforce_eager": "",
|
||||
"max_num_batched_tokens": 2048,
|
||||
"max_num_seqs": 256,
|
||||
"load_format": "dummy"
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||
"backend": "vllm",
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 128,
|
||||
"random-output-len": 2048,
|
||||
"ignore-eos": "",
|
||||
"num_prompts": 32
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_tp1_random_2048_128",
|
||||
"qps_list": [1, 4, 16, "inf"],
|
||||
"max_concurrency_list": [32],
|
||||
"server_environment_variables": {
|
||||
"VLLM_RPC_TIMEOUT": 100000,
|
||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||
"VLLM_CPU_SGL_KERNEL": 1,
|
||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||
},
|
||||
"server_parameters": {
|
||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||
"tensor_parallel_size": 1,
|
||||
"dtype": "bfloat16",
|
||||
"distributed_executor_backend": "mp",
|
||||
"block_size": 128,
|
||||
"trust_remote_code": "",
|
||||
"enable_chunked_prefill": "",
|
||||
"disable_log_stats": "",
|
||||
"enforce_eager": "",
|
||||
"max_num_batched_tokens": 2048,
|
||||
"max_num_seqs": 256,
|
||||
"load_format": "dummy"
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||
"backend": "vllm",
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 2048,
|
||||
"random-output-len": 128,
|
||||
"ignore-eos": "",
|
||||
"num_prompts": 32
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_tp2_random_2048_128",
|
||||
"qps_list": [1, 4, 16, "inf"],
|
||||
"max_concurrency_list": [32],
|
||||
"server_environment_variables": {
|
||||
"VLLM_RPC_TIMEOUT": 100000,
|
||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||
"VLLM_CPU_SGL_KERNEL": 1,
|
||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||
},
|
||||
"server_parameters": {
|
||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||
"tensor_parallel_size": 2,
|
||||
"dtype": "bfloat16",
|
||||
"distributed_executor_backend": "mp",
|
||||
"block_size": 128,
|
||||
"trust_remote_code": "",
|
||||
"enable_chunked_prefill": "",
|
||||
"disable_log_stats": "",
|
||||
"enforce_eager": "",
|
||||
"max_num_batched_tokens": 2048,
|
||||
"max_num_seqs": 256,
|
||||
"load_format": "dummy"
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||
"backend": "vllm",
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 2048,
|
||||
"random-output-len": 128,
|
||||
"ignore-eos": "",
|
||||
"num_prompts": 32
|
||||
}
|
||||
"client_parameters": {
|
||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||
"backend": "vllm",
|
||||
"ignore-eos": "",
|
||||
"num_prompts": 200
|
||||
}
|
||||
]
|
||||
},
|
||||
"tests": [
|
||||
{
|
||||
"test_name": "serving_llama8B_tp1_sharegpt",
|
||||
"server_parameters": {
|
||||
"tensor_parallel_size": 1
|
||||
},
|
||||
"client_parameters": {
|
||||
"dataset_name": "sharegpt",
|
||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json"
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_tp2_sharegpt",
|
||||
"server_parameters": {
|
||||
"tensor_parallel_size": 2
|
||||
},
|
||||
"client_parameters": {
|
||||
"dataset_name": "sharegpt",
|
||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json"
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_tp1_random_128_128",
|
||||
"server_parameters": {
|
||||
"tensor_parallel_size": 1
|
||||
},
|
||||
"client_parameters": {
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 128,
|
||||
"random-output-len": 128
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_tp2_random_128_128",
|
||||
"server_parameters": {
|
||||
"tensor_parallel_size": 2
|
||||
},
|
||||
"client_parameters": {
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 128,
|
||||
"random-output-len": 128
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_tp4_random_128_128",
|
||||
"server_parameters": {
|
||||
"tensor_parallel_size": 4
|
||||
},
|
||||
"client_parameters": {
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 128,
|
||||
"random-output-len": 128
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_tp1_random_128_2048",
|
||||
"server_parameters": {
|
||||
"tensor_parallel_size": 1
|
||||
},
|
||||
"client_parameters": {
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 128,
|
||||
"random-output-len": 2048
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_tp2_random_128_2048",
|
||||
"server_parameters": {
|
||||
"tensor_parallel_size": 2
|
||||
},
|
||||
"client_parameters": {
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 128,
|
||||
"random-output-len": 2048
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_tp4_random_128_2048",
|
||||
"server_parameters": {
|
||||
"tensor_parallel_size": 4
|
||||
},
|
||||
"client_parameters": {
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 128,
|
||||
"random-output-len": 2048
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_tp1_random_2048_128",
|
||||
"server_parameters": {
|
||||
"tensor_parallel_size": 1
|
||||
},
|
||||
"client_parameters": {
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 2048,
|
||||
"random-output-len": 128
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_tp2_random_2048_128",
|
||||
"server_parameters": {
|
||||
"tensor_parallel_size": 2
|
||||
},
|
||||
"client_parameters": {
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 2048,
|
||||
"random-output-len": 128
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_tp4_random_2048_128",
|
||||
"server_parameters": {
|
||||
"tensor_parallel_size": 4
|
||||
},
|
||||
"client_parameters": {
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 2048,
|
||||
"random-output-len": 128
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama3B_tp1_random_128_128",
|
||||
"server_parameters": {
|
||||
"model": "meta-llama/Llama-3.2-3B-Instruct",
|
||||
"tensor_parallel_size": 1
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "meta-llama/Llama-3.2-3B-Instruct",
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 128,
|
||||
"random-output-len": 128
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_granite2B_tp1_random_128_128",
|
||||
"server_parameters": {
|
||||
"model": "ibm-granite/granite-3.2-2b-instruct",
|
||||
"tensor_parallel_size": 1
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "ibm-granite/granite-3.2-2b-instruct",
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 128,
|
||||
"random-output-len": 128
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_qwen1.7B_tp1_random_128_128",
|
||||
"server_parameters": {
|
||||
"model": "Qwen/Qwen3-1.7B",
|
||||
"tensor_parallel_size": 1
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "Qwen/Qwen3-1.7B",
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 128,
|
||||
"random-output-len": 128
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_qwen4B_tp1_random_128_128",
|
||||
"server_parameters": {
|
||||
"model": "Qwen/Qwen3-4B",
|
||||
"tensor_parallel_size": 1
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "Qwen/Qwen3-4B",
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 128,
|
||||
"random-output-len": 128
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_qwen8B_tp1_random_128_128",
|
||||
"server_parameters": {
|
||||
"model": "Qwen/Qwen3-8B",
|
||||
"tensor_parallel_size": 1
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "Qwen/Qwen3-8B",
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 128,
|
||||
"random-output-len": 128
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_glm9B_tp1_random_128_128",
|
||||
"server_parameters": {
|
||||
"model": "zai-org/glm-4-9b-hf",
|
||||
"tensor_parallel_size": 1
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "zai-org/glm-4-9b-hf",
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 128,
|
||||
"random-output-len": 128
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_gemma7B_tp1_random_128_128",
|
||||
"server_parameters": {
|
||||
"model": "google/gemma-7b",
|
||||
"tensor_parallel_size": 1
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "google/gemma-7b",
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 128,
|
||||
"random-output-len": 128
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
@@ -8,13 +8,28 @@ steps:
|
||||
commands:
|
||||
# #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here:
|
||||
# https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg VLLM_MAIN_CUDA_VERSION=12.9 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
||||
- "mkdir artifacts"
|
||||
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
||||
- "bash .buildkite/scripts/upload-wheels.sh"
|
||||
env:
|
||||
DOCKER_BUILDKIT: "1"
|
||||
|
||||
- label: "Build arm64 wheel - CUDA 13.0"
|
||||
depends_on: ~
|
||||
id: build-wheel-arm64-cuda-13-0
|
||||
agents:
|
||||
queue: arm64_cpu_queue_postmerge
|
||||
commands:
|
||||
# #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here:
|
||||
# https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
||||
- "mkdir artifacts"
|
||||
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
||||
- "bash .buildkite/scripts/upload-wheels.sh manylinux_2_35"
|
||||
env:
|
||||
DOCKER_BUILDKIT: "1"
|
||||
|
||||
# aarch64 build
|
||||
- label: "Build arm64 CPU wheel"
|
||||
depends_on: ~
|
||||
@@ -25,24 +40,11 @@ steps:
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_BUILD_ACL=ON --tag vllm-ci:build-image --target vllm-build --progress plain -f docker/Dockerfile.cpu ."
|
||||
- "mkdir artifacts"
|
||||
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
||||
- "bash .buildkite/scripts/upload-wheels.sh"
|
||||
- "bash .buildkite/scripts/upload-wheels.sh manylinux_2_35"
|
||||
env:
|
||||
DOCKER_BUILDKIT: "1"
|
||||
|
||||
# x86 + CUDA builds
|
||||
- label: "Build wheel - CUDA 12.8"
|
||||
depends_on: ~
|
||||
id: build-wheel-cuda-12-8
|
||||
agents:
|
||||
queue: cpu_queue_postmerge
|
||||
commands:
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
||||
- "mkdir artifacts"
|
||||
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
||||
- "bash .buildkite/scripts/upload-wheels.sh"
|
||||
env:
|
||||
DOCKER_BUILDKIT: "1"
|
||||
|
||||
- label: "Build wheel - CUDA 12.9"
|
||||
depends_on: ~
|
||||
id: build-wheel-cuda-12-9
|
||||
@@ -52,7 +54,7 @@ steps:
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
||||
- "mkdir artifacts"
|
||||
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
||||
- "bash .buildkite/scripts/upload-wheels.sh"
|
||||
- "bash .buildkite/scripts/upload-wheels.sh manylinux_2_31"
|
||||
env:
|
||||
DOCKER_BUILDKIT: "1"
|
||||
|
||||
@@ -65,7 +67,21 @@ steps:
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
||||
- "mkdir artifacts"
|
||||
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
||||
- "bash .buildkite/scripts/upload-wheels.sh"
|
||||
- "bash .buildkite/scripts/upload-wheels.sh manylinux_2_35"
|
||||
env:
|
||||
DOCKER_BUILDKIT: "1"
|
||||
|
||||
# x86 CPU wheel build
|
||||
- label: "Build x86 CPU wheel"
|
||||
depends_on: ~
|
||||
id: build-wheel-x86-cpu
|
||||
agents:
|
||||
queue: cpu_queue_postmerge
|
||||
commands:
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_CPU_AVX512BF16=true --build-arg VLLM_CPU_AVX512VNNI=true --build-arg VLLM_CPU_AMXBF16=true --tag vllm-ci:build-image --target vllm-build --progress plain -f docker/Dockerfile.cpu ."
|
||||
- "mkdir artifacts"
|
||||
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
||||
- "bash .buildkite/scripts/upload-wheels.sh manylinux_2_35"
|
||||
env:
|
||||
DOCKER_BUILDKIT: "1"
|
||||
|
||||
@@ -109,7 +125,6 @@ steps:
|
||||
- label: "Annotate release workflow"
|
||||
depends_on:
|
||||
- create-multi-arch-manifest
|
||||
- build-wheel-cuda-12-8
|
||||
id: annotate-release-workflow
|
||||
agents:
|
||||
queue: cpu_queue_postmerge
|
||||
|
||||
@@ -23,8 +23,8 @@ To download the wheel (by version):
|
||||
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux1_x86_64.whl .
|
||||
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux2014_aarch64.whl .
|
||||
|
||||
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu126/vllm-${RELEASE_VERSION}+cu126-cp38-abi3-manylinux1_x86_64.whl .
|
||||
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu129/vllm-${RELEASE_VERSION}+cu129-cp38-abi3-manylinux1_x86_64.whl .
|
||||
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu130/vllm-${RELEASE_VERSION}+cu130-cp38-abi3-manylinux1_x86_64.whl .
|
||||
\`\`\`
|
||||
|
||||
To download and upload the image:
|
||||
@@ -45,9 +45,10 @@ docker tag vllm/vllm-openai:aarch64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
|
||||
docker push vllm/vllm-openai:latest-aarch64
|
||||
docker push vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
|
||||
|
||||
docker manifest create vllm/vllm-openai:latest vllm/vllm-openai:latest-x86_64 vllm/vllm-openai:latest-aarch64 --amend
|
||||
docker manifest create vllm/vllm-openai:v${RELEASE_VERSION} vllm/vllm-openai:v${RELEASE_VERSION}-x86_64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64 --amend
|
||||
docker manifest rm vllm/vllm-openai:latest
|
||||
docker manifest create vllm/vllm-openai:latest vllm/vllm-openai:latest-x86_64 vllm/vllm-openai:latest-aarch64
|
||||
docker manifest create vllm/vllm-openai:v${RELEASE_VERSION} vllm/vllm-openai:v${RELEASE_VERSION}-x86_64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
|
||||
docker manifest push vllm/vllm-openai:latest
|
||||
docker manifest push vllm/vllm-openai:v${RELEASE_VERSION}
|
||||
\`\`\`
|
||||
EOF
|
||||
EOF
|
||||
|
||||
400
.buildkite/scripts/generate-nightly-index.py
Normal file
400
.buildkite/scripts/generate-nightly-index.py
Normal file
@@ -0,0 +1,400 @@
|
||||
#!/usr/bin/env python3
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
# do not complain about line length (for docstring)
|
||||
# ruff: noqa: E501
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import sys
|
||||
from dataclasses import asdict, dataclass
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
from urllib.parse import quote
|
||||
|
||||
import regex as re
|
||||
|
||||
if not sys.version_info >= (3, 12):
|
||||
raise RuntimeError("This script requires Python 3.12 or higher.")
|
||||
|
||||
INDEX_HTML_TEMPLATE = """<!DOCTYPE html>
|
||||
<html>
|
||||
<!-- {comment} -->
|
||||
<meta name="pypi:repository-version" content="1.0">
|
||||
<body>
|
||||
{items}
|
||||
</body>
|
||||
</html>
|
||||
"""
|
||||
|
||||
|
||||
@dataclass
|
||||
class WheelFileInfo:
|
||||
package_name: str
|
||||
version: str
|
||||
build_tag: str | None
|
||||
python_tag: str
|
||||
abi_tag: str
|
||||
platform_tag: str
|
||||
variant: str | None
|
||||
filename: str
|
||||
|
||||
|
||||
def parse_from_filename(file: str) -> WheelFileInfo:
|
||||
"""
|
||||
Parse wheel file name to extract metadata.
|
||||
|
||||
The format of wheel names:
|
||||
{package_name}-{version}(-{build_tag})?-{python_tag}-{abi_tag}-{platform_tag}.whl
|
||||
All versions could contain a variant like '+cu129' or '.cpu' or `.rocm` (or not).
|
||||
Example:
|
||||
vllm-0.11.0-cp38-abi3-manylinux1_x86_64.whl
|
||||
vllm-0.10.2rc2+cu129-cp38-abi3-manylinux2014_aarch64.whl
|
||||
vllm-0.11.1rc8.dev14+gaa384b3c0-cp38-abi3-manylinux2014_aarch64.whl
|
||||
vllm-0.11.1rc8.dev14+gaa384b3c0.cu130-cp38-abi3-manylinux1_x86_64.whl
|
||||
"""
|
||||
wheel_file_re = re.compile(
|
||||
r"^(?P<package_name>.+)-(?P<version>[^-]+?)(-(?P<build_tag>[^-]+))?-(?P<python_tag>[^-]+)-(?P<abi_tag>[^-]+)-(?P<platform_tag>[^-]+)\.whl$"
|
||||
)
|
||||
match = wheel_file_re.match(file)
|
||||
if not match:
|
||||
raise ValueError(f"Invalid wheel file name: {file}")
|
||||
|
||||
package_name = match.group("package_name")
|
||||
version = match.group("version")
|
||||
build_tag = match.group("build_tag")
|
||||
python_tag = match.group("python_tag")
|
||||
abi_tag = match.group("abi_tag")
|
||||
platform_tag = match.group("platform_tag")
|
||||
|
||||
# extract variant from version
|
||||
variant = None
|
||||
if "dev" in version:
|
||||
ver_after_dev = version.split("dev")[-1]
|
||||
if "." in ver_after_dev:
|
||||
variant = ver_after_dev.split(".")[-1]
|
||||
version = version.removesuffix("." + variant)
|
||||
else:
|
||||
if "+" in version:
|
||||
version, variant = version.split("+")
|
||||
|
||||
return WheelFileInfo(
|
||||
package_name=package_name,
|
||||
version=version,
|
||||
build_tag=build_tag,
|
||||
python_tag=python_tag,
|
||||
abi_tag=abi_tag,
|
||||
platform_tag=platform_tag,
|
||||
variant=variant,
|
||||
filename=file,
|
||||
)
|
||||
|
||||
|
||||
def generate_project_list(subdir_names: list[str], comment: str = "") -> str:
|
||||
"""
|
||||
Generate project list HTML content linking to each project & variant sub-directory.
|
||||
"""
|
||||
href_tags = []
|
||||
for name in sorted(subdir_names):
|
||||
name = name.strip("/").strip(".")
|
||||
href_tags.append(f' <a href="{name}/">{name}/</a><br/>')
|
||||
return INDEX_HTML_TEMPLATE.format(items="\n".join(href_tags), comment=comment)
|
||||
|
||||
|
||||
def generate_package_index_and_metadata(
|
||||
wheel_files: list[WheelFileInfo],
|
||||
wheel_base_dir: Path,
|
||||
index_base_dir: Path,
|
||||
comment: str = "",
|
||||
) -> tuple[str, str]:
|
||||
"""
|
||||
Generate package index HTML content for a specific package, linking to actual wheel files.
|
||||
"""
|
||||
href_tags = []
|
||||
metadata = []
|
||||
for file in sorted(wheel_files, key=lambda x: x.filename):
|
||||
relative_path = (
|
||||
wheel_base_dir.relative_to(index_base_dir, walk_up=True) / file.filename
|
||||
)
|
||||
# handle with '+' in URL, and avoid double-encoding '/' and already-encoded '%2B'
|
||||
# NOTE: this is AWS S3 specific behavior!
|
||||
file_path_quoted = quote(relative_path.as_posix(), safe=":%/")
|
||||
href_tags.append(f' <a href="{file_path_quoted}">{file.filename}</a><br/>')
|
||||
file_meta = asdict(file)
|
||||
file_meta["path"] = file_path_quoted
|
||||
metadata.append(file_meta)
|
||||
index_str = INDEX_HTML_TEMPLATE.format(items="\n".join(href_tags), comment=comment)
|
||||
metadata_str = json.dumps(metadata, indent=2)
|
||||
return index_str, metadata_str
|
||||
|
||||
|
||||
def generate_index_and_metadata(
|
||||
whl_files: list[str],
|
||||
wheel_base_dir: Path,
|
||||
index_base_dir: Path,
|
||||
default_variant: str | None = None,
|
||||
alias_to_default: str | None = None,
|
||||
comment: str = "",
|
||||
):
|
||||
"""
|
||||
Generate index for all wheel files.
|
||||
|
||||
Args:
|
||||
whl_files (list[str]): List of wheel files (must be directly under `wheel_base_dir`).
|
||||
wheel_base_dir (Path): Base directory for wheel files.
|
||||
index_base_dir (Path): Base directory to store index files.
|
||||
default_variant (str | None): The default variant name, if any.
|
||||
alias_to_default (str | None): Alias variant name for the default variant, if any.
|
||||
comment (str | None): Optional comment to include in the generated HTML files.
|
||||
|
||||
First, parse all wheel files to extract metadata.
|
||||
We need to collect all wheel files for each variant, and generate an index for it (in a sub-directory).
|
||||
The index for the default variant (if any) is generated in the root index directory.
|
||||
|
||||
If `default_variant` is provided, all wheels must have variant suffixes, and the default variant index
|
||||
is purely a copy of the corresponding variant index, with only the links adjusted.
|
||||
Otherwise, all wheels without variant suffixes are treated as the default variant.
|
||||
|
||||
If `alias_to_default` is provided, an additional alias sub-directory is created, it has the same content
|
||||
as the default variant index, but the links are adjusted accordingly.
|
||||
|
||||
Index directory structure:
|
||||
index_base_dir/ (hosted at wheels.vllm.ai/{nightly,$commit,$version}/)
|
||||
index.html # project list, linking to "vllm/" and other packages, and all variant sub-directories
|
||||
vllm/
|
||||
index.html # package index, pointing to actual files in wheel_base_dir (relative path)
|
||||
metadata.json # machine-readable metadata for all wheels in this package
|
||||
cpu/ # cpu variant sub-directory
|
||||
index.html
|
||||
vllm/
|
||||
index.html
|
||||
metadata.json
|
||||
cu129/ # cu129 is actually the alias to default variant
|
||||
index.html
|
||||
vllm/
|
||||
index.html
|
||||
metadata.json
|
||||
cu130/ # cu130 variant sub-directory
|
||||
index.html
|
||||
vllm/
|
||||
index.html
|
||||
metadata.json
|
||||
...
|
||||
|
||||
metadata.json stores a dump of all wheel files' metadata in a machine-readable format:
|
||||
[
|
||||
{
|
||||
"package_name": "vllm",
|
||||
"version": "0.10.2rc2",
|
||||
"build_tag": null,
|
||||
"python_tag": "cp38",
|
||||
"abi_tag": "abi3",
|
||||
"platform_tag": "manylinux2014_aarch64",
|
||||
"variant": "cu129",
|
||||
"filename": "vllm-0.10.2rc2+cu129-cp38-abi3-manylinux2014_aarch64.whl",
|
||||
"path": "../vllm-0.10.2rc2%2Bcu129-cp38-abi3-manylinux2014_aarch64.whl" # to be concatenated with the directory URL and URL-encoded
|
||||
},
|
||||
...
|
||||
]
|
||||
"""
|
||||
|
||||
parsed_files = [parse_from_filename(f) for f in whl_files]
|
||||
|
||||
if not parsed_files:
|
||||
print("No wheel files found, skipping index generation.")
|
||||
return
|
||||
|
||||
# Group by variant
|
||||
variant_to_files: dict[str, list[WheelFileInfo]] = {}
|
||||
for file in parsed_files:
|
||||
variant = file.variant or "default"
|
||||
if variant not in variant_to_files:
|
||||
variant_to_files[variant] = []
|
||||
variant_to_files[variant].append(file)
|
||||
|
||||
print(f"Found variants: {list(variant_to_files.keys())}")
|
||||
|
||||
# sanity check for default variant
|
||||
if default_variant:
|
||||
if "default" in variant_to_files:
|
||||
raise ValueError(
|
||||
"All wheel files must have variant suffixes when `default_variant` is specified."
|
||||
)
|
||||
if default_variant not in variant_to_files:
|
||||
raise ValueError(
|
||||
f"Default variant '{default_variant}' not found among wheel files."
|
||||
)
|
||||
|
||||
if alias_to_default:
|
||||
if "default" not in variant_to_files:
|
||||
# e.g. only some wheels are uploaded to S3 currently
|
||||
print(
|
||||
"[WARN] Alias to default variant specified, but no default variant found."
|
||||
)
|
||||
elif alias_to_default in variant_to_files:
|
||||
raise ValueError(
|
||||
f"Alias variant name '{alias_to_default}' already exists among wheel files."
|
||||
)
|
||||
else:
|
||||
variant_to_files[alias_to_default] = variant_to_files["default"].copy()
|
||||
print(f"Alias variant '{alias_to_default}' created for default variant.")
|
||||
|
||||
# Generate comment in HTML header
|
||||
comment_str = f" ({comment})" if comment else ""
|
||||
comment_tmpl = f"Generated on {datetime.now().isoformat()}{comment_str}"
|
||||
|
||||
# Generate index for each variant
|
||||
subdir_names = set()
|
||||
for variant, files in variant_to_files.items():
|
||||
if variant == "default":
|
||||
variant_dir = index_base_dir
|
||||
else:
|
||||
variant_dir = index_base_dir / variant
|
||||
subdir_names.add(variant)
|
||||
|
||||
variant_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# gather all package names in this variant
|
||||
packages = set(f.package_name for f in files)
|
||||
if variant == "default":
|
||||
# these packages should also appear in the "project list"
|
||||
# generate after all variants are processed
|
||||
subdir_names = subdir_names.union(packages)
|
||||
else:
|
||||
# generate project list for this variant directly
|
||||
project_list_str = generate_project_list(sorted(packages), comment_tmpl)
|
||||
with open(variant_dir / "index.html", "w") as f:
|
||||
f.write(project_list_str)
|
||||
|
||||
for package in packages:
|
||||
# filter files belonging to this package only
|
||||
package_files = [f for f in files if f.package_name == package]
|
||||
package_dir = variant_dir / package
|
||||
package_dir.mkdir(parents=True, exist_ok=True)
|
||||
index_str, metadata_str = generate_package_index_and_metadata(
|
||||
package_files, wheel_base_dir, package_dir, comment
|
||||
)
|
||||
with open(package_dir / "index.html", "w") as f:
|
||||
f.write(index_str)
|
||||
with open(package_dir / "metadata.json", "w") as f:
|
||||
f.write(metadata_str)
|
||||
|
||||
# Generate top-level project list index
|
||||
project_list_str = generate_project_list(sorted(subdir_names), comment_tmpl)
|
||||
with open(index_base_dir / "index.html", "w") as f:
|
||||
f.write(project_list_str)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
"""
|
||||
Arguments:
|
||||
--version <version> : version string for the current build (e.g., commit hash)
|
||||
--current-objects <path_to_json> : path to JSON file containing current S3 objects listing in this version directory
|
||||
--output-dir <output_directory> : directory to store generated index files
|
||||
--alias-to-default <alias_variant_name> : (optional) alias variant name for the default variant
|
||||
--comment <comment_string> : (optional) comment string to include in generated HTML files
|
||||
"""
|
||||
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Process nightly build wheel files to generate indices."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--version",
|
||||
type=str,
|
||||
required=True,
|
||||
help="Version string for the current build (e.g., commit hash)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--current-objects",
|
||||
type=str,
|
||||
required=True,
|
||||
help="Path to JSON file containing current S3 objects listing in this version directory",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output-dir",
|
||||
type=str,
|
||||
required=True,
|
||||
help="Directory to store generated index files",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--alias-to-default",
|
||||
type=str,
|
||||
default=None,
|
||||
help="Alias variant name for the default variant",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--comment",
|
||||
type=str,
|
||||
default="",
|
||||
help="Optional comment string to include in generated HTML files",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
version = args.version
|
||||
if "/" in version or "\\" in version:
|
||||
raise ValueError("Version string must not contain slashes.")
|
||||
current_objects_path = Path(args.current_objects)
|
||||
output_dir = Path(args.output_dir)
|
||||
if not output_dir.exists():
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Read current objects JSON
|
||||
with open(current_objects_path) as f:
|
||||
current_objects: dict[str, list[dict[str, Any]]] = json.load(f)
|
||||
|
||||
# current_objects looks like from list_objects_v2 S3 API:
|
||||
"""
|
||||
"Contents": [
|
||||
{
|
||||
"Key": "e2f56c309d2a28899c68975a7e104502d56deb8f/vllm-0.11.2.dev363+ge2f56c309-cp38-abi3-manylinux1_x86_64.whl",
|
||||
"LastModified": "2025-11-28T14:00:32+00:00",
|
||||
"ETag": "\"37a38339c7cdb61ca737021b968075df-52\"",
|
||||
"ChecksumAlgorithm": [
|
||||
"CRC64NVME"
|
||||
],
|
||||
"ChecksumType": "FULL_OBJECT",
|
||||
"Size": 435649349,
|
||||
"StorageClass": "STANDARD"
|
||||
},
|
||||
...
|
||||
]
|
||||
"""
|
||||
|
||||
# Extract wheel file keys
|
||||
wheel_files = []
|
||||
for item in current_objects.get("Contents", []):
|
||||
key: str = item["Key"]
|
||||
if key.endswith(".whl"):
|
||||
wheel_files.append(key.split("/")[-1]) # only the filename is used
|
||||
|
||||
print(f"Found {len(wheel_files)} wheel files for version {version}: {wheel_files}")
|
||||
|
||||
# keep only "official" files for a non-nightly version (specifed by cli args)
|
||||
PY_VERSION_RE = re.compile(r"^\d+\.\d+\.\d+([a-zA-Z0-9.+-]*)?$")
|
||||
if PY_VERSION_RE.match(version):
|
||||
# upload-wheels.sh ensures no "dev" is in args.version
|
||||
wheel_files = list(
|
||||
filter(lambda x: version in x and "dev" not in x, wheel_files)
|
||||
)
|
||||
print(f"Non-nightly version detected, wheel files used: {wheel_files}")
|
||||
else:
|
||||
print("Nightly version detected, keeping all wheel files.")
|
||||
|
||||
# Generate index and metadata, assuming wheels and indices are stored as:
|
||||
# s3://vllm-wheels/{version}/<wheel files>
|
||||
# s3://vllm-wheels/<anything>/<index files>
|
||||
wheel_base_dir = Path(output_dir).parent / version
|
||||
index_base_dir = Path(output_dir)
|
||||
|
||||
generate_index_and_metadata(
|
||||
whl_files=wheel_files,
|
||||
wheel_base_dir=wheel_base_dir,
|
||||
index_base_dir=index_base_dir,
|
||||
default_variant=None,
|
||||
alias_to_default=args.alias_to_default,
|
||||
comment=args.comment.strip(),
|
||||
)
|
||||
print(f"Successfully generated index and metadata in {output_dir}")
|
||||
68
.buildkite/scripts/hardware_ci/run-cpu-test-arm.sh
Executable file
68
.buildkite/scripts/hardware_ci/run-cpu-test-arm.sh
Executable file
@@ -0,0 +1,68 @@
|
||||
#!/bin/bash
|
||||
|
||||
# This script build the CPU docker image and run the offline inference inside the container.
|
||||
# It serves a sanity check for compilation and basic model usage.
|
||||
set -ex
|
||||
|
||||
# allow to bind to different cores
|
||||
CORE_RANGE=${CORE_RANGE:-0-16}
|
||||
OMP_CORE_RANGE=${OMP_CORE_RANGE:-0-16}
|
||||
|
||||
export CMAKE_BUILD_PARALLEL_LEVEL=16
|
||||
|
||||
# Setup cleanup
|
||||
remove_docker_container() {
|
||||
set -e;
|
||||
docker rm -f cpu-test || true;
|
||||
}
|
||||
trap remove_docker_container EXIT
|
||||
remove_docker_container
|
||||
|
||||
# Try building the docker image
|
||||
docker build --tag cpu-test --target vllm-test -f docker/Dockerfile.cpu .
|
||||
|
||||
# Run the image
|
||||
docker run -itd --cpuset-cpus="$CORE_RANGE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=16 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test cpu-test
|
||||
|
||||
function cpu_tests() {
|
||||
set -e
|
||||
|
||||
docker exec cpu-test bash -c "
|
||||
set -e
|
||||
pip list"
|
||||
|
||||
# offline inference
|
||||
docker exec cpu-test bash -c "
|
||||
set -e
|
||||
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
|
||||
|
||||
# Run model tests
|
||||
docker exec cpu-test bash -c "
|
||||
set -e
|
||||
pytest -x -v -s tests/models/multimodal/generation/test_whisper.py -m cpu_model"
|
||||
|
||||
# Run kernel tests
|
||||
docker exec cpu-test bash -c "
|
||||
set -e
|
||||
pytest -x -v -s tests/kernels/test_onednn.py
|
||||
pytest -x -v -s tests/kernels/attention/test_cpu_attn.py
|
||||
pytest -x -v -s tests/kernels/moe/test_moe.py -k test_cpu_fused_moe_basic"
|
||||
|
||||
# basic online serving
|
||||
docker exec cpu-test bash -c '
|
||||
set -e
|
||||
VLLM_CPU_OMP_THREADS_BIND=$E2E_OMP_THREADS vllm serve Qwen/Qwen3-0.6B --max-model-len 2048 &
|
||||
server_pid=$!
|
||||
timeout 600 bash -c "until curl localhost:8000/v1/models; do sleep 1; done" || exit 1
|
||||
vllm bench serve \
|
||||
--backend vllm \
|
||||
--dataset-name random \
|
||||
--model Qwen/Qwen3-0.6B \
|
||||
--num-prompts 20 \
|
||||
--endpoint /v1/completions
|
||||
kill -s SIGTERM $server_pid &'
|
||||
}
|
||||
|
||||
# All of CPU tests are expected to be finished less than 40 mins.
|
||||
export -f cpu_tests
|
||||
timeout 2h bash -c cpu_tests
|
||||
@@ -25,20 +25,22 @@ function cpu_tests() {
|
||||
|
||||
# offline inference
|
||||
podman exec -it "$container_id" bash -c "
|
||||
export TORCH_COMPILE_DISABLE=1
|
||||
set -xve
|
||||
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m" >> $HOME/test_basic.log
|
||||
|
||||
# Run basic model test
|
||||
podman exec -it "$container_id" bash -c "
|
||||
export TORCH_COMPILE_DISABLE=1
|
||||
set -evx
|
||||
pip install pytest pytest-asyncio einops peft Pillow soundfile transformers_stream_generator matplotlib
|
||||
pip install sentence-transformers datamodel_code_generator
|
||||
pip install sentence-transformers datamodel_code_generator tblib
|
||||
|
||||
# Note: disable Bart until supports V1
|
||||
# pytest -v -s tests/models/language/generation/test_bart.py -m cpu_model
|
||||
pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-openai-community/gpt2]
|
||||
pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-facebook/opt-125m]
|
||||
pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-google/gemma-1.1-2b-it]
|
||||
pytest -v -s tests/models/language/generation/test_common.py::test_models[False-False-5-32-openai-community/gpt2]
|
||||
pytest -v -s tests/models/language/generation/test_common.py::test_models[False-False-5-32-facebook/opt-125m]
|
||||
pytest -v -s tests/models/language/generation/test_common.py::test_models[False-False-5-32-google/gemma-1.1-2b-it]
|
||||
pytest -v -s tests/models/language/pooling/test_classification.py::test_models[float-jason9693/Qwen2.5-1.5B-apeach]
|
||||
# TODO: Below test case tests/models/language/pooling/test_embedding.py::test_models[True-ssmits/Qwen2-7B-Instruct-embed-base] fails on ppc64le. Disabling it for time being.
|
||||
# pytest -v -s tests/models/language/pooling/test_embedding.py -m cpu_model" >> $HOME/test_rest.log
|
||||
|
||||
@@ -21,8 +21,8 @@ trap remove_docker_container EXIT
|
||||
remove_docker_container
|
||||
|
||||
# Try building the docker image
|
||||
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --tag cpu-test-"$NUMA_NODE" --target vllm-test -f docker/Dockerfile.cpu .
|
||||
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" --tag cpu-test-"$NUMA_NODE"-avx2 --target vllm-test -f docker/Dockerfile.cpu .
|
||||
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --progress plain --tag cpu-test-"$NUMA_NODE" --target vllm-test -f docker/Dockerfile.cpu .
|
||||
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --progress plain --build-arg VLLM_CPU_DISABLE_AVX512="true" --tag cpu-test-"$NUMA_NODE"-avx2 --target vllm-test -f docker/Dockerfile.cpu .
|
||||
|
||||
# Run the image, setting --shm-size=4g for tensor parallel.
|
||||
docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=16 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"
|
||||
@@ -73,12 +73,11 @@ function cpu_tests() {
|
||||
pytest -x -s -v \
|
||||
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_logprobs"
|
||||
|
||||
# Note: disable it until supports V1
|
||||
# Run AWQ test
|
||||
# docker exec cpu-test-"$NUMA_NODE" bash -c "
|
||||
# set -e
|
||||
# pytest -x -s -v \
|
||||
# tests/quantization/test_ipex_quant.py"
|
||||
# Run AWQ/GPTQ test
|
||||
docker exec cpu-test-"$NUMA_NODE" bash -c "
|
||||
set -e
|
||||
pytest -x -s -v \
|
||||
tests/quantization/test_cpu_wna16.py"
|
||||
|
||||
# Run multi-lora tests
|
||||
docker exec cpu-test-"$NUMA_NODE" bash -c "
|
||||
|
||||
@@ -74,6 +74,7 @@ FROM ${BASE_IMAGE_NAME}
|
||||
|
||||
# Define environments
|
||||
ENV DEBIAN_FRONTEND=noninteractive
|
||||
ENV SOC_VERSION="ascend910b1"
|
||||
|
||||
RUN pip config set global.index-url http://cache-service-vllm.nginx-pypi-cache.svc.cluster.local:${PYPI_CACHE_PORT}/pypi/simple && \
|
||||
pip config set global.trusted-host cache-service-vllm.nginx-pypi-cache.svc.cluster.local && \
|
||||
|
||||
@@ -35,9 +35,10 @@ docker run \
|
||||
echo $ZE_AFFINITY_MASK
|
||||
pip install tblib==3.1.0
|
||||
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
|
||||
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 -O3 -O.cudagraph_mode=NONE
|
||||
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 -O3 -cc.cudagraph_mode=NONE
|
||||
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray
|
||||
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp
|
||||
python3 examples/offline_inference/basic/generate.py --model Intel/Qwen2.5-0.5B-W4A16-G128-AutoRound-LLMC-TEST-ONLY --enforce-eager
|
||||
VLLM_ATTENTION_BACKEND=TRITON_ATTN python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
|
||||
cd tests
|
||||
pytest -v -s v1/core
|
||||
@@ -46,6 +47,6 @@ docker run \
|
||||
pytest -v -s v1/worker --ignore=v1/worker/test_gpu_model_runner.py
|
||||
pytest -v -s v1/structured_output
|
||||
pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_tree_attention.py --ignore=v1/spec_decode/test_speculators_eagle3.py
|
||||
pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_nixl_connector.py --ignore=v1/kv_connector/unit/test_shared_storage_connector.py --ignore=v1/kv_connector/unit/test_lmcache_integration.py
|
||||
pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_nixl_connector.py --ignore=v1/kv_connector/unit/test_example_connector.py --ignore=v1/kv_connector/unit/test_lmcache_integration.py
|
||||
pytest -v -s v1/test_serial_utils.py
|
||||
'
|
||||
|
||||
@@ -12,6 +12,11 @@ REPO_ROOT="$(cd "${SCRIPT_DIR}/../.." && pwd)"
|
||||
PRIME_RL_REPO="https://github.com/PrimeIntellect-ai/prime-rl.git"
|
||||
PRIME_RL_DIR="${REPO_ROOT}/prime-rl"
|
||||
|
||||
if command -v rocm-smi &> /dev/null || command -v rocminfo &> /dev/null; then
|
||||
echo "AMD GPU detected. Prime-RL currently only supports NVIDIA. Skipping..."
|
||||
exit 0
|
||||
fi
|
||||
|
||||
echo "Setting up Prime-RL integration test environment..."
|
||||
|
||||
# Clean up any existing Prime-RL directory
|
||||
|
||||
@@ -17,7 +17,17 @@ wait_for_server() {
|
||||
}
|
||||
|
||||
MODEL="deepseek-ai/DeepSeek-V2-lite"
|
||||
BACKENDS=("deepep_high_throughput" "deepep_low_latency")
|
||||
|
||||
# Set BACKENDS based on platform
|
||||
if command -v rocm-smi &> /dev/null || [[ -d /opt/rocm ]] || [[ -n "${ROCM_PATH:-}" ]]; then
|
||||
# ROCm platform
|
||||
BACKENDS=("allgather_reducescatter")
|
||||
# Disable MOE padding for ROCm since it is causing eplb to fail
|
||||
export VLLM_ROCM_MOE_PADDING=0
|
||||
else
|
||||
# Non-ROCm platform (CUDA/other)
|
||||
BACKENDS=("deepep_high_throughput" "deepep_low_latency")
|
||||
fi
|
||||
|
||||
cleanup() {
|
||||
if [[ -n "${SERVER_PID:-}" ]] && kill -0 "${SERVER_PID}" 2>/dev/null; then
|
||||
|
||||
@@ -0,0 +1,74 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euxo pipefail
|
||||
|
||||
# args: [THRESHOLD] [NUM_QUESTIONS] [START_PORT] [DATA_PARALLEL_SIZE] [TENSOR_PARALLEL_SIZE]
|
||||
THRESHOLD=${1:-0.8}
|
||||
NUM_Q=${2:-1319}
|
||||
PORT=${3:-8020}
|
||||
DATA_PARALLEL_SIZE=${4:-2}
|
||||
TENSOR_PARALLEL_SIZE=${5:-2}
|
||||
OUT_DIR=${OUT_DIR:-/tmp/vllm-scheduled}
|
||||
mkdir -p "${OUT_DIR}"
|
||||
|
||||
wait_for_server() {
|
||||
local port=$1
|
||||
timeout 600 bash -c '
|
||||
until curl -sf "http://127.0.0.1:'"$port"'/health" > /dev/null; do
|
||||
sleep 1
|
||||
done'
|
||||
}
|
||||
|
||||
MODEL="QWen/Qwen3-30B-A3B-FP8"
|
||||
# Set BACKENDS based on platform
|
||||
if command -v rocm-smi &> /dev/null || [[ -d /opt/rocm ]] || [[ -n "${ROCM_PATH:-}" ]]; then
|
||||
# ROCm platform
|
||||
BACKENDS=("allgather_reducescatter")
|
||||
# Disable MOE padding for ROCm since it is causing eplb to fail
|
||||
export VLLM_ROCM_MOE_PADDING=0
|
||||
else
|
||||
# Non-ROCm platform (CUDA/other)
|
||||
BACKENDS=("deepep_high_throughput" "deepep_low_latency")
|
||||
fi
|
||||
|
||||
cleanup() {
|
||||
if [[ -n "${SERVER_PID:-}" ]] && kill -0 "${SERVER_PID}" 2>/dev/null; then
|
||||
kill "${SERVER_PID}" 2>/dev/null || true
|
||||
for _ in {1..20}; do
|
||||
kill -0 "${SERVER_PID}" 2>/dev/null || break
|
||||
sleep 0.5
|
||||
done
|
||||
kill -9 "${SERVER_PID}" 2>/dev/null || true
|
||||
fi
|
||||
}
|
||||
trap cleanup EXIT
|
||||
|
||||
for BACK in "${BACKENDS[@]}"; do
|
||||
VLLM_DEEP_GEMM_WARMUP=skip \
|
||||
VLLM_ALL2ALL_BACKEND=$BACK \
|
||||
vllm serve "$MODEL" \
|
||||
--enforce-eager \
|
||||
--enable-eplb \
|
||||
--eplb-config '{"window_size":10, "step_interval":100, "num_redundant_experts":0, "log_balancedness":true}' \
|
||||
--tensor-parallel-size ${TENSOR_PARALLEL_SIZE} \
|
||||
--data-parallel-size ${DATA_PARALLEL_SIZE} \
|
||||
--enable-expert-parallel \
|
||||
--trust-remote-code \
|
||||
--max-model-len 2048 \
|
||||
--port $PORT &
|
||||
SERVER_PID=$!
|
||||
wait_for_server $PORT
|
||||
|
||||
TAG=$(echo "$MODEL" | tr '/: \\n' '_____')
|
||||
OUT="${OUT_DIR}/${TAG}_${BACK}.json"
|
||||
python3 tests/evals/gsm8k/gsm8k_eval.py --host http://127.0.0.1 --port $PORT --num-questions ${NUM_Q} --save-results ${OUT}
|
||||
python3 - <<PY
|
||||
import json; acc=json.load(open('${OUT}'))['accuracy']
|
||||
print(f"${MODEL} ${BACK}: accuracy {acc:.3f}")
|
||||
assert acc >= ${THRESHOLD}, f"${MODEL} ${BACK} accuracy {acc}"
|
||||
PY
|
||||
|
||||
cleanup
|
||||
SERVER_PID=
|
||||
sleep 1
|
||||
PORT=$((PORT+1))
|
||||
done
|
||||
@@ -2,9 +2,9 @@
|
||||
set -euxo pipefail
|
||||
|
||||
# args: [THRESHOLD] [NUM_QUESTIONS] [START_PORT]
|
||||
THRESHOLD=${1:-0.8}
|
||||
THRESHOLD=${1:-0.25}
|
||||
NUM_Q=${2:-1319}
|
||||
PORT=${3:-8020}
|
||||
PORT=${3:-8040}
|
||||
OUT_DIR=${OUT_DIR:-/tmp/vllm-scheduled}
|
||||
mkdir -p "${OUT_DIR}"
|
||||
|
||||
@@ -16,8 +16,18 @@ wait_for_server() {
|
||||
done'
|
||||
}
|
||||
|
||||
MODEL="QWen/Qwen3-30B-A3B-FP8"
|
||||
BACKENDS=("deepep_high_throughput" "deepep_low_latency")
|
||||
MODEL="Qwen/Qwen3-Next-80B-A3B-Instruct"
|
||||
|
||||
# Set BACKENDS based on platform
|
||||
if command -v rocm-smi &> /dev/null || [[ -d /opt/rocm ]] || [[ -n "${ROCM_PATH:-}" ]]; then
|
||||
# ROCm platform
|
||||
BACKENDS=("allgather_reducescatter")
|
||||
# Disable MOE padding for ROCm since it is causing eplb to fail
|
||||
export VLLM_ROCM_MOE_PADDING=0
|
||||
else
|
||||
# Non-ROCm platform (CUDA/other)
|
||||
BACKENDS=("deepep_high_throughput" "deepep_low_latency")
|
||||
fi
|
||||
|
||||
cleanup() {
|
||||
if [[ -n "${SERVER_PID:-}" ]] && kill -0 "${SERVER_PID}" 2>/dev/null; then
|
||||
@@ -36,11 +46,14 @@ for BACK in "${BACKENDS[@]}"; do
|
||||
VLLM_ALL2ALL_BACKEND=$BACK \
|
||||
vllm serve "$MODEL" \
|
||||
--enforce-eager \
|
||||
--tensor-parallel-size 2 \
|
||||
--data-parallel-size 2 \
|
||||
--tensor-parallel-size 4 \
|
||||
--enable-expert-parallel \
|
||||
--enable-eplb \
|
||||
--eplb-config '{"window_size":200,"step_interval":600,"use_async":true}' \
|
||||
--speculative-config '{"method":"qwen3_next_mtp","num_speculative_tokens":1}' \
|
||||
--trust-remote-code \
|
||||
--max-model-len 2048 \
|
||||
--gpu-memory-utilization 0.9 \
|
||||
--port $PORT &
|
||||
SERVER_PID=$!
|
||||
wait_for_server $PORT
|
||||
@@ -2,6 +2,28 @@
|
||||
|
||||
set -ex
|
||||
|
||||
# ======== part 0: setup ========
|
||||
|
||||
BUCKET="vllm-wheels"
|
||||
INDICES_OUTPUT_DIR="indices"
|
||||
DEFAULT_VARIANT_ALIAS="cu129" # align with vLLM_MAIN_CUDA_VERSION in vllm/envs.py
|
||||
PYTHON=${PYTHON_PROG:=python3} # try to read from env var, otherwise use python3
|
||||
SUBPATH=$BUILDKITE_COMMIT
|
||||
S3_COMMIT_PREFIX="s3://$BUCKET/$SUBPATH/"
|
||||
|
||||
# detect if python3.10+ is available
|
||||
has_new_python=$($PYTHON -c "print(1 if __import__('sys').version_info >= (3,12) else 0)")
|
||||
if [[ "$has_new_python" -eq 0 ]]; then
|
||||
# use new python from docker
|
||||
docker pull python:3-slim
|
||||
PYTHON="docker run --rm -v $(pwd):/app -w /app python:3-slim python3"
|
||||
fi
|
||||
|
||||
echo "Using python interpreter: $PYTHON"
|
||||
echo "Python version: $($PYTHON --version)"
|
||||
|
||||
# ========= part 1: collect, rename & upload the wheel ==========
|
||||
|
||||
# Assume wheels are in artifacts/dist/*.whl
|
||||
wheel_files=(artifacts/dist/*.whl)
|
||||
|
||||
@@ -10,74 +32,76 @@ if [[ ${#wheel_files[@]} -ne 1 ]]; then
|
||||
echo "Error: Expected exactly one wheel file in artifacts/dist/, but found ${#wheel_files[@]}"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Get the single wheel file
|
||||
wheel="${wheel_files[0]}"
|
||||
|
||||
# Detect architecture and rename 'linux' to appropriate manylinux version
|
||||
arch=$(uname -m)
|
||||
if [[ $arch == "x86_64" ]]; then
|
||||
manylinux_version="manylinux1"
|
||||
elif [[ $arch == "aarch64" ]]; then
|
||||
manylinux_version="manylinux2014"
|
||||
else
|
||||
echo "Warning: Unknown architecture $arch, using manylinux1 as default"
|
||||
manylinux_version="manylinux1"
|
||||
fi
|
||||
# default build image uses ubuntu 20.04, which corresponds to manylinux_2_31
|
||||
# we also accept params as manylinux tag
|
||||
# refer to https://github.com/mayeut/pep600_compliance?tab=readme-ov-file#acceptable-distros-to-build-wheels
|
||||
manylinux_version="${1:-manylinux_2_31}"
|
||||
|
||||
# Rename 'linux' to the appropriate manylinux version in the wheel filename
|
||||
if [[ "$wheel" != *"linux"* ]]; then
|
||||
echo "Error: Wheel filename does not contain 'linux': $wheel"
|
||||
exit 1
|
||||
fi
|
||||
new_wheel="${wheel/linux/$manylinux_version}"
|
||||
mv -- "$wheel" "$new_wheel"
|
||||
wheel="$new_wheel"
|
||||
echo "Renamed wheel to: $wheel"
|
||||
|
||||
# Extract the version from the wheel
|
||||
version=$(unzip -p "$wheel" '**/METADATA' | grep '^Version: ' | cut -d' ' -f2)
|
||||
echo "Version: $version"
|
||||
echo "Version in wheel: $version"
|
||||
pure_version="${version%%+*}"
|
||||
echo "Pure version (without variant): $pure_version"
|
||||
|
||||
normal_wheel="$wheel" # Save the original wheel filename
|
||||
# copy wheel to its own bucket
|
||||
aws s3 cp "$wheel" "$S3_COMMIT_PREFIX"
|
||||
|
||||
# If the version contains "dev", rename it to v1.0.0.dev for consistency
|
||||
if [[ $version == *dev* ]]; then
|
||||
suffix="${version##*.}"
|
||||
if [[ $suffix == cu* ]]; then
|
||||
new_version="1.0.0.dev+${suffix}"
|
||||
else
|
||||
new_version="1.0.0.dev"
|
||||
fi
|
||||
new_wheel="${wheel/$version/$new_version}"
|
||||
# use cp to keep both files in the artifacts directory
|
||||
cp -- "$wheel" "$new_wheel"
|
||||
wheel="$new_wheel"
|
||||
version="$new_version"
|
||||
fi
|
||||
# ========= part 2: generate and upload indices ==========
|
||||
# generate indices for all existing wheels in the commit directory
|
||||
# this script might be run multiple times if there are multiple variants being built
|
||||
# so we need to guarantee there is little chance for "TOCTOU" issues
|
||||
# i.e., one process is generating indices while another is uploading a new wheel
|
||||
# so we need to ensure no time-consuming operations happen below
|
||||
|
||||
# Upload the wheel to S3
|
||||
python3 .buildkite/generate_index.py --wheel "$normal_wheel"
|
||||
# list all wheels in the commit directory
|
||||
echo "Existing wheels on S3:"
|
||||
aws s3 ls "$S3_COMMIT_PREFIX"
|
||||
obj_json="objects.json"
|
||||
aws s3api list-objects-v2 --bucket "$BUCKET" --prefix "$SUBPATH/" --delimiter / --output json > "$obj_json"
|
||||
mkdir -p "$INDICES_OUTPUT_DIR"
|
||||
|
||||
# generate index for this commit
|
||||
aws s3 cp "$wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
|
||||
aws s3 cp "$normal_wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
|
||||
|
||||
if [[ $normal_wheel == *"cu129"* ]]; then
|
||||
# only upload index.html for cu129 wheels (default wheels) as it
|
||||
# is available on both x86 and arm64
|
||||
aws s3 cp index.html "s3://vllm-wheels/$BUILDKITE_COMMIT/vllm/index.html"
|
||||
aws s3 cp "s3://vllm-wheels/nightly/index.html" "s3://vllm-wheels/$BUILDKITE_COMMIT/index.html"
|
||||
# call script to generate indicies for all existing wheels
|
||||
# this indices have relative paths that could work as long as it is next to the wheel directory in s3
|
||||
# i.e., the wheels are always in s3://vllm-wheels/<commit>/
|
||||
# and indices can be placed in /<commit>/, or /nightly/, or /<version>/
|
||||
if [[ ! -z "$DEFAULT_VARIANT_ALIAS" ]]; then
|
||||
alias_arg="--alias-to-default $DEFAULT_VARIANT_ALIAS"
|
||||
else
|
||||
echo "Skipping index files for non-cu129 wheels"
|
||||
alias_arg=""
|
||||
fi
|
||||
|
||||
# generate index for nightly
|
||||
aws s3 cp "$wheel" "s3://vllm-wheels/nightly/"
|
||||
aws s3 cp "$normal_wheel" "s3://vllm-wheels/nightly/"
|
||||
# HACK: we do not need regex module here, but it is required by pre-commit hook
|
||||
# To avoid any external dependency, we simply replace it back to the stdlib re module
|
||||
sed -i 's/import regex as re/import re/g' .buildkite/scripts/generate-nightly-index.py
|
||||
$PYTHON .buildkite/scripts/generate-nightly-index.py --version "$SUBPATH" --current-objects "$obj_json" --output-dir "$INDICES_OUTPUT_DIR" --comment "commit $BUILDKITE_COMMIT" $alias_arg
|
||||
|
||||
if [[ $normal_wheel == *"cu129"* ]]; then
|
||||
# only upload index.html for cu129 wheels (default wheels) as it
|
||||
# is available on both x86 and arm64
|
||||
aws s3 cp index.html "s3://vllm-wheels/nightly/vllm/index.html"
|
||||
else
|
||||
echo "Skipping index files for non-cu129 wheels"
|
||||
# copy indices to /<commit>/ unconditionally
|
||||
echo "Uploading indices to $S3_COMMIT_PREFIX"
|
||||
aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "$S3_COMMIT_PREFIX"
|
||||
|
||||
# copy to /nightly/ only if it is on the main branch and not a PR
|
||||
if [[ "$BUILDKITE_BRANCH" == "main" && "$BUILDKITE_PULL_REQUEST" == "false" ]]; then
|
||||
echo "Uploading indices to overwrite /nightly/"
|
||||
aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "s3://$BUCKET/nightly/"
|
||||
fi
|
||||
|
||||
aws s3 cp "$wheel" "s3://vllm-wheels/$version/"
|
||||
aws s3 cp index.html "s3://vllm-wheels/$version/vllm/index.html"
|
||||
# re-generate and copy to /<pure_version>/ only if it does not have "dev" in the version
|
||||
if [[ "$version" != *"dev"* ]]; then
|
||||
echo "Re-generating indices for /$pure_version/"
|
||||
rm -rf "$INDICES_OUTPUT_DIR/*"
|
||||
mkdir -p "$INDICES_OUTPUT_DIR"
|
||||
$PYTHON .buildkite/scripts/generate-nightly-index.py --version "$pure_version" --current-objects "$obj_json" --output-dir "$INDICES_OUTPUT_DIR" --comment "version $pure_version" $alias_arg
|
||||
aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "s3://$BUCKET/$pure_version/"
|
||||
fi
|
||||
|
||||
@@ -39,9 +39,9 @@ steps:
|
||||
# if this test fails, it means the nightly torch version is not compatible with some
|
||||
# of the dependencies. Please check the error message and add the package to whitelist
|
||||
# in /vllm/tools/pre_commit/generate_nightly_torch_test.py
|
||||
mirror_hardwares: [amdexperimental]
|
||||
mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
|
||||
agent_pool: mi325_1
|
||||
# grade: Blocking
|
||||
grade: Blocking
|
||||
soft_fail: true
|
||||
source_file_dependencies:
|
||||
- requirements/nightly_torch_test.txt
|
||||
@@ -50,9 +50,9 @@ steps:
|
||||
|
||||
- label: Async Engine, Inputs, Utils, Worker Test # 10min
|
||||
timeout_in_minutes: 15
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
|
||||
agent_pool: mi325_1
|
||||
# grade: Blocking
|
||||
grade: Blocking
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/multimodal
|
||||
@@ -61,25 +61,31 @@ steps:
|
||||
- pytest -v -s -m 'not cpu_test' multimodal
|
||||
- pytest -v -s utils_
|
||||
|
||||
- label: Async Engine, Inputs, Utils, Worker Test (CPU) # 4 mins
|
||||
timeout_in_minutes: 10
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
- label: Async Engine, Inputs, Utils, Worker, Config Test (CPU) # 20min
|
||||
timeout_in_minutes: 30
|
||||
mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
|
||||
agent_pool: mi325_1
|
||||
# grade: Blocking
|
||||
grade: Blocking
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/test_inputs.py
|
||||
- tests/test_outputs.py
|
||||
- tests/multimodal
|
||||
- tests/standalone_tests/lazy_imports.py
|
||||
- tests/tokenizers_
|
||||
- tests/tool_parsers
|
||||
- tests/transformers_utils
|
||||
- tests/config
|
||||
no_gpu: true
|
||||
commands:
|
||||
- python3 standalone_tests/lazy_imports.py
|
||||
- pytest -v -s test_inputs.py
|
||||
- pytest -v -s test_outputs.py
|
||||
- pytest -v -s -m 'cpu_test' multimodal
|
||||
- pytest -v -s tokenizers_
|
||||
- pytest -v -s tool_parsers
|
||||
- pytest -v -s transformers_utils
|
||||
- pytest -v -s config
|
||||
|
||||
- label: Python-only Installation Test # 10min
|
||||
timeout_in_minutes: 20
|
||||
@@ -111,9 +117,9 @@ steps:
|
||||
- pytest -v -s basic_correctness/test_cpu_offload.py
|
||||
|
||||
- label: Entrypoints Unit Tests # 5min
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
|
||||
agent_pool: mi325_1
|
||||
# grade: Blocking
|
||||
grade: Blocking
|
||||
timeout_in_minutes: 10
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
fast_check: true
|
||||
@@ -187,7 +193,7 @@ steps:
|
||||
- tests/distributed/test_utils
|
||||
- tests/distributed/test_pynccl
|
||||
- tests/distributed/test_events
|
||||
- tests/compile/test_basic_correctness
|
||||
- tests/compile/fullgraph/test_basic_correctness.py
|
||||
- examples/offline_inference/rlhf.py
|
||||
- examples/offline_inference/rlhf_colocate.py
|
||||
- tests/examples/offline_inference/data_parallel.py
|
||||
@@ -210,12 +216,13 @@ steps:
|
||||
# test with internal dp
|
||||
- python3 ../examples/offline_inference/data_parallel.py --enforce-eager
|
||||
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
|
||||
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
|
||||
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
|
||||
- TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_internal_lb_dp.py
|
||||
- TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_hybrid_lb_dp.py
|
||||
- pytest -v -s v1/engine/test_engine_core_client.py::test_kv_cache_events_dp
|
||||
- pytest -v -s distributed/test_utils.py
|
||||
- pytest -v -s compile/test_basic_correctness.py
|
||||
- pytest -v -s compile/fullgraph/test_basic_correctness.py
|
||||
- pytest -v -s distributed/test_pynccl.py
|
||||
- pytest -v -s distributed/test_events.py
|
||||
- pytest -v -s distributed/test_symm_mem_allreduce.py
|
||||
@@ -248,9 +255,9 @@ steps:
|
||||
- torchrun --nproc-per-node=8 ../examples/offline_inference/torchrun_dp_example.py --tp-size=2 --pp-size=1 --dp-size=4 --enable-ep
|
||||
|
||||
- label: EPLB Algorithm Test # 5min
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
|
||||
agent_pool: mi325_1
|
||||
# grade: Blocking
|
||||
grade: Blocking
|
||||
timeout_in_minutes: 15
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
source_file_dependencies:
|
||||
@@ -306,28 +313,25 @@ steps:
|
||||
- pytest -v -s test_regression.py
|
||||
working_dir: "/vllm-workspace/tests" # optional
|
||||
|
||||
- label: Engine Test # 25min
|
||||
timeout_in_minutes: 40
|
||||
- label: Engine Test # 9min
|
||||
timeout_in_minutes: 15
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
agent_pool: mi325_1
|
||||
# grade: Blocking
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/engine
|
||||
- tests/tokenization
|
||||
- tests/test_sequence
|
||||
- tests/test_config
|
||||
- tests/test_logger
|
||||
- tests/test_vllm_port
|
||||
commands:
|
||||
- pytest -v -s engine test_sequence.py test_config.py test_logger.py test_vllm_port.py
|
||||
# OOM in the CI unless we run this separately
|
||||
- pytest -v -s tokenization
|
||||
|
||||
- label: V1 Test e2e + engine # 30min
|
||||
timeout_in_minutes: 45
|
||||
- label: V1 Test e2e + engine # 65min
|
||||
timeout_in_minutes: 90
|
||||
mirror_hardwares: [amdexperimental]
|
||||
agent_pool: mi325_1
|
||||
agent_pool: mi325_4
|
||||
# grade: Blocking
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
@@ -340,9 +344,9 @@ steps:
|
||||
|
||||
- label: V1 Test entrypoints # 35min
|
||||
timeout_in_minutes: 50
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
|
||||
agent_pool: mi325_1
|
||||
# grade: Blocking
|
||||
grade: Blocking
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/v1
|
||||
@@ -390,10 +394,34 @@ steps:
|
||||
commands:
|
||||
- pytest -v -s v1/attention
|
||||
|
||||
- label: V1 Test others (CPU) # 5 mins
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
- label: Batch Invariance Tests (H100) # 10min
|
||||
mirror_hardwares: [amdexperimental]
|
||||
agent_pool: mi325_1
|
||||
# grade: Blocking
|
||||
timeout_in_minutes: 25
|
||||
gpu: h100
|
||||
source_file_dependencies:
|
||||
- vllm/v1/attention
|
||||
- vllm/model_executor/layers
|
||||
- tests/v1/determinism/
|
||||
commands:
|
||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||
- pip install pytest-timeout pytest-forked
|
||||
- pytest -v -s v1/determinism/test_batch_invariance.py
|
||||
- pytest -v -s v1/determinism/test_rms_norm_batch_invariant.py
|
||||
|
||||
- label: V1 Test attention (B200) # 10min
|
||||
timeout_in_minutes: 30
|
||||
gpu: b200
|
||||
source_file_dependencies:
|
||||
- vllm/v1/attention
|
||||
- tests/v1/attention
|
||||
commands:
|
||||
- VLLM_DISABLE_FLASHINFER_PREFILL=1 pytest -v -s v1/attention # TODO: FI prefill is bugged and causes incorrectness, fix this
|
||||
|
||||
- label: V1 Test others (CPU) # 5 mins
|
||||
mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
|
||||
agent_pool: mi325_1
|
||||
grade: Blocking
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/v1
|
||||
@@ -409,29 +437,34 @@ steps:
|
||||
|
||||
- label: Examples Test # 30min
|
||||
timeout_in_minutes: 45
|
||||
mirror_hardwares: [amdexperimental]
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
agent_pool: mi325_1
|
||||
# grade: Blocking
|
||||
working_dir: "/vllm-workspace/examples"
|
||||
source_file_dependencies:
|
||||
- vllm/entrypoints
|
||||
- vllm/multimodal
|
||||
- examples/
|
||||
commands:
|
||||
- pip install tensorizer # for tensorizer test
|
||||
# for basic
|
||||
- python3 offline_inference/basic/chat.py
|
||||
- python3 offline_inference/basic/generate.py --model facebook/opt-125m
|
||||
- python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
|
||||
- python3 offline_inference/basic/chat.py
|
||||
- python3 offline_inference/prefix_caching.py
|
||||
- python3 offline_inference/llm_engine_example.py
|
||||
- python3 offline_inference/audio_language.py --seed 0
|
||||
- python3 offline_inference/vision_language.py --seed 0
|
||||
- python3 offline_inference/vision_language_pooling.py --seed 0
|
||||
- python3 offline_inference/vision_language_multi_image.py --seed 0
|
||||
- python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
|
||||
- python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
|
||||
- python3 offline_inference/basic/classify.py
|
||||
- python3 offline_inference/basic/embed.py
|
||||
- python3 offline_inference/basic/score.py
|
||||
# for multi-modal models
|
||||
- python3 offline_inference/audio_language.py --seed 0
|
||||
- python3 offline_inference/vision_language.py --seed 0
|
||||
- python3 offline_inference/vision_language_multi_image.py --seed 0
|
||||
- python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
|
||||
# for pooling models
|
||||
- python3 pooling/pooling/vision_language_pooling.py --seed 0
|
||||
# for features demo
|
||||
- python3 offline_inference/prefix_caching.py
|
||||
- python3 offline_inference/llm_engine_example.py
|
||||
- python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
|
||||
- python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
|
||||
# https://github.com/vllm-project/vllm/pull/26682 uses slightly more memory in PyTorch 2.9+ causing this test to OOM in 1xL4 GPU
|
||||
- python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536
|
||||
@@ -485,7 +518,7 @@ steps:
|
||||
|
||||
- label: PyTorch Compilation Unit Tests # 15min
|
||||
timeout_in_minutes: 30
|
||||
mirror_hardwares: [amdexperimental]
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
agent_pool: mi325_1
|
||||
# grade: Blocking
|
||||
torch_nightly: true
|
||||
@@ -493,21 +526,16 @@ steps:
|
||||
- vllm/
|
||||
- tests/compile
|
||||
commands:
|
||||
- pytest -v -s compile/test_pass_manager.py
|
||||
- pytest -v -s compile/test_fusion.py
|
||||
- pytest -v -s compile/test_fusion_attn.py
|
||||
- pytest -v -s compile/test_functionalization.py
|
||||
- pytest -v -s compile/test_silu_mul_quant_fusion.py
|
||||
# - pytest -v -s compile/test_sequence_parallelism.py
|
||||
# - pytest -v -s compile/test_async_tp.py
|
||||
- pytest -v -s compile/test_fusion_all_reduce.py
|
||||
- pytest -v -s compile/test_decorator.py
|
||||
- pytest -v -s compile/test_noop_elimination.py
|
||||
- pytest -v -s compile/test_aot_compile.py
|
||||
# Run unit tests defined directly under compile/,
|
||||
# not including subdirectories, which are usually heavier
|
||||
# tests covered elsewhere.
|
||||
# Use `find` to launch multiple instances of pytest so that
|
||||
# they do not suffer from https://github.com/vllm-project/vllm/issues/28965
|
||||
- "find compile/ -maxdepth 1 -name 'test_*.py' -exec pytest -s -v {} \\\\;"
|
||||
|
||||
- label: PyTorch Fullgraph Smoke Test # 15min
|
||||
timeout_in_minutes: 30
|
||||
mirror_hardwares: [amdexperimental]
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
agent_pool: mi325_1
|
||||
# grade: Blocking
|
||||
torch_nightly: true
|
||||
@@ -515,9 +543,11 @@ steps:
|
||||
- vllm/
|
||||
- tests/compile
|
||||
commands:
|
||||
- pytest -v -s compile/test_basic_correctness.py
|
||||
- pytest -v -s compile/test_multimodal_compile.py
|
||||
- pytest -v -s compile/piecewise/
|
||||
# Run smoke tests under fullgraph directory, except test_full_graph.py
|
||||
# as it is a heavy test that is covered in other steps.
|
||||
# Use `find` to launch multiple instances of pytest so that
|
||||
# they do not suffer from https://github.com/vllm-project/vllm/issues/28965
|
||||
- "find compile/fullgraph/ -name 'test_*.py' -not -name 'test_full_graph.py' -exec pytest -s -v {} \\\\;"
|
||||
|
||||
- label: PyTorch Fullgraph Test # 27min
|
||||
timeout_in_minutes: 40
|
||||
@@ -529,10 +559,10 @@ steps:
|
||||
- vllm/
|
||||
- tests/compile
|
||||
commands:
|
||||
- pytest -v -s compile/test_full_graph.py -k 'not test_fp8_kv_scale_compile'
|
||||
- pytest -v -s compile/fullgraph/test_full_graph.py -k 'not test_fp8_kv_scale_compile'
|
||||
# Limit to no custom ops to reduce running time
|
||||
# Wrap with quotes to escape yaml and avoid starting -k string with a -
|
||||
- "pytest -v -s compile/test_fusions_e2e.py -k 'TRITON and -quant_fp8'"
|
||||
- "pytest -v -s compile/distributed/test_fusions_e2e.py -k 'TRITON and not +quant_fp8 and not Llama-4'"
|
||||
|
||||
- label: Cudagraph test
|
||||
timeout_in_minutes: 20
|
||||
@@ -561,7 +591,7 @@ steps:
|
||||
|
||||
- label: Kernels Attention Test %N # 23min
|
||||
timeout_in_minutes: 35
|
||||
mirror_hardwares: [amdexperimental]
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
agent_pool: mi325_8
|
||||
# grade: Blocking
|
||||
source_file_dependencies:
|
||||
@@ -588,7 +618,7 @@ steps:
|
||||
|
||||
- label: Kernels MoE Test %N # 40min
|
||||
timeout_in_minutes: 60
|
||||
mirror_hardwares: [amdexperimental]
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
agent_pool: mi325_8
|
||||
# grade: Blocking
|
||||
source_file_dependencies:
|
||||
@@ -615,6 +645,26 @@ steps:
|
||||
commands:
|
||||
- pytest -v -s kernels/mamba
|
||||
|
||||
- label: Kernels DeepGEMM Test (H100) # Nvidia-centric
|
||||
# Not replicating for CUTLAS & CuTe
|
||||
timeout_in_minutes: 45
|
||||
gpu: h100
|
||||
num_gpus: 1
|
||||
source_file_dependencies:
|
||||
- tools/install_deepgemm.sh
|
||||
- vllm/utils/deep_gemm.py
|
||||
- vllm/model_executor/layers/fused_moe
|
||||
- vllm/model_executor/layers/quantization
|
||||
- tests/kernels/quantization/test_block_fp8.py
|
||||
- tests/kernels/moe/test_deepgemm.py
|
||||
- tests/kernels/moe/test_batched_deepgemm.py
|
||||
- tests/kernels/attention/test_deepgemm_attention.py
|
||||
commands:
|
||||
- pytest -v -s kernels/quantization/test_block_fp8.py -k deep_gemm
|
||||
- pytest -v -s kernels/moe/test_deepgemm.py
|
||||
- pytest -v -s kernels/moe/test_batched_deepgemm.py
|
||||
- pytest -v -s kernels/attention/test_deepgemm_attention.py
|
||||
|
||||
- label: Model Executor Test # 23min
|
||||
timeout_in_minutes: 35
|
||||
torch_nightly: true
|
||||
@@ -673,16 +723,18 @@ steps:
|
||||
# we can only upgrade after this is resolved
|
||||
# TODO(jerryzh168): resolve the above comment
|
||||
- uv pip install --system torchao==0.13.0
|
||||
- uv pip install --system conch-triton-kernels
|
||||
- VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py
|
||||
|
||||
- label: LM Eval Small Models # 15min
|
||||
timeout_in_minutes: 20
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
- label: LM Eval Small Models # 53min
|
||||
timeout_in_minutes: 75
|
||||
mirror_hardwares: [amdexperimental]
|
||||
agent_pool: mi325_1
|
||||
# grade: Blocking
|
||||
source_file_dependencies:
|
||||
- csrc/
|
||||
- vllm/model_executor/layers/quantization
|
||||
autorun_on_main: true
|
||||
commands:
|
||||
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt --tp-size=1
|
||||
|
||||
@@ -695,9 +747,9 @@ steps:
|
||||
- csrc/
|
||||
- vllm/entrypoints/openai/
|
||||
- vllm/model_executor/models/whisper.py
|
||||
commands: # LMEval
|
||||
commands: # LMEval+Transcription WER check
|
||||
# Transcription WER check is skipped because encoder-decoder models are not supported on ROCm, see https://github.com/vllm-project/vllm/issues/27442
|
||||
- pytest -s entrypoints/openai/correctness/ --ignore entrypoints/openai/correctness/test_transcription_api_correctness.py
|
||||
- pytest -s entrypoints/openai/correctness/
|
||||
|
||||
- label: OpenAI-Compatible Tool Use # 23 min
|
||||
timeout_in_minutes: 35
|
||||
@@ -709,19 +761,7 @@ steps:
|
||||
- vllm/
|
||||
- tests/tool_use
|
||||
commands:
|
||||
- pytest -v -s -m 'not cpu_test' tool_use
|
||||
|
||||
- label: OpenAI-Compatible Tool Use (CPU) # 5 mins
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
agent_pool: mi325_1
|
||||
# grade: Blocking
|
||||
timeout_in_minutes: 10
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/tool_use
|
||||
no_gpu: true
|
||||
commands:
|
||||
- pytest -v -s -m 'cpu_test' tool_use
|
||||
- pytest -v -s tool_use
|
||||
|
||||
##### models test #####
|
||||
|
||||
@@ -746,6 +786,7 @@ steps:
|
||||
torch_nightly: true
|
||||
source_file_dependencies:
|
||||
- vllm/model_executor/models/
|
||||
- vllm/transformers_utils/
|
||||
- tests/models/test_initialization.py
|
||||
commands:
|
||||
# Only when vLLM model source is modified - test initialization of a large
|
||||
@@ -891,6 +932,18 @@ steps:
|
||||
commands:
|
||||
- pytest -v -s models/language/pooling_mteb_test
|
||||
|
||||
- label: Multi-Modal Processor Test (CPU)
|
||||
timeout_in_minutes: 60
|
||||
mirror_hardwares: [amdexperimental]
|
||||
agent_pool: mi325_1
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/models/multimodal
|
||||
no_gpu: true
|
||||
commands:
|
||||
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
|
||||
- pytest -v -s models/multimodal/processing --ignore models/multimodal/processing/test_tensor_schema.py
|
||||
|
||||
- label: Multi-Modal Processor Test # 44min
|
||||
timeout_in_minutes: 60
|
||||
mirror_hardwares: [amdexperimental]
|
||||
@@ -918,8 +971,8 @@ steps:
|
||||
- pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing
|
||||
- cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model # Otherwise, mp_method="spawn" doesn't work
|
||||
|
||||
- label: Multi-Modal Accuracy Eval (Small Models) # 10min
|
||||
timeout_in_minutes: 70
|
||||
- label: Multi-Modal Accuracy Eval (Small Models) # 150min - 180min
|
||||
timeout_in_minutes: 180
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
agent_pool: mi325_1
|
||||
# grade: Blocking
|
||||
@@ -931,7 +984,8 @@ steps:
|
||||
commands:
|
||||
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-mm-small.txt --tp-size=1
|
||||
|
||||
- label: Multi-Modal Models Test (Extended) 1
|
||||
- label: Multi-Modal Models Test (Extended) 1 # 60min
|
||||
timeout_in_minutes: 120
|
||||
mirror_hardwares: [amdexperimental]
|
||||
agent_pool: mi325_1
|
||||
# grade: Blocking
|
||||
@@ -955,7 +1009,8 @@ steps:
|
||||
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
|
||||
- pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=0) and not core_model'
|
||||
|
||||
- label: Multi-Modal Models Test (Extended) 3
|
||||
- label: Multi-Modal Models Test (Extended) 3 # 75min
|
||||
timeout_in_minutes: 150
|
||||
mirror_hardwares: [amdexperimental]
|
||||
agent_pool: mi325_1
|
||||
# grade: Blocking
|
||||
@@ -998,12 +1053,12 @@ steps:
|
||||
optional: true
|
||||
commands:
|
||||
- pip install --upgrade git+https://github.com/huggingface/transformers
|
||||
- pytest -v -s tests/models/test_initialization.py
|
||||
- pytest -v -s tests/models/test_initialization.py -k 'not (Gemma3 or ModernBert or Qwen2_5_VL or Qwen2_5vl or Qwen2VL or TransformersMultiModalEmbeddingModel or TransformersMultiModalForSequenceClassification or Ultravox or Phi4Multimodal or LlavaNextVideo or MiniCPMO or Lfm2Moe or PaliGemma or RobertaForSequenceClassification or Ovis2_5 or Fuyu or DeepseekOCR or KimiVL)'
|
||||
- pytest -v -s tests/models/test_transformers.py
|
||||
- pytest -v -s tests/models/multimodal/processing/
|
||||
- pytest -v -s tests/models/multimodal/test_mapping.py
|
||||
# - pytest -v -s tests/models/multimodal/processing/
|
||||
- pytest -v -s tests/models/multimodal/test_mapping.py -k 'not (Gemma3 or Qwen2VL or Qwen2_5_VL)'
|
||||
- python3 examples/offline_inference/basic/chat.py
|
||||
- python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
|
||||
# - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
|
||||
# Whisper needs spawn method to avoid deadlock
|
||||
- VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper
|
||||
|
||||
@@ -1047,8 +1102,9 @@ steps:
|
||||
- pytest -v -s tests/kernels/moe/test_nvfp4_moe.py
|
||||
- pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py
|
||||
- pytest -v -s tests/kernels/moe/test_flashinfer.py
|
||||
- pytest -v -s tests/kernels/moe/test_cutedsl_moe.py
|
||||
|
||||
- label: Blackwell Fusion Tests # 30 min
|
||||
- label: Blackwell Fusion and Compile Tests # 30 min
|
||||
timeout_in_minutes: 40
|
||||
working_dir: "/vllm-workspace/"
|
||||
gpu: b200
|
||||
@@ -1056,20 +1112,29 @@ steps:
|
||||
- csrc/quantization/fp4/
|
||||
- vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
|
||||
- vllm/v1/attention/backends/flashinfer.py
|
||||
- vllm/v1/worker/
|
||||
- vllm/v1/cudagraph_dispatcher.py
|
||||
- vllm/compilation/
|
||||
# can affect pattern matching
|
||||
- vllm/model_executor/layers/layernorm.py
|
||||
- vllm/model_executor/layers/activation.py
|
||||
- vllm/model_executor/layers/quantization/input_quant_fp8.py
|
||||
- tests/compile/test_fusion_attn.py
|
||||
- tests/compile/test_silu_mul_quant_fusion.py
|
||||
- tests/compile/distributed/test_fusion_all_reduce.py
|
||||
- tests/compile/distributed/test_fusions_e2e.py
|
||||
- tests/compile/fullgraph/test_full_graph.py
|
||||
commands:
|
||||
- nvidia-smi
|
||||
- pytest -v -s tests/compile/test_fusion_attn.py
|
||||
- pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
|
||||
# this runner has 2 GPUs available even though num_gpus=2 is not set
|
||||
- pytest -v -s tests/compile/test_fusion_all_reduce.py
|
||||
- pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
|
||||
# Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time
|
||||
# Wrap with quotes to escape yaml
|
||||
- "pytest -v -s tests/compile/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and Llama-3.1 and -quant_fp8 and -rms_norm'"
|
||||
- "pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and not +quant_fp8 and not +rms_norm'"
|
||||
# test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40)
|
||||
- pytest -v -s tests/compile/fullgraph/test_full_graph.py::test_fp8_kv_scale_compile
|
||||
|
||||
- label: Blackwell Fusion E2E Tests # 30 min
|
||||
timeout_in_minutes: 40
|
||||
@@ -1086,20 +1151,16 @@ steps:
|
||||
- vllm/model_executor/layers/layernorm.py
|
||||
- vllm/model_executor/layers/activation.py
|
||||
- vllm/model_executor/layers/quantization/input_quant_fp8.py
|
||||
- tests/compile/test_fusions_e2e.py
|
||||
- tests/compile/test_full_graph.py
|
||||
- tests/compile/distributed/test_fusions_e2e.py
|
||||
commands:
|
||||
- nvidia-smi
|
||||
# Run all e2e fusion tests
|
||||
- pytest -v -s tests/compile/test_fusions_e2e.py
|
||||
# test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40)
|
||||
- pytest -v -s tests/compile/test_full_graph.py::test_fp8_kv_scale_compile
|
||||
- pytest -v -s tests/compile/distributed/test_fusions_e2e.py
|
||||
|
||||
- label: ROCm GPT-OSS Eval
|
||||
- label: Blackwell GPT-OSS Eval
|
||||
timeout_in_minutes: 60
|
||||
working_dir: "/vllm-workspace/"
|
||||
agent_pool: mi325_1
|
||||
mirror_hardwares: [amdproduction]
|
||||
gpu: b200
|
||||
optional: true # run on nightlies
|
||||
source_file_dependencies:
|
||||
- tests/evals/gpt_oss
|
||||
@@ -1108,7 +1169,7 @@ steps:
|
||||
- vllm/v1/attention/backends/flashinfer.py
|
||||
commands:
|
||||
- uv pip install --system 'gpt-oss[eval]==0.0.5'
|
||||
- VLLM_ROCM_USE_AITER_MHA=0 VLLM_ROCM_USE_AITER=1 VLLM_USE_AITER_UNIFIED_ATTENTION=1 pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58
|
||||
- pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58
|
||||
|
||||
- label: Blackwell Quantized MoE Test
|
||||
timeout_in_minutes: 60
|
||||
@@ -1198,7 +1259,7 @@ steps:
|
||||
- vllm/worker/worker_base.py
|
||||
- vllm/v1/engine/
|
||||
- vllm/v1/worker/
|
||||
- tests/compile/test_basic_correctness.py
|
||||
- tests/compile/fullgraph/test_basic_correctness.py
|
||||
- tests/compile/test_wrapper.py
|
||||
- tests/distributed/
|
||||
- tests/entrypoints/llm/test_collective_rpc.py
|
||||
@@ -1208,10 +1269,11 @@ steps:
|
||||
- tests/v1/worker/test_worker_memory_snapshot.py
|
||||
commands:
|
||||
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
|
||||
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
|
||||
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
|
||||
- DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py
|
||||
- pytest -v -s entrypoints/llm/test_collective_rpc.py
|
||||
- pytest -v -s ./compile/test_basic_correctness.py
|
||||
- pytest -v -s ./compile/fullgraph/test_basic_correctness.py
|
||||
- pytest -v -s ./compile/test_wrapper.py
|
||||
- VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
|
||||
- VLLM_TEST_SAME_HOST=1 VLLM_TEST_WITH_DEFAULT_DEVICE_SET=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
|
||||
@@ -1243,7 +1305,7 @@ steps:
|
||||
|
||||
- label: Plugin Tests (2 GPUs) # 40min
|
||||
timeout_in_minutes: 60
|
||||
mirror_hardwares: [amdexperimental]
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
agent_pool: mi325_2
|
||||
# grade: Blocking
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
@@ -1311,12 +1373,15 @@ steps:
|
||||
- pytest -v -s -x lora/test_llama_tp.py
|
||||
- pytest -v -s -x lora/test_llm_with_multi_loras.py
|
||||
- pytest -v -s -x lora/test_olmoe_tp.py
|
||||
- pytest -v -s -x lora/test_gptoss_tp.py
|
||||
|
||||
# Disabled for now because MXFP4 backend on non-cuda platform
|
||||
# doesn't support LoRA yet
|
||||
#- pytest -v -s -x lora/test_gptoss_tp.py
|
||||
|
||||
|
||||
- label: Weight Loading Multiple GPU Test # 33min
|
||||
timeout_in_minutes: 45
|
||||
mirror_hardwares: [amdexperimental]
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
agent_pool: mi325_2
|
||||
# grade: Blocking
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
@@ -1326,7 +1391,7 @@ steps:
|
||||
- vllm/
|
||||
- tests/weight_loading
|
||||
commands:
|
||||
- bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models.txt
|
||||
- bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-amd.txt
|
||||
|
||||
- label: Weight Loading Multiple GPU Test - Large Models # optional
|
||||
mirror_hardwares: [amdexperimental]
|
||||
@@ -1334,13 +1399,12 @@ steps:
|
||||
# grade: Blocking
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
num_gpus: 2
|
||||
gpu: a100
|
||||
optional: true
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/weight_loading
|
||||
commands:
|
||||
- bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large.txt
|
||||
- bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large-amd.txt
|
||||
|
||||
- label: NixlConnector PD accuracy tests (Distributed) # 30min
|
||||
mirror_hardwares: [amdexperimental]
|
||||
@@ -1376,7 +1440,83 @@ steps:
|
||||
- TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
|
||||
- pytest -v -s -x lora/test_mixtral.py
|
||||
|
||||
|
||||
- label: LM Eval Large Models # optional
|
||||
gpu: a100
|
||||
optional: true
|
||||
mirror_hardwares: [amdexperimental]
|
||||
agent_pool: mi325_4
|
||||
# grade: Blocking
|
||||
num_gpus: 4
|
||||
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
|
||||
source_file_dependencies:
|
||||
- csrc/
|
||||
- vllm/model_executor/layers/quantization
|
||||
commands:
|
||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4
|
||||
|
||||
##### H100 test #####
|
||||
- label: LM Eval Large Models (H100) # optional
|
||||
gpu: h100
|
||||
optional: true
|
||||
mirror_hardwares: [amdexperimental]
|
||||
agent_pool: mi325_4
|
||||
# grade: Blocking
|
||||
num_gpus: 4
|
||||
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
|
||||
source_file_dependencies:
|
||||
- csrc/
|
||||
- vllm/model_executor/layers/quantization
|
||||
commands:
|
||||
- export VLLM_USE_DEEP_GEMM=0 # We found Triton is faster than DeepGEMM for H100
|
||||
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt --tp-size=4
|
||||
|
||||
|
||||
##### H200 test #####
|
||||
- label: Distributed Tests (H200) # optional
|
||||
mirror_hardwares: [amdexperimental]
|
||||
agent_pool: mi325_2
|
||||
# grade: Blocking
|
||||
gpu: h200
|
||||
optional: true
|
||||
working_dir: "/vllm-workspace/"
|
||||
num_gpus: 2
|
||||
commands:
|
||||
- VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_async_tp.py
|
||||
- pytest -v -s tests/compile/distributed/test_sequence_parallelism.py
|
||||
- pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
|
||||
#- pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm
|
||||
- "VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'"
|
||||
- VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/distributed/test_sequence_parallel.py
|
||||
- pytest -v -s tests/distributed/test_context_parallel.py
|
||||
- HIP_VISIBLE_DEVICES=0,1 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048
|
||||
- pytest -v -s tests/v1/distributed/test_dbo.py
|
||||
|
||||
##### B200 test #####
|
||||
- label: Distributed Tests (B200) # optional
|
||||
gpu: b200
|
||||
optional: true
|
||||
working_dir: "/vllm-workspace/"
|
||||
num_gpus: 2
|
||||
commands:
|
||||
- pytest -v -s tests/distributed/test_context_parallel.py
|
||||
- pytest -v -s tests/distributed/test_nccl_symm_mem_allreduce.py
|
||||
- pytest -v -s tests/v1/distributed/test_dbo.py
|
||||
|
||||
##### E2E Eval Tests #####
|
||||
- label: LM Eval Small Models (1 Card) # 15min
|
||||
timeout_in_minutes: 20
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
agent_pool: mi325_1
|
||||
# grade: Blocking
|
||||
source_file_dependencies:
|
||||
- csrc/
|
||||
- vllm/model_executor/layers/quantization
|
||||
commands:
|
||||
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt --tp-size=1
|
||||
|
||||
- label: LM Eval Large Models (4 Card)
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
agent_pool: mi325_4
|
||||
# grade: Blocking
|
||||
@@ -1391,50 +1531,29 @@ steps:
|
||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4
|
||||
|
||||
##### H100 test #####
|
||||
- label: LM Eval Large Models (H100) # optional
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
agent_pool: mi325_4
|
||||
# grade: Blocking
|
||||
gpu: h100
|
||||
optional: true
|
||||
num_gpus: 4
|
||||
- label: ROCm LM Eval Large Models (8 Card)
|
||||
mirror_hardwares: [amdproduction]
|
||||
agent_pool: mi325_8
|
||||
num_gpus: 8
|
||||
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
|
||||
commands:
|
||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-rocm.txt --tp-size=8
|
||||
|
||||
- label: ROCm GPT-OSS Eval
|
||||
timeout_in_minutes: 60
|
||||
working_dir: "/vllm-workspace/"
|
||||
agent_pool: mi325_1
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
optional: true # run on nightlies
|
||||
source_file_dependencies:
|
||||
- csrc/
|
||||
- vllm/model_executor/layers/quantization
|
||||
- tests/evals/gpt_oss
|
||||
- vllm/model_executor/models/gpt_oss.py
|
||||
- vllm/model_executor/layers/quantization/mxfp4.py
|
||||
- vllm/v1/attention/backends/flashinfer.py
|
||||
commands:
|
||||
- export VLLM_USE_DEEP_GEMM=0 # We found Triton is faster than DeepGEMM for H100
|
||||
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt --tp-size=4
|
||||
|
||||
##### H200 test #####
|
||||
- label: Distributed Tests (H200) # optional
|
||||
mirror_hardwares: [amdexperimental]
|
||||
agent_pool: mi325_2
|
||||
# grade: Blocking
|
||||
gpu: h200
|
||||
optional: true
|
||||
working_dir: "/vllm-workspace/"
|
||||
num_gpus: 2
|
||||
commands:
|
||||
- pytest -v -s tests/compile/test_async_tp.py
|
||||
- pytest -v -s tests/compile/test_sequence_parallelism.py
|
||||
- pytest -v -s tests/compile/test_fusion_all_reduce.py
|
||||
- pytest -v -s tests/compile/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm
|
||||
- pytest -v -s tests/distributed/test_context_parallel.py
|
||||
- CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048
|
||||
- pytest -v -s tests/v1/distributed/test_dbo.py
|
||||
|
||||
##### B200 test #####
|
||||
- label: Distributed Tests (B200) # optional
|
||||
gpu: b200
|
||||
optional: true
|
||||
working_dir: "/vllm-workspace/"
|
||||
num_gpus: 2
|
||||
commands:
|
||||
- pytest -v -s tests/distributed/test_context_parallel.py
|
||||
- pytest -v -s tests/distributed/test_nccl_symm_mem_allreduce.py
|
||||
- pytest -v -s tests/v1/distributed/test_dbo.py
|
||||
- uv pip install --system 'gpt-oss[eval]==0.0.5'
|
||||
- VLLM_ROCM_USE_AITER_MHA=0 VLLM_ROCM_USE_AITER=1 VLLM_USE_AITER_UNIFIED_ATTENTION=1 pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58
|
||||
|
||||
##### RL Integration Tests #####
|
||||
- label: Prime-RL Integration Test # 15min
|
||||
@@ -1450,9 +1569,8 @@ steps:
|
||||
- .buildkite/scripts/run-prime-rl-test.sh
|
||||
commands:
|
||||
- bash .buildkite/scripts/run-prime-rl-test.sh
|
||||
|
||||
- label: DeepSeek V2-Lite Accuracy
|
||||
mirror_hardwares: [amdexperimental]
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
agent_pool: mi325_4
|
||||
# grade: Blocking
|
||||
timeout_in_minutes: 60
|
||||
@@ -1463,8 +1581,8 @@ steps:
|
||||
commands:
|
||||
- bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh 0.25 200 8010
|
||||
|
||||
- label: Qwen3-30B-A3B-FP8-block Accuracy
|
||||
mirror_hardwares: [amdexperimental]
|
||||
- label: Qwen3-30B-A3B-FP8-block Accuracy (H100)
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
agent_pool: mi325_4
|
||||
# grade: Blocking
|
||||
timeout_in_minutes: 60
|
||||
@@ -1473,4 +1591,36 @@ steps:
|
||||
num_gpus: 4
|
||||
working_dir: "/vllm-workspace"
|
||||
commands:
|
||||
- bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep.sh 0.8 200 8020
|
||||
- bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020
|
||||
|
||||
- label: Qwen3-30B-A3B-FP8-block Accuracy (B200)
|
||||
timeout_in_minutes: 60
|
||||
gpu: b200
|
||||
optional: true
|
||||
num_gpus: 2
|
||||
working_dir: "/vllm-workspace"
|
||||
commands:
|
||||
- bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1
|
||||
|
||||
- label: DeepSeek V2-Lite Async EPLB Accuracy
|
||||
timeout_in_minutes: 60
|
||||
mirror_hardwares: [amdexperimental]
|
||||
agent_pool: mi325_4
|
||||
# grade: Blocking
|
||||
gpu: h100
|
||||
optional: true
|
||||
num_gpus: 4
|
||||
working_dir: "/vllm-workspace"
|
||||
commands:
|
||||
- bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_async_eplb.sh 0.25 1319 8030
|
||||
|
||||
- label: Qwen3-Next-80B-A3B-Instruct MTP Async EPLB Accuracy
|
||||
timeout_in_minutes: 60
|
||||
mirror_hardwares: [amdexperimental]
|
||||
agent_pool: mi325_4
|
||||
# grade: Blocking
|
||||
optional: true
|
||||
num_gpus: 4
|
||||
working_dir: "/vllm-workspace"
|
||||
commands:
|
||||
- bash .buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh 0.8 1319 8040
|
||||
|
||||
@@ -57,14 +57,16 @@ steps:
|
||||
- pytest -v -s -m 'not cpu_test' multimodal
|
||||
- pytest -v -s utils_
|
||||
|
||||
- label: Async Engine, Inputs, Utils, Worker, Config Test (CPU) # 4 mins
|
||||
timeout_in_minutes: 10
|
||||
- label: Async Engine, Inputs, Utils, Worker, Config Test (CPU) # 20min
|
||||
timeout_in_minutes: 30
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/test_inputs.py
|
||||
- tests/test_outputs.py
|
||||
- tests/multimodal
|
||||
- tests/standalone_tests/lazy_imports.py
|
||||
- tests/tokenizers_
|
||||
- tests/tool_parsers
|
||||
- tests/transformers_utils
|
||||
- tests/config
|
||||
no_gpu: true
|
||||
@@ -73,6 +75,8 @@ steps:
|
||||
- pytest -v -s test_inputs.py
|
||||
- pytest -v -s test_outputs.py
|
||||
- pytest -v -s -m 'cpu_test' multimodal
|
||||
- pytest -v -s tokenizers_
|
||||
- pytest -v -s tool_parsers
|
||||
- pytest -v -s transformers_utils
|
||||
- pytest -v -s config
|
||||
|
||||
@@ -167,7 +171,7 @@ steps:
|
||||
- tests/distributed/test_utils
|
||||
- tests/distributed/test_pynccl
|
||||
- tests/distributed/test_events
|
||||
- tests/compile/test_basic_correctness
|
||||
- tests/compile/fullgraph/test_basic_correctness.py
|
||||
- examples/offline_inference/rlhf.py
|
||||
- examples/offline_inference/rlhf_colocate.py
|
||||
- tests/examples/offline_inference/data_parallel.py
|
||||
@@ -192,12 +196,13 @@ steps:
|
||||
# test with internal dp
|
||||
- python3 ../examples/offline_inference/data_parallel.py --enforce-eager
|
||||
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
|
||||
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
|
||||
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
|
||||
- TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_internal_lb_dp.py
|
||||
- TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_hybrid_lb_dp.py
|
||||
- pytest -v -s v1/engine/test_engine_core_client.py::test_kv_cache_events_dp
|
||||
- pytest -v -s distributed/test_utils.py
|
||||
- pytest -v -s compile/test_basic_correctness.py
|
||||
- pytest -v -s compile/fullgraph/test_basic_correctness.py
|
||||
- pytest -v -s distributed/test_pynccl.py
|
||||
- pytest -v -s distributed/test_events.py
|
||||
- pytest -v -s distributed/test_symm_mem_allreduce.py
|
||||
@@ -275,21 +280,18 @@ steps:
|
||||
- pytest -v -s test_regression.py
|
||||
working_dir: "/vllm-workspace/tests" # optional
|
||||
|
||||
- label: Engine Test # 25min
|
||||
timeout_in_minutes: 40
|
||||
- label: Engine Test # 9min
|
||||
timeout_in_minutes: 15
|
||||
mirror_hardwares: [amdexperimental]
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/engine
|
||||
- tests/tokenization
|
||||
- tests/test_sequence
|
||||
- tests/test_config
|
||||
- tests/test_logger
|
||||
- tests/test_vllm_port
|
||||
commands:
|
||||
- pytest -v -s engine test_sequence.py test_config.py test_logger.py test_vllm_port.py
|
||||
# OOM in the CI unless we run this separately
|
||||
- pytest -v -s tokenization
|
||||
|
||||
- label: V1 Test e2e + engine # 30min
|
||||
timeout_in_minutes: 45
|
||||
@@ -346,6 +348,19 @@ steps:
|
||||
commands:
|
||||
- pytest -v -s v1/attention
|
||||
|
||||
- label: Batch Invariance Tests (H100) # 10min
|
||||
timeout_in_minutes: 25
|
||||
gpu: h100
|
||||
source_file_dependencies:
|
||||
- vllm/v1/attention
|
||||
- vllm/model_executor/layers
|
||||
- tests/v1/determinism/
|
||||
commands:
|
||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||
- pip install pytest-timeout pytest-forked
|
||||
- pytest -v -s v1/determinism/test_batch_invariance.py
|
||||
- pytest -v -s v1/determinism/test_rms_norm_batch_invariant.py
|
||||
|
||||
- label: V1 Test attention (B200) # 10min
|
||||
timeout_in_minutes: 30
|
||||
gpu: b200
|
||||
@@ -375,23 +390,28 @@ steps:
|
||||
working_dir: "/vllm-workspace/examples"
|
||||
source_file_dependencies:
|
||||
- vllm/entrypoints
|
||||
- vllm/multimodal
|
||||
- examples/
|
||||
commands:
|
||||
- pip install tensorizer # for tensorizer test
|
||||
# for basic
|
||||
- python3 offline_inference/basic/chat.py
|
||||
- python3 offline_inference/basic/generate.py --model facebook/opt-125m
|
||||
- python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
|
||||
- python3 offline_inference/basic/chat.py
|
||||
- python3 offline_inference/prefix_caching.py
|
||||
- python3 offline_inference/llm_engine_example.py
|
||||
- python3 offline_inference/audio_language.py --seed 0
|
||||
- python3 offline_inference/vision_language.py --seed 0
|
||||
- python3 offline_inference/vision_language_pooling.py --seed 0
|
||||
- python3 offline_inference/vision_language_multi_image.py --seed 0
|
||||
- python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
|
||||
- python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
|
||||
- python3 offline_inference/basic/classify.py
|
||||
- python3 offline_inference/basic/embed.py
|
||||
- python3 offline_inference/basic/score.py
|
||||
# for multi-modal models
|
||||
- python3 offline_inference/audio_language.py --seed 0
|
||||
- python3 offline_inference/vision_language.py --seed 0
|
||||
- python3 offline_inference/vision_language_multi_image.py --seed 0
|
||||
- python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
|
||||
# for pooling models
|
||||
- python3 pooling/pooling/vision_language_pooling.py --seed 0
|
||||
# for features demo
|
||||
- python3 offline_inference/prefix_caching.py
|
||||
- python3 offline_inference/llm_engine_example.py
|
||||
- python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
|
||||
- python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
|
||||
# https://github.com/vllm-project/vllm/pull/26682 uses slightly more memory in PyTorch 2.9+ causing this test to OOM in 1xL4 GPU
|
||||
- python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536
|
||||
@@ -445,18 +465,14 @@ steps:
|
||||
- vllm/
|
||||
- tests/compile
|
||||
commands:
|
||||
- pytest -v -s compile/test_graph_partition.py
|
||||
- pytest -v -s compile/test_config.py
|
||||
- pytest -v -s compile/test_pass_manager.py
|
||||
- pytest -v -s compile/test_fusion.py
|
||||
- pytest -v -s compile/test_fusion_attn.py
|
||||
- pytest -v -s compile/test_functionalization.py
|
||||
- pytest -v -s compile/test_silu_mul_quant_fusion.py
|
||||
- pytest -v -s compile/test_fusion_all_reduce.py
|
||||
- pytest -v -s compile/test_decorator.py
|
||||
- pytest -v -s compile/test_noop_elimination.py
|
||||
- pytest -v -s compile/test_aot_compile.py
|
||||
- pytest -v -s compile/test_qk_norm_rope_fusion.py
|
||||
# Run unit tests defined directly under compile/,
|
||||
# not including subdirectories, which are usually heavier
|
||||
# tests covered elsewhere.
|
||||
# Use `find` to launch multiple instances of pytest so that
|
||||
# they do not suffer from https://github.com/vllm-project/vllm/issues/28965
|
||||
# However, find does not normally propagate error codes, so we combine it with xargs
|
||||
# (using -0 for proper path handling)
|
||||
- "find compile/ -maxdepth 1 -name 'test_*.py' -print0 | xargs -0 -n1 -I{} pytest -s -v '{}'"
|
||||
|
||||
- label: PyTorch Fullgraph Smoke Test # 15min
|
||||
timeout_in_minutes: 30
|
||||
@@ -466,9 +482,13 @@ steps:
|
||||
- vllm/
|
||||
- tests/compile
|
||||
commands:
|
||||
- pytest -v -s compile/test_basic_correctness.py
|
||||
- pytest -v -s compile/test_multimodal_compile.py
|
||||
- pytest -v -s compile/piecewise/
|
||||
# Run smoke tests under fullgraph directory, except test_full_graph.py
|
||||
# as it is a heavy test that is covered in other steps.
|
||||
# Use `find` to launch multiple instances of pytest so that
|
||||
# they do not suffer from https://github.com/vllm-project/vllm/issues/28965
|
||||
# However, find does not normally propagate error codes, so we combine it with xargs
|
||||
# (using -0 for proper path handling)
|
||||
- "find compile/fullgraph -maxdepth 1 -name 'test_*.py' -not -name 'test_full_graph.py' -print0 | xargs -0 -n1 -I{} pytest -s -v '{}'"
|
||||
|
||||
- label: PyTorch Fullgraph Test # 27min
|
||||
timeout_in_minutes: 40
|
||||
@@ -479,10 +499,10 @@ steps:
|
||||
- tests/compile
|
||||
commands:
|
||||
# fp8 kv scales not supported on sm89, tested on Blackwell instead
|
||||
- pytest -v -s compile/test_full_graph.py -k 'not test_fp8_kv_scale_compile'
|
||||
- pytest -v -s compile/fullgraph/test_full_graph.py -k 'not test_fp8_kv_scale_compile'
|
||||
# Limit to no custom ops to reduce running time
|
||||
# Wrap with quotes to escape yaml and avoid starting -k string with a -
|
||||
- "pytest -v -s compile/test_fusions_e2e.py -k 'TRITON and not +quant_fp8 and not Llama-4'"
|
||||
- "pytest -v -s compile/distributed/test_fusions_e2e.py -k 'TRITON and not +quant_fp8 and not Llama-4'"
|
||||
|
||||
- label: Cudagraph test
|
||||
timeout_in_minutes: 20
|
||||
@@ -554,6 +574,25 @@ steps:
|
||||
commands:
|
||||
- pytest -v -s kernels/mamba
|
||||
|
||||
- label: Kernels DeepGEMM Test (H100)
|
||||
timeout_in_minutes: 45
|
||||
gpu: h100
|
||||
num_gpus: 1
|
||||
source_file_dependencies:
|
||||
- tools/install_deepgemm.sh
|
||||
- vllm/utils/deep_gemm.py
|
||||
- vllm/model_executor/layers/fused_moe
|
||||
- vllm/model_executor/layers/quantization
|
||||
- tests/kernels/quantization/test_block_fp8.py
|
||||
- tests/kernels/moe/test_deepgemm.py
|
||||
- tests/kernels/moe/test_batched_deepgemm.py
|
||||
- tests/kernels/attention/test_deepgemm_attention.py
|
||||
commands:
|
||||
- pytest -v -s kernels/quantization/test_block_fp8.py -k deep_gemm
|
||||
- pytest -v -s kernels/moe/test_deepgemm.py
|
||||
- pytest -v -s kernels/moe/test_batched_deepgemm.py
|
||||
- pytest -v -s kernels/attention/test_deepgemm_attention.py
|
||||
|
||||
- label: Model Executor Test # 23min
|
||||
timeout_in_minutes: 35
|
||||
torch_nightly: true
|
||||
@@ -604,6 +643,7 @@ steps:
|
||||
# we can only upgrade after this is resolved
|
||||
# TODO(jerryzh168): resolve the above comment
|
||||
- uv pip install --system torchao==0.13.0 --index-url https://download.pytorch.org/whl/cu129
|
||||
- uv pip install --system conch-triton-kernels
|
||||
- VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py
|
||||
|
||||
- label: LM Eval Small Models # 53min
|
||||
@@ -634,16 +674,7 @@ steps:
|
||||
- vllm/
|
||||
- tests/tool_use
|
||||
commands:
|
||||
- pytest -v -s -m 'not cpu_test' tool_use
|
||||
|
||||
- label: OpenAI-Compatible Tool Use (CPU) # 5 mins
|
||||
timeout_in_minutes: 10
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/tool_use
|
||||
no_gpu: true
|
||||
commands:
|
||||
- pytest -v -s -m 'cpu_test' tool_use
|
||||
- pytest -v -s tool_use
|
||||
|
||||
##### models test #####
|
||||
|
||||
@@ -654,6 +685,7 @@ steps:
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/models/test_initialization.py
|
||||
- tests/models/registry.py
|
||||
commands:
|
||||
# Run a subset of model initialization tests
|
||||
- pytest -v -s models/test_initialization.py::test_can_initialize_small_subset
|
||||
@@ -664,7 +696,9 @@ steps:
|
||||
torch_nightly: true
|
||||
source_file_dependencies:
|
||||
- vllm/model_executor/models/
|
||||
- vllm/transformers_utils/
|
||||
- tests/models/test_initialization.py
|
||||
- tests/models/registry.py
|
||||
commands:
|
||||
# Only when vLLM model source is modified - test initialization of a large
|
||||
# subset of supported models (the complement of the small subset in the above
|
||||
@@ -790,14 +824,24 @@ steps:
|
||||
commands:
|
||||
- pytest -v -s models/language/pooling_mteb_test
|
||||
|
||||
- label: Multi-Modal Processor Test # 44min
|
||||
- label: Multi-Modal Processor Test (CPU)
|
||||
timeout_in_minutes: 60
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/models/multimodal
|
||||
no_gpu: true
|
||||
commands:
|
||||
- "pip install git+https://github.com/TIGER-AI-Lab/Mantis.git || echo 'Mantis installation skipped (decord not available on CPU-only environment)'"
|
||||
- pytest -v -s models/multimodal/processing --ignore models/multimodal/processing/test_tensor_schema.py
|
||||
|
||||
- label: Multi-Modal Processor Test
|
||||
timeout_in_minutes: 60
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/models/multimodal
|
||||
commands:
|
||||
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
|
||||
- pytest -v -s models/multimodal/processing
|
||||
- pytest -v -s models/multimodal/processing/test_tensor_schema.py
|
||||
|
||||
- label: Multi-Modal Models Test (Standard) # 60min
|
||||
timeout_in_minutes: 80
|
||||
@@ -874,14 +918,15 @@ steps:
|
||||
- label: Transformers Nightly Models Test
|
||||
working_dir: "/vllm-workspace/"
|
||||
optional: true
|
||||
soft_fail: true
|
||||
commands:
|
||||
- pip install --upgrade git+https://github.com/huggingface/transformers
|
||||
- pytest -v -s tests/models/test_initialization.py -k 'not (Gemma3 or ModernBert or Qwen2_5_VL or Qwen2_5vl or Qwen2VL or TransformersMultiModalEmbeddingModel or TransformersMultiModalForSequenceClassification or Ultravox or Phi4Multimodal or LlavaNextVideo or MiniCPMO or Lfm2Moe or PaliGemma or RobertaForSequenceClassification or Ovis2_5 or Fuyu or DeepseekOCR or KimiVL)'
|
||||
- pytest -v -s tests/models/test_initialization.py
|
||||
- pytest -v -s tests/models/test_transformers.py
|
||||
# - pytest -v -s tests/models/multimodal/processing/
|
||||
- pytest -v -s tests/models/multimodal/test_mapping.py -k 'not (Gemma3 or Qwen2VL or Qwen2_5_VL)'
|
||||
- pytest -v -s tests/models/multimodal/processing/
|
||||
- pytest -v -s tests/models/multimodal/test_mapping.py
|
||||
- python3 examples/offline_inference/basic/chat.py
|
||||
# - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
|
||||
- python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
|
||||
# Whisper needs spawn method to avoid deadlock
|
||||
- VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper
|
||||
|
||||
@@ -925,6 +970,7 @@ steps:
|
||||
- pytest -v -s tests/kernels/moe/test_nvfp4_moe.py
|
||||
- pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py
|
||||
- pytest -v -s tests/kernels/moe/test_flashinfer.py
|
||||
- pytest -v -s tests/kernels/moe/test_cutedsl_moe.py
|
||||
|
||||
- label: Blackwell Fusion and Compile Tests # 30 min
|
||||
timeout_in_minutes: 40
|
||||
@@ -934,22 +980,29 @@ steps:
|
||||
- csrc/quantization/fp4/
|
||||
- vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
|
||||
- vllm/v1/attention/backends/flashinfer.py
|
||||
- vllm/v1/worker/
|
||||
- vllm/v1/cudagraph_dispatcher.py
|
||||
- vllm/compilation/
|
||||
# can affect pattern matching
|
||||
- vllm/model_executor/layers/layernorm.py
|
||||
- vllm/model_executor/layers/activation.py
|
||||
- vllm/model_executor/layers/quantization/input_quant_fp8.py
|
||||
- tests/compile/test_fusion_attn.py
|
||||
- tests/compile/test_silu_mul_quant_fusion.py
|
||||
- tests/compile/distributed/test_fusion_all_reduce.py
|
||||
- tests/compile/distributed/test_fusions_e2e.py
|
||||
- tests/compile/fullgraph/test_full_graph.py
|
||||
commands:
|
||||
- nvidia-smi
|
||||
- pytest -v -s tests/compile/test_fusion_attn.py
|
||||
- pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
|
||||
# this runner has 2 GPUs available even though num_gpus=2 is not set
|
||||
- pytest -v -s tests/compile/test_fusion_all_reduce.py
|
||||
- pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
|
||||
# Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time
|
||||
# Wrap with quotes to escape yaml
|
||||
- "pytest -v -s tests/compile/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and not +quant_fp8 and not +rms_norm'"
|
||||
- "pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and not +quant_fp8 and not +rms_norm'"
|
||||
# test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40)
|
||||
- pytest -v -s tests/compile/test_full_graph.py::test_fp8_kv_scale_compile
|
||||
- pytest -v -s tests/compile/fullgraph/test_full_graph.py::test_fp8_kv_scale_compile
|
||||
|
||||
- label: Blackwell Fusion E2E Tests # 30 min
|
||||
timeout_in_minutes: 40
|
||||
@@ -966,12 +1019,11 @@ steps:
|
||||
- vllm/model_executor/layers/layernorm.py
|
||||
- vllm/model_executor/layers/activation.py
|
||||
- vllm/model_executor/layers/quantization/input_quant_fp8.py
|
||||
- tests/compile/test_fusions_e2e.py
|
||||
- tests/compile/test_full_graph.py
|
||||
- tests/compile/distributed/test_fusions_e2e.py
|
||||
commands:
|
||||
- nvidia-smi
|
||||
# Run all e2e fusion tests
|
||||
- pytest -v -s tests/compile/test_fusions_e2e.py
|
||||
- pytest -v -s tests/compile/distributed/test_fusions_e2e.py
|
||||
|
||||
- label: Blackwell GPT-OSS Eval
|
||||
timeout_in_minutes: 60
|
||||
@@ -1069,7 +1121,7 @@ steps:
|
||||
- vllm/worker/worker_base.py
|
||||
- vllm/v1/engine/
|
||||
- vllm/v1/worker/
|
||||
- tests/compile/test_basic_correctness.py
|
||||
- tests/compile/fullgraph/test_basic_correctness.py
|
||||
- tests/compile/test_wrapper.py
|
||||
- tests/distributed/
|
||||
- tests/entrypoints/llm/test_collective_rpc.py
|
||||
@@ -1081,10 +1133,11 @@ steps:
|
||||
# https://github.com/NVIDIA/nccl/issues/1838
|
||||
- export NCCL_CUMEM_HOST_ENABLE=0
|
||||
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
|
||||
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
|
||||
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
|
||||
- DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py
|
||||
- pytest -v -s entrypoints/llm/test_collective_rpc.py
|
||||
- pytest -v -s ./compile/test_basic_correctness.py
|
||||
- pytest -v -s ./compile/fullgraph/test_basic_correctness.py
|
||||
- pytest -v -s ./compile/test_wrapper.py
|
||||
- VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
|
||||
- VLLM_TEST_SAME_HOST=1 VLLM_TEST_WITH_DEFAULT_DEVICE_SET=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
|
||||
@@ -1170,6 +1223,8 @@ steps:
|
||||
# FIXIT: find out which code initialize cuda before running the test
|
||||
# before the fix, we need to use spawn to test it
|
||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||
# Alot of these tests are on the edge of OOMing
|
||||
- export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
|
||||
# There is some Tensor Parallelism related processing logic in LoRA that
|
||||
# requires multi-GPU testing for validation.
|
||||
- pytest -v -s -x lora/test_chatglm3_tp.py
|
||||
@@ -1264,11 +1319,11 @@ steps:
|
||||
working_dir: "/vllm-workspace/"
|
||||
num_gpus: 2
|
||||
commands:
|
||||
- pytest -v -s tests/compile/test_async_tp.py
|
||||
- pytest -v -s tests/compile/test_sequence_parallelism.py
|
||||
- pytest -v -s tests/compile/test_fusion_all_reduce.py
|
||||
- "pytest -v -s tests/compile/test_fusions_e2e.py -k 'not Llama-4'"
|
||||
- pytest -v -s tests/distributed/test_sequence_parallel.py
|
||||
- VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_async_tp.py
|
||||
- pytest -v -s tests/compile/distributed/test_sequence_parallelism.py
|
||||
- pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
|
||||
- "VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'"
|
||||
- VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/distributed/test_sequence_parallel.py
|
||||
- pytest -v -s tests/distributed/test_context_parallel.py
|
||||
- CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048
|
||||
- pytest -v -s tests/v1/distributed/test_dbo.py
|
||||
@@ -1288,6 +1343,7 @@ steps:
|
||||
- label: Prime-RL Integration Test # 15min
|
||||
timeout_in_minutes: 30
|
||||
optional: true
|
||||
soft_fail: true
|
||||
num_gpus: 2
|
||||
working_dir: "/vllm-workspace"
|
||||
source_file_dependencies:
|
||||
@@ -1305,11 +1361,20 @@ steps:
|
||||
commands:
|
||||
- bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh 0.25 200 8010
|
||||
|
||||
- label: Qwen3-30B-A3B-FP8-block Accuracy
|
||||
- label: Qwen3-30B-A3B-FP8-block Accuracy (H100)
|
||||
timeout_in_minutes: 60
|
||||
gpu: h100
|
||||
optional: true
|
||||
num_gpus: 4
|
||||
working_dir: "/vllm-workspace"
|
||||
commands:
|
||||
- bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep.sh 0.8 200 8020
|
||||
- bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020
|
||||
|
||||
- label: Qwen3-30B-A3B-FP8-block Accuracy (B200)
|
||||
timeout_in_minutes: 60
|
||||
gpu: b200
|
||||
optional: true
|
||||
num_gpus: 2
|
||||
working_dir: "/vllm-workspace"
|
||||
commands:
|
||||
- bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1
|
||||
|
||||
21
.buildkite/test_areas/attention.yaml
Normal file
21
.buildkite/test_areas/attention.yaml
Normal file
@@ -0,0 +1,21 @@
|
||||
group: Attention
|
||||
depends_on:
|
||||
- image-build
|
||||
steps:
|
||||
- label: V1 attention (H100)
|
||||
timeout_in_minutes: 30
|
||||
gpu: h100
|
||||
source_file_dependencies:
|
||||
- vllm/v1/attention
|
||||
- tests/v1/attention
|
||||
commands:
|
||||
- pytest -v -s v1/attention
|
||||
|
||||
- label: V1 attention (B200)
|
||||
timeout_in_minutes: 30
|
||||
gpu: b200
|
||||
source_file_dependencies:
|
||||
- vllm/v1/attention
|
||||
- tests/v1/attention
|
||||
commands:
|
||||
- VLLM_DISABLE_FLASHINFER_PREFILL=1 pytest -v -s v1/attention # TODO: FI prefill is bugged and causes incorrectness, fix this
|
||||
16
.buildkite/test_areas/basic_correctness.yaml
Normal file
16
.buildkite/test_areas/basic_correctness.yaml
Normal file
@@ -0,0 +1,16 @@
|
||||
group: Basic Correctness
|
||||
depends_on:
|
||||
- image-build
|
||||
steps:
|
||||
- label: Basic Correctness
|
||||
timeout_in_minutes: 30
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/basic_correctness/test_basic_correctness
|
||||
- tests/basic_correctness/test_cpu_offload
|
||||
- tests/basic_correctness/test_cumem.py
|
||||
commands:
|
||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||
- pytest -v -s basic_correctness/test_cumem.py
|
||||
- pytest -v -s basic_correctness/test_basic_correctness.py
|
||||
- pytest -v -s basic_correctness/test_cpu_offload.py
|
||||
19
.buildkite/test_areas/benchmarks.yaml
Normal file
19
.buildkite/test_areas/benchmarks.yaml
Normal file
@@ -0,0 +1,19 @@
|
||||
group: Benchmarks
|
||||
depends_on:
|
||||
- image-build
|
||||
steps:
|
||||
- label: Benchmarks
|
||||
timeout_in_minutes: 20
|
||||
working_dir: "/vllm-workspace/.buildkite"
|
||||
source_file_dependencies:
|
||||
- benchmarks/
|
||||
commands:
|
||||
- bash scripts/run-benchmarks.sh
|
||||
|
||||
- label: Benchmarks CLI Test
|
||||
timeout_in_minutes: 20
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/benchmarks/
|
||||
commands:
|
||||
- pytest -v -s benchmarks/
|
||||
57
.buildkite/test_areas/compile.yaml
Normal file
57
.buildkite/test_areas/compile.yaml
Normal file
@@ -0,0 +1,57 @@
|
||||
group: Compile
|
||||
depends_on:
|
||||
- image-build
|
||||
steps:
|
||||
- label: Fusion and Compile Tests (B200)
|
||||
timeout_in_minutes: 40
|
||||
working_dir: "/vllm-workspace/"
|
||||
gpu: b200
|
||||
source_file_dependencies:
|
||||
- csrc/quantization/fp4/
|
||||
- vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
|
||||
- vllm/v1/attention/backends/flashinfer.py
|
||||
- vllm/v1/worker/
|
||||
- vllm/v1/cudagraph_dispatcher.py
|
||||
- vllm/compilation/
|
||||
# can affect pattern matching
|
||||
- vllm/model_executor/layers/layernorm.py
|
||||
- vllm/model_executor/layers/activation.py
|
||||
- vllm/model_executor/layers/quantization/input_quant_fp8.py
|
||||
- tests/compile/test_fusion_attn.py
|
||||
- tests/compile/test_silu_mul_quant_fusion.py
|
||||
- tests/compile/distributed/test_fusion_all_reduce.py
|
||||
- tests/compile/distributed/test_fusions_e2e.py
|
||||
- tests/compile/fullgraph/test_full_graph.py
|
||||
commands:
|
||||
- nvidia-smi
|
||||
- pytest -v -s tests/compile/test_fusion_attn.py
|
||||
- pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
|
||||
# this runner has 2 GPUs available even though num_gpus=2 is not set
|
||||
- pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
|
||||
# Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time
|
||||
# Wrap with quotes to escape yaml
|
||||
- "pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and not +quant_fp8 and not +rms_norm'"
|
||||
# test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40)
|
||||
- pytest -v -s tests/compile/fullgraph/test_full_graph.py::test_fp8_kv_scale_compile
|
||||
|
||||
- label: Fusion E2E (2 GPUs)(B200)
|
||||
timeout_in_minutes: 40
|
||||
working_dir: "/vllm-workspace/"
|
||||
gpu: b200
|
||||
optional: true
|
||||
num_gpus: 2
|
||||
source_file_dependencies:
|
||||
- csrc/quantization/fp4/
|
||||
- vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
|
||||
- vllm/v1/attention/backends/flashinfer.py
|
||||
- vllm/compilation/
|
||||
# can affect pattern matching
|
||||
- vllm/model_executor/layers/layernorm.py
|
||||
- vllm/model_executor/layers/activation.py
|
||||
- vllm/model_executor/layers/quantization/input_quant_fp8.py
|
||||
- tests/compile/distributed/test_fusions_e2e.py
|
||||
commands:
|
||||
- nvidia-smi
|
||||
# Run all e2e fusion tests
|
||||
- pytest -v -s tests/compile/distributed/test_fusions_e2e.py
|
||||
|
||||
22
.buildkite/test_areas/cuda.yaml
Normal file
22
.buildkite/test_areas/cuda.yaml
Normal file
@@ -0,0 +1,22 @@
|
||||
group: CUDA
|
||||
depends_on:
|
||||
- image-build
|
||||
steps:
|
||||
- label: Platform Tests (CUDA)
|
||||
timeout_in_minutes: 15
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/cuda
|
||||
commands:
|
||||
- pytest -v -s cuda/test_cuda_context.py
|
||||
|
||||
- label: Cudagraph
|
||||
timeout_in_minutes: 20
|
||||
source_file_dependencies:
|
||||
- tests/v1/cudagraph
|
||||
- vllm/v1/cudagraph_dispatcher.py
|
||||
- vllm/config/compilation.py
|
||||
- vllm/compilation
|
||||
commands:
|
||||
- pytest -v -s v1/cudagraph/test_cudagraph_dispatch.py
|
||||
- pytest -v -s v1/cudagraph/test_cudagraph_mode.py
|
||||
199
.buildkite/test_areas/distributed.yaml
Normal file
199
.buildkite/test_areas/distributed.yaml
Normal file
@@ -0,0 +1,199 @@
|
||||
group: Distributed
|
||||
depends_on:
|
||||
- image-build
|
||||
steps:
|
||||
- label: Distributed Comm Ops
|
||||
timeout_in_minutes: 20
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
num_gpus: 2
|
||||
source_file_dependencies:
|
||||
- vllm/distributed
|
||||
- tests/distributed
|
||||
commands:
|
||||
- pytest -v -s distributed/test_comm_ops.py
|
||||
- pytest -v -s distributed/test_shm_broadcast.py
|
||||
- pytest -v -s distributed/test_shm_buffer.py
|
||||
- pytest -v -s distributed/test_shm_storage.py
|
||||
|
||||
- label: Distributed (2 GPUs)
|
||||
timeout_in_minutes: 90
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
num_gpus: 2
|
||||
source_file_dependencies:
|
||||
- vllm/compilation/
|
||||
- vllm/distributed/
|
||||
- vllm/engine/
|
||||
- vllm/executor/
|
||||
- vllm/worker/worker_base.py
|
||||
- vllm/v1/engine/
|
||||
- vllm/v1/worker/
|
||||
- tests/compile/fullgraph/test_basic_correctness.py
|
||||
- tests/compile/test_wrapper.py
|
||||
- tests/distributed/
|
||||
- tests/entrypoints/llm/test_collective_rpc.py
|
||||
- tests/v1/distributed
|
||||
- tests/v1/entrypoints/openai/test_multi_api_servers.py
|
||||
- tests/v1/shutdown
|
||||
- tests/v1/worker/test_worker_memory_snapshot.py
|
||||
commands:
|
||||
# https://github.com/NVIDIA/nccl/issues/1838
|
||||
- export NCCL_CUMEM_HOST_ENABLE=0
|
||||
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
|
||||
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
|
||||
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
|
||||
- DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py
|
||||
- pytest -v -s entrypoints/llm/test_collective_rpc.py
|
||||
- pytest -v -s ./compile/fullgraph/test_basic_correctness.py
|
||||
- pytest -v -s ./compile/test_wrapper.py
|
||||
- VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
|
||||
- VLLM_TEST_SAME_HOST=1 VLLM_TEST_WITH_DEFAULT_DEVICE_SET=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
|
||||
- pytest -v -s distributed/test_sequence_parallel.py
|
||||
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
|
||||
- pytest -v -s v1/worker/test_worker_memory_snapshot.py
|
||||
|
||||
- label: Distributed Tests (4 GPUs)
|
||||
timeout_in_minutes: 50
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
num_gpus: 4
|
||||
source_file_dependencies:
|
||||
- vllm/distributed/
|
||||
- tests/distributed/test_utils
|
||||
- tests/distributed/test_pynccl
|
||||
- tests/distributed/test_events
|
||||
- tests/compile/fullgraph/test_basic_correctness.py
|
||||
- examples/offline_inference/rlhf.py
|
||||
- examples/offline_inference/rlhf_colocate.py
|
||||
- tests/examples/offline_inference/data_parallel.py
|
||||
- tests/v1/distributed
|
||||
- tests/v1/engine/test_engine_core_client.py
|
||||
- tests/distributed/test_symm_mem_allreduce.py
|
||||
commands:
|
||||
# https://github.com/NVIDIA/nccl/issues/1838
|
||||
- export NCCL_CUMEM_HOST_ENABLE=0
|
||||
# test with torchrun tp=2 and external_dp=2
|
||||
- torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
|
||||
# test with torchrun tp=2 and pp=2
|
||||
- PP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
|
||||
# test with torchrun tp=4 and dp=1
|
||||
- TP_SIZE=4 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
|
||||
# test with torchrun tp=2, pp=2 and dp=1
|
||||
- PP_SIZE=2 TP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
|
||||
# test with torchrun tp=1 and dp=4 with ep
|
||||
- DP_SIZE=4 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
|
||||
# test with torchrun tp=2 and dp=2 with ep
|
||||
- TP_SIZE=2 DP_SIZE=2 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
|
||||
# test with internal dp
|
||||
- python3 ../examples/offline_inference/data_parallel.py --enforce-eager
|
||||
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
|
||||
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
|
||||
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
|
||||
- TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_internal_lb_dp.py
|
||||
- TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_hybrid_lb_dp.py
|
||||
- pytest -v -s v1/engine/test_engine_core_client.py::test_kv_cache_events_dp
|
||||
- pytest -v -s distributed/test_utils.py
|
||||
- pytest -v -s compile/fullgraph/test_basic_correctness.py
|
||||
- pytest -v -s distributed/test_pynccl.py
|
||||
- pytest -v -s distributed/test_events.py
|
||||
- pytest -v -s distributed/test_symm_mem_allreduce.py
|
||||
# TODO: create a dedicated test section for multi-GPU example tests
|
||||
# when we have multiple distributed example tests
|
||||
- cd ../examples/offline_inference
|
||||
- VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf.py
|
||||
- VLLM_ALLOW_INSECURE_SERIALIZATION=1 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
|
||||
|
||||
- label: Distributed Tests (8 GPUs)(H100)
|
||||
timeout_in_minutes: 10
|
||||
gpu: h100
|
||||
num_gpus: 8
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
source_file_dependencies:
|
||||
- examples/offline_inference/torchrun_dp_example.py
|
||||
- vllm/config/parallel.py
|
||||
- vllm/distributed/
|
||||
- vllm/v1/engine/llm_engine.py
|
||||
- vllm/v1/executor/uniproc_executor.py
|
||||
- vllm/v1/worker/gpu_worker.py
|
||||
commands:
|
||||
# https://github.com/NVIDIA/nccl/issues/1838
|
||||
- export NCCL_CUMEM_HOST_ENABLE=0
|
||||
# test with torchrun tp=2 and dp=4 with ep
|
||||
- torchrun --nproc-per-node=8 ../examples/offline_inference/torchrun_dp_example.py --tp-size=2 --pp-size=1 --dp-size=4 --enable-ep
|
||||
|
||||
- label: Distributed Tests (4 GPUs)(A100)
|
||||
gpu: a100
|
||||
optional: true
|
||||
num_gpus: 4
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
commands:
|
||||
# NOTE: don't test llama model here, it seems hf implementation is buggy
|
||||
# see https://github.com/vllm-project/vllm/pull/5689 for details
|
||||
- pytest -v -s distributed/test_custom_all_reduce.py
|
||||
- torchrun --nproc_per_node=2 distributed/test_ca_buffer_sharing.py
|
||||
- TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
|
||||
- pytest -v -s -x lora/test_mixtral.py
|
||||
|
||||
- label: Distributed Tests (2 GPUs)(H200)
|
||||
gpu: h200
|
||||
optional: true
|
||||
working_dir: "/vllm-workspace/"
|
||||
num_gpus: 2
|
||||
commands:
|
||||
- VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_async_tp.py
|
||||
- pytest -v -s tests/compile/distributed/test_sequence_parallelism.py
|
||||
- pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
|
||||
- VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'
|
||||
- VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/distributed/test_sequence_parallel.py
|
||||
- pytest -v -s tests/distributed/test_context_parallel.py
|
||||
- CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048
|
||||
- pytest -v -s tests/v1/distributed/test_dbo.py
|
||||
|
||||
- label: Distributed Tests (2 GPUs)(B200)
|
||||
gpu: b200
|
||||
optional: true
|
||||
working_dir: "/vllm-workspace/"
|
||||
num_gpus: 2
|
||||
commands:
|
||||
- pytest -v -s tests/distributed/test_context_parallel.py
|
||||
- pytest -v -s tests/distributed/test_nccl_symm_mem_allreduce.py
|
||||
- pytest -v -s tests/v1/distributed/test_dbo.py
|
||||
|
||||
- label: 2 Node Test (4 GPUs)
|
||||
timeout_in_minutes: 30
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
num_gpus: 2
|
||||
num_nodes: 2
|
||||
source_file_dependencies:
|
||||
- vllm/distributed/
|
||||
- vllm/engine/
|
||||
- vllm/executor/
|
||||
- vllm/model_executor/models/
|
||||
- tests/distributed/
|
||||
- tests/examples/offline_inference/data_parallel.py
|
||||
commands:
|
||||
- ./.buildkite/scripts/run-multi-node-test.sh /vllm-workspace/tests 2 2 public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:0bec63fa317e1fbd62e19b0fc31c43c81bf89077 "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=0 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py" "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=1 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code"
|
||||
|
||||
- label: Distributed NixlConnector PD accuracy (4 GPUs)
|
||||
timeout_in_minutes: 30
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
num_gpus: 4
|
||||
source_file_dependencies:
|
||||
- vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
|
||||
- tests/v1/kv_connector/nixl_integration/
|
||||
commands:
|
||||
- uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
|
||||
- bash v1/kv_connector/nixl_integration/tp_config_sweep_accuracy_test.sh
|
||||
|
||||
- label: Pipeline + Context Parallelism (4 GPUs))
|
||||
timeout_in_minutes: 60
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
num_gpus: 4
|
||||
source_file_dependencies:
|
||||
- vllm/distributed/
|
||||
- vllm/engine/
|
||||
- vllm/executor/
|
||||
- vllm/model_executor/models/
|
||||
- tests/distributed/
|
||||
commands:
|
||||
- pytest -v -s distributed/test_pp_cudagraph.py
|
||||
- pytest -v -s distributed/test_pipeline_parallel.py
|
||||
59
.buildkite/test_areas/e2e_integration.yaml
Normal file
59
.buildkite/test_areas/e2e_integration.yaml
Normal file
@@ -0,0 +1,59 @@
|
||||
group: E2E Integration
|
||||
depends_on:
|
||||
- image-build
|
||||
steps:
|
||||
- label: DeepSeek V2-Lite Accuracy
|
||||
timeout_in_minutes: 60
|
||||
gpu: h100
|
||||
optional: true
|
||||
num_gpus: 4
|
||||
working_dir: "/vllm-workspace"
|
||||
commands:
|
||||
- bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh 0.25 200 8010
|
||||
|
||||
- label: Qwen3-30B-A3B-FP8-block Accuracy
|
||||
timeout_in_minutes: 60
|
||||
gpu: h100
|
||||
optional: true
|
||||
num_gpus: 4
|
||||
working_dir: "/vllm-workspace"
|
||||
commands:
|
||||
- bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020
|
||||
|
||||
- label: Qwen3-30B-A3B-FP8-block Accuracy (B200)
|
||||
timeout_in_minutes: 60
|
||||
gpu: b200
|
||||
optional: true
|
||||
num_gpus: 2
|
||||
working_dir: "/vllm-workspace"
|
||||
commands:
|
||||
- bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1
|
||||
|
||||
- label: Prime-RL Integration (2 GPUs)
|
||||
timeout_in_minutes: 30
|
||||
optional: true
|
||||
num_gpus: 2
|
||||
working_dir: "/vllm-workspace"
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- .buildkite/scripts/run-prime-rl-test.sh
|
||||
commands:
|
||||
- bash .buildkite/scripts/run-prime-rl-test.sh
|
||||
|
||||
- label: DeepSeek V2-Lite Async EPLB Accuracy
|
||||
timeout_in_minutes: 60
|
||||
gpu: h100
|
||||
optional: true
|
||||
num_gpus: 4
|
||||
working_dir: "/vllm-workspace"
|
||||
commands:
|
||||
- bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_async_eplb.sh 0.25 1319 8030
|
||||
|
||||
- label: Qwen3-Next-80B-A3B-Instruct MTP Async EPLB Accuracy
|
||||
timeout_in_minutes: 60
|
||||
gpu: h100
|
||||
optional: true
|
||||
num_gpus: 4
|
||||
working_dir: "/vllm-workspace"
|
||||
commands:
|
||||
- bash .buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh 0.8 1319 8040
|
||||
26
.buildkite/test_areas/engine.yaml
Normal file
26
.buildkite/test_areas/engine.yaml
Normal file
@@ -0,0 +1,26 @@
|
||||
group: Engine
|
||||
depends_on:
|
||||
- image-build
|
||||
steps:
|
||||
- label: Engine
|
||||
timeout_in_minutes: 15
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/engine
|
||||
- tests/test_sequence
|
||||
- tests/test_config
|
||||
- tests/test_logger
|
||||
- tests/test_vllm_port
|
||||
commands:
|
||||
- pytest -v -s engine test_sequence.py test_config.py test_logger.py test_vllm_port.py
|
||||
|
||||
- label: V1 e2e + engine
|
||||
timeout_in_minutes: 45
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/v1
|
||||
commands:
|
||||
# TODO: accuracy does not match, whether setting
|
||||
# VLLM_USE_FLASHINFER_SAMPLER or not on H100.
|
||||
- pytest -v -s v1/e2e
|
||||
- pytest -v -s v1/engine
|
||||
68
.buildkite/test_areas/entrypoints.yaml
Normal file
68
.buildkite/test_areas/entrypoints.yaml
Normal file
@@ -0,0 +1,68 @@
|
||||
group: Entrypoints
|
||||
depends_on:
|
||||
- image-build
|
||||
steps:
|
||||
- label: Entrypoints Unit Tests
|
||||
timeout_in_minutes: 10
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
source_file_dependencies:
|
||||
- vllm/entrypoints
|
||||
- tests/entrypoints/
|
||||
commands:
|
||||
- pytest -v -s entrypoints/openai/tool_parsers
|
||||
- pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/openai --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py --ignore=entrypoints/pooling
|
||||
|
||||
- label: Entrypoints Integration (LLM)
|
||||
timeout_in_minutes: 40
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/entrypoints/llm
|
||||
- tests/entrypoints/offline_mode
|
||||
commands:
|
||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||
- pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_collective_rpc.py
|
||||
- pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
|
||||
- pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
|
||||
|
||||
- label: Entrypoints Integration (API Server)
|
||||
timeout_in_minutes: 130
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/entrypoints/openai
|
||||
- tests/entrypoints/test_chat_utils
|
||||
commands:
|
||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||
- PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/openai/test_collective_rpc.py # PYTHONPATH is needed to import custom Worker extension
|
||||
- pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/test_collective_rpc.py --ignore=entrypoints/openai/tool_parsers/
|
||||
- pytest -v -s entrypoints/test_chat_utils.py
|
||||
|
||||
|
||||
- label: Entrypoints Integration (Pooling)
|
||||
timeout_in_minutes: 50
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/entrypoints/pooling
|
||||
commands:
|
||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||
- pytest -v -s entrypoints/pooling
|
||||
|
||||
|
||||
- label: Entrypoints V1
|
||||
timeout_in_minutes: 50
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/v1
|
||||
commands:
|
||||
- pytest -v -s v1/entrypoints
|
||||
|
||||
- label: OpenAI API Correctness
|
||||
timeout_in_minutes: 30
|
||||
source_file_dependencies:
|
||||
- csrc/
|
||||
- vllm/entrypoints/openai/
|
||||
- vllm/model_executor/models/whisper.py
|
||||
commands: # LMEval+Transcription WER check
|
||||
- pytest -s entrypoints/openai/correctness/
|
||||
23
.buildkite/test_areas/expert_parallelism.yaml
Normal file
23
.buildkite/test_areas/expert_parallelism.yaml
Normal file
@@ -0,0 +1,23 @@
|
||||
group: Expert Parallelism
|
||||
depends_on:
|
||||
- image-build
|
||||
steps:
|
||||
- label: EPLB Algorithm
|
||||
timeout_in_minutes: 15
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
source_file_dependencies:
|
||||
- vllm/distributed/eplb
|
||||
- tests/distributed/test_eplb_algo.py
|
||||
commands:
|
||||
- pytest -v -s distributed/test_eplb_algo.py
|
||||
|
||||
- label: EPLB Execution
|
||||
timeout_in_minutes: 20
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
num_gpus: 4
|
||||
source_file_dependencies:
|
||||
- vllm/distributed/eplb
|
||||
- tests/distributed/test_eplb_execute.py
|
||||
commands:
|
||||
- pytest -v -s distributed/test_eplb_execute.py
|
||||
- pytest -v -s distributed/test_eplb_spec_decode.py
|
||||
117
.buildkite/test_areas/kernels.yaml
Normal file
117
.buildkite/test_areas/kernels.yaml
Normal file
@@ -0,0 +1,117 @@
|
||||
group: Kernels
|
||||
depends_on:
|
||||
- image-build
|
||||
steps:
|
||||
- label: Kernels Core Operation Test
|
||||
timeout_in_minutes: 75
|
||||
source_file_dependencies:
|
||||
- csrc/
|
||||
- tests/kernels/core
|
||||
- tests/kernels/test_top_k_per_row.py
|
||||
commands:
|
||||
- pytest -v -s kernels/core kernels/test_top_k_per_row.py
|
||||
|
||||
- label: Kernels Attention Test %N
|
||||
timeout_in_minutes: 35
|
||||
source_file_dependencies:
|
||||
- csrc/attention/
|
||||
- vllm/attention
|
||||
- vllm/v1/attention
|
||||
- tests/kernels/attention
|
||||
commands:
|
||||
- pytest -v -s kernels/attention --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
|
||||
parallelism: 2
|
||||
|
||||
- label: Kernels Quantization Test %N
|
||||
timeout_in_minutes: 90
|
||||
source_file_dependencies:
|
||||
- csrc/quantization/
|
||||
- vllm/model_executor/layers/quantization
|
||||
- tests/kernels/quantization
|
||||
commands:
|
||||
- pytest -v -s kernels/quantization --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
|
||||
parallelism: 2
|
||||
|
||||
- label: Kernels MoE Test %N
|
||||
timeout_in_minutes: 60
|
||||
source_file_dependencies:
|
||||
- csrc/quantization/cutlass_w8a8/moe/
|
||||
- csrc/moe/
|
||||
- tests/kernels/moe
|
||||
- vllm/model_executor/layers/fused_moe/
|
||||
- vllm/distributed/device_communicators/
|
||||
- vllm/envs.py
|
||||
- vllm/config
|
||||
commands:
|
||||
- pytest -v -s kernels/moe --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
|
||||
parallelism: 2
|
||||
|
||||
- label: Kernels Mamba Test
|
||||
timeout_in_minutes: 45
|
||||
source_file_dependencies:
|
||||
- csrc/mamba/
|
||||
- tests/kernels/mamba
|
||||
- vllm/model_executor/layers/mamba/ops
|
||||
commands:
|
||||
- pytest -v -s kernels/mamba
|
||||
|
||||
- label: Kernels DeepGEMM Test (H100)
|
||||
timeout_in_minutes: 45
|
||||
gpu: h100
|
||||
num_gpus: 1
|
||||
source_file_dependencies:
|
||||
- tools/install_deepgemm.sh
|
||||
- vllm/utils/deep_gemm.py
|
||||
- vllm/model_executor/layers/fused_moe
|
||||
- vllm/model_executor/layers/quantization
|
||||
- tests/kernels/quantization/test_block_fp8.py
|
||||
- tests/kernels/moe/test_deepgemm.py
|
||||
- tests/kernels/moe/test_batched_deepgemm.py
|
||||
- tests/kernels/attention/test_deepgemm_attention.py
|
||||
commands:
|
||||
- pytest -v -s kernels/quantization/test_block_fp8.py -k deep_gemm
|
||||
- pytest -v -s kernels/moe/test_deepgemm.py
|
||||
- pytest -v -s kernels/moe/test_batched_deepgemm.py
|
||||
- pytest -v -s kernels/attention/test_deepgemm_attention.py
|
||||
|
||||
- label: Kernels (B200)
|
||||
timeout_in_minutes: 30
|
||||
working_dir: "/vllm-workspace/"
|
||||
gpu: b200
|
||||
# optional: true
|
||||
source_file_dependencies:
|
||||
- csrc/quantization/fp4/
|
||||
- csrc/attention/mla/
|
||||
- csrc/quantization/cutlass_w8a8/moe/
|
||||
- vllm/model_executor/layers/fused_moe/cutlass_moe.py
|
||||
- vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py
|
||||
- vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py
|
||||
- vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
|
||||
- vllm/v1/attention/backends/flashinfer.py
|
||||
- vllm/v1/attention/backends/mla/cutlass_mla.py
|
||||
- vllm/v1/attention/backends/mla/flashinfer_mla.py
|
||||
- vllm/platforms/cuda.py
|
||||
- vllm/attention/selector.py
|
||||
commands:
|
||||
- nvidia-smi
|
||||
- python3 examples/offline_inference/basic/chat.py
|
||||
# Attention
|
||||
# num_heads2 broken by https://github.com/flashinfer-ai/flashinfer/issues/1353
|
||||
- pytest -v -s tests/kernels/attention/test_attention_selector.py
|
||||
- pytest -v -s tests/kernels/attention/test_flashinfer.py -k 'not num_heads2'
|
||||
- pytest -v -s tests/kernels/attention/test_flashinfer_trtllm_attention.py
|
||||
- pytest -v -s tests/kernels/attention/test_cutlass_mla_decode.py
|
||||
- pytest -v -s tests/kernels/attention/test_flashinfer_mla_decode.py
|
||||
# Quantization
|
||||
- pytest -v -s tests/kernels/quantization/test_cutlass_scaled_mm.py -k 'fp8'
|
||||
- pytest -v -s tests/kernels/quantization/test_nvfp4_quant.py
|
||||
- pytest -v -s tests/kernels/quantization/test_silu_mul_nvfp4_quant.py
|
||||
- pytest -v -s tests/kernels/quantization/test_nvfp4_scaled_mm.py
|
||||
- pytest -v -s tests/kernels/quantization/test_flashinfer_scaled_mm.py
|
||||
- pytest -v -s tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py
|
||||
- pytest -v -s tests/kernels/quantization/test_nvfp4_qutlass.py
|
||||
- pytest -v -s tests/kernels/quantization/test_mxfp4_qutlass.py
|
||||
- pytest -v -s tests/kernels/moe/test_nvfp4_moe.py
|
||||
- pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py
|
||||
- pytest -v -s tests/kernels/moe/test_flashinfer.py
|
||||
- pytest -v -s tests/kernels/moe/test_cutedsl_moe.py
|
||||
46
.buildkite/test_areas/lm_eval.yaml
Normal file
46
.buildkite/test_areas/lm_eval.yaml
Normal file
@@ -0,0 +1,46 @@
|
||||
group: LM Eval
|
||||
depends_on:
|
||||
- image-build
|
||||
steps:
|
||||
- label: LM Eval Small Models
|
||||
timeout_in_minutes: 75
|
||||
source_file_dependencies:
|
||||
- csrc/
|
||||
- vllm/model_executor/layers/quantization
|
||||
autorun_on_main: true
|
||||
commands:
|
||||
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt --tp-size=1
|
||||
|
||||
- label: LM Eval Large Models (4 GPUs)(A100)
|
||||
gpu: a100
|
||||
optional: true
|
||||
num_gpus: 4
|
||||
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
|
||||
source_file_dependencies:
|
||||
- csrc/
|
||||
- vllm/model_executor/layers/quantization
|
||||
commands:
|
||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4
|
||||
|
||||
- label: LM Eval Large Models (4 GPUs)(H100)
|
||||
gpu: h100
|
||||
optional: true
|
||||
num_gpus: 4
|
||||
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
|
||||
source_file_dependencies:
|
||||
- csrc/
|
||||
- vllm/model_executor/layers/quantization
|
||||
commands:
|
||||
- export VLLM_USE_DEEP_GEMM=0 # We found Triton is faster than DeepGEMM for H100
|
||||
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt --tp-size=4
|
||||
|
||||
- label: LM Eval Small Models (B200)
|
||||
timeout_in_minutes: 120
|
||||
gpu: b200
|
||||
optional: true
|
||||
source_file_dependencies:
|
||||
- csrc/
|
||||
- vllm/model_executor/layers/quantization
|
||||
commands:
|
||||
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt --tp-size=1
|
||||
31
.buildkite/test_areas/lora.yaml
Normal file
31
.buildkite/test_areas/lora.yaml
Normal file
@@ -0,0 +1,31 @@
|
||||
group: LoRA
|
||||
depends_on:
|
||||
- image-build
|
||||
steps:
|
||||
- label: LoRA %N
|
||||
timeout_in_minutes: 30
|
||||
source_file_dependencies:
|
||||
- vllm/lora
|
||||
- tests/lora
|
||||
commands:
|
||||
- pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py --ignore=lora/test_llm_with_multi_loras.py --ignore=lora/test_olmoe_tp.py --ignore=lora/test_deepseekv2_tp.py --ignore=lora/test_gptoss_tp.py --ignore=lora/test_qwen3moe_tp.py
|
||||
parallelism: 4
|
||||
|
||||
|
||||
- label: LoRA TP (Distributed)
|
||||
timeout_in_minutes: 30
|
||||
num_gpus: 4
|
||||
source_file_dependencies:
|
||||
- vllm/lora
|
||||
- tests/lora
|
||||
commands:
|
||||
# FIXIT: find out which code initialize cuda before running the test
|
||||
# before the fix, we need to use spawn to test it
|
||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||
# There is some Tensor Parallelism related processing logic in LoRA that
|
||||
# requires multi-GPU testing for validation.
|
||||
- pytest -v -s -x lora/test_chatglm3_tp.py
|
||||
- pytest -v -s -x lora/test_llama_tp.py
|
||||
- pytest -v -s -x lora/test_llm_with_multi_loras.py
|
||||
- pytest -v -s -x lora/test_olmoe_tp.py
|
||||
- pytest -v -s -x lora/test_gptoss_tp.py
|
||||
165
.buildkite/test_areas/misc.yaml
Normal file
165
.buildkite/test_areas/misc.yaml
Normal file
@@ -0,0 +1,165 @@
|
||||
group: Miscellaneous
|
||||
depends_on:
|
||||
- image-build
|
||||
steps:
|
||||
- label: V1 Others
|
||||
timeout_in_minutes: 60
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/v1
|
||||
commands:
|
||||
- uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
|
||||
# split the test to avoid interference
|
||||
- pytest -v -s -m 'not cpu_test' v1/core
|
||||
- pytest -v -s v1/executor
|
||||
- pytest -v -s v1/kv_offload
|
||||
- pytest -v -s v1/sample
|
||||
- pytest -v -s v1/logits_processors
|
||||
- pytest -v -s v1/worker
|
||||
- pytest -v -s v1/spec_decode
|
||||
- pytest -v -s -m 'not cpu_test' v1/kv_connector/unit
|
||||
- pytest -v -s -m 'not cpu_test' v1/metrics
|
||||
- pytest -v -s v1/test_oracle.py
|
||||
- pytest -v -s v1/test_request.py
|
||||
- pytest -v -s v1/test_outputs.py
|
||||
# Integration test for streaming correctness (requires special branch).
|
||||
- pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api
|
||||
- pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
|
||||
|
||||
- label: V1 Others (CPU)
|
||||
depends_on: ~
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/v1
|
||||
no_gpu: true
|
||||
commands:
|
||||
# split the test to avoid interference
|
||||
- pytest -v -s -m 'cpu_test' v1/core
|
||||
- pytest -v -s v1/structured_output
|
||||
- pytest -v -s v1/test_serial_utils.py
|
||||
- pytest -v -s -m 'cpu_test' v1/kv_connector/unit
|
||||
- pytest -v -s -m 'cpu_test' v1/metrics
|
||||
|
||||
- label: Regression
|
||||
timeout_in_minutes: 20
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/test_regression
|
||||
commands:
|
||||
- pip install modelscope
|
||||
- pytest -v -s test_regression.py
|
||||
working_dir: "/vllm-workspace/tests" # optional
|
||||
|
||||
- label: Examples
|
||||
timeout_in_minutes: 45
|
||||
working_dir: "/vllm-workspace/examples"
|
||||
source_file_dependencies:
|
||||
- vllm/entrypoints
|
||||
- vllm/multimodal
|
||||
- examples/
|
||||
commands:
|
||||
- pip install tensorizer # for tensorizer test
|
||||
- python3 offline_inference/basic/chat.py # for basic
|
||||
- python3 offline_inference/basic/generate.py --model facebook/opt-125m
|
||||
- python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
|
||||
- python3 offline_inference/basic/classify.py
|
||||
- python3 offline_inference/basic/embed.py
|
||||
- python3 offline_inference/basic/score.py
|
||||
# for multi-modal models
|
||||
- python3 offline_inference/audio_language.py --seed 0
|
||||
- python3 offline_inference/vision_language.py --seed 0
|
||||
- python3 offline_inference/vision_language_multi_image.py --seed 0
|
||||
- python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
|
||||
# for pooling models
|
||||
- python3 pooling/pooling/vision_language_pooling.py --seed 0
|
||||
# for features demo
|
||||
- python3 offline_inference/prefix_caching.py
|
||||
- python3 offline_inference/llm_engine_example.py
|
||||
- python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
|
||||
- python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
|
||||
# https://github.com/vllm-project/vllm/pull/26682 uses slightly more memory in PyTorch 2.9+ causing this test to OOM in 1xL4 GPU
|
||||
- python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536
|
||||
|
||||
- label: Metrics, Tracing (2 GPUs)
|
||||
timeout_in_minutes: 20
|
||||
num_gpus: 2
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/v1/tracing
|
||||
commands:
|
||||
- "pip install \
|
||||
'opentelemetry-sdk>=1.26.0' \
|
||||
'opentelemetry-api>=1.26.0' \
|
||||
'opentelemetry-exporter-otlp>=1.26.0' \
|
||||
'opentelemetry-semantic-conventions-ai>=0.4.1'"
|
||||
- pytest -v -s v1/tracing
|
||||
|
||||
- label: Python-only Installation
|
||||
depends_on: ~
|
||||
timeout_in_minutes: 20
|
||||
source_file_dependencies:
|
||||
- tests/standalone_tests/python_only_compile.sh
|
||||
- setup.py
|
||||
commands:
|
||||
- bash standalone_tests/python_only_compile.sh
|
||||
|
||||
- label: Async Engine, Inputs, Utils, Worker
|
||||
timeout_in_minutes: 50
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/multimodal
|
||||
- tests/utils_
|
||||
commands:
|
||||
- pytest -v -s -m 'not cpu_test' multimodal
|
||||
- pytest -v -s utils_
|
||||
|
||||
- label: Async Engine, Inputs, Utils, Worker, Config (CPU)
|
||||
depends_on: ~
|
||||
timeout_in_minutes: 30
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/test_inputs.py
|
||||
- tests/test_outputs.py
|
||||
- tests/multimodal
|
||||
- tests/standalone_tests/lazy_imports.py
|
||||
- tests/tokenizers_
|
||||
- tests/tool_parsers
|
||||
- tests/transformers_utils
|
||||
- tests/config
|
||||
no_gpu: true
|
||||
commands:
|
||||
- python3 standalone_tests/lazy_imports.py
|
||||
- pytest -v -s test_inputs.py
|
||||
- pytest -v -s test_outputs.py
|
||||
- pytest -v -s -m 'cpu_test' multimodal
|
||||
- pytest -v -s tokenizers_
|
||||
- pytest -v -s tool_parsers
|
||||
- pytest -v -s transformers_utils
|
||||
- pytest -v -s config
|
||||
|
||||
- label: GPT-OSS Eval (B200)
|
||||
timeout_in_minutes: 60
|
||||
working_dir: "/vllm-workspace/"
|
||||
gpu: b200
|
||||
optional: true
|
||||
source_file_dependencies:
|
||||
- tests/evals/gpt_oss
|
||||
- vllm/model_executor/models/gpt_oss.py
|
||||
- vllm/model_executor/layers/quantization/mxfp4.py
|
||||
- vllm/v1/attention/backends/flashinfer.py
|
||||
commands:
|
||||
- uv pip install --system 'gpt-oss[eval]==0.0.5'
|
||||
- pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58
|
||||
|
||||
- label: Batch Invariance (H100)
|
||||
timeout_in_minutes: 25
|
||||
gpu: h100
|
||||
source_file_dependencies:
|
||||
- vllm/v1/attention
|
||||
- vllm/model_executor/layers
|
||||
- tests/v1/determinism/
|
||||
commands:
|
||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||
- pip install pytest-timeout pytest-forked
|
||||
- pytest -v -s v1/determinism/test_batch_invariance.py
|
||||
- pytest -v -s v1/determinism/test_rms_norm_batch_invariant.py
|
||||
17
.buildkite/test_areas/model_executor.yaml
Normal file
17
.buildkite/test_areas/model_executor.yaml
Normal file
@@ -0,0 +1,17 @@
|
||||
group: Model Executor
|
||||
depends_on:
|
||||
- image-build
|
||||
steps:
|
||||
- label: Model Executor
|
||||
timeout_in_minutes: 35
|
||||
source_file_dependencies:
|
||||
- vllm/engine/arg_utils.py
|
||||
- vllm/config/model.py
|
||||
- vllm/model_executor
|
||||
- tests/model_executor
|
||||
- tests/entrypoints/openai/test_tensorizer_entrypoint.py
|
||||
commands:
|
||||
- apt-get update && apt-get install -y curl libsodium23
|
||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||
- pytest -v -s model_executor
|
||||
- pytest -v -s entrypoints/openai/test_tensorizer_entrypoint.py
|
||||
62
.buildkite/test_areas/models_basic.yaml
Normal file
62
.buildkite/test_areas/models_basic.yaml
Normal file
@@ -0,0 +1,62 @@
|
||||
group: Models - Basic
|
||||
depends_on:
|
||||
- image-build
|
||||
steps:
|
||||
- label: Basic Models Tests (Initialization)
|
||||
timeout_in_minutes: 45
|
||||
mirror_hardwares: [amdexperimental]
|
||||
torch_nightly: true
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/models/test_initialization.py
|
||||
commands:
|
||||
# Run a subset of model initialization tests
|
||||
- pytest -v -s models/test_initialization.py::test_can_initialize_small_subset
|
||||
|
||||
- label: Basic Models Tests (Extra Initialization) %N
|
||||
timeout_in_minutes: 45
|
||||
mirror_hardwares: [amdexperimental]
|
||||
torch_nightly: true
|
||||
source_file_dependencies:
|
||||
- vllm/model_executor/models/
|
||||
- tests/models/test_initialization.py
|
||||
commands:
|
||||
# Only when vLLM model source is modified - test initialization of a large
|
||||
# subset of supported models (the complement of the small subset in the above
|
||||
# test.) Also run if model initialization test file is modified
|
||||
- pytest -v -s models/test_initialization.py -k 'not test_can_initialize_small_subset' --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --shard-id=$$BUILDKITE_PARALLEL_JOB
|
||||
parallelism: 2
|
||||
|
||||
- label: Basic Models Tests (Other)
|
||||
timeout_in_minutes: 45
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/models/test_transformers.py
|
||||
- tests/models/test_registry.py
|
||||
commands:
|
||||
- pytest -v -s models/test_transformers.py models/test_registry.py
|
||||
|
||||
- label: Basic Models Test (Other CPU) # 5min
|
||||
timeout_in_minutes: 10
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/models/test_utils.py
|
||||
- tests/models/test_vision.py
|
||||
no_gpu: true
|
||||
commands:
|
||||
- pytest -v -s models/test_utils.py models/test_vision.py
|
||||
|
||||
- label: Transformers Nightly Models
|
||||
working_dir: "/vllm-workspace/"
|
||||
optional: true
|
||||
soft_fail: true
|
||||
commands:
|
||||
- pip install --upgrade git+https://github.com/huggingface/transformers
|
||||
- pytest -v -s tests/models/test_initialization.py
|
||||
- pytest -v -s tests/models/test_transformers.py
|
||||
- pytest -v -s tests/models/multimodal/processing/
|
||||
- pytest -v -s tests/models/multimodal/test_mapping.py
|
||||
- python3 examples/offline_inference/basic/chat.py
|
||||
- python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
|
||||
# Whisper needs spawn method to avoid deadlock
|
||||
- VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper
|
||||
22
.buildkite/test_areas/models_distributed.yaml
Normal file
22
.buildkite/test_areas/models_distributed.yaml
Normal file
@@ -0,0 +1,22 @@
|
||||
group: Models - Distributed
|
||||
depends_on:
|
||||
- image-build
|
||||
steps:
|
||||
- label: Distributed Model Tests (2 GPUs)
|
||||
timeout_in_minutes: 50
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
num_gpus: 2
|
||||
source_file_dependencies:
|
||||
- vllm/model_executor/model_loader/sharded_state_loader.py
|
||||
- vllm/model_executor/models/
|
||||
- tests/basic_correctness/
|
||||
- tests/model_executor/model_loader/test_sharded_state_loader.py
|
||||
- tests/models/
|
||||
commands:
|
||||
- TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
|
||||
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s model_executor/model_loader/test_sharded_state_loader.py
|
||||
# Avoid importing model tests that cause CUDA reinitialization error
|
||||
- pytest models/test_transformers.py -v -s -m 'distributed(num_gpus=2)'
|
||||
- pytest models/language -v -s -m 'distributed(num_gpus=2)'
|
||||
- pytest models/multimodal -v -s -m 'distributed(num_gpus=2)' --ignore models/multimodal/generation/test_whisper.py
|
||||
- VLLM_WORKER_MULTIPROC_METHOD=spawn pytest models/multimodal/generation/test_whisper.py -v -s -m 'distributed(num_gpus=2)'
|
||||
91
.buildkite/test_areas/models_language.yaml
Normal file
91
.buildkite/test_areas/models_language.yaml
Normal file
@@ -0,0 +1,91 @@
|
||||
group: Models - Language
|
||||
depends_on:
|
||||
- image-build
|
||||
steps:
|
||||
- label: Language Models Tests (Standard)
|
||||
timeout_in_minutes: 25
|
||||
mirror_hardwares: [amdexperimental]
|
||||
torch_nightly: true
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/models/language
|
||||
commands:
|
||||
# Test standard language models, excluding a subset of slow tests
|
||||
- pip freeze | grep -E 'torch'
|
||||
- pytest -v -s models/language -m 'core_model and (not slow_test)'
|
||||
|
||||
- label: Language Models Tests (Extra Standard) %N
|
||||
timeout_in_minutes: 45
|
||||
mirror_hardwares: [amdexperimental]
|
||||
torch_nightly: true
|
||||
source_file_dependencies:
|
||||
- vllm/model_executor/models/
|
||||
- tests/models/language/pooling/test_embedding.py
|
||||
- tests/models/language/generation/test_common.py
|
||||
- tests/models/language/pooling/test_classification.py
|
||||
commands:
|
||||
# Shard slow subset of standard language models tests. Only run when model
|
||||
# source is modified, or when specified test files are modified
|
||||
- pip freeze | grep -E 'torch'
|
||||
- pytest -v -s models/language -m 'core_model and slow_test' --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --shard-id=$$BUILDKITE_PARALLEL_JOB
|
||||
parallelism: 2
|
||||
|
||||
- label: Language Models Tests (Hybrid) %N
|
||||
timeout_in_minutes: 75
|
||||
mirror_hardwares: [amdexperimental]
|
||||
torch_nightly: true
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/models/language/generation
|
||||
commands:
|
||||
# Install fast path packages for testing against transformers
|
||||
# Note: also needed to run plamo2 model in vLLM
|
||||
- uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/mamba@v2.2.5'
|
||||
- uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2'
|
||||
# Shard hybrid language model tests
|
||||
- pytest -v -s models/language/generation -m hybrid_model --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --shard-id=$$BUILDKITE_PARALLEL_JOB
|
||||
parallelism: 2
|
||||
|
||||
- label: Language Models Test (Extended Generation) # 80min
|
||||
timeout_in_minutes: 110
|
||||
mirror_hardwares: [amdexperimental]
|
||||
optional: true
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/models/language/generation
|
||||
commands:
|
||||
# Install fast path packages for testing against transformers
|
||||
# Note: also needed to run plamo2 model in vLLM
|
||||
- uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/mamba@v2.2.5'
|
||||
- uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2'
|
||||
- pytest -v -s models/language/generation -m '(not core_model) and (not hybrid_model)'
|
||||
|
||||
- label: Language Models Test (PPL)
|
||||
timeout_in_minutes: 110
|
||||
mirror_hardwares: [amdexperimental]
|
||||
optional: true
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/models/language/generation_ppl_test
|
||||
commands:
|
||||
- pytest -v -s models/language/generation_ppl_test
|
||||
|
||||
- label: Language Models Test (Extended Pooling) # 36min
|
||||
timeout_in_minutes: 50
|
||||
mirror_hardwares: [amdexperimental]
|
||||
optional: true
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/models/language/pooling
|
||||
commands:
|
||||
- pytest -v -s models/language/pooling -m 'not core_model'
|
||||
|
||||
- label: Language Models Test (MTEB)
|
||||
timeout_in_minutes: 110
|
||||
mirror_hardwares: [amdexperimental]
|
||||
optional: true
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/models/language/pooling_mteb_test
|
||||
commands:
|
||||
- pytest -v -s models/language/pooling_mteb_test
|
||||
79
.buildkite/test_areas/models_multimodal.yaml
Normal file
79
.buildkite/test_areas/models_multimodal.yaml
Normal file
@@ -0,0 +1,79 @@
|
||||
group: Models - Multimodal
|
||||
depends_on:
|
||||
- image-build
|
||||
steps:
|
||||
- label: Multi-Modal Models (Standard) # 60min
|
||||
timeout_in_minutes: 80
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/models/multimodal
|
||||
commands:
|
||||
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
|
||||
- pip freeze | grep -E 'torch'
|
||||
- pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing
|
||||
- cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model # Otherwise, mp_method="spawn" doesn't work
|
||||
|
||||
- label: Multi-Modal Processor Test (CPU)
|
||||
timeout_in_minutes: 60
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/models/multimodal
|
||||
no_gpu: true
|
||||
commands:
|
||||
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
|
||||
- pytest -v -s models/multimodal/processing --ignore models/multimodal/processing/test_tensor_schema.py
|
||||
|
||||
- label: Multi-Modal Processor # 44min
|
||||
timeout_in_minutes: 60
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/models/multimodal
|
||||
commands:
|
||||
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
|
||||
- pytest -v -s models/multimodal/processing/test_tensor_schema.py
|
||||
|
||||
- label: Multi-Modal Accuracy Eval (Small Models) # 50min
|
||||
timeout_in_minutes: 70
|
||||
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
|
||||
source_file_dependencies:
|
||||
- vllm/multimodal/
|
||||
- vllm/inputs/
|
||||
- vllm/v1/core/
|
||||
commands:
|
||||
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-mm-small.txt --tp-size=1
|
||||
|
||||
- label: Multi-Modal Models (Extended) 1
|
||||
optional: true
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/models/multimodal
|
||||
commands:
|
||||
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
|
||||
- pytest -v -s models/multimodal -m 'not core_model' --ignore models/multimodal/generation/test_common.py --ignore models/multimodal/processing
|
||||
|
||||
- label: Multi-Modal Models (Extended) 2
|
||||
optional: true
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/models/multimodal
|
||||
commands:
|
||||
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
|
||||
- pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=0) and not core_model'
|
||||
|
||||
- label: Multi-Modal Models (Extended) 3
|
||||
optional: true
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/models/multimodal
|
||||
commands:
|
||||
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
|
||||
- pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=1) and not core_model'
|
||||
|
||||
# This test is used only in PR development phase to test individual models and should never run on main
|
||||
- label: Custom Models
|
||||
optional: true
|
||||
commands:
|
||||
- echo 'Testing custom models...'
|
||||
# PR authors can temporarily add commands below to test individual models
|
||||
# e.g. pytest -v -s models/encoder_decoder/vision_language/test_mllama.py
|
||||
# *To avoid merge conflicts, remember to REMOVE (not just comment out) them before merging the PR*
|
||||
34
.buildkite/test_areas/plugins.yaml
Normal file
34
.buildkite/test_areas/plugins.yaml
Normal file
@@ -0,0 +1,34 @@
|
||||
group: Plugins
|
||||
depends_on:
|
||||
- image-build
|
||||
steps:
|
||||
- label: Plugin Tests (2 GPUs)
|
||||
timeout_in_minutes: 60
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
num_gpus: 2
|
||||
source_file_dependencies:
|
||||
- vllm/plugins/
|
||||
- tests/plugins/
|
||||
commands:
|
||||
# begin platform plugin and general plugin tests, all the code in-between runs on dummy platform
|
||||
- pip install -e ./plugins/vllm_add_dummy_platform
|
||||
- pytest -v -s plugins_tests/test_platform_plugins.py
|
||||
- pip uninstall vllm_add_dummy_platform -y
|
||||
# end platform plugin tests
|
||||
# begin io_processor plugins test, all the code in between uses the prithvi_io_processor plugin
|
||||
- pip install -e ./plugins/prithvi_io_processor_plugin
|
||||
- pytest -v -s plugins_tests/test_io_processor_plugins.py
|
||||
- pip uninstall prithvi_io_processor_plugin -y
|
||||
# end io_processor plugins test
|
||||
# begin stat_logger plugins test
|
||||
- pip install -e ./plugins/vllm_add_dummy_stat_logger
|
||||
- pytest -v -s plugins_tests/test_stats_logger_plugins.py
|
||||
- pip uninstall dummy_stat_logger -y
|
||||
# end stat_logger plugins test
|
||||
# other tests continue here:
|
||||
- pytest -v -s plugins_tests/test_scheduler_plugins.py
|
||||
- pip install -e ./plugins/vllm_add_dummy_model
|
||||
- pytest -v -s distributed/test_distributed_oot.py
|
||||
- pytest -v -s entrypoints/openai/test_oot_registration.py # it needs a clean process
|
||||
- pytest -v -s models/test_oot_registration.py # it needs a clean process
|
||||
- pytest -v -s plugins/lora_resolvers # unit tests for in-tree lora resolver plugins
|
||||
50
.buildkite/test_areas/pytorch.yaml
Normal file
50
.buildkite/test_areas/pytorch.yaml
Normal file
@@ -0,0 +1,50 @@
|
||||
group: PyTorch
|
||||
depends_on:
|
||||
- image-build
|
||||
steps:
|
||||
- label: PyTorch Compilation Unit Tests
|
||||
timeout_in_minutes: 30
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/compile
|
||||
commands:
|
||||
# Run unit tests defined directly under compile/,
|
||||
# not including subdirectories, which are usually heavier
|
||||
# tests covered elsewhere.
|
||||
# Use `find` to launch multiple instances of pytest so that
|
||||
# they do not suffer from https://github.com/vllm-project/vllm/issues/28965
|
||||
- "find compile/ -maxdepth 1 -name 'test_*.py' -exec pytest -s -v {} \\;"
|
||||
|
||||
- label: PyTorch Fullgraph Smoke Test
|
||||
timeout_in_minutes: 30
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/compile
|
||||
commands:
|
||||
# Run smoke tests under fullgraph directory, except test_full_graph.py
|
||||
# as it is a heavy test that is covered in other steps.
|
||||
# Use `find` to launch multiple instances of pytest so that
|
||||
# they do not suffer from https://github.com/vllm-project/vllm/issues/28965
|
||||
- "find compile/fullgraph/ -name 'test_*.py' -not -name 'test_full_graph.py' -exec pytest -s -v {} \\;"
|
||||
|
||||
- label: PyTorch Fullgraph
|
||||
timeout_in_minutes: 40
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/compile
|
||||
commands:
|
||||
# fp8 kv scales not supported on sm89, tested on Blackwell instead
|
||||
- pytest -v -s compile/fullgraph/test_full_graph.py -k 'not test_fp8_kv_scale_compile'
|
||||
# Limit to no custom ops to reduce running time
|
||||
# Wrap with quotes to escape yaml and avoid starting -k string with a -
|
||||
- "pytest -v -s compile/distributed/test_fusions_e2e.py -k 'TRITON and not +quant_fp8 and not Llama-4'"
|
||||
|
||||
- label: Pytorch Nightly Dependency Override Check # 2min
|
||||
# if this test fails, it means the nightly torch version is not compatible with some
|
||||
# of the dependencies. Please check the error message and add the package to whitelist
|
||||
# in /vllm/tools/pre_commit/generate_nightly_torch_test.py
|
||||
soft_fail: true
|
||||
source_file_dependencies:
|
||||
- requirements/nightly_torch_test.txt
|
||||
commands:
|
||||
- bash standalone_tests/pytorch_nightly_dependency.sh
|
||||
46
.buildkite/test_areas/quantization.yaml
Normal file
46
.buildkite/test_areas/quantization.yaml
Normal file
@@ -0,0 +1,46 @@
|
||||
group: Quantization
|
||||
depends_on:
|
||||
- image-build
|
||||
steps:
|
||||
- label: Quantization
|
||||
timeout_in_minutes: 90
|
||||
source_file_dependencies:
|
||||
- csrc/
|
||||
- vllm/model_executor/layers/quantization
|
||||
- tests/quantization
|
||||
commands:
|
||||
# temporary install here since we need nightly, will move to requirements/test.in
|
||||
# after torchao 0.12 release, and pin a working version of torchao nightly here
|
||||
|
||||
# since torchao nightly is only compatible with torch nightly currently
|
||||
# https://github.com/pytorch/ao/issues/2919, we'll have to skip new torchao tests for now
|
||||
# we can only upgrade after this is resolved
|
||||
# TODO(jerryzh168): resolve the above comment
|
||||
- uv pip install --system torchao==0.13.0 --index-url https://download.pytorch.org/whl/cu129
|
||||
- uv pip install --system conch-triton-kernels
|
||||
- VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py
|
||||
|
||||
- label: Quantized MoE Test (B200)
|
||||
timeout_in_minutes: 60
|
||||
working_dir: "/vllm-workspace/"
|
||||
gpu: b200
|
||||
source_file_dependencies:
|
||||
- tests/quantization/test_blackwell_moe.py
|
||||
- vllm/model_executor/models/deepseek_v2.py
|
||||
- vllm/model_executor/models/gpt_oss.py
|
||||
- vllm/model_executor/models/llama4.py
|
||||
- vllm/model_executor/layers/fused_moe
|
||||
- vllm/model_executor/layers/quantization/compressed_tensors
|
||||
- vllm/model_executor/layers/quantization/modelopt.py
|
||||
- vllm/model_executor/layers/quantization/mxfp4.py
|
||||
- vllm/v1/attention/backends/flashinfer.py
|
||||
commands:
|
||||
- pytest -s -v tests/quantization/test_blackwell_moe.py
|
||||
|
||||
- label: Quantized Models Test
|
||||
timeout_in_minutes: 60
|
||||
source_file_dependencies:
|
||||
- vllm/model_executor/layers/quantization
|
||||
- tests/models/quantization
|
||||
commands:
|
||||
- pytest -v -s models/quantization
|
||||
14
.buildkite/test_areas/samplers.yaml
Normal file
14
.buildkite/test_areas/samplers.yaml
Normal file
@@ -0,0 +1,14 @@
|
||||
group: Samplers
|
||||
depends_on:
|
||||
- image-build
|
||||
steps:
|
||||
- label: Samplers Test
|
||||
timeout_in_minutes: 75
|
||||
source_file_dependencies:
|
||||
- vllm/model_executor/layers
|
||||
- vllm/sampling_metadata.py
|
||||
- tests/samplers
|
||||
- tests/conftest.py
|
||||
commands:
|
||||
- pytest -v -s samplers
|
||||
- VLLM_USE_FLASHINFER_SAMPLER=1 pytest -v -s samplers
|
||||
13
.buildkite/test_areas/tool_use.yaml
Normal file
13
.buildkite/test_areas/tool_use.yaml
Normal file
@@ -0,0 +1,13 @@
|
||||
group: Tool use
|
||||
depends_on:
|
||||
- image-build
|
||||
steps:
|
||||
- label: OpenAI-Compatible Tool Use
|
||||
timeout_in_minutes: 35
|
||||
mirror_hardwares: [amdexperimental]
|
||||
fast_check: false
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/tool_use
|
||||
commands:
|
||||
- pytest -v -s tool_use
|
||||
25
.buildkite/test_areas/weight_loading.yaml
Normal file
25
.buildkite/test_areas/weight_loading.yaml
Normal file
@@ -0,0 +1,25 @@
|
||||
group: Weight Loading
|
||||
depends_on:
|
||||
- image-build
|
||||
steps:
|
||||
- label: Weight Loading Multiple GPU # 33min
|
||||
timeout_in_minutes: 45
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
num_gpus: 2
|
||||
optional: true
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/weight_loading
|
||||
commands:
|
||||
- bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models.txt
|
||||
|
||||
- label: Weight Loading Multiple GPU - Large Models # optional
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
num_gpus: 2
|
||||
gpu: a100
|
||||
optional: true
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/weight_loading
|
||||
commands:
|
||||
- bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large.txt
|
||||
8
.github/CODEOWNERS
vendored
8
.github/CODEOWNERS
vendored
@@ -9,6 +9,7 @@
|
||||
/vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth @yewentao256 @pavanimajety
|
||||
/vllm/model_executor/layers/mamba @tdoublep
|
||||
/vllm/model_executor/model_loader @22quinn
|
||||
/vllm/model_executor/layers/batch_invariant.py @yewentao256
|
||||
/vllm/multimodal @DarkLight1337 @ywang96 @NickLucche @tjtanaa
|
||||
/vllm/vllm_flash_attn @LucasWilkinson
|
||||
/vllm/lora @jeejeelee
|
||||
@@ -35,6 +36,9 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
|
||||
/vllm/v1/kv_cache_interface.py @heheda12345
|
||||
/vllm/v1/offloading @ApostaC
|
||||
|
||||
# Model runner V2
|
||||
/vllm/v1/worker/gpu @WoosukKwon
|
||||
|
||||
# Test ownership
|
||||
/.buildkite/lm-eval-harness @mgoin
|
||||
/tests/distributed/test_multi_node_assignment.py @youkaichao
|
||||
@@ -56,6 +60,7 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
|
||||
/tests/v1/kv_connector/nixl_integration @NickLucche
|
||||
/tests/v1/kv_connector @ApostaC
|
||||
/tests/v1/offloading @ApostaC
|
||||
/tests/v1/determinism @yewentao256
|
||||
|
||||
# Transformers modeling backend
|
||||
/vllm/model_executor/models/transformers @hmellor
|
||||
@@ -141,9 +146,10 @@ mkdocs.yaml @hmellor
|
||||
/requirements/kv_connectors.txt @NickLucche
|
||||
|
||||
# Pooling models
|
||||
/examples/*/pooling/ @noooop
|
||||
/examples/pooling @noooop
|
||||
/tests/models/*/pooling* @noooop
|
||||
/tests/entrypoints/pooling @noooop
|
||||
/vllm/entrypoints/pooling @noooop
|
||||
/vllm/config/pooler.py @noooop
|
||||
/vllm/pooling_params.py @noooop
|
||||
/vllm/model_executor/layers/pooler.py @noooop
|
||||
|
||||
50
.github/mergify.yml
vendored
50
.github/mergify.yml
vendored
@@ -14,6 +14,52 @@ pull_request_rules:
|
||||
comment:
|
||||
message: "Documentation preview: https://vllm--{{number}}.org.readthedocs.build/en/{{number}}/"
|
||||
|
||||
- name: comment-pre-commit-failure
|
||||
description: Comment on PR when pre-commit check fails
|
||||
conditions:
|
||||
- status-failure=pre-commit
|
||||
- -closed
|
||||
- -draft
|
||||
actions:
|
||||
comment:
|
||||
message: |
|
||||
Hi @{{author}}, the pre-commit checks have failed. Please run:
|
||||
|
||||
```bash
|
||||
uv pip install pre-commit
|
||||
pre-commit install
|
||||
pre-commit run --all-files
|
||||
```
|
||||
|
||||
Then, commit the changes and push to your branch.
|
||||
|
||||
For future commits, `pre-commit` will run automatically on changed files before each commit.
|
||||
|
||||
> [!TIP]
|
||||
> <details>
|
||||
> <summary>Is <code>mypy</code> or <code>markdownlint</code> failing?</summary>
|
||||
> <br/>
|
||||
> <code>mypy</code> and <code>markdownlint</code> are run differently in CI. If the failure is related to either of these checks, please use the following commands to run them locally:
|
||||
>
|
||||
> ```bash
|
||||
> # For mypy (substitute "3.10" with the failing version if needed)
|
||||
> pre-commit run --hook-stage manual mypy-3.10
|
||||
> # For markdownlint
|
||||
> pre-commit run --hook-stage manual markdownlint
|
||||
> ```
|
||||
> </details>
|
||||
|
||||
- name: comment-dco-failure
|
||||
description: Comment on PR when DCO check fails
|
||||
conditions:
|
||||
- status-failure=dco
|
||||
- -closed
|
||||
- -draft
|
||||
actions:
|
||||
comment:
|
||||
message: |
|
||||
Hi @{{author}}, the DCO check has failed. Please click on DCO in the Checks section for instructions on how to resolve this.
|
||||
|
||||
- name: label-ci-build
|
||||
description: Automatically apply ci/build label
|
||||
conditions:
|
||||
@@ -140,7 +186,7 @@ pull_request_rules:
|
||||
- files~=^tests/entrypoints/test_context.py
|
||||
- files~=^vllm/model_executor/models/.*gpt[-_]?oss.*\.py
|
||||
- files~=^vllm/model_executor/layers/.*gpt[-_]?oss.*\.py
|
||||
- files~=^vllm/entrypoints/harmony_utils.py
|
||||
- files~=^vllm/entrypoints/openai/parser/harmony_utils.py
|
||||
- files~=^vllm/entrypoints/tool_server.py
|
||||
- files~=^vllm/entrypoints/tool.py
|
||||
- files~=^vllm/entrypoints/context.py
|
||||
@@ -358,4 +404,4 @@ pull_request_rules:
|
||||
actions:
|
||||
label:
|
||||
add:
|
||||
- kv-connector
|
||||
- kv-connector
|
||||
|
||||
4
.github/workflows/cleanup_pr_body.yml
vendored
4
.github/workflows/cleanup_pr_body.yml
vendored
@@ -13,10 +13,10 @@ jobs:
|
||||
|
||||
steps:
|
||||
- name: Checkout repository
|
||||
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
|
||||
|
||||
- name: Set up Python
|
||||
uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6.0.0
|
||||
uses: actions/setup-python@83679a892e2d95755f2dac6acb0bfd1e9ac5d548 # v6.1.0
|
||||
with:
|
||||
python-version: '3.12'
|
||||
|
||||
|
||||
25
.github/workflows/issue_autolabel.yml
vendored
25
.github/workflows/issue_autolabel.yml
vendored
@@ -105,6 +105,31 @@ jobs:
|
||||
}
|
||||
],
|
||||
},
|
||||
cpu: {
|
||||
// Keyword search - matches whole words only (with word boundaries)
|
||||
keywords: [
|
||||
{
|
||||
term: "CPU Backend",
|
||||
searchIn: "title"
|
||||
},
|
||||
{
|
||||
term: "x86",
|
||||
searchIn: "title"
|
||||
},
|
||||
{
|
||||
term: "ARM",
|
||||
searchIn: "title"
|
||||
},
|
||||
{
|
||||
term: "Apple Silicon",
|
||||
searchIn: "title"
|
||||
},
|
||||
{
|
||||
term: "IBM Z",
|
||||
searchIn: "title"
|
||||
},
|
||||
],
|
||||
},
|
||||
// Add more label configurations here as needed
|
||||
// example: {
|
||||
// keywords: [...],
|
||||
|
||||
9
.github/workflows/macos-smoke-test.yml
vendored
9
.github/workflows/macos-smoke-test.yml
vendored
@@ -9,10 +9,10 @@ on:
|
||||
jobs:
|
||||
macos-m1-smoke-test:
|
||||
runs-on: macos-latest
|
||||
timeout-minutes: 20
|
||||
timeout-minutes: 30
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
- uses: actions/checkout@v6.0.1
|
||||
|
||||
- uses: astral-sh/setup-uv@v7
|
||||
with:
|
||||
@@ -37,15 +37,14 @@ jobs:
|
||||
- name: Verify installation
|
||||
run: |
|
||||
python -c "import vllm; print(f'vLLM version: {vllm.__version__}')"
|
||||
python -c "import torch; print(f'PyTorch: {torch.__version__}')"
|
||||
|
||||
- name: Smoke test vllm serve
|
||||
timeout-minutes: 10
|
||||
run: |
|
||||
# Start server in background
|
||||
vllm serve Qwen/Qwen3-0.6B \
|
||||
--max-model-len=2048 \
|
||||
--max-model-len=2K \
|
||||
--load-format=dummy \
|
||||
--hf-overrides '{"num_hidden_layers": 2}' \
|
||||
--enforce-eager \
|
||||
--port 8000 &
|
||||
|
||||
|
||||
4
.github/workflows/pre-commit.yml
vendored
4
.github/workflows/pre-commit.yml
vendored
@@ -16,8 +16,8 @@ jobs:
|
||||
pre-commit:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||
- uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6.0.0
|
||||
- uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
|
||||
- uses: actions/setup-python@83679a892e2d95755f2dac6acb0bfd1e9ac5d548 # v6.1.0
|
||||
with:
|
||||
python-version: "3.12"
|
||||
- run: echo "::add-matcher::.github/workflows/matchers/actionlint.json"
|
||||
|
||||
4
.github/workflows/stale.yml
vendored
4
.github/workflows/stale.yml
vendored
@@ -7,13 +7,15 @@ on:
|
||||
|
||||
jobs:
|
||||
close-issues-and-pull-requests:
|
||||
# Prevents triggering on forks or other repos
|
||||
if: github.repository == 'vllm-project/vllm'
|
||||
permissions:
|
||||
issues: write
|
||||
pull-requests: write
|
||||
actions: write
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/stale@5f858e3efba33a5ca4407a664cc011ad407f2008 # v10.1.0
|
||||
- uses: actions/stale@997185467fa4f803885201cee163a9f38240193d # v10.1.1
|
||||
with:
|
||||
# Increasing this value ensures that changes to this workflow
|
||||
# propagate to all issues and PRs in days rather than months
|
||||
|
||||
3
.gitignore
vendored
3
.gitignore
vendored
@@ -4,6 +4,9 @@
|
||||
# vllm-flash-attn built from source
|
||||
vllm/vllm_flash_attn/*
|
||||
|
||||
# OpenAI triton kernels copied from source
|
||||
vllm/third_party/triton_kernels/*
|
||||
|
||||
# triton jit
|
||||
.triton
|
||||
|
||||
|
||||
140
CMakeLists.txt
140
CMakeLists.txt
@@ -136,7 +136,7 @@ elseif(HIP_FOUND)
|
||||
|
||||
# ROCm 5.X and 6.X
|
||||
if (ROCM_VERSION_DEV_MAJOR GREATER_EQUAL 5 AND
|
||||
NOT Torch_VERSION VERSION_EQUAL ${TORCH_SUPPORTED_VERSION_ROCM})
|
||||
Torch_VERSION VERSION_LESS ${TORCH_SUPPORTED_VERSION_ROCM})
|
||||
message(WARNING "Pytorch version >= ${TORCH_SUPPORTED_VERSION_ROCM} "
|
||||
"expected for ROCm build, saw ${Torch_VERSION} instead.")
|
||||
endif()
|
||||
@@ -307,7 +307,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
SET(CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL "Enable only the header library")
|
||||
|
||||
# Set CUTLASS_REVISION. Used for FetchContent. Also fixes some bogus messages when building.
|
||||
set(CUTLASS_REVISION "v4.2.1" CACHE STRING "CUTLASS revision to use")
|
||||
set(CUTLASS_REVISION "v4.2.1")
|
||||
|
||||
# Use the specified CUTLASS source directory for compilation if VLLM_CUTLASS_SRC_DIR is provided
|
||||
if (DEFINED ENV{VLLM_CUTLASS_SRC_DIR})
|
||||
@@ -354,8 +354,17 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
# Only build Marlin kernels if we are building for at least some compatible archs.
|
||||
# Keep building Marlin for 9.0 as there are some group sizes and shapes that
|
||||
# are not supported by Machete yet.
|
||||
# 9.0 for latest bf16 atomicAdd PTX
|
||||
cuda_archs_loose_intersection(MARLIN_ARCHS "8.0+PTX;9.0+PTX" "${CUDA_ARCHS}")
|
||||
|
||||
# marlin arches for fp16 output
|
||||
cuda_archs_loose_intersection(MARLIN_ARCHS "8.0+PTX" "${CUDA_ARCHS}")
|
||||
# marlin arches for bf16 output (we need 9.0 for bf16 atomicAdd PTX)
|
||||
cuda_archs_loose_intersection(MARLIN_BF16_ARCHS "8.0+PTX;9.0+PTX" "${CUDA_ARCHS}")
|
||||
# marlin arches for fp8 input
|
||||
# - sm80 doesn't support fp8 computation
|
||||
# - sm90 and sm100 don't support QMMA.16832.F32.E4M3.E4M3 SAAS instruction
|
||||
# so we only enable fp8 computation for SM89 (e.g. RTX 40x0) and 12.0 (e.g. RTX 50x0)
|
||||
cuda_archs_loose_intersection(MARLIN_FP8_ARCHS "8.9;12.0" "${CUDA_ARCHS}")
|
||||
|
||||
if (MARLIN_ARCHS)
|
||||
|
||||
#
|
||||
@@ -365,16 +374,18 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
set(MARLIN_GEN_SCRIPT
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/csrc/quantization/gptq_marlin/generate_kernels.py)
|
||||
file(MD5 ${MARLIN_GEN_SCRIPT} MARLIN_GEN_SCRIPT_HASH)
|
||||
list(JOIN CUDA_ARCHS "," CUDA_ARCHS_STR)
|
||||
set(MARLIN_GEN_SCRIPT_HASH_AND_ARCH "${MARLIN_GEN_SCRIPT_HASH}(ARCH:${CUDA_ARCHS_STR})")
|
||||
|
||||
message(STATUS "Marlin generation script hash: ${MARLIN_GEN_SCRIPT_HASH}")
|
||||
message(STATUS "Last run Marlin generate script hash: $CACHE{MARLIN_GEN_SCRIPT_HASH}")
|
||||
message(STATUS "Marlin generation script hash: ${MARLIN_GEN_SCRIPT_HASH_AND_ARCH}")
|
||||
message(STATUS "Last run Marlin generate script hash: $CACHE{MARLIN_GEN_SCRIPT_HASH_AND_ARCH}")
|
||||
|
||||
if (NOT DEFINED CACHE{MARLIN_GEN_SCRIPT_HASH}
|
||||
OR NOT $CACHE{MARLIN_GEN_SCRIPT_HASH} STREQUAL ${MARLIN_GEN_SCRIPT_HASH})
|
||||
if (NOT DEFINED CACHE{MARLIN_GEN_SCRIPT_HASH_AND_ARCH}
|
||||
OR NOT $CACHE{MARLIN_GEN_SCRIPT_HASH_AND_ARCH} STREQUAL ${MARLIN_GEN_SCRIPT_HASH_AND_ARCH})
|
||||
execute_process(
|
||||
COMMAND ${CMAKE_COMMAND} -E env
|
||||
PYTHONPATH=$PYTHONPATH
|
||||
${Python_EXECUTABLE} ${MARLIN_GEN_SCRIPT}
|
||||
PYTHONPATH=$ENV{PYTHONPATH}
|
||||
${Python_EXECUTABLE} ${MARLIN_GEN_SCRIPT} ${CUDA_ARCHS_STR}
|
||||
RESULT_VARIABLE marlin_generation_result
|
||||
OUTPUT_VARIABLE marlin_generation_result
|
||||
OUTPUT_FILE ${CMAKE_CURRENT_BINARY_DIR}/marlin_generation.log
|
||||
@@ -387,15 +398,15 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
"\nCheck the log for details: "
|
||||
"${CMAKE_CURRENT_BINARY_DIR}/marlin_generation.log")
|
||||
else()
|
||||
set(MARLIN_GEN_SCRIPT_HASH ${MARLIN_GEN_SCRIPT_HASH}
|
||||
CACHE STRING "Last run Marlin generate script hash" FORCE)
|
||||
set(MARLIN_GEN_SCRIPT_HASH_AND_ARCH ${MARLIN_GEN_SCRIPT_HASH_AND_ARCH}
|
||||
CACHE STRING "Last run Marlin generate script hash and arch" FORCE)
|
||||
message(STATUS "Marlin generation completed successfully.")
|
||||
endif()
|
||||
else()
|
||||
message(STATUS "Marlin generation script has not changed, skipping generation.")
|
||||
endif()
|
||||
|
||||
file(GLOB MARLIN_TEMPLATE_KERNEL_SRC "csrc/quantization/gptq_marlin/kernel_*.cu")
|
||||
file(GLOB MARLIN_TEMPLATE_KERNEL_SRC "csrc/quantization/gptq_marlin/sm80_kernel_*_float16.cu")
|
||||
set_gencode_flags_for_srcs(
|
||||
SRCS "${MARLIN_TEMPLATE_KERNEL_SRC}"
|
||||
CUDA_ARCHS "${MARLIN_ARCHS}")
|
||||
@@ -403,12 +414,34 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
set_source_files_properties(${MARLIN_TEMPLATE_KERNEL_SRC}
|
||||
PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false")
|
||||
endif()
|
||||
|
||||
list(APPEND VLLM_EXT_SRC ${MARLIN_TEMPLATE_KERNEL_SRC})
|
||||
|
||||
file(GLOB MARLIN_TEMPLATE_BF16_KERNEL_SRC "csrc/quantization/gptq_marlin/sm80_kernel_*_bfloat16.cu")
|
||||
set_gencode_flags_for_srcs(
|
||||
SRCS "${MARLIN_TEMPLATE_BF16_KERNEL_SRC}"
|
||||
CUDA_ARCHS "${MARLIN_BF16_ARCHS}")
|
||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8)
|
||||
set_source_files_properties(${MARLIN_TEMPLATE_BF16_KERNEL_SRC}
|
||||
PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false")
|
||||
endif()
|
||||
list(APPEND VLLM_EXT_SRC ${MARLIN_TEMPLATE_BF16_KERNEL_SRC})
|
||||
|
||||
if (MARLIN_FP8_ARCHS)
|
||||
file(GLOB MARLIN_TEMPLATE_FP8_KERNEL_SRC "csrc/quantization/gptq_marlin/sm89_kernel_*.cu")
|
||||
set_gencode_flags_for_srcs(
|
||||
SRCS "${MARLIN_TEMPLATE_FP8_KERNEL_SRC}"
|
||||
CUDA_ARCHS "${MARLIN_FP8_ARCHS}")
|
||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8)
|
||||
set_source_files_properties(${MARLIN_TEMPLATE_FP8_KERNEL_SRC}
|
||||
PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false")
|
||||
endif()
|
||||
list(APPEND VLLM_EXT_SRC ${MARLIN_TEMPLATE_FP8_KERNEL_SRC})
|
||||
endif()
|
||||
|
||||
set(MARLIN_SRCS
|
||||
"csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu"
|
||||
"csrc/quantization/gptq_marlin/gptq_marlin.cu"
|
||||
"csrc/quantization/gptq_marlin/marlin_int4_fp8_preprocess.cu"
|
||||
"csrc/quantization/gptq_marlin/gptq_marlin_repack.cu"
|
||||
"csrc/quantization/gptq_marlin/awq_marlin_repack.cu")
|
||||
set_gencode_flags_for_srcs(
|
||||
@@ -512,9 +545,9 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
# The cutlass_scaled_mm kernels for Blackwell SM100 (c3x, i.e. CUTLASS 3.x)
|
||||
# require CUDA 12.8 or later
|
||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
|
||||
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f;11.0f;12.0f" "${CUDA_ARCHS}")
|
||||
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f;11.0f" "${CUDA_ARCHS}")
|
||||
else()
|
||||
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a;10.3a;12.0a;12.1a" "${CUDA_ARCHS}")
|
||||
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a;10.3a" "${CUDA_ARCHS}")
|
||||
endif()
|
||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
|
||||
set(SRCS
|
||||
@@ -604,12 +637,15 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
set(SRCS
|
||||
"csrc/quantization/fp4/nvfp4_quant_kernels.cu"
|
||||
"csrc/quantization/fp4/activation_nvfp4_quant_fusion_kernels.cu"
|
||||
"csrc/quantization/fp4/nvfp4_scaled_mm_sm120_kernels.cu")
|
||||
"csrc/quantization/fp4/nvfp4_experts_quant.cu"
|
||||
"csrc/quantization/fp4/nvfp4_scaled_mm_sm120_kernels.cu"
|
||||
"csrc/quantization/fp4/nvfp4_blockwise_moe_kernel.cu")
|
||||
set_gencode_flags_for_srcs(
|
||||
SRCS "${SRCS}"
|
||||
CUDA_ARCHS "${FP4_ARCHS}")
|
||||
list(APPEND VLLM_EXT_SRC "${SRCS}")
|
||||
list(APPEND VLLM_GPU_FLAGS "-DENABLE_NVFP4_SM120=1")
|
||||
list(APPEND VLLM_GPU_FLAGS "-DENABLE_CUTLASS_MOE_SM120=1")
|
||||
message(STATUS "Building NVFP4 for archs: ${FP4_ARCHS}")
|
||||
else()
|
||||
message(STATUS "Not building NVFP4 as no compatible archs were found.")
|
||||
@@ -619,9 +655,9 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
|
||||
# FP4 Archs and flags
|
||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
|
||||
cuda_archs_loose_intersection(FP4_ARCHS "10.0f;11.0f;12.0f" "${CUDA_ARCHS}")
|
||||
cuda_archs_loose_intersection(FP4_ARCHS "10.0f;11.0f" "${CUDA_ARCHS}")
|
||||
else()
|
||||
cuda_archs_loose_intersection(FP4_ARCHS "10.0a;10.1a;12.0a;12.1a" "${CUDA_ARCHS}")
|
||||
cuda_archs_loose_intersection(FP4_ARCHS "10.0a;10.1a;10.3a" "${CUDA_ARCHS}")
|
||||
endif()
|
||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND FP4_ARCHS)
|
||||
set(SRCS
|
||||
@@ -695,7 +731,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
|
||||
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f;11.0f" "${CUDA_ARCHS}")
|
||||
else()
|
||||
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a" "${CUDA_ARCHS}")
|
||||
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a;10.3a" "${CUDA_ARCHS}")
|
||||
endif()
|
||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
|
||||
set(SRCS "csrc/quantization/w8a8/cutlass/moe/grouped_mm_c3x_sm100.cu")
|
||||
@@ -741,9 +777,9 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
endif()
|
||||
|
||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
|
||||
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f;11.0f;12.0f" "${CUDA_ARCHS}")
|
||||
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f;11.0f" "${CUDA_ARCHS}")
|
||||
else()
|
||||
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a;10.3a;12.0a;12.1a" "${CUDA_ARCHS}")
|
||||
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a;10.3a" "${CUDA_ARCHS}")
|
||||
endif()
|
||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
|
||||
set(SRCS "csrc/quantization/w8a8/cutlass/moe/blockwise_scaled_group_mm_sm100.cu")
|
||||
@@ -786,7 +822,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
OR NOT $CACHE{MACHETE_GEN_SCRIPT_HASH} STREQUAL ${MACHETE_GEN_SCRIPT_HASH})
|
||||
execute_process(
|
||||
COMMAND ${CMAKE_COMMAND} -E env
|
||||
PYTHONPATH=${CMAKE_CURRENT_SOURCE_DIR}/csrc/cutlass_extensions/:${CUTLASS_DIR}/python/:${VLLM_PYTHON_PATH}:$PYTHONPATH
|
||||
PYTHONPATH=${CMAKE_CURRENT_SOURCE_DIR}/csrc/cutlass_extensions/:${CUTLASS_DIR}/python/:${VLLM_PYTHON_PATH}:$ENV{PYTHONPATH}
|
||||
${Python_EXECUTABLE} ${MACHETE_GEN_SCRIPT}
|
||||
RESULT_VARIABLE machete_generation_result
|
||||
OUTPUT_VARIABLE machete_generation_output
|
||||
@@ -838,7 +874,10 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
cuda_archs_loose_intersection(W4A8_ARCHS "9.0a" "${CUDA_ARCHS}")
|
||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.0 AND W4A8_ARCHS)
|
||||
set(SRCS
|
||||
"csrc/quantization/cutlass_w4a8/w4a8_mm_entry.cu")
|
||||
"csrc/quantization/cutlass_w4a8/w4a8_mm_entry.cu"
|
||||
"csrc/quantization/cutlass_w4a8/w4a8_grouped_mm_entry.cu"
|
||||
"csrc/quantization/cutlass_w4a8/w4a8_utils.cu"
|
||||
)
|
||||
|
||||
set_gencode_flags_for_srcs(
|
||||
SRCS "${SRCS}"
|
||||
@@ -908,7 +947,6 @@ target_compile_definitions(_C PRIVATE CUTLASS_ENABLE_DIRECT_CUDA_DRIVER_CALL=1)
|
||||
set(VLLM_MOE_EXT_SRC
|
||||
"csrc/moe/torch_bindings.cpp"
|
||||
"csrc/moe/moe_align_sum_kernels.cu"
|
||||
"csrc/moe/moe_lora_align_sum_kernels.cu"
|
||||
"csrc/moe/topk_softmax_kernels.cu")
|
||||
|
||||
if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
@@ -938,8 +976,15 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
CUDA_ARCHS "${CUDA_ARCHS}")
|
||||
|
||||
list(APPEND VLLM_MOE_EXT_SRC "${VLLM_MOE_WNA16_SRC}")
|
||||
# 9.0 for latest bf16 atomicAdd PTX
|
||||
cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0+PTX;9.0+PTX" "${CUDA_ARCHS}")
|
||||
# moe marlin arches
|
||||
# note that we always set `use_atomic_add=False` for moe marlin now,
|
||||
# so we don't need 9.0 for bf16 atomicAdd PTX
|
||||
cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0+PTX" "${CUDA_ARCHS}")
|
||||
# moe marlin arches for fp8 input
|
||||
# - sm80 doesn't support fp8 computation
|
||||
# - sm90 and sm100 don't support QMMA.16832.F32.E4M3.E4M3 SAAS instruction
|
||||
# so we only enable fp8 computation for SM89 (e.g. RTX 40x0) and 12.0 (e.g. RTX 50x0)
|
||||
cuda_archs_loose_intersection(MARLIN_MOE_FP8_ARCHS "8.9;12.0" "${CUDA_ARCHS}")
|
||||
if (MARLIN_MOE_ARCHS)
|
||||
|
||||
#
|
||||
@@ -949,16 +994,18 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
set(MOE_MARLIN_GEN_SCRIPT
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/csrc/moe/marlin_moe_wna16/generate_kernels.py)
|
||||
file(MD5 ${MOE_MARLIN_GEN_SCRIPT} MOE_MARLIN_GEN_SCRIPT_HASH)
|
||||
list(JOIN CUDA_ARCHS "," CUDA_ARCHS_STR)
|
||||
set(MOE_MARLIN_GEN_SCRIPT_HASH_AND_ARCH "${MOE_MARLIN_GEN_SCRIPT_HASH}(ARCH:${CUDA_ARCHS_STR})")
|
||||
|
||||
message(STATUS "Marlin MOE generation script hash: ${MOE_MARLIN_GEN_SCRIPT_HASH}")
|
||||
message(STATUS "Last run Marlin MOE generate script hash: $CACHE{MOE_MARLIN_GEN_SCRIPT_HASH}")
|
||||
message(STATUS "Marlin MOE generation script hash with arch: ${MOE_MARLIN_GEN_SCRIPT_HASH_AND_ARCH}")
|
||||
message(STATUS "Last run Marlin MOE generate script hash with arch: $CACHE{MOE_MARLIN_GEN_SCRIPT_HASH_AND_ARCH}")
|
||||
|
||||
if (NOT DEFINED CACHE{MOE_MARLIN_GEN_SCRIPT_HASH}
|
||||
OR NOT $CACHE{MOE_MARLIN_GEN_SCRIPT_HASH} STREQUAL ${MOE_MARLIN_GEN_SCRIPT_HASH})
|
||||
if (NOT DEFINED CACHE{MOE_MARLIN_GEN_SCRIPT_HASH_AND_ARCH}
|
||||
OR NOT $CACHE{MOE_MARLIN_GEN_SCRIPT_HASH_AND_ARCH} STREQUAL ${MOE_MARLIN_GEN_SCRIPT_HASH_AND_ARCH})
|
||||
execute_process(
|
||||
COMMAND ${CMAKE_COMMAND} -E env
|
||||
PYTHONPATH=$PYTHONPATH
|
||||
${Python_EXECUTABLE} ${MOE_MARLIN_GEN_SCRIPT}
|
||||
PYTHONPATH=$ENV{PYTHONPATH}
|
||||
${Python_EXECUTABLE} ${MOE_MARLIN_GEN_SCRIPT} ${CUDA_ARCHS_STR}
|
||||
RESULT_VARIABLE moe_marlin_generation_result
|
||||
OUTPUT_VARIABLE moe_marlin_generation_output
|
||||
OUTPUT_FILE ${CMAKE_CURRENT_BINARY_DIR}/moe_marlin_generation.log
|
||||
@@ -971,7 +1018,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
"\nCheck the log for details: "
|
||||
"${CMAKE_CURRENT_BINARY_DIR}/moe_marlin_generation.log")
|
||||
else()
|
||||
set(MOE_MARLIN_GEN_SCRIPT_HASH ${MOE_MARLIN_GEN_SCRIPT_HASH}
|
||||
set(MOE_MARLIN_GEN_SCRIPT_HASH_AND_ARCH ${MOE_MARLIN_GEN_SCRIPT_HASH_AND_ARCH}
|
||||
CACHE STRING "Last run Marlin MOE generate script hash" FORCE)
|
||||
message(STATUS "Marlin MOE generation completed successfully.")
|
||||
endif()
|
||||
@@ -979,16 +1026,28 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
message(STATUS "Marlin MOE generation script has not changed, skipping generation.")
|
||||
endif()
|
||||
|
||||
file(GLOB MOE_WNAA16_MARLIN_SRC "csrc/moe/marlin_moe_wna16/*.cu")
|
||||
file(GLOB MARLIN_MOE_SRC "csrc/moe/marlin_moe_wna16/sm80_kernel_*.cu")
|
||||
list(APPEND MARLIN_MOE_SRC "csrc/moe/marlin_moe_wna16/ops.cu")
|
||||
set_gencode_flags_for_srcs(
|
||||
SRCS "${MOE_WNAA16_MARLIN_SRC}"
|
||||
SRCS "${MARLIN_MOE_SRC}"
|
||||
CUDA_ARCHS "${MARLIN_MOE_ARCHS}")
|
||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8)
|
||||
set_source_files_properties(${MOE_WNAA16_MARLIN_SRC}
|
||||
set_source_files_properties(${MARLIN_MOE_SRC}
|
||||
PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false")
|
||||
endif()
|
||||
list(APPEND VLLM_MOE_EXT_SRC ${MARLIN_MOE_SRC})
|
||||
|
||||
list(APPEND VLLM_MOE_EXT_SRC ${MOE_WNAA16_MARLIN_SRC})
|
||||
if (MARLIN_MOE_FP8_ARCHS)
|
||||
file(GLOB MARLIN_MOE_FP8_SRC "csrc/moe/marlin_moe_wna16/sm89_kernel_*.cu")
|
||||
set_gencode_flags_for_srcs(
|
||||
SRCS "${MARLIN_MOE_FP8_SRC}"
|
||||
CUDA_ARCHS "${MARLIN_MOE_FP8_ARCHS}")
|
||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8)
|
||||
set_source_files_properties(${MARLIN_MOE_FP8_SRC}
|
||||
PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false")
|
||||
endif()
|
||||
list(APPEND VLLM_MOE_EXT_SRC ${MARLIN_MOE_FP8_SRC})
|
||||
endif()
|
||||
|
||||
message(STATUS "Building Marlin MOE kernels for archs: ${MARLIN_MOE_ARCHS}")
|
||||
else()
|
||||
@@ -1030,6 +1089,11 @@ if(VLLM_GPU_LANG STREQUAL "HIP")
|
||||
WITH_SOABI)
|
||||
endif()
|
||||
|
||||
# For CUDA and HIP builds also build the triton_kernels external package.
|
||||
if(VLLM_GPU_LANG STREQUAL "CUDA" OR VLLM_GPU_LANG STREQUAL "HIP")
|
||||
include(cmake/external_projects/triton_kernels.cmake)
|
||||
endif()
|
||||
|
||||
# For CUDA we also build and ship some external projects.
|
||||
if (VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
include(cmake/external_projects/flashmla.cmake)
|
||||
|
||||
@@ -21,6 +21,7 @@ Join us at the [PyTorch Conference, October 22-23](https://events.linuxfoundatio
|
||||
|
||||
*Latest News* 🔥
|
||||
|
||||
- [2025/11] We hosted [vLLM Bangkok Meetup](https://luma.com/v0f647nv). We explored vLLM and LMCache inference and low-resource language adaptation with speakers from Embedded LLM, AMD, and Red Hat. Please find the meetup slides [here](https://drive.google.com/drive/folders/1H0DS57F8HQ5q3kSOSoRmucPJWL3E0A_X?usp=sharing).
|
||||
- [2025/11] We hosted [the first vLLM Europe Meetup in Zurich](https://luma.com/0gls27kb) focused on quantization, distributed inference, and reinforcement learning at scale with speakers from Mistral, IBM, and Red Hat. Please find the meetup slides [here](https://docs.google.com/presentation/d/1UC9PTLCHYXQpOmJDSFg6Sljra3iVXzc09DeEI7dnxMc/edit?usp=sharing) and recording [here](https://www.youtube.com/watch?v=6m6ZE6yVEDI)
|
||||
- [2025/11] We hosted [vLLM Beijing Meetup](https://mp.weixin.qq.com/s/xSrYXjNgr1HbCP4ExYNG1w) focusing on distributed inference and diverse accelerator support with vLLM! Please find the meetup slides [here](https://drive.google.com/drive/folders/1nQJ8ZkLSjKxvu36sSHaceVXtttbLvvu-?usp=drive_link).
|
||||
- [2025/10] We hosted [vLLM Shanghai Meetup](https://mp.weixin.qq.com/s/__xb4OyOsImz-9eAVrdlcg) focused on hands-on vLLM inference optimization! Please find the meetup slides [here](https://drive.google.com/drive/folders/1KqwjsFJLfEsC8wlDugnrR61zsWHt94Q6).
|
||||
@@ -136,16 +137,19 @@ Compute Resources:
|
||||
- Alibaba Cloud
|
||||
- AMD
|
||||
- Anyscale
|
||||
- Arm
|
||||
- AWS
|
||||
- Crusoe Cloud
|
||||
- Databricks
|
||||
- DeepInfra
|
||||
- Google Cloud
|
||||
- IBM
|
||||
- Intel
|
||||
- Lambda Lab
|
||||
- Nebius
|
||||
- Novita AI
|
||||
- NVIDIA
|
||||
- Red Hat
|
||||
- Replicate
|
||||
- Roblox
|
||||
- RunPod
|
||||
|
||||
@@ -83,7 +83,7 @@ MIN_CACHE_HIT_PCT=0
|
||||
MAX_LATENCY_ALLOWED_MS=100000000000 # A very large number
|
||||
```
|
||||
|
||||
#### 2. Maximize Throughput with a Latency Requirement
|
||||
### 2. Maximize Throughput with a Latency Requirement
|
||||
|
||||
- **Goal**: Find the best server parameters when P99 end-to-end latency must be below 500ms.
|
||||
- **Configuration**:
|
||||
@@ -96,7 +96,7 @@ MIN_CACHE_HIT_PCT=0
|
||||
MAX_LATENCY_ALLOWED_MS=500
|
||||
```
|
||||
|
||||
#### 3. Maximize Throughput with Prefix Caching and Latency Requirements
|
||||
### 3. Maximize Throughput with Prefix Caching and Latency Requirements
|
||||
|
||||
- **Goal**: Find the best server parameters assuming a 60% prefix cache hit rate and a latency requirement of 500ms.
|
||||
- **Configuration**:
|
||||
|
||||
@@ -18,6 +18,11 @@ MIN_CACHE_HIT_PCT=${MIN_CACHE_HIT_PCT:-0}
|
||||
MAX_LATENCY_ALLOWED_MS=${MAX_LATENCY_ALLOWED_MS:-100000000000}
|
||||
NUM_SEQS_LIST=${NUM_SEQS_LIST:-"128 256"}
|
||||
NUM_BATCHED_TOKENS_LIST=${NUM_BATCHED_TOKENS_LIST:-"512 1024 2048 4096"}
|
||||
HOSTNAME=$(hostname)
|
||||
if [[ -z "$HOSTNAME" ]]; then
|
||||
echo "Error: Failed to determine hostname." >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
LOG_FOLDER="$BASE/auto-benchmark/$TAG"
|
||||
RESULT="$LOG_FOLDER/result.txt"
|
||||
@@ -82,6 +87,7 @@ start_server() {
|
||||
"$MODEL"
|
||||
"--disable-log-requests"
|
||||
"--port" "8004"
|
||||
"--host" "$HOSTNAME"
|
||||
"--gpu-memory-utilization" "$gpu_memory_utilization"
|
||||
"--max-num-seqs" "$max_num_seqs"
|
||||
"--max-num-batched-tokens" "$max_num_batched_tokens"
|
||||
@@ -96,8 +102,9 @@ start_server() {
|
||||
# This correctly passes each element as a separate argument.
|
||||
if [[ -n "$profile_dir" ]]; then
|
||||
# Start server with profiling enabled
|
||||
VLLM_SERVER_DEV_MODE=1 VLLM_TORCH_PROFILER_DIR=$profile_dir \
|
||||
vllm serve "${common_args_array[@]}" > "$vllm_log" 2>&1 &
|
||||
local profile_config_json="{\"profiler\": \"torch\", \"torch_profiler_dir\": \"$profile_dir\"}"
|
||||
VLLM_SERVER_DEV_MODE=1 \
|
||||
vllm serve --profiler-config "$profile_config_json" "${common_args_array[@]}" > "$vllm_log" 2>&1 &
|
||||
else
|
||||
# Start server without profiling
|
||||
VLLM_SERVER_DEV_MODE=1 \
|
||||
@@ -112,7 +119,7 @@ start_server() {
|
||||
# since that we should always have permission to send signal to the server process.
|
||||
kill -0 $server_pid 2> /dev/null || break
|
||||
|
||||
RESPONSE=$(curl -s -X GET "http://0.0.0.0:8004/health" -w "%{http_code}" -o /dev/stdout)
|
||||
RESPONSE=$(curl -s -X GET "http://${HOSTNAME}:8004/health" -w "%{http_code}" -o /dev/stdout)
|
||||
STATUS_CODE=$(echo "$RESPONSE" | tail -n 1)
|
||||
if [[ "$STATUS_CODE" -eq 200 ]]; then
|
||||
server_started=1
|
||||
@@ -172,6 +179,7 @@ run_benchmark() {
|
||||
--goodput e2el:$MAX_LATENCY_ALLOWED_MS \
|
||||
--num-prompts 1000 \
|
||||
--random-prefix-len $prefix_len \
|
||||
--host "$HOSTNAME" \
|
||||
--port 8004 &> "$bm_log"
|
||||
throughput=$(grep "Request throughput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
|
||||
e2el=$(grep "P99 E2EL (ms):" "$bm_log" | awk '{print $NF}')
|
||||
@@ -187,7 +195,7 @@ run_benchmark() {
|
||||
request_rate=$((${throughput%.*} + 1))
|
||||
while ((request_rate > 0)); do
|
||||
# clear prefix cache
|
||||
curl -X POST http://0.0.0.0:8004/reset_prefix_cache
|
||||
curl -X POST http://${HOSTNAME}:8004/reset_prefix_cache
|
||||
sleep 5
|
||||
bm_log="$LOG_FOLDER/bm_log_${max_num_seqs}_${max_num_batched_tokens}_requestrate_${request_rate}.txt"
|
||||
vllm bench serve \
|
||||
@@ -203,6 +211,7 @@ run_benchmark() {
|
||||
--goodput e2el:$MAX_LATENCY_ALLOWED_MS \
|
||||
--num-prompts 100 \
|
||||
--random-prefix-len $prefix_len \
|
||||
--host "$HOSTNAME" \
|
||||
--port 8004 &> "$bm_log"
|
||||
throughput=$(grep "Request throughput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
|
||||
e2el=$(grep "P99 E2EL (ms):" "$bm_log" | awk '{print $NF}')
|
||||
@@ -303,6 +312,7 @@ if (( $(echo "$best_throughput > 0" | bc -l) )); then
|
||||
--goodput e2el:$MAX_LATENCY_ALLOWED_MS \
|
||||
--num-prompts 100 \
|
||||
--random-prefix-len $prefix_len \
|
||||
--host "$HOSTNAME" \
|
||||
--port 8004 \
|
||||
--profile &> "$bm_log"
|
||||
else
|
||||
|
||||
@@ -620,7 +620,7 @@ def get_tokenizer(
|
||||
kwargs["use_fast"] = False
|
||||
if tokenizer_mode == "mistral":
|
||||
try:
|
||||
from vllm.transformers_utils.tokenizer import MistralTokenizer
|
||||
from vllm.tokenizers.mistral import MistralTokenizer
|
||||
except ImportError as e:
|
||||
raise ImportError(
|
||||
"MistralTokenizer requires vllm package.\n"
|
||||
|
||||
120
benchmarks/benchmark_hash.py
Normal file
120
benchmarks/benchmark_hash.py
Normal file
@@ -0,0 +1,120 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""
|
||||
Micro benchmark comparing built-in hash(), SHA-256, and xxHash.
|
||||
|
||||
This focuses on a single test payload shaped like the prefix-cache hash input:
|
||||
(32-byte bytes object, 32-int tuple)
|
||||
|
||||
Usage:
|
||||
python benchmarks/hash_micro_benchmark.py --iterations 20000
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import random
|
||||
import statistics
|
||||
import time
|
||||
from collections.abc import Callable, Iterable
|
||||
|
||||
from vllm.utils.hashing import sha256, xxhash
|
||||
|
||||
|
||||
def _generate_test_data(seed: int) -> tuple[bytes, tuple[int, ...]]:
|
||||
"""Generate a deterministic test payload."""
|
||||
random.seed(seed)
|
||||
bytes_data = bytes(random.getrandbits(8) for _ in range(32))
|
||||
int_tuple = tuple(random.randint(1, 1_000_000) for _ in range(32))
|
||||
return (bytes_data, int_tuple)
|
||||
|
||||
|
||||
def _benchmark_func(func: Callable[[tuple], object], data: tuple, iterations: int):
|
||||
"""Return (avg_seconds, std_seconds) for hashing `data` `iterations` times."""
|
||||
times: list[float] = []
|
||||
|
||||
# Warm-up to avoid first-run noise.
|
||||
for _ in range(200):
|
||||
func(data)
|
||||
|
||||
for _ in range(iterations):
|
||||
start = time.perf_counter()
|
||||
func(data)
|
||||
end = time.perf_counter()
|
||||
times.append(end - start)
|
||||
|
||||
avg = statistics.mean(times)
|
||||
std = statistics.stdev(times) if len(times) > 1 else 0.0
|
||||
return avg, std
|
||||
|
||||
|
||||
def _run_benchmarks(
|
||||
benchmarks: Iterable[tuple[str, Callable[[tuple], object]]],
|
||||
data: tuple,
|
||||
iterations: int,
|
||||
):
|
||||
"""Yield (name, avg, std) for each benchmark, skipping unavailable ones."""
|
||||
for name, func in benchmarks:
|
||||
try:
|
||||
avg, std = _benchmark_func(func, data, iterations)
|
||||
except ModuleNotFoundError as exc:
|
||||
print(f"Skipping {name}: {exc}")
|
||||
continue
|
||||
yield name, avg, std
|
||||
|
||||
|
||||
def builtin_hash(data: tuple) -> int:
|
||||
"""Wrapper for Python's built-in hash()."""
|
||||
return hash(data)
|
||||
|
||||
|
||||
def main() -> None:
|
||||
parser = argparse.ArgumentParser(description=__doc__)
|
||||
parser.add_argument(
|
||||
"--iterations",
|
||||
type=int,
|
||||
default=10_000,
|
||||
help="Number of measured iterations per hash function.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--seed", type=int, default=42, help="Random seed for test payload."
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
data = _generate_test_data(args.seed)
|
||||
benchmarks = (
|
||||
("SHA256 (pickle)", sha256),
|
||||
("xxHash (pickle)", xxhash),
|
||||
("built-in hash()", builtin_hash),
|
||||
)
|
||||
|
||||
print("=" * 60)
|
||||
print("HASH FUNCTION MICRO BENCHMARK")
|
||||
print("=" * 60)
|
||||
print("Test data: (32-byte bytes object, 32-int tuple)")
|
||||
print(f"Iterations: {args.iterations:,}")
|
||||
print("=" * 60)
|
||||
|
||||
results = list(_run_benchmarks(benchmarks, data, args.iterations))
|
||||
builtin_entry = next((r for r in results if r[0] == "built-in hash()"), None)
|
||||
|
||||
print("\nResults:")
|
||||
for name, avg, std in results:
|
||||
print(f" {name:16s}: {avg * 1e6:8.2f} ± {std * 1e6:6.2f} μs")
|
||||
|
||||
if builtin_entry:
|
||||
_, builtin_avg, _ = builtin_entry
|
||||
print("\n" + "=" * 60)
|
||||
print("SUMMARY (relative to built-in hash())")
|
||||
print("=" * 60)
|
||||
for name, avg, _ in results:
|
||||
if name == "built-in hash()":
|
||||
continue
|
||||
speed_ratio = avg / builtin_avg
|
||||
print(f"• {name} is {speed_ratio:.1f}x slower than built-in hash()")
|
||||
else:
|
||||
print("\nBuilt-in hash() result missing; cannot compute speed ratios.")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -32,12 +32,11 @@ def benchmark_propose(args):
|
||||
|
||||
model_config = ModelConfig(
|
||||
model="facebook/opt-125m",
|
||||
task="generate",
|
||||
max_model_len=args.num_token + args.num_spec_token,
|
||||
tokenizer="facebook/opt-125m",
|
||||
tokenizer_mode="auto",
|
||||
dtype="auto",
|
||||
seed=None,
|
||||
seed=0,
|
||||
trust_remote_code=False,
|
||||
)
|
||||
proposer = NgramProposer(
|
||||
@@ -108,7 +107,10 @@ def benchmark_batched_propose(args):
|
||||
device_config=DeviceConfig(device=current_platform.device_type),
|
||||
parallel_config=ParallelConfig(),
|
||||
load_config=LoadConfig(),
|
||||
scheduler_config=SchedulerConfig(),
|
||||
scheduler_config=SchedulerConfig(
|
||||
max_model_len=model_config.max_model_len,
|
||||
is_encoder_decoder=model_config.is_encoder_decoder,
|
||||
),
|
||||
)
|
||||
|
||||
# monkey patch vllm.v1.worker.gpu_model_runner.get_pp_group
|
||||
|
||||
110
benchmarks/benchmark_prefix_block_hash.py
Normal file
110
benchmarks/benchmark_prefix_block_hash.py
Normal file
@@ -0,0 +1,110 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
"""
|
||||
Simple benchmark to compare prefix-cache block hashing algorithms.
|
||||
|
||||
Example:
|
||||
python benchmark_prefix_block_hash.py --num-blocks 20000 --block-size 32
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import random
|
||||
import statistics
|
||||
import sys
|
||||
import time
|
||||
from collections.abc import Callable, Iterable, Sequence
|
||||
|
||||
from vllm.utils.hashing import get_hash_fn_by_name
|
||||
from vllm.v1.core.kv_cache_utils import BlockHash, hash_block_tokens, init_none_hash
|
||||
|
||||
SUPPORTED_ALGOS = ("sha256", "sha256_cbor", "xxhash", "xxhash_cbor")
|
||||
|
||||
|
||||
def _generate_blocks(
|
||||
num_blocks: int, block_size: int, vocab_size: int, seed: int
|
||||
) -> list[list[int]]:
|
||||
rng = random.Random(seed)
|
||||
return [
|
||||
[rng.randrange(vocab_size) for _ in range(block_size)]
|
||||
for _ in range(num_blocks)
|
||||
]
|
||||
|
||||
|
||||
def _hash_all_blocks(
|
||||
hash_fn: Callable[[object], bytes],
|
||||
blocks: Iterable[Sequence[int]],
|
||||
) -> float:
|
||||
parent_hash: BlockHash | None = None
|
||||
start = time.perf_counter()
|
||||
for block in blocks:
|
||||
parent_hash = hash_block_tokens(hash_fn, parent_hash, block, extra_keys=None)
|
||||
end = time.perf_counter()
|
||||
return end - start
|
||||
|
||||
|
||||
def _benchmark(
|
||||
hash_algo: str,
|
||||
blocks: list[list[int]],
|
||||
trials: int,
|
||||
) -> tuple[float, float, float] | None:
|
||||
try:
|
||||
hash_fn = get_hash_fn_by_name(hash_algo)
|
||||
init_none_hash(hash_fn)
|
||||
timings = [_hash_all_blocks(hash_fn, blocks) for _ in range(trials)]
|
||||
except ModuleNotFoundError as exc:
|
||||
print(f"Skipping {hash_algo}: {exc}", file=sys.stderr)
|
||||
return None
|
||||
|
||||
avg = statistics.mean(timings)
|
||||
best = min(timings)
|
||||
# throughput: tokens / second
|
||||
tokens_hashed = len(blocks) * len(blocks[0])
|
||||
throughput = tokens_hashed / best
|
||||
return avg, best, throughput
|
||||
|
||||
|
||||
def main() -> None:
|
||||
parser = argparse.ArgumentParser(description=__doc__)
|
||||
parser.add_argument("--num-blocks", type=int, default=10000, help="Block count.")
|
||||
parser.add_argument("--block-size", type=int, default=32, help="Tokens per block.")
|
||||
parser.add_argument(
|
||||
"--vocab-size", type=int, default=32000, help="Token id range [0, vocab_size)."
|
||||
)
|
||||
parser.add_argument("--seed", type=int, default=0, help="Random seed.")
|
||||
parser.add_argument(
|
||||
"--trials", type=int, default=5, help="Number of timed trials per algorithm."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--algorithms",
|
||||
nargs="+",
|
||||
default=SUPPORTED_ALGOS,
|
||||
choices=SUPPORTED_ALGOS,
|
||||
help="Hash algorithms to benchmark.",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
blocks = _generate_blocks(
|
||||
args.num_blocks, args.block_size, args.vocab_size, args.seed
|
||||
)
|
||||
print(
|
||||
f"Benchmarking {len(args.algorithms)} algorithms on "
|
||||
f"{args.num_blocks} blocks (block size={args.block_size})."
|
||||
)
|
||||
|
||||
for algo in args.algorithms:
|
||||
result = _benchmark(algo, blocks, args.trials)
|
||||
if result is None:
|
||||
continue
|
||||
|
||||
avg, best, throughput = result
|
||||
print(
|
||||
f"{algo:14s} avg: {avg:.6f}s best: {best:.6f}s "
|
||||
f"throughput: {throughput / 1e6:.2f}M tokens/s"
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -40,7 +40,7 @@ from vllm.engine.arg_utils import EngineArgs
|
||||
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
||||
|
||||
try:
|
||||
from vllm.transformers_utils.tokenizer import get_tokenizer
|
||||
from vllm.tokenizers import get_tokenizer
|
||||
except ImportError:
|
||||
from backend_request_func import get_tokenizer
|
||||
|
||||
|
||||
@@ -46,7 +46,7 @@ from tqdm.asyncio import tqdm
|
||||
from transformers import PreTrainedTokenizerBase
|
||||
|
||||
try:
|
||||
from vllm.transformers_utils.tokenizer import get_tokenizer
|
||||
from vllm.tokenizers import get_tokenizer
|
||||
except ImportError:
|
||||
from backend_request_func import get_tokenizer
|
||||
|
||||
@@ -574,7 +574,7 @@ async def benchmark(
|
||||
)
|
||||
print(
|
||||
"{:<40} {:<10.2f}".format(
|
||||
"Total Token throughput (tok/s):", metrics.total_token_throughput
|
||||
"Total token throughput (tok/s):", metrics.total_token_throughput
|
||||
)
|
||||
)
|
||||
|
||||
@@ -963,8 +963,7 @@ def create_argument_parser():
|
||||
parser.add_argument(
|
||||
"--profile",
|
||||
action="store_true",
|
||||
help="Use Torch Profiler. The endpoint must be launched with "
|
||||
"VLLM_TORCH_PROFILER_DIR to enable profiler.",
|
||||
help="Use vLLM Profiling. --profiler-config must be provided on the server.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--result-dir",
|
||||
|
||||
@@ -5,11 +5,12 @@ import argparse
|
||||
import asyncio
|
||||
import logging
|
||||
import os
|
||||
import time
|
||||
import uuid
|
||||
from urllib.parse import urlparse
|
||||
|
||||
import aiohttp
|
||||
from quart import Quart, Response, make_response, request
|
||||
from rate_limiter import RateLimiter
|
||||
from request_queue import RequestQueue
|
||||
|
||||
# Configure logging
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
@@ -24,26 +25,8 @@ def parse_args():
|
||||
parser.add_argument(
|
||||
"--timeout",
|
||||
type=float,
|
||||
default=300,
|
||||
help="Timeout for backend service requests in seconds (default: 300)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--max-concurrent",
|
||||
type=int,
|
||||
default=100,
|
||||
help="Maximum concurrent requests to backend services (default: 100)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--queue-size",
|
||||
type=int,
|
||||
default=500,
|
||||
help="Maximum number of requests in the queue (default: 500)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--rate-limit",
|
||||
type=int,
|
||||
default=40,
|
||||
help="Maximum requests per second (default: 40)",
|
||||
default=6 * 60 * 60,
|
||||
help="Timeout for backend service requests in seconds (default: 21600)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--port",
|
||||
@@ -54,14 +37,32 @@ def parse_args():
|
||||
parser.add_argument(
|
||||
"--prefill-url",
|
||||
type=str,
|
||||
default="http://localhost:8100/v1/completions",
|
||||
help="Prefill service endpoint URL",
|
||||
default="http://localhost:8100",
|
||||
help="Prefill service base URL (protocol + host[:port])",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--decode-url",
|
||||
type=str,
|
||||
default="http://localhost:8200/v1/completions",
|
||||
help="Decode service endpoint URL",
|
||||
default="http://localhost:8200",
|
||||
help="Decode service base URL (protocol + host[:port])",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--kv-host",
|
||||
type=str,
|
||||
default="localhost",
|
||||
help="Hostname or IP used by KV transfer (default: localhost)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--prefill-kv-port",
|
||||
type=int,
|
||||
default=14579,
|
||||
help="Prefill KV port (default: 14579)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--decode-kv-port",
|
||||
type=int,
|
||||
default=14580,
|
||||
help="Decode KV port (default: 14580)",
|
||||
)
|
||||
|
||||
return parser.parse_args()
|
||||
@@ -73,70 +74,129 @@ def main():
|
||||
|
||||
# Initialize configuration using command line parameters
|
||||
AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=args.timeout)
|
||||
MAX_CONCURRENT_REQUESTS = args.max_concurrent
|
||||
REQUEST_QUEUE_SIZE = args.queue_size
|
||||
RATE_LIMIT = args.rate_limit
|
||||
PREFILL_SERVICE_URL = args.prefill_url
|
||||
DECODE_SERVICE_URL = args.decode_url
|
||||
PORT = args.port
|
||||
|
||||
PREFILL_KV_ADDR = f"{args.kv_host}:{args.prefill_kv_port}"
|
||||
DECODE_KV_ADDR = f"{args.kv_host}:{args.decode_kv_port}"
|
||||
|
||||
logger.info(
|
||||
"Proxy resolved KV addresses -> prefill: %s, decode: %s",
|
||||
PREFILL_KV_ADDR,
|
||||
DECODE_KV_ADDR,
|
||||
)
|
||||
|
||||
app = Quart(__name__)
|
||||
|
||||
# Initialize the rate limiter and request queue
|
||||
rate_limiter = RateLimiter(RATE_LIMIT)
|
||||
request_queue = RequestQueue(MAX_CONCURRENT_REQUESTS, REQUEST_QUEUE_SIZE)
|
||||
|
||||
# Attach the configuration object to the application instance
|
||||
# Attach the configuration object to the application instance so helper
|
||||
# coroutines can read the resolved backend URLs and timeouts without using
|
||||
# globals.
|
||||
app.config.update(
|
||||
{
|
||||
"AIOHTTP_TIMEOUT": AIOHTTP_TIMEOUT,
|
||||
"rate_limiter": rate_limiter,
|
||||
"request_queue": request_queue,
|
||||
"PREFILL_SERVICE_URL": PREFILL_SERVICE_URL,
|
||||
"DECODE_SERVICE_URL": DECODE_SERVICE_URL,
|
||||
"PREFILL_KV_ADDR": PREFILL_KV_ADDR,
|
||||
"DECODE_KV_ADDR": DECODE_KV_ADDR,
|
||||
}
|
||||
)
|
||||
|
||||
# Start queue processing on app startup
|
||||
@app.before_serving
|
||||
async def startup():
|
||||
"""Start request processing task when app starts serving"""
|
||||
asyncio.create_task(request_queue.process())
|
||||
def _normalize_base_url(url: str) -> str:
|
||||
"""Remove any trailing slash so path joins behave predictably."""
|
||||
return url.rstrip("/")
|
||||
|
||||
async def forward_request(url, data):
|
||||
"""Forward request to backend service with rate limiting and error handling"""
|
||||
headers = {"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"}
|
||||
def _get_host_port(url: str) -> str:
|
||||
"""Return the hostname:port portion for logging and KV headers."""
|
||||
parsed = urlparse(url)
|
||||
host = parsed.hostname or "localhost"
|
||||
port = parsed.port
|
||||
if port is None:
|
||||
port = 80 if parsed.scheme == "http" else 443
|
||||
return f"{host}:{port}"
|
||||
|
||||
# Use rate limiter as context manager
|
||||
async with (
|
||||
rate_limiter,
|
||||
aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session,
|
||||
):
|
||||
try:
|
||||
async with session.post(
|
||||
url=url, json=data, headers=headers
|
||||
) as response:
|
||||
if response.status == 200:
|
||||
# Stream response chunks
|
||||
async for chunk_bytes in response.content.iter_chunked(1024):
|
||||
yield chunk_bytes
|
||||
else:
|
||||
# Handle backend service errors
|
||||
error_text = await response.text()
|
||||
logger.error(
|
||||
"Backend service error: %s - %s",
|
||||
response.status,
|
||||
error_text,
|
||||
)
|
||||
yield b'{"error": "Backend service error"}'
|
||||
except aiohttp.ClientError as e:
|
||||
# Handle connection errors
|
||||
logger.error("Connection error to %s: %s", url, str(e))
|
||||
yield b'{"error": "Service unavailable"}'
|
||||
except asyncio.TimeoutError:
|
||||
# Handle timeout errors
|
||||
logger.error("Timeout connecting to %s", url)
|
||||
yield b'{"error": "Service timeout"}'
|
||||
PREFILL_BASE = _normalize_base_url(PREFILL_SERVICE_URL)
|
||||
DECODE_BASE = _normalize_base_url(DECODE_SERVICE_URL)
|
||||
KV_TARGET = _get_host_port(DECODE_SERVICE_URL)
|
||||
|
||||
def _build_headers(request_id: str) -> dict[str, str]:
|
||||
"""Construct the headers expected by vLLM's P2P disagg connector."""
|
||||
headers: dict[str, str] = {"X-Request-Id": request_id, "X-KV-Target": KV_TARGET}
|
||||
api_key = os.environ.get("OPENAI_API_KEY")
|
||||
if api_key:
|
||||
headers["Authorization"] = f"Bearer {api_key}"
|
||||
return headers
|
||||
|
||||
async def _run_prefill(
|
||||
request_path: str,
|
||||
payload: dict,
|
||||
headers: dict[str, str],
|
||||
request_id: str,
|
||||
):
|
||||
url = f"{PREFILL_BASE}{request_path}"
|
||||
start_ts = time.perf_counter()
|
||||
logger.info("[prefill] start request_id=%s url=%s", request_id, url)
|
||||
try:
|
||||
async with (
|
||||
aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session,
|
||||
session.post(url=url, json=payload, headers=headers) as resp,
|
||||
):
|
||||
if resp.status != 200:
|
||||
error_text = await resp.text()
|
||||
raise RuntimeError(
|
||||
f"Prefill backend error {resp.status}: {error_text}"
|
||||
)
|
||||
await resp.read()
|
||||
logger.info(
|
||||
"[prefill] done request_id=%s status=%s elapsed=%.2fs",
|
||||
request_id,
|
||||
resp.status,
|
||||
time.perf_counter() - start_ts,
|
||||
)
|
||||
except asyncio.TimeoutError as exc:
|
||||
raise RuntimeError(f"Prefill service timeout at {url}") from exc
|
||||
except aiohttp.ClientError as exc:
|
||||
raise RuntimeError(f"Prefill service unavailable at {url}") from exc
|
||||
|
||||
async def _stream_decode(
|
||||
request_path: str,
|
||||
payload: dict,
|
||||
headers: dict[str, str],
|
||||
request_id: str,
|
||||
):
|
||||
url = f"{DECODE_BASE}{request_path}"
|
||||
# Stream tokens from the decode service once the prefill stage has
|
||||
# materialized KV caches on the target workers.
|
||||
logger.info("[decode] start request_id=%s url=%s", request_id, url)
|
||||
try:
|
||||
async with (
|
||||
aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session,
|
||||
session.post(url=url, json=payload, headers=headers) as resp,
|
||||
):
|
||||
if resp.status != 200:
|
||||
error_text = await resp.text()
|
||||
logger.error(
|
||||
"Decode backend error %s - %s", resp.status, error_text
|
||||
)
|
||||
err_msg = (
|
||||
'{"error": "Decode backend error ' + str(resp.status) + '"}'
|
||||
)
|
||||
yield err_msg.encode()
|
||||
return
|
||||
logger.info(
|
||||
"[decode] streaming response request_id=%s status=%s",
|
||||
request_id,
|
||||
resp.status,
|
||||
)
|
||||
async for chunk_bytes in resp.content.iter_chunked(1024):
|
||||
yield chunk_bytes
|
||||
logger.info("[decode] finished streaming request_id=%s", request_id)
|
||||
except asyncio.TimeoutError:
|
||||
logger.error("Decode service timeout at %s", url)
|
||||
yield b'{"error": "Decode service timeout"}'
|
||||
except aiohttp.ClientError as exc:
|
||||
logger.error("Decode service error at %s: %s", url, exc)
|
||||
yield b'{"error": "Decode service unavailable"}'
|
||||
|
||||
async def process_request():
|
||||
"""Process a single request through prefill and decode stages"""
|
||||
@@ -146,13 +206,27 @@ def main():
|
||||
# Create prefill request (max_tokens=1)
|
||||
prefill_request = original_request_data.copy()
|
||||
prefill_request["max_tokens"] = 1
|
||||
if "max_completion_tokens" in prefill_request:
|
||||
prefill_request["max_completion_tokens"] = 1
|
||||
|
||||
# Execute prefill stage
|
||||
async for _ in forward_request(PREFILL_SERVICE_URL, prefill_request):
|
||||
continue
|
||||
# The request id encodes both KV socket addresses so the backend can
|
||||
# shuttle tensors directly via NCCL once the prefill response
|
||||
# completes.
|
||||
request_id = (
|
||||
f"___prefill_addr_{PREFILL_KV_ADDR}___decode_addr_"
|
||||
f"{DECODE_KV_ADDR}_{uuid.uuid4().hex}"
|
||||
)
|
||||
|
||||
headers = _build_headers(request_id)
|
||||
await _run_prefill(request.path, prefill_request, headers, request_id)
|
||||
|
||||
# Execute decode stage and stream response
|
||||
generator = forward_request(DECODE_SERVICE_URL, original_request_data)
|
||||
# Pass the unmodified user request so the decode phase can continue
|
||||
# sampling with the already-populated KV cache.
|
||||
generator = _stream_decode(
|
||||
request.path, original_request_data, headers, request_id
|
||||
)
|
||||
response = await make_response(generator)
|
||||
response.timeout = None # Disable timeout for streaming response
|
||||
return response
|
||||
@@ -168,23 +242,10 @@ def main():
|
||||
@app.route("/v1/completions", methods=["POST"])
|
||||
async def handle_request():
|
||||
"""Handle incoming API requests with concurrency and rate limiting"""
|
||||
# Create task for request processing
|
||||
task = asyncio.create_task(process_request())
|
||||
|
||||
# Enqueue request or reject if queue is full
|
||||
if not await request_queue.enqueue(task):
|
||||
return Response(
|
||||
response=b'{"error": "Server busy, try again later"}',
|
||||
status=503,
|
||||
content_type="application/json",
|
||||
)
|
||||
|
||||
try:
|
||||
# Return the response from the processing task
|
||||
return await task
|
||||
return await process_request()
|
||||
except asyncio.CancelledError:
|
||||
# Handle task cancellation (timeout or queue full)
|
||||
logger.warning("Request cancelled due to timeout or queue full")
|
||||
logger.warning("Request cancelled")
|
||||
return Response(
|
||||
response=b'{"error": "Request cancelled"}',
|
||||
status=503,
|
||||
|
||||
@@ -14,6 +14,9 @@ from tqdm import tqdm
|
||||
|
||||
import vllm._custom_ops as ops
|
||||
from vllm.model_executor.layers.layernorm import RMSNorm
|
||||
from vllm.model_executor.layers.quantization.utils.fp8_utils import (
|
||||
per_token_group_quant_fp8,
|
||||
)
|
||||
|
||||
|
||||
@dataclass
|
||||
@@ -22,6 +25,7 @@ class bench_params_t:
|
||||
hidden_size: int
|
||||
add_residual: bool
|
||||
dtype: torch.dtype
|
||||
group_size: list[int]
|
||||
|
||||
def description(self):
|
||||
return (
|
||||
@@ -29,6 +33,7 @@ class bench_params_t:
|
||||
f"x D {self.hidden_size} "
|
||||
f"x R {self.add_residual} "
|
||||
f"x DT {self.dtype}"
|
||||
f"x GS {self.group_size}"
|
||||
)
|
||||
|
||||
|
||||
@@ -38,10 +43,11 @@ def get_bench_params() -> list[bench_params_t]:
|
||||
HIDDEN_SIZES = list(range(1024, 8129, 1024))
|
||||
ADD_RESIDUAL = [True, False]
|
||||
DTYPES = [torch.bfloat16, torch.float]
|
||||
GROUP_SIZES = [[1, 64], [1, 128]]
|
||||
|
||||
combinations = product(NUM_TOKENS, HIDDEN_SIZES, ADD_RESIDUAL, DTYPES)
|
||||
combinations = product(NUM_TOKENS, HIDDEN_SIZES, ADD_RESIDUAL, DTYPES, GROUP_SIZES)
|
||||
bench_params = list(
|
||||
map(lambda x: bench_params_t(x[0], x[1], x[2], x[3]), combinations)
|
||||
map(lambda x: bench_params_t(x[0], x[1], x[2], x[3], x[4]), combinations)
|
||||
)
|
||||
return bench_params
|
||||
|
||||
@@ -52,6 +58,7 @@ def unfused_int8_impl(
|
||||
x: torch.Tensor,
|
||||
residual: torch.Tensor | None,
|
||||
quant_dtype: torch.dtype,
|
||||
group_size: list[int],
|
||||
):
|
||||
# Norm
|
||||
torch_out = None
|
||||
@@ -69,6 +76,7 @@ def unfused_fp8_impl(
|
||||
x: torch.Tensor,
|
||||
residual: torch.Tensor | None,
|
||||
quant_dtype: torch.dtype,
|
||||
group_size: list[int],
|
||||
):
|
||||
# Norm
|
||||
torch_out = None
|
||||
@@ -81,23 +89,63 @@ def unfused_fp8_impl(
|
||||
torch_out, _ = ops.scaled_fp8_quant(torch_out)
|
||||
|
||||
|
||||
def unfused_groupwise_fp8_impl(
|
||||
rms_norm_layer: RMSNorm,
|
||||
x: torch.Tensor,
|
||||
residual: torch.Tensor | None,
|
||||
quant_dtype: torch.dtype,
|
||||
group_size: list[int],
|
||||
):
|
||||
# Norm
|
||||
torch_out = None
|
||||
if residual is None:
|
||||
torch_out = rms_norm_layer.forward_cuda(x, residual)
|
||||
else:
|
||||
torch_out, _ = rms_norm_layer.forward_cuda(x, residual)
|
||||
|
||||
# Quant
|
||||
torch_out, _ = per_token_group_quant_fp8(
|
||||
torch_out, group_size=group_size[1], use_ue8m0=False
|
||||
)
|
||||
|
||||
|
||||
def fused_impl(
|
||||
rms_norm_layer: RMSNorm, # this stores the weights
|
||||
x: torch.Tensor,
|
||||
residual: torch.Tensor | None,
|
||||
quant_dtype: torch.dtype,
|
||||
group_size: list[int],
|
||||
):
|
||||
out, _ = ops.rms_norm_dynamic_per_token_quant(
|
||||
x, rms_norm_layer.weight, 1e-6, quant_dtype, residual=residual
|
||||
)
|
||||
|
||||
|
||||
def fused_groupwise_impl(
|
||||
rms_norm_layer: RMSNorm, # this stores the weights
|
||||
x: torch.Tensor,
|
||||
residual: torch.Tensor | None,
|
||||
quant_dtype: torch.dtype,
|
||||
group_size: list[int],
|
||||
):
|
||||
out, _ = ops.rms_norm_per_block_quant(
|
||||
x,
|
||||
rms_norm_layer.weight,
|
||||
1e-6,
|
||||
quant_dtype,
|
||||
group_size,
|
||||
residual=residual,
|
||||
is_scale_transposed=True,
|
||||
)
|
||||
|
||||
|
||||
# Bench functions
|
||||
def bench_fn(
|
||||
rms_norm_layer: RMSNorm,
|
||||
x: torch.Tensor,
|
||||
residual: torch.Tensor,
|
||||
quant_dtype: torch.dtype,
|
||||
group_size: list[int],
|
||||
label: str,
|
||||
sub_label: str,
|
||||
fn: Callable,
|
||||
@@ -110,10 +158,11 @@ def bench_fn(
|
||||
"x": x,
|
||||
"residual": residual,
|
||||
"quant_dtype": quant_dtype,
|
||||
"group_size": group_size,
|
||||
"fn": fn,
|
||||
}
|
||||
return TBenchmark.Timer(
|
||||
stmt="fn(rms_norm_layer, x, residual, quant_dtype)",
|
||||
stmt="fn(rms_norm_layer, x, residual, quant_dtype, group_size)",
|
||||
globals=globals,
|
||||
label=label,
|
||||
sub_label=sub_label,
|
||||
@@ -147,6 +196,7 @@ def bench(params: bench_params_t, label: str, sub_label: str) -> Iterable[TMeasu
|
||||
x,
|
||||
residual,
|
||||
torch.int8,
|
||||
params.group_size,
|
||||
label,
|
||||
sub_label,
|
||||
unfused_int8_impl,
|
||||
@@ -161,6 +211,7 @@ def bench(params: bench_params_t, label: str, sub_label: str) -> Iterable[TMeasu
|
||||
x,
|
||||
residual,
|
||||
torch.float8_e4m3fn,
|
||||
params.group_size,
|
||||
label,
|
||||
sub_label,
|
||||
unfused_fp8_impl,
|
||||
@@ -175,6 +226,7 @@ def bench(params: bench_params_t, label: str, sub_label: str) -> Iterable[TMeasu
|
||||
x,
|
||||
residual,
|
||||
torch.int8,
|
||||
params.group_size,
|
||||
label,
|
||||
sub_label,
|
||||
fused_impl,
|
||||
@@ -189,6 +241,7 @@ def bench(params: bench_params_t, label: str, sub_label: str) -> Iterable[TMeasu
|
||||
x,
|
||||
residual,
|
||||
torch.float8_e4m3fn,
|
||||
params.group_size,
|
||||
label,
|
||||
sub_label,
|
||||
fused_impl,
|
||||
@@ -196,6 +249,36 @@ def bench(params: bench_params_t, label: str, sub_label: str) -> Iterable[TMeasu
|
||||
)
|
||||
)
|
||||
|
||||
# unfused groupwise fp8 impl.
|
||||
timers.append(
|
||||
bench_fn(
|
||||
layer,
|
||||
x,
|
||||
residual,
|
||||
torch.float8_e4m3fn,
|
||||
params.group_size,
|
||||
label,
|
||||
sub_label,
|
||||
unfused_groupwise_fp8_impl,
|
||||
"unfused_groupwise_fp8_impl",
|
||||
)
|
||||
)
|
||||
|
||||
# fused groupwise fp8 impl.
|
||||
timers.append(
|
||||
bench_fn(
|
||||
layer,
|
||||
x,
|
||||
residual,
|
||||
torch.float8_e4m3fn,
|
||||
params.group_size,
|
||||
label,
|
||||
sub_label,
|
||||
fused_groupwise_impl,
|
||||
"fused_groupwise_fp8_impl",
|
||||
)
|
||||
)
|
||||
|
||||
print_timers(timers)
|
||||
|
||||
return timers
|
||||
|
||||
244
benchmarks/kernels/benchmark_2d_silu_mul_fp8_quant.py
Normal file
244
benchmarks/kernels/benchmark_2d_silu_mul_fp8_quant.py
Normal file
@@ -0,0 +1,244 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from dataclasses import dataclass
|
||||
from enum import Enum
|
||||
from itertools import product
|
||||
from typing import Any
|
||||
|
||||
import torch
|
||||
import torch.utils.benchmark as TBenchmark
|
||||
from torch.utils.benchmark import Measurement as TMeasurement
|
||||
|
||||
from vllm.model_executor.layers.quantization.utils.fp8_utils import (
|
||||
_per_token_group_quant_fp8_colmajor,
|
||||
silu_mul_per_token_group_quant_fp8_colmajor,
|
||||
)
|
||||
from vllm.triton_utils import triton
|
||||
from vllm.utils.deep_gemm import is_deep_gemm_e8m0_used
|
||||
|
||||
from .utils import ArgPool, Bench, CudaGraphBenchParams
|
||||
|
||||
GROUP_SIZE = 128
|
||||
FLOAT8_T = torch.float8_e4m3fn
|
||||
|
||||
|
||||
def print_timers(timers: list[TMeasurement], cuda_graph_nops: int):
|
||||
print(
|
||||
f"Note : The timings reported above is for {cuda_graph_nops} "
|
||||
"consecutive invocations of the benchmarking functions. "
|
||||
f"Please divide by {cuda_graph_nops} for single invocation "
|
||||
"timings."
|
||||
)
|
||||
compare = TBenchmark.Compare(timers)
|
||||
compare.print()
|
||||
|
||||
|
||||
class ImplType(Enum):
|
||||
SILU_MUL_PER_TOKEN_GROUP_QUANT_FP8_COLMAJOR = 1
|
||||
REFERENCE = 2
|
||||
|
||||
def get_impl(self):
|
||||
if self == ImplType.SILU_MUL_PER_TOKEN_GROUP_QUANT_FP8_COLMAJOR:
|
||||
return silu_mul_per_token_group_quant_fp8_colmajor
|
||||
elif self == ImplType.REFERENCE:
|
||||
return reference
|
||||
raise ValueError(f"Unrecognized ImplType {self}")
|
||||
|
||||
|
||||
@dataclass
|
||||
class BenchmarkTensors:
|
||||
input: torch.Tensor
|
||||
output: torch.Tensor
|
||||
|
||||
# Reference act output tensor
|
||||
ref_act_out: torch.Tensor
|
||||
ref_quant_out: torch.Tensor
|
||||
|
||||
@staticmethod
|
||||
def make(T: int, N: int) -> "BenchmarkTensors":
|
||||
assert T % GROUP_SIZE == 0
|
||||
assert N % (GROUP_SIZE * 2) == 0
|
||||
|
||||
input = torch.rand((T, N), dtype=torch.bfloat16, device="cuda")
|
||||
|
||||
# silu_mul_per_token_group_quant_fp8_colmajor output.
|
||||
output = torch.rand((T, N // 2), dtype=torch.bfloat16, device="cuda").to(
|
||||
FLOAT8_T
|
||||
)
|
||||
|
||||
# reference output.
|
||||
ref_act_out = torch.empty((T, N // 2), dtype=torch.bfloat16, device="cuda")
|
||||
ref_quant_out = torch.empty(
|
||||
(T, N // 2), dtype=torch.bfloat16, device="cuda"
|
||||
).to(FLOAT8_T)
|
||||
|
||||
return BenchmarkTensors(
|
||||
input=input,
|
||||
output=output,
|
||||
ref_act_out=ref_act_out,
|
||||
ref_quant_out=ref_quant_out,
|
||||
)
|
||||
|
||||
@property
|
||||
def T(self):
|
||||
return self.input.size(0)
|
||||
|
||||
@property
|
||||
def N(self):
|
||||
return self.input.size(1)
|
||||
|
||||
def make_impl_kwargs(self, impl_type: ImplType) -> dict[str, Any]:
|
||||
if impl_type == ImplType.SILU_MUL_PER_TOKEN_GROUP_QUANT_FP8_COLMAJOR:
|
||||
return {
|
||||
"input": self.input,
|
||||
"output": self.output,
|
||||
"use_ue8m0": is_deep_gemm_e8m0_used(),
|
||||
}
|
||||
elif impl_type == ImplType.REFERENCE:
|
||||
return {
|
||||
"input": self.input,
|
||||
"act_out": self.ref_act_out,
|
||||
"quant_out": self.ref_quant_out,
|
||||
"use_ue8m0": is_deep_gemm_e8m0_used(),
|
||||
}
|
||||
raise ValueError(f"Unrecognized impl_type {impl_type}")
|
||||
|
||||
|
||||
def reference_quant(x: torch.Tensor, quant_out: torch.Tensor, use_ue8m0: bool):
|
||||
"""
|
||||
Reference triton quant kernel from,
|
||||
vllm.model_executor.layers.quantization.utils.fp8_utils
|
||||
"""
|
||||
assert quant_out.size() == x.size()
|
||||
# Allocate the scale tensor column-major format.
|
||||
shape = (x.shape[-1] // GROUP_SIZE,) + x.shape[:-1]
|
||||
x_q = quant_out
|
||||
x_s = torch.empty(shape, device=x.device, dtype=torch.float32).permute(-1, -2)
|
||||
|
||||
M = x.numel() // GROUP_SIZE
|
||||
N = GROUP_SIZE
|
||||
BLOCK = triton.next_power_of_2(N)
|
||||
# heuristics for number of warps
|
||||
num_warps = min(max(BLOCK // 256, 1), 8)
|
||||
num_stages = 1
|
||||
|
||||
finfo = torch.finfo(FLOAT8_T)
|
||||
fp8_min = finfo.min
|
||||
fp8_max = finfo.max
|
||||
|
||||
_per_token_group_quant_fp8_colmajor[(M,)](
|
||||
x,
|
||||
x_q,
|
||||
x_s,
|
||||
GROUP_SIZE,
|
||||
x.shape[1],
|
||||
x.stride(0),
|
||||
x_s.stride(1),
|
||||
eps=1e-10,
|
||||
fp8_min=fp8_min,
|
||||
fp8_max=fp8_max,
|
||||
use_ue8m0=use_ue8m0,
|
||||
BLOCK=BLOCK,
|
||||
num_warps=num_warps,
|
||||
num_stages=num_stages,
|
||||
)
|
||||
return x_q, x_s
|
||||
|
||||
|
||||
def reference(
|
||||
input: torch.Tensor,
|
||||
act_out: torch.Tensor,
|
||||
quant_out: torch.Tensor,
|
||||
use_ue8m0: bool,
|
||||
) -> tuple[torch.Tensor, torch.Tensor]:
|
||||
torch.ops._C.silu_and_mul(act_out, input)
|
||||
return reference_quant(act_out, quant_out, use_ue8m0)
|
||||
|
||||
|
||||
def bench_impl(
|
||||
bench_tensors: list[BenchmarkTensors], impl_type: ImplType
|
||||
) -> TMeasurement:
|
||||
T = bench_tensors[0].T
|
||||
N = bench_tensors[0].N
|
||||
|
||||
arg_pool_size = len(bench_tensors)
|
||||
kwargs_list = [bt.make_impl_kwargs(impl_type) for bt in bench_tensors]
|
||||
|
||||
# warmup
|
||||
for kwargs in kwargs_list:
|
||||
impl_type.get_impl()(**kwargs)
|
||||
torch.cuda.synchronize()
|
||||
|
||||
# Merge into a single kwargs and qualify arguments as ArgPool
|
||||
kwargs = {k: ArgPool([]) for k in kwargs_list[0]}
|
||||
for _kwargs in kwargs_list:
|
||||
for k, v in _kwargs.items():
|
||||
kwargs[k].values.append(v)
|
||||
|
||||
cuda_graph_params = None
|
||||
cuda_graph_params = CudaGraphBenchParams(arg_pool_size)
|
||||
timer = None
|
||||
with Bench(
|
||||
cuda_graph_params,
|
||||
"silu-mul-quant",
|
||||
f"num_tokens={T}, N={N}",
|
||||
impl_type.name,
|
||||
impl_type.get_impl(),
|
||||
**kwargs,
|
||||
) as bench:
|
||||
timer = bench.run()
|
||||
return timer
|
||||
|
||||
|
||||
def test_correctness(T: int, N: int):
|
||||
print(f"Testing num_tokens={T}, N={N} ...")
|
||||
|
||||
bench_tensor = BenchmarkTensors.make(T, N)
|
||||
|
||||
def output_from_impl(impl: ImplType) -> tuple[torch.Tensor, torch.Tensor]:
|
||||
return impl.get_impl()(**bench_tensor.make_impl_kwargs(impl))
|
||||
|
||||
# reference output
|
||||
ref_out_q, ref_out_s = output_from_impl(ImplType.REFERENCE)
|
||||
|
||||
# test ouptut
|
||||
out_q, out_s = output_from_impl(
|
||||
ImplType.SILU_MUL_PER_TOKEN_GROUP_QUANT_FP8_COLMAJOR
|
||||
)
|
||||
|
||||
torch.testing.assert_close(ref_out_q.to(torch.float32), out_q.to(torch.float32))
|
||||
torch.testing.assert_close(ref_out_s, out_s)
|
||||
|
||||
|
||||
def run(Ts: list[int], Ns: list[int], arg_pool_size: int) -> list[TMeasurement]:
|
||||
timers = []
|
||||
for N, T in product(Ns, Ts):
|
||||
test_correctness(T, N)
|
||||
|
||||
bench_tensors: list[BenchmarkTensors] = [
|
||||
BenchmarkTensors.make(T, N) for _ in range(arg_pool_size)
|
||||
]
|
||||
|
||||
silu_mul_quant_timer = bench_impl(
|
||||
bench_tensors, ImplType.SILU_MUL_PER_TOKEN_GROUP_QUANT_FP8_COLMAJOR
|
||||
)
|
||||
timers.append(silu_mul_quant_timer)
|
||||
reference_timer = bench_impl(bench_tensors, ImplType.REFERENCE)
|
||||
timers.append(reference_timer)
|
||||
|
||||
print_timers(
|
||||
[silu_mul_quant_timer, reference_timer], cuda_graph_nops=arg_pool_size
|
||||
)
|
||||
|
||||
print_timers(timers, cuda_graph_nops=arg_pool_size)
|
||||
|
||||
return timers
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
T = [128 * i for i in range(1, 16)] + [2048 * i for i in range(1, 65)]
|
||||
N = [2048, 4096, 8192]
|
||||
|
||||
print(f"T = {T}, N = {N}")
|
||||
run(T, N, arg_pool_size=8)
|
||||
@@ -255,8 +255,8 @@ def bench_run(
|
||||
torch.cuda.synchronize()
|
||||
|
||||
# Timing
|
||||
start_event = torch.cuda.Event(enable_timing=True)
|
||||
end_event = torch.cuda.Event(enable_timing=True)
|
||||
start_event = torch.Event(enable_timing=True)
|
||||
end_event = torch.Event(enable_timing=True)
|
||||
|
||||
latencies = []
|
||||
for _ in range(num_iters):
|
||||
|
||||
@@ -237,6 +237,7 @@ def marlin_create_bench_fn(bt: BenchmarkTensors) -> Callable:
|
||||
b_q_weight=w_q,
|
||||
b_bias=None,
|
||||
b_scales=w_s,
|
||||
a_scales=None,
|
||||
global_scale=None,
|
||||
b_zeros=w_zp,
|
||||
g_idx=g_idx,
|
||||
|
||||
@@ -263,7 +263,7 @@ def bench_run(
|
||||
|
||||
results.append(
|
||||
benchmark.Timer(
|
||||
stmt="output = gptq_marlin_gemm(a, None, marlin_q_w, marlin_s, marlin_s2, marlin_zp, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, quant_type, size_m, size_n, size_k, is_k_full, False, False, False)", # noqa: E501
|
||||
stmt="output = gptq_marlin_gemm(a, None, marlin_q_w, marlin_s, None, marlin_s2, marlin_zp, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, quant_type, size_m, size_n, size_k, is_k_full, False, False, False)", # noqa: E501
|
||||
globals=globals,
|
||||
label=label,
|
||||
sub_label=sub_label,
|
||||
@@ -273,7 +273,7 @@ def bench_run(
|
||||
|
||||
results.append(
|
||||
benchmark.Timer(
|
||||
stmt="output = gptq_marlin_gemm(a, None, marlin_q_w, marlin_s, marlin_s2, marlin_zp, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, quant_type, size_m, size_n, size_k, is_k_full, False, True, False)", # noqa: E501
|
||||
stmt="output = gptq_marlin_gemm(a, None, marlin_q_w, marlin_s, None, marlin_s2, marlin_zp, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, quant_type, size_m, size_n, size_k, is_k_full, False, True, False)", # noqa: E501
|
||||
globals=globals,
|
||||
label=label,
|
||||
sub_label=sub_label,
|
||||
|
||||
150
benchmarks/kernels/benchmark_mla_k_concat.py
Normal file
150
benchmarks/kernels/benchmark_mla_k_concat.py
Normal file
@@ -0,0 +1,150 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""
|
||||
Benchmark script comparing torch.cat vs direct copy for k_nope/k_pe concatenation
|
||||
in MLA (Multi-head Latent Attention) prefill.
|
||||
|
||||
This validates that the optimization from commit 8d4142bd is beneficial across
|
||||
various batch sizes, not just the originally tested batch size of 32768.
|
||||
"""
|
||||
|
||||
import time
|
||||
from collections.abc import Callable
|
||||
|
||||
import torch
|
||||
|
||||
# DeepSeek-V3 MLA dimensions
|
||||
NUM_HEADS = 128
|
||||
QK_NOPE_HEAD_DIM = 128
|
||||
PE_DIM = 64
|
||||
|
||||
|
||||
def cat_method(k_nope: torch.Tensor, k_pe: torch.Tensor) -> torch.Tensor:
|
||||
"""Original torch.cat approach with expand."""
|
||||
return torch.cat((k_nope, k_pe.expand((*k_nope.shape[:-1], -1))), dim=-1)
|
||||
|
||||
|
||||
def direct_copy_method(k_nope: torch.Tensor, k_pe: torch.Tensor) -> torch.Tensor:
|
||||
"""Optimized direct copy approach (avoids expand + cat overhead)."""
|
||||
k = torch.empty(
|
||||
(*k_nope.shape[:-1], k_nope.shape[-1] + k_pe.shape[-1]),
|
||||
dtype=k_nope.dtype,
|
||||
device=k_nope.device,
|
||||
)
|
||||
k[..., : k_nope.shape[-1]] = k_nope
|
||||
k[..., k_nope.shape[-1] :] = k_pe
|
||||
return k
|
||||
|
||||
|
||||
def benchmark_method(
|
||||
method: Callable,
|
||||
k_nope: torch.Tensor,
|
||||
k_pe: torch.Tensor,
|
||||
num_warmup: int = 10,
|
||||
num_iters: int = 100,
|
||||
) -> float:
|
||||
"""Benchmark a concatenation method and return mean latency in ms."""
|
||||
# Warmup
|
||||
for _ in range(num_warmup):
|
||||
_ = method(k_nope, k_pe)
|
||||
torch.cuda.synchronize()
|
||||
|
||||
# Benchmark
|
||||
start = time.perf_counter()
|
||||
for _ in range(num_iters):
|
||||
_ = method(k_nope, k_pe)
|
||||
torch.cuda.synchronize()
|
||||
end = time.perf_counter()
|
||||
|
||||
return (end - start) / num_iters * 1000 # Convert to ms
|
||||
|
||||
|
||||
@torch.inference_mode()
|
||||
def run_benchmark(dtype: torch.dtype, dtype_name: str):
|
||||
"""Run benchmark for a specific dtype."""
|
||||
torch.set_default_device("cuda")
|
||||
|
||||
# Batch sizes to test (powers of 2 from 32 to 65536)
|
||||
batch_sizes = [32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536]
|
||||
|
||||
print("=" * 80)
|
||||
print("Benchmark: torch.cat vs direct copy for MLA k_nope/k_pe concatenation")
|
||||
print("=" * 80)
|
||||
print(
|
||||
f"Tensor shapes: k_nope=[B, {NUM_HEADS}, {QK_NOPE_HEAD_DIM}], "
|
||||
f"k_pe=[B, 1, {PE_DIM}]"
|
||||
)
|
||||
print(f"dtype: {dtype_name}")
|
||||
print()
|
||||
print(
|
||||
f"{'Batch Size':>12} | {'cat (ms)':>10} | {'direct (ms)':>12} | "
|
||||
f"{'Speedup':>8} | {'Reduction':>10}"
|
||||
)
|
||||
print("-" * 70)
|
||||
|
||||
results = []
|
||||
for batch_size in batch_sizes:
|
||||
# Create input tensors (generate in float32 then convert for FP8 compatibility)
|
||||
k_nope = torch.randn(
|
||||
batch_size, NUM_HEADS, QK_NOPE_HEAD_DIM, dtype=torch.float32, device="cuda"
|
||||
).to(dtype)
|
||||
k_pe = torch.randn(
|
||||
batch_size, 1, PE_DIM, dtype=torch.float32, device="cuda"
|
||||
).to(dtype)
|
||||
|
||||
# Benchmark both methods
|
||||
cat_time = benchmark_method(cat_method, k_nope, k_pe)
|
||||
direct_time = benchmark_method(direct_copy_method, k_nope, k_pe)
|
||||
|
||||
speedup = cat_time / direct_time
|
||||
reduction = (1 - direct_time / cat_time) * 100
|
||||
|
||||
results.append((batch_size, cat_time, direct_time, speedup, reduction))
|
||||
|
||||
print(
|
||||
f"{batch_size:>12} | {cat_time:>10.3f} | {direct_time:>12.3f} | "
|
||||
f"{speedup:>7.2f}x | {reduction:>9.1f}%"
|
||||
)
|
||||
|
||||
print("=" * 80)
|
||||
|
||||
# Summary statistics
|
||||
speedups = [r[3] for r in results]
|
||||
print("\nSpeedup summary:")
|
||||
print(f" Min: {min(speedups):.2f}x")
|
||||
print(f" Max: {max(speedups):.2f}x")
|
||||
print(f" Mean: {sum(speedups) / len(speedups):.2f}x")
|
||||
|
||||
# Find crossover point
|
||||
crossover_batch = None
|
||||
for batch_size, _, _, speedup, _ in results:
|
||||
if speedup >= 1.0:
|
||||
crossover_batch = batch_size
|
||||
break
|
||||
|
||||
print("\nConclusion:")
|
||||
if crossover_batch:
|
||||
print(f" - Direct copy becomes beneficial at batch size >= {crossover_batch}")
|
||||
# Filter for large batches (>= 512 which is typical for prefill)
|
||||
large_batch_speedups = [r[3] for r in results if r[0] >= 512]
|
||||
if large_batch_speedups:
|
||||
avg_large = sum(large_batch_speedups) / len(large_batch_speedups)
|
||||
print(f" - For batch sizes >= 512: avg speedup = {avg_large:.2f}x")
|
||||
print(" - MLA prefill typically uses large batches, so optimization is effective")
|
||||
|
||||
return results
|
||||
|
||||
|
||||
@torch.inference_mode()
|
||||
def main():
|
||||
# Test bfloat16
|
||||
print("\n")
|
||||
run_benchmark(torch.bfloat16, "bfloat16")
|
||||
|
||||
# Test float8_e4m3fn
|
||||
print("\n")
|
||||
run_benchmark(torch.float8_e4m3fn, "float8_e4m3fn")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -185,8 +185,8 @@ def benchmark_config(
|
||||
graph.replay()
|
||||
torch.cuda.synchronize()
|
||||
|
||||
start_event = torch.cuda.Event(enable_timing=True)
|
||||
end_event = torch.cuda.Event(enable_timing=True)
|
||||
start_event = torch.Event(enable_timing=True)
|
||||
end_event = torch.Event(enable_timing=True)
|
||||
|
||||
latencies: list[float] = []
|
||||
for i in range(num_iters):
|
||||
|
||||
@@ -24,12 +24,15 @@ def get_topk_ids(num_tokens: int, num_experts: int, topk: int) -> torch.Tensor:
|
||||
num_tokens_range = [1, 16, 256, 4096]
|
||||
num_experts_range = [16, 64, 224, 256, 280, 512]
|
||||
topk_range = [1, 2, 8]
|
||||
configs = list(itertools.product(num_tokens_range, num_experts_range, topk_range))
|
||||
ep_size_range = [1, 8]
|
||||
configs = list(
|
||||
itertools.product(num_tokens_range, num_experts_range, topk_range, ep_size_range)
|
||||
)
|
||||
|
||||
|
||||
@triton.testing.perf_report(
|
||||
triton.testing.Benchmark(
|
||||
x_names=["num_tokens", "num_experts", "topk"],
|
||||
x_names=["num_tokens", "num_experts", "topk", "ep_size"],
|
||||
x_vals=configs,
|
||||
line_arg="provider",
|
||||
line_vals=["vllm"],
|
||||
@@ -38,16 +41,26 @@ configs = list(itertools.product(num_tokens_range, num_experts_range, topk_range
|
||||
args={},
|
||||
)
|
||||
)
|
||||
def benchmark(num_tokens, num_experts, topk, provider):
|
||||
def benchmark(num_tokens, num_experts, topk, ep_size, provider):
|
||||
"""Benchmark function for Triton."""
|
||||
block_size = 256
|
||||
torch.cuda.manual_seed_all(0)
|
||||
topk_ids = get_topk_ids(num_tokens, num_experts, topk)
|
||||
|
||||
e_map = None
|
||||
if ep_size != 1:
|
||||
local_e = num_experts // ep_size
|
||||
e_ids = torch.randperm(num_experts, device="cuda", dtype=torch.int32)[:local_e]
|
||||
e_map = torch.full((num_experts,), -1, device="cuda", dtype=torch.int32)
|
||||
e_map[e_ids] = torch.arange(local_e, device="cuda", dtype=torch.int32)
|
||||
|
||||
quantiles = [0.5, 0.2, 0.8]
|
||||
|
||||
if provider == "vllm":
|
||||
ms, min_ms, max_ms = triton.testing.do_bench(
|
||||
lambda: moe_align_block_size(topk_ids, block_size, num_experts),
|
||||
lambda: moe_align_block_size(
|
||||
topk_ids, block_size, num_experts, e_map, ignore_invalid_experts=True
|
||||
),
|
||||
quantiles=quantiles,
|
||||
)
|
||||
|
||||
|
||||
@@ -105,8 +105,8 @@ def benchmark_permute(
|
||||
graph.replay()
|
||||
torch.cuda.synchronize()
|
||||
|
||||
start_event = torch.cuda.Event(enable_timing=True)
|
||||
end_event = torch.cuda.Event(enable_timing=True)
|
||||
start_event = torch.Event(enable_timing=True)
|
||||
end_event = torch.Event(enable_timing=True)
|
||||
|
||||
latencies: list[float] = []
|
||||
for i in range(num_iters):
|
||||
@@ -241,8 +241,8 @@ def benchmark_unpermute(
|
||||
graph.replay()
|
||||
torch.cuda.synchronize()
|
||||
|
||||
start_event = torch.cuda.Event(enable_timing=True)
|
||||
end_event = torch.cuda.Event(enable_timing=True)
|
||||
start_event = torch.Event(enable_timing=True)
|
||||
end_event = torch.Event(enable_timing=True)
|
||||
|
||||
latencies: list[float] = []
|
||||
for i in range(num_iters):
|
||||
|
||||
@@ -6,7 +6,7 @@
|
||||
#
|
||||
# The CSV file (named with current date/time) contains these columns:
|
||||
# model_name, tp_size, num_tokens, num_heads, num_kv_heads, head_dim, max_position,
|
||||
# rope_theta, is_neox_style, rope_scaling, dtype, torch_mean, torch_median, torch_p99,
|
||||
# is_neox_style, rope_parameters, dtype, torch_mean, torch_median, torch_p99,
|
||||
# torch_min, torch_max, triton_mean, triton_median, triton_p99, triton_min, triton_max,
|
||||
# speedup
|
||||
#
|
||||
@@ -86,9 +86,8 @@ def benchmark_mrope(
|
||||
num_heads: int,
|
||||
num_kv_heads: int,
|
||||
max_position: int = 8192,
|
||||
rope_theta: float = 10000,
|
||||
is_neox_style: bool = True,
|
||||
rope_scaling: dict[str, Any] = None,
|
||||
rope_parameters: dict[str, Any] | None = None,
|
||||
dtype: torch.dtype = torch.bfloat16,
|
||||
seed: int = 0,
|
||||
warmup_iter: int = 10,
|
||||
@@ -100,11 +99,9 @@ def benchmark_mrope(
|
||||
# the parameters to compute the q k v size based on tp_size
|
||||
mrope_helper_class = get_rope(
|
||||
head_size=head_dim,
|
||||
rotary_dim=head_dim,
|
||||
max_position=max_position,
|
||||
base=rope_theta,
|
||||
is_neox_style=is_neox_style,
|
||||
rope_scaling=rope_scaling,
|
||||
rope_parameters=rope_parameters,
|
||||
dtype=dtype,
|
||||
).to(device=device)
|
||||
|
||||
@@ -203,9 +200,8 @@ def benchmark_mrope(
|
||||
num_kv_heads,
|
||||
head_dim,
|
||||
max_position,
|
||||
rope_theta,
|
||||
is_neox_style,
|
||||
str(rope_scaling),
|
||||
str(rope_parameters),
|
||||
str(dtype).split(".")[-1],
|
||||
torch_stats["mean"],
|
||||
torch_stats["median"],
|
||||
@@ -255,9 +251,8 @@ if __name__ == "__main__":
|
||||
"num_kv_heads",
|
||||
"head_dim",
|
||||
"max_position",
|
||||
"rope_theta",
|
||||
"is_neox_style",
|
||||
"rope_scaling",
|
||||
"rope_parameters",
|
||||
"dtype",
|
||||
"torch_mean",
|
||||
"torch_median",
|
||||
@@ -303,7 +298,7 @@ if __name__ == "__main__":
|
||||
q_size = num_heads * head_dim
|
||||
kv_size = num_kv_heads * head_dim
|
||||
is_neox_style = True
|
||||
rope_theta = config.rope_theta
|
||||
rope_parameters = config.rope_parameters
|
||||
max_position = config.max_position_embeddings
|
||||
|
||||
for num_tokens in num_tokens_list:
|
||||
@@ -315,9 +310,8 @@ if __name__ == "__main__":
|
||||
num_heads=num_heads,
|
||||
num_kv_heads=num_kv_heads,
|
||||
max_position=max_position,
|
||||
rope_theta=rope_theta,
|
||||
is_neox_style=is_neox_style,
|
||||
rope_scaling=config.rope_scaling,
|
||||
rope_parameters=rope_parameters,
|
||||
dtype=getattr(torch, args.dtype),
|
||||
seed=args.seed,
|
||||
warmup_iter=args.warmup_iter,
|
||||
|
||||
@@ -30,8 +30,8 @@ def _time_cuda(
|
||||
fn()
|
||||
torch.cuda.synchronize()
|
||||
|
||||
start = torch.cuda.Event(enable_timing=True)
|
||||
end = torch.cuda.Event(enable_timing=True)
|
||||
start = torch.Event(enable_timing=True)
|
||||
end = torch.Event(enable_timing=True)
|
||||
|
||||
start.record()
|
||||
for _ in range(bench_iters):
|
||||
|
||||
@@ -32,8 +32,8 @@ def get_benchmark(head_size, rotary_dim, is_neox_style, device):
|
||||
def benchmark(batch_size, seq_len, num_heads, provider):
|
||||
dtype = torch.bfloat16
|
||||
max_position = 8192
|
||||
base = 10000
|
||||
rope = get_rope(head_size, rotary_dim, max_position, base, is_neox_style)
|
||||
rope_parameters = {"partial_rotary_factor": rotary_dim / head_size}
|
||||
rope = get_rope(head_size, max_position, is_neox_style, rope_parameters)
|
||||
rope = rope.to(dtype=dtype, device=device)
|
||||
cos_sin_cache = rope.cos_sin_cache.to(dtype=torch.float, device=device)
|
||||
|
||||
|
||||
@@ -253,8 +253,8 @@ def benchmark(
|
||||
)
|
||||
torch.cuda.synchronize()
|
||||
|
||||
start_event = torch.cuda.Event(enable_timing=True)
|
||||
end_event = torch.cuda.Event(enable_timing=True)
|
||||
start_event = torch.Event(enable_timing=True)
|
||||
end_event = torch.Event(enable_timing=True)
|
||||
|
||||
# Benchmark
|
||||
latencies: list[float] = []
|
||||
|
||||
@@ -127,8 +127,8 @@ def benchmark_decode(
|
||||
|
||||
def time_fn(fn, warmup=10, trials=20):
|
||||
torch.cuda.synchronize()
|
||||
start = torch.cuda.Event(enable_timing=True)
|
||||
end = torch.cuda.Event(enable_timing=True)
|
||||
start = torch.Event(enable_timing=True)
|
||||
end = torch.Event(enable_timing=True)
|
||||
times = []
|
||||
for i in range(warmup):
|
||||
fn()
|
||||
|
||||
@@ -139,8 +139,8 @@ def benchmark_prefill(
|
||||
|
||||
def time_fn(fn, warmup=10, trials=20):
|
||||
torch.cuda.synchronize()
|
||||
start = torch.cuda.Event(enable_timing=True)
|
||||
end = torch.cuda.Event(enable_timing=True)
|
||||
start = torch.Event(enable_timing=True)
|
||||
end = torch.Event(enable_timing=True)
|
||||
times = []
|
||||
for i in range(warmup):
|
||||
fn()
|
||||
|
||||
@@ -183,8 +183,8 @@ def benchmark_config(
|
||||
run()
|
||||
torch.cuda.synchronize()
|
||||
|
||||
start_event = torch.cuda.Event(enable_timing=True)
|
||||
end_event = torch.cuda.Event(enable_timing=True)
|
||||
start_event = torch.Event(enable_timing=True)
|
||||
end_event = torch.Event(enable_timing=True)
|
||||
|
||||
latencies: list[float] = []
|
||||
for i in range(num_iters):
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
|
||||
This directory includes benchmarks between DeepSeek's DeepGEMM block fp8 kernels against vLLM's existing triton and CUTLASS-based kernels.
|
||||
|
||||
Currently this just includes dense GEMMs and only works on Hopper GPUs.
|
||||
Currently, this just includes dense GEMMs and only works on Hopper GPUs.
|
||||
|
||||
## Setup
|
||||
|
||||
|
||||
@@ -55,6 +55,10 @@ output_num_chunks 166.0 99.01 11.80 79.00 90.00 98.00 108.75
|
||||
----------------------------------------------------------------------------------------------------
|
||||
```
|
||||
|
||||
If you run with `--warmup-step`, the summary will also include `warmup_runtime_sec`
|
||||
and `total_runtime_incl_warmup_sec` (while `runtime_sec` continues to reflect the
|
||||
benchmark-only runtime so the reported throughput stays comparable).
|
||||
|
||||
### JSON configuration file for synthetic conversations generation
|
||||
|
||||
The input flag `--input-file` is used to determine the input conversations for the benchmark.<br/>
|
||||
|
||||
@@ -1076,6 +1076,7 @@ def process_statistics(
|
||||
verbose: bool,
|
||||
gen_conv_args: GenConvArgs | None = None,
|
||||
excel_output: bool = False,
|
||||
warmup_runtime_sec: float | None = None,
|
||||
) -> None:
|
||||
if len(client_metrics) == 0:
|
||||
logger.info("No samples to process")
|
||||
@@ -1169,8 +1170,13 @@ def process_statistics(
|
||||
# Convert milliseconds to seconds
|
||||
runtime_sec = runtime_sec / 1000.0
|
||||
requests_per_sec = float(len(df)) / runtime_sec
|
||||
|
||||
params = {"runtime_sec": runtime_sec, "requests_per_sec": requests_per_sec}
|
||||
params = {
|
||||
"runtime_sec": runtime_sec,
|
||||
"requests_per_sec": requests_per_sec,
|
||||
}
|
||||
if warmup_runtime_sec is not None:
|
||||
params["warmup_runtime_sec"] = warmup_runtime_sec
|
||||
params["total_runtime_incl_warmup_sec"] = runtime_sec + warmup_runtime_sec
|
||||
|
||||
# Generate a summary of relevant metrics (and drop irrelevant data)
|
||||
df = df.drop(columns=exclude).describe(percentiles=percentiles).transpose()
|
||||
@@ -1552,6 +1558,8 @@ async def main() -> None:
|
||||
url=args.url, num_clients=args.num_clients, early_stop=not args.no_early_stop
|
||||
)
|
||||
|
||||
warmup_runtime_sec: float | None = None
|
||||
|
||||
# Warm-up step
|
||||
if args.warmup_step:
|
||||
# Only send a single user prompt from every conversation.
|
||||
@@ -1566,26 +1574,56 @@ async def main() -> None:
|
||||
# all clients should finish their work before exiting
|
||||
warmup_bench_args = bench_args._replace(early_stop=False)
|
||||
|
||||
logger.info(f"{Color.PURPLE}Warmup start{Color.RESET}")
|
||||
logger.info("%sWarmup start%s", Color.PURPLE, Color.RESET)
|
||||
warmup_start_ns = time.perf_counter_ns()
|
||||
conversations, _ = await main_mp(
|
||||
warmup_client_args, req_args, warmup_bench_args, tokenizer, conversations
|
||||
)
|
||||
logger.info(f"{Color.PURPLE}Warmup done{Color.RESET}")
|
||||
warmup_runtime_sec = nanosec_to_sec(time.perf_counter_ns() - warmup_start_ns)
|
||||
logger.info(
|
||||
"%sWarmup runtime: %.3f sec (%.3f ms)%s",
|
||||
Color.PURPLE,
|
||||
warmup_runtime_sec,
|
||||
warmup_runtime_sec * 1000,
|
||||
Color.RESET,
|
||||
)
|
||||
logger.info("%sWarmup done%s", Color.PURPLE, Color.RESET)
|
||||
|
||||
# Run the benchmark
|
||||
start_time = time.perf_counter_ns()
|
||||
benchmark_start_ns = time.perf_counter_ns()
|
||||
client_convs, client_metrics = await main_mp(
|
||||
client_args, req_args, bench_args, tokenizer, conversations
|
||||
)
|
||||
total_runtime_ms = nanosec_to_millisec(time.perf_counter_ns() - start_time)
|
||||
benchmark_runtime_sec = nanosec_to_sec(time.perf_counter_ns() - benchmark_start_ns)
|
||||
|
||||
# Calculate requests per second
|
||||
total_runtime_sec = total_runtime_ms / 1000.0
|
||||
rps = len(client_metrics) / total_runtime_sec
|
||||
requests_per_sec = len(client_metrics) / benchmark_runtime_sec
|
||||
benchmark_runtime_ms = benchmark_runtime_sec * 1000.0
|
||||
logger.info(
|
||||
f"{Color.GREEN}All clients finished, total runtime: {total_runtime_sec:.3f} sec"
|
||||
f" ({total_runtime_ms:.3f} ms), requests per second: {rps:.3f}{Color.RESET}"
|
||||
"%sAll clients finished, benchmark runtime: %.3f sec (%.3f ms), "
|
||||
"requests per second: %.3f%s",
|
||||
Color.GREEN,
|
||||
benchmark_runtime_sec,
|
||||
benchmark_runtime_ms,
|
||||
requests_per_sec,
|
||||
Color.RESET,
|
||||
)
|
||||
if warmup_runtime_sec is not None:
|
||||
total_runtime_sec = benchmark_runtime_sec + warmup_runtime_sec
|
||||
logger.info(
|
||||
"%sWarmup runtime: %.3f sec (%.3f ms)%s",
|
||||
Color.GREEN,
|
||||
warmup_runtime_sec,
|
||||
warmup_runtime_sec * 1000,
|
||||
Color.RESET,
|
||||
)
|
||||
logger.info(
|
||||
"%sTotal runtime (including warmup): %.3f sec (%.3f ms)%s",
|
||||
Color.GREEN,
|
||||
total_runtime_sec,
|
||||
total_runtime_sec * 1000,
|
||||
Color.RESET,
|
||||
)
|
||||
|
||||
# Benchmark parameters
|
||||
params = {
|
||||
@@ -1610,6 +1648,7 @@ async def main() -> None:
|
||||
verbose=args.verbose,
|
||||
gen_conv_args=gen_conv_args,
|
||||
excel_output=args.excel_output,
|
||||
warmup_runtime_sec=warmup_runtime_sec,
|
||||
)
|
||||
|
||||
if args.output_file is not None:
|
||||
|
||||
@@ -251,17 +251,6 @@ if ((AVX512_FOUND AND NOT AVX512_DISABLED) OR (ASIMD_FOUND AND NOT APPLE_SILICON
|
||||
endif()
|
||||
|
||||
# Build ACL with CMake
|
||||
set(ARM_COMPUTE_BUILD_SHARED_LIB "OFF")
|
||||
set(CMAKE_BUILD_TYPE "Release")
|
||||
set(ARM_COMPUTE_ARCH "armv8.2-a")
|
||||
set(ARM_COMPUTE_ENABLE_ASSERTS "OFF")
|
||||
set(ARM_COMPUTE_ENABLE_CPPTHREADS "OFF")
|
||||
set(ONEDNN_ENABLE_PRIMITIVE "MATMUL;REORDER")
|
||||
set(ARM_COMPUTE_ENABLE_OPENMP "ON")
|
||||
set(ARM_COMPUTE_ENABLE_WERROR "OFF")
|
||||
set(ARM_COMPUTE_BUILD_EXAMPLES "OFF")
|
||||
set(ARM_COMPUTE_BUILD_TESTING "OFF")
|
||||
|
||||
set(_cmake_config_cmd
|
||||
${CMAKE_COMMAND} -G Ninja -B build
|
||||
-DARM_COMPUTE_BUILD_SHARED_LIB=OFF
|
||||
@@ -375,6 +364,7 @@ set(VLLM_EXT_SRC
|
||||
if (AVX512_FOUND AND NOT AVX512_DISABLED)
|
||||
set(VLLM_EXT_SRC
|
||||
"csrc/cpu/shm.cpp"
|
||||
"csrc/cpu/cpu_wna16.cpp"
|
||||
${VLLM_EXT_SRC})
|
||||
if (ENABLE_AVX512BF16 AND ENABLE_AVX512VNNI)
|
||||
set(VLLM_EXT_SRC
|
||||
|
||||
53
cmake/external_projects/triton_kernels.cmake
Normal file
53
cmake/external_projects/triton_kernels.cmake
Normal file
@@ -0,0 +1,53 @@
|
||||
# Install OpenAI triton_kernels from https://github.com/triton-lang/triton/tree/main/python/triton_kernels
|
||||
|
||||
set(DEFAULT_TRITON_KERNELS_TAG "v3.5.0")
|
||||
|
||||
# Set TRITON_KERNELS_SRC_DIR for use with local development with vLLM. We expect TRITON_KERNELS_SRC_DIR to
|
||||
# be directly set to the triton_kernels python directory.
|
||||
if (DEFINED ENV{TRITON_KERNELS_SRC_DIR})
|
||||
message(STATUS "[triton_kernels] Fetch from $ENV{TRITON_KERNELS_SRC_DIR}")
|
||||
FetchContent_Declare(
|
||||
triton_kernels
|
||||
SOURCE_DIR $ENV{TRITON_KERNELS_SRC_DIR}
|
||||
)
|
||||
|
||||
else()
|
||||
set(TRITON_GIT "https://github.com/triton-lang/triton.git")
|
||||
message (STATUS "[triton_kernels] Fetch from ${TRITON_GIT}:${DEFAULT_TRITON_KERNELS_TAG}")
|
||||
FetchContent_Declare(
|
||||
triton_kernels
|
||||
# TODO (varun) : Fetch just the triton_kernels directory from Triton
|
||||
GIT_REPOSITORY https://github.com/triton-lang/triton.git
|
||||
GIT_TAG ${DEFAULT_TRITON_KERNELS_TAG}
|
||||
GIT_PROGRESS TRUE
|
||||
SOURCE_SUBDIR python/triton_kernels/triton_kernels
|
||||
)
|
||||
endif()
|
||||
|
||||
# Fetch content
|
||||
FetchContent_MakeAvailable(triton_kernels)
|
||||
|
||||
if (NOT triton_kernels_SOURCE_DIR)
|
||||
message (FATAL_ERROR "[triton_kernels] Cannot resolve triton_kernels_SOURCE_DIR")
|
||||
endif()
|
||||
|
||||
if (DEFINED ENV{TRITON_KERNELS_SRC_DIR})
|
||||
set(TRITON_KERNELS_PYTHON_DIR "${triton_kernels_SOURCE_DIR}/")
|
||||
else()
|
||||
set(TRITON_KERNELS_PYTHON_DIR "${triton_kernels_SOURCE_DIR}/python/triton_kernels/triton_kernels/")
|
||||
endif()
|
||||
|
||||
message (STATUS "[triton_kernels] triton_kernels is available at ${TRITON_KERNELS_PYTHON_DIR}")
|
||||
|
||||
add_custom_target(triton_kernels)
|
||||
|
||||
# Ensure the vllm/third_party directory exists before installation
|
||||
install(CODE "file(MAKE_DIRECTORY \"\${CMAKE_INSTALL_PREFIX}/vllm/third_party/triton_kernels\")")
|
||||
|
||||
## Copy .py files to install directory.
|
||||
install(DIRECTORY
|
||||
${TRITON_KERNELS_PYTHON_DIR}
|
||||
DESTINATION
|
||||
vllm/third_party/triton_kernels/
|
||||
COMPONENT triton_kernels
|
||||
FILES_MATCHING PATTERN "*.py")
|
||||
@@ -38,7 +38,7 @@ else()
|
||||
FetchContent_Declare(
|
||||
vllm-flash-attn
|
||||
GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
|
||||
GIT_TAG 58e0626a692f09241182582659e3bf8f16472659
|
||||
GIT_TAG 86f8f157cf82aa2342743752b97788922dd7de43
|
||||
GIT_PROGRESS TRUE
|
||||
# Don't share the vllm-flash-attn build between build types
|
||||
BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn
|
||||
|
||||
@@ -140,16 +140,21 @@ function(vllm_prepare_torch_gomp_shim TORCH_GOMP_SHIM_DIR)
|
||||
run_python(_VLLM_TORCH_GOMP_PATH
|
||||
"
|
||||
import os, glob
|
||||
try:
|
||||
import torch
|
||||
torch_pkg = os.path.dirname(torch.__file__)
|
||||
site_root = os.path.dirname(torch_pkg)
|
||||
torch_libs = os.path.join(site_root, 'torch.libs')
|
||||
print(glob.glob(os.path.join(torch_libs, 'libgomp-*.so*'))[0])
|
||||
except:
|
||||
print('')
|
||||
import torch
|
||||
torch_pkg = os.path.dirname(torch.__file__)
|
||||
site_root = os.path.dirname(torch_pkg)
|
||||
|
||||
# Search both torch.libs and torch/lib
|
||||
roots = [os.path.join(site_root, 'torch.libs'), os.path.join(torch_pkg, 'lib')]
|
||||
candidates = []
|
||||
for root in roots:
|
||||
if not os.path.isdir(root):
|
||||
continue
|
||||
candidates.extend(glob.glob(os.path.join(root, 'libgomp*.so*')))
|
||||
|
||||
print(candidates[0] if candidates else '')
|
||||
"
|
||||
"failed to probe torch.libs for libgomp")
|
||||
"failed to probe for libgomp")
|
||||
|
||||
if(_VLLM_TORCH_GOMP_PATH STREQUAL "" OR NOT EXISTS "${_VLLM_TORCH_GOMP_PATH}")
|
||||
return()
|
||||
@@ -495,7 +500,13 @@ function (define_extension_target MOD_NAME)
|
||||
set(SOABI_KEYWORD "")
|
||||
endif()
|
||||
|
||||
if (ARG_USE_SABI)
|
||||
run_python(IS_FREETHREADED_PYTHON
|
||||
"import sysconfig; print(1 if sysconfig.get_config_var(\"Py_GIL_DISABLED\") else 0)"
|
||||
"Failed to determine whether interpreter is free-threaded")
|
||||
|
||||
# Free-threaded Python doesn't yet support the stable ABI (see PEP 803/809),
|
||||
# so avoid using the stable ABI under free-threading only.
|
||||
if (ARG_USE_SABI AND NOT IS_FREETHREADED_PYTHON)
|
||||
Python_add_library(${MOD_NAME} MODULE USE_SABI ${ARG_USE_SABI} ${SOABI_KEYWORD} "${ARG_SOURCES}")
|
||||
else()
|
||||
Python_add_library(${MOD_NAME} MODULE ${SOABI_KEYWORD} "${ARG_SOURCES}")
|
||||
|
||||
@@ -16,7 +16,8 @@ __global__ void merge_attn_states_kernel(
|
||||
scalar_t* output, float* output_lse, const scalar_t* prefix_output,
|
||||
const float* prefix_lse, const scalar_t* suffix_output,
|
||||
const float* suffix_lse, const uint num_tokens, const uint num_heads,
|
||||
const uint head_size) {
|
||||
const uint head_size, const uint prefix_head_stride,
|
||||
const uint output_head_stride) {
|
||||
using pack_128b_t = uint4;
|
||||
const uint pack_size = 16 / sizeof(scalar_t);
|
||||
const uint threads_per_head = head_size / pack_size;
|
||||
@@ -34,11 +35,13 @@ __global__ void merge_attn_states_kernel(
|
||||
const uint head_idx = token_head_idx % num_heads;
|
||||
|
||||
const uint pack_offset = pack_idx * pack_size; // (0~15)*8, etc.
|
||||
const uint head_offset =
|
||||
token_idx * num_heads * head_size + head_idx * head_size;
|
||||
const scalar_t* prefix_head_ptr = prefix_output + head_offset;
|
||||
const scalar_t* suffix_head_ptr = suffix_output + head_offset;
|
||||
scalar_t* output_head_ptr = output + head_offset;
|
||||
const uint src_head_offset = token_idx * num_heads * prefix_head_stride +
|
||||
head_idx * prefix_head_stride;
|
||||
const uint dst_head_offset = token_idx * num_heads * output_head_stride +
|
||||
head_idx * output_head_stride;
|
||||
const scalar_t* prefix_head_ptr = prefix_output + src_head_offset;
|
||||
const scalar_t* suffix_head_ptr = suffix_output + src_head_offset;
|
||||
scalar_t* output_head_ptr = output + dst_head_offset;
|
||||
|
||||
float p_lse = prefix_lse[head_idx * num_tokens + token_idx];
|
||||
float s_lse = suffix_lse[head_idx * num_tokens + token_idx];
|
||||
@@ -140,7 +143,7 @@ __global__ void merge_attn_states_kernel(
|
||||
reinterpret_cast<float*>(prefix_lse.data_ptr()), \
|
||||
reinterpret_cast<scalar_t*>(suffix_output.data_ptr()), \
|
||||
reinterpret_cast<float*>(suffix_lse.data_ptr()), num_tokens, \
|
||||
num_heads, head_size); \
|
||||
num_heads, head_size, prefix_head_stride, output_head_stride); \
|
||||
}
|
||||
|
||||
/*@brief Merges the attention states from prefix and suffix
|
||||
@@ -166,17 +169,11 @@ void merge_attn_states_launcher(torch::Tensor& output,
|
||||
const uint num_tokens = output.size(0);
|
||||
const uint num_heads = output.size(1);
|
||||
const uint head_size = output.size(2);
|
||||
const uint prefix_head_stride = prefix_output.stride(1);
|
||||
const uint output_head_stride = output.stride(1);
|
||||
const uint pack_size = 16 / sizeof(scalar_t);
|
||||
TORCH_CHECK(head_size % pack_size == 0,
|
||||
"headsize must be multiple of pack_size:", pack_size);
|
||||
TORCH_CHECK(output.stride(-2) == head_size && output.stride(-1) == 1,
|
||||
"output heads must be contiguous in memory");
|
||||
TORCH_CHECK(
|
||||
prefix_output.stride(-2) == head_size && prefix_output.stride(-1) == 1,
|
||||
"prefix_output heads must be contiguous in memory");
|
||||
TORCH_CHECK(
|
||||
suffix_output.stride(-2) == head_size && suffix_output.stride(-1) == 1,
|
||||
"suffix_output heads must be contiguous in memory");
|
||||
float* output_lse_ptr = nullptr;
|
||||
if (output_lse.has_value()) {
|
||||
output_lse_ptr = output_lse.value().data_ptr<float>();
|
||||
|
||||
23
csrc/cache.h
23
csrc/cache.h
@@ -1,6 +1,7 @@
|
||||
#pragma once
|
||||
|
||||
#include <torch/all.h>
|
||||
#include <c10/util/Optional.h>
|
||||
|
||||
#include <map>
|
||||
#include <vector>
|
||||
@@ -41,11 +42,12 @@ void convert_fp8(torch::Tensor& dst_cache, torch::Tensor& src_cache,
|
||||
const double scale, const std::string& kv_cache_dtype);
|
||||
|
||||
void gather_and_maybe_dequant_cache(
|
||||
torch::Tensor const& src_cache, // [NUM_BLOCKS, BLOCK_SIZE, ENTRIES...]
|
||||
torch::Tensor const& dst, // [TOT_TOKENS, ENTRIES...]
|
||||
torch::Tensor const& block_table, // [BATCH, BLOCK_INDICES]
|
||||
torch::Tensor const& cu_seq_lens, // [BATCH+1]
|
||||
int64_t batch_size, const std::string& kv_cache_dtype,
|
||||
torch::Tensor const& src_cache, // [NUM_BLOCKS, BLOCK_SIZE, ENTRIES...]
|
||||
torch::Tensor const& dst, // [TOT_TOKENS, ENTRIES...]
|
||||
torch::Tensor const& block_table, // [BATCH, BLOCK_INDICES]
|
||||
torch::Tensor const& cu_seq_lens, // [BATCH+1]
|
||||
torch::Tensor const& token_to_seq, // [MAX_TOKEN_ACROSS_CHUNKS]
|
||||
int64_t num_tokens, const std::string& kv_cache_dtype,
|
||||
torch::Tensor const& scale,
|
||||
std::optional<torch::Tensor> seq_starts = std::nullopt);
|
||||
|
||||
@@ -57,6 +59,15 @@ void cp_gather_cache(
|
||||
torch::Tensor const& cu_seq_lens, // [BATCH+1]
|
||||
int64_t batch_size, std::optional<torch::Tensor> seq_starts = std::nullopt);
|
||||
|
||||
// Gather and upconvert FP8 KV cache to BF16 workspace
|
||||
void cp_gather_and_upconvert_fp8_kv_cache(
|
||||
torch::Tensor const& src_cache, // [NUM_BLOCKS, BLOCK_SIZE, 656]
|
||||
torch::Tensor const& dst, // [TOT_TOKENS, 576]
|
||||
torch::Tensor const& block_table, // [BATCH, BLOCK_INDICES]
|
||||
torch::Tensor const& seq_lens, // [BATCH]
|
||||
torch::Tensor const& workspace_starts, // [BATCH]
|
||||
int64_t batch_size);
|
||||
|
||||
// Indexer K quantization and cache function
|
||||
void indexer_k_quant_and_cache(
|
||||
torch::Tensor& k, // [num_tokens, head_dim]
|
||||
@@ -71,4 +82,4 @@ void cp_gather_indexer_k_quant_cache(
|
||||
torch::Tensor& dst_k, // [num_tokens, head_dim]
|
||||
torch::Tensor& dst_scale, // [num_tokens, head_dim / quant_block_size * 4]
|
||||
const torch::Tensor& block_table, // [batch_size, num_blocks]
|
||||
const torch::Tensor& cu_seq_lens); // [batch_size + 1]
|
||||
const torch::Tensor& cu_seq_lens); // [batch_size + 1]
|
||||
|
||||
@@ -2,6 +2,7 @@
|
||||
#include <ATen/cuda/CUDAContext.h>
|
||||
#include <c10/cuda/CUDAGuard.h>
|
||||
#include <c10/cuda/CUDAException.h>
|
||||
#include <c10/util/Optional.h>
|
||||
|
||||
#include "cuda_utils.h"
|
||||
#include "cuda_compat.h"
|
||||
@@ -514,7 +515,8 @@ __global__ void indexer_k_quant_and_cache_kernel(
|
||||
const int quant_block_size, // quantization block size
|
||||
const int cache_block_size, // cache block size
|
||||
const int cache_stride, // stride for each token in kv_cache
|
||||
const bool use_ue8m0 // use ue8m0 scale format
|
||||
|
||||
const bool use_ue8m0 // use ue8m0 scale format
|
||||
) {
|
||||
constexpr int VEC_SIZE = 4;
|
||||
const int64_t token_idx = blockIdx.x;
|
||||
@@ -552,7 +554,11 @@ __global__ void indexer_k_quant_and_cache_kernel(
|
||||
#ifndef USE_ROCM
|
||||
__syncwarp();
|
||||
#endif
|
||||
#if defined(__gfx942__)
|
||||
float scale = fmaxf(amax, 1e-4) / 224.0f;
|
||||
#else
|
||||
float scale = fmaxf(amax, 1e-4) / 448.0f;
|
||||
#endif
|
||||
if (use_ue8m0) {
|
||||
scale = exp2f(ceilf(log2f(scale)));
|
||||
}
|
||||
@@ -901,87 +907,80 @@ void convert_fp8(torch::Tensor& dst_cache, torch::Tensor& src_cache,
|
||||
namespace vllm {
|
||||
|
||||
// grid is launched with dimensions (batch, num_splits)
|
||||
template <typename scalar_t, typename cache_t, Fp8KVCacheDataType kv_dt>
|
||||
template <typename scalar_t, typename cache_t, Fp8KVCacheDataType kv_dt,
|
||||
int ENTRY_SIZE, int CTA_SIZE>
|
||||
__global__ void gather_and_maybe_dequant_cache(
|
||||
const cache_t* __restrict__ src_cache, // [NUM_BLOCKS, BLOCK_SIZE,
|
||||
// ENTRIES...]
|
||||
scalar_t* __restrict__ dst, // [TOT_TOKENS, ENTRIES...]
|
||||
const int32_t* __restrict__ block_table, // [BATCH, BLOCK_INDICES]
|
||||
const int32_t* __restrict__ cu_seq_lens, // [BATCH+1]
|
||||
const int32_t block_size, const int32_t entry_size,
|
||||
const cache_t* __restrict__ src_cache, // [NUM_BLOCKS, BLOCK_SIZE,
|
||||
// ENTRIES...]
|
||||
scalar_t* __restrict__ dst, // [TOT_TOKENS, ENTRIES...]
|
||||
const int32_t* __restrict__ block_table, // [BATCH, BLOCK_INDICES]
|
||||
const int32_t* __restrict__ cu_seq_lens, // [BATCH+1]
|
||||
const int32_t* __restrict__ token_to_seq, // [MAX_TOKEN_ACROSS_CHUNK]
|
||||
const int32_t num_tokens, const int32_t block_size,
|
||||
const int64_t block_table_stride, const int64_t cache_block_stride,
|
||||
const int64_t cache_entry_stride, const int64_t dst_entry_stride,
|
||||
const float* __restrict__ scale,
|
||||
const int32_t* __restrict__ seq_starts) { // Optional: starting offsets per
|
||||
// batch
|
||||
constexpr int vec_size = sizeof(float4) / sizeof(scalar_t);
|
||||
using ltype = vllm::vec_n_t<cache_t, vec_size>;
|
||||
using stype = vllm::vec_n_t<scalar_t, vec_size>;
|
||||
// We are adding this for code readability which will be optimized out when
|
||||
// build in release.
|
||||
assert(CTA_SIZE == blockDim.x);
|
||||
|
||||
const int64_t bid = blockIdx.x; // Batch ID
|
||||
const int32_t num_splits = gridDim.y;
|
||||
const int32_t split = blockIdx.y;
|
||||
const int32_t seq_start = cu_seq_lens[bid];
|
||||
const int32_t seq_end = cu_seq_lens[bid + 1];
|
||||
const int32_t seq_len = seq_end - seq_start;
|
||||
const int32_t tot_blocks = cuda_utils::ceil_div(seq_len, block_size);
|
||||
const int32_t split_blocks = cuda_utils::ceil_div(tot_blocks, num_splits);
|
||||
#pragma unroll
|
||||
for (int token_id = blockIdx.x; token_id < num_tokens;
|
||||
token_id += gridDim.x) {
|
||||
int64_t batch_id = token_to_seq[token_id];
|
||||
int64_t batch_start = cu_seq_lens[batch_id];
|
||||
int64_t batch_end = cu_seq_lens[batch_id + 1];
|
||||
int32_t batch_offset = token_id - batch_start;
|
||||
|
||||
const int32_t split_start = split * split_blocks;
|
||||
const int32_t split_end = min((split + 1) * split_blocks, tot_blocks);
|
||||
if (token_id >= batch_end) return;
|
||||
int32_t offset = 0;
|
||||
if (seq_starts != nullptr) {
|
||||
offset = seq_starts[batch_id];
|
||||
}
|
||||
batch_offset += offset;
|
||||
int32_t block_table_id = batch_offset / block_size;
|
||||
int32_t slot_id = batch_offset % block_size;
|
||||
int32_t block_table_offset = batch_id * block_table_stride + block_table_id;
|
||||
int32_t block_id = block_table[block_table_offset];
|
||||
int64_t cache_offset =
|
||||
block_id * cache_block_stride + slot_id * cache_entry_stride;
|
||||
constexpr int32_t vec_iter_cnt = ENTRY_SIZE / vec_size;
|
||||
scalar_t* dst_ = dst + token_id * dst_entry_stride;
|
||||
cache_t* src_ = const_cast<cache_t*>(src_cache) + cache_offset;
|
||||
|
||||
const bool is_active_split = (split_start < tot_blocks);
|
||||
const bool is_last_split = (split_end == tot_blocks);
|
||||
|
||||
if (!is_active_split) return;
|
||||
|
||||
int32_t full_blocks_end = split_end;
|
||||
int32_t partial_block_size = 0;
|
||||
|
||||
// Adjust the pointer for the block_table for this batch.
|
||||
// If seq_starts is provided, compute an offset based on (seq_starts[bid] /
|
||||
// page_size)
|
||||
const int32_t batch_offset = bid * block_table_stride;
|
||||
int32_t offset = 0;
|
||||
if (seq_starts != nullptr) {
|
||||
offset = seq_starts[bid] / block_size;
|
||||
}
|
||||
const int32_t* batch_block_table = block_table + batch_offset + offset;
|
||||
|
||||
// Adjust dst pointer based on the cumulative sequence lengths.
|
||||
dst += seq_start * dst_entry_stride;
|
||||
|
||||
if (is_last_split) {
|
||||
partial_block_size = seq_len % block_size;
|
||||
if (partial_block_size) full_blocks_end -= 1;
|
||||
}
|
||||
|
||||
auto copy_entry = [&](const cache_t* __restrict__ _src,
|
||||
scalar_t* __restrict__ _dst) {
|
||||
for (int i = threadIdx.x; i < entry_size; i += blockDim.x) {
|
||||
#pragma unroll
|
||||
for (int idx = threadIdx.x; idx < vec_iter_cnt; idx += CTA_SIZE) {
|
||||
if constexpr (kv_dt == Fp8KVCacheDataType::kAuto) {
|
||||
_dst[i] = static_cast<scalar_t>(_src[i]);
|
||||
reinterpret_cast<stype*>(dst_)[idx] =
|
||||
static_cast<stype>(reinterpret_cast<ltype*>(src_)[idx]);
|
||||
} else {
|
||||
_dst[i] =
|
||||
fp8::scaled_convert<scalar_t, cache_t, kv_dt>(_src[i], *scale);
|
||||
ltype loaded_val = reinterpret_cast<ltype*>(src_)[idx];
|
||||
stype store_val;
|
||||
#pragma unroll
|
||||
for (int j = 0; j < vec_size; ++j) {
|
||||
store_val.val[j] = fp8::scaled_convert<scalar_t, cache_t, kv_dt>(
|
||||
loaded_val.val[j], *scale);
|
||||
}
|
||||
reinterpret_cast<stype*>(dst_)[idx] = store_val;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
for (int pid = split_start; pid < full_blocks_end; ++pid) {
|
||||
auto block_id = batch_block_table[pid];
|
||||
auto block_start_ptr = src_cache + block_id * cache_block_stride;
|
||||
auto block_dst_ptr = dst + pid * block_size * dst_entry_stride;
|
||||
for (int eid = 0; eid < block_size; ++eid) {
|
||||
copy_entry(block_start_ptr + eid * cache_entry_stride,
|
||||
block_dst_ptr + eid * dst_entry_stride);
|
||||
}
|
||||
}
|
||||
|
||||
if (partial_block_size) {
|
||||
auto block_id = batch_block_table[full_blocks_end];
|
||||
auto block_start_ptr = src_cache + block_id * cache_block_stride;
|
||||
auto block_dst_ptr = dst + full_blocks_end * block_size * dst_entry_stride;
|
||||
for (int eid = 0; eid < partial_block_size; ++eid) {
|
||||
copy_entry(block_start_ptr + eid * cache_entry_stride,
|
||||
block_dst_ptr + eid * dst_entry_stride);
|
||||
// process tail
|
||||
constexpr int32_t tail_cnt = ENTRY_SIZE % vec_size;
|
||||
dst_ = dst_ + ENTRY_SIZE - tail_cnt;
|
||||
src_ = src_ + ENTRY_SIZE - tail_cnt;
|
||||
#pragma unroll
|
||||
for (int idx = threadIdx.x; idx < tail_cnt; idx += CTA_SIZE) {
|
||||
if constexpr (kv_dt == Fp8KVCacheDataType::kAuto) {
|
||||
dst_[idx] = static_cast<scalar_t>(src_[idx]);
|
||||
} else {
|
||||
dst_[idx] =
|
||||
fp8::scaled_convert<scalar_t, cache_t, kv_dt>(src_[idx], *scale);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -992,34 +991,38 @@ __global__ void gather_and_maybe_dequant_cache(
|
||||
// SCALAR_T is the data type of the destination tensor.
|
||||
// CACHE_T is the stored data type of kv-cache.
|
||||
// KV_DTYPE is the real data type of kv-cache.
|
||||
#define CALL_GATHER_CACHE(SCALAR_T, CACHE_T, KV_DTYPE) \
|
||||
vllm::gather_and_maybe_dequant_cache<SCALAR_T, CACHE_T, KV_DTYPE> \
|
||||
<<<grid, block, 0, stream>>>( \
|
||||
reinterpret_cast<CACHE_T*>(src_cache.data_ptr()), \
|
||||
reinterpret_cast<SCALAR_T*>(dst.data_ptr()), \
|
||||
block_table.data_ptr<int32_t>(), cu_seq_lens.data_ptr<int32_t>(), \
|
||||
block_size, entry_size, block_table_stride, cache_block_stride, \
|
||||
cache_entry_stride, dst_entry_stride, \
|
||||
reinterpret_cast<const float*>(scale.data_ptr()), seq_starts_ptr);
|
||||
#define CALL_GATHER_CACHE(SCALAR_T, CACHE_T, KV_DTYPE) \
|
||||
vllm::gather_and_maybe_dequant_cache<SCALAR_T, CACHE_T, KV_DTYPE, 576, \
|
||||
thread_block_size> \
|
||||
<<<grid, block, 0, stream>>>( \
|
||||
reinterpret_cast<CACHE_T*>(src_cache.data_ptr()), \
|
||||
reinterpret_cast<SCALAR_T*>(dst.data_ptr()), \
|
||||
block_table.data_ptr<int32_t>(), cu_seq_lens.data_ptr<int32_t>(), \
|
||||
token_to_seq.data_ptr<int32_t>(), num_tokens, block_size, \
|
||||
block_table_stride, cache_block_stride, cache_entry_stride, \
|
||||
dst_entry_stride, reinterpret_cast<const float*>(scale.data_ptr()), \
|
||||
seq_starts_ptr);
|
||||
|
||||
// Gather sequences from the cache into the destination tensor.
|
||||
// - cu_seq_lens contains the cumulative sequence lengths for each batch
|
||||
// - block_table contains the cache block indices for each sequence
|
||||
// - token_to_seq contains the back mapping from token_id to batch_id
|
||||
// - Optionally, seq_starts (if provided) offsets the starting block index by
|
||||
// (seq_starts[bid] / page_size)
|
||||
void gather_and_maybe_dequant_cache(
|
||||
torch::Tensor const& src_cache, // [NUM_BLOCKS, BLOCK_SIZE, ENTRIES...]
|
||||
torch::Tensor const& dst, // [TOT_TOKENS, ENTRIES...]
|
||||
torch::Tensor const& block_table, // [BATCH, BLOCK_INDICES]
|
||||
torch::Tensor const& cu_seq_lens, // [BATCH+1]
|
||||
int64_t batch_size, const std::string& kv_cache_dtype,
|
||||
torch::Tensor const& src_cache, // [NUM_BLOCKS, BLOCK_SIZE, ENTRIES...]
|
||||
torch::Tensor const& dst, // [TOT_TOKENS, ENTRIES...]
|
||||
torch::Tensor const& block_table, // [BATCH, BLOCK_INDICES]
|
||||
torch::Tensor const& cu_seq_lens, // [BATCH+1]
|
||||
torch::Tensor const& token_to_seq, // [MAX_TOKEN_ACROSS_CHUNKS]
|
||||
int64_t num_tokens, const std::string& kv_cache_dtype,
|
||||
torch::Tensor const& scale,
|
||||
std::optional<torch::Tensor> seq_starts = std::nullopt) {
|
||||
at::cuda::OptionalCUDAGuard device_guard(src_cache.device());
|
||||
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
||||
|
||||
int32_t block_size = src_cache.size(1);
|
||||
int32_t entry_size = src_cache.flatten(2, -1).size(2);
|
||||
int32_t head_dim = dst.size(-1);
|
||||
|
||||
TORCH_CHECK(block_table.dtype() == torch::kInt32,
|
||||
"block_table must be int32");
|
||||
@@ -1029,6 +1032,9 @@ void gather_and_maybe_dequant_cache(
|
||||
TORCH_CHECK(seq_starts.value().dtype() == torch::kInt32,
|
||||
"seq_starts must be int32");
|
||||
}
|
||||
TORCH_CHECK(head_dim == 576,
|
||||
"gather_and_maybe_dequant_cache only support the head_dim to 576 "
|
||||
"for better performance")
|
||||
|
||||
TORCH_CHECK(src_cache.device() == dst.device(),
|
||||
"src_cache and dst must be on the same device");
|
||||
@@ -1046,10 +1052,9 @@ void gather_and_maybe_dequant_cache(
|
||||
int64_t cache_entry_stride = src_cache.stride(1);
|
||||
int64_t dst_entry_stride = dst.stride(0);
|
||||
|
||||
// Decide on the number of splits based on the batch size.
|
||||
int num_splits = batch_size > 128 ? 2 : batch_size > 64 ? 4 : 16;
|
||||
dim3 grid(batch_size, num_splits);
|
||||
dim3 block(1024);
|
||||
constexpr int32_t thread_block_size = 64;
|
||||
dim3 grid(num_tokens);
|
||||
dim3 block(thread_block_size);
|
||||
|
||||
const int32_t* seq_starts_ptr =
|
||||
seq_starts.has_value() ? seq_starts.value().data_ptr<int32_t>() : nullptr;
|
||||
@@ -1058,6 +1063,82 @@ void gather_and_maybe_dequant_cache(
|
||||
}
|
||||
|
||||
namespace vllm {
|
||||
|
||||
// Gather and upconvert FP8 KV cache tokens to BF16 workspace
|
||||
// Similar to cp_gather_cache but specifically for FP8->BF16 conversion
|
||||
__global__ void cp_gather_and_upconvert_fp8_kv_cache(
|
||||
const uint8_t* __restrict__ src_cache, // [NUM_BLOCKS, BLOCK_SIZE, 656]
|
||||
__nv_bfloat16* __restrict__ dst, // [TOT_TOKENS, 576]
|
||||
const int32_t* __restrict__ block_table, // [BATCH, BLOCK_INDICES]
|
||||
const int32_t* __restrict__ seq_lens, // [BATCH]
|
||||
const int32_t* __restrict__ workspace_starts, // [BATCH]
|
||||
const int32_t block_size, const int32_t head_dim,
|
||||
const int64_t block_table_stride, const int64_t cache_block_stride,
|
||||
const int64_t cache_entry_stride, const int64_t dst_entry_stride) {
|
||||
const int64_t bid = blockIdx.x; // Batch ID
|
||||
const int32_t num_splits = gridDim.y;
|
||||
const int32_t split = blockIdx.y;
|
||||
const int32_t seq_start = workspace_starts[bid];
|
||||
const int32_t seq_len = seq_lens[bid];
|
||||
const int32_t tot_slots = seq_len;
|
||||
const int32_t split_slots = cuda_utils::ceil_div(tot_slots, num_splits);
|
||||
|
||||
const int32_t split_start = split * split_slots;
|
||||
const int32_t split_end = min((split + 1) * split_slots, tot_slots);
|
||||
|
||||
const bool is_active_split = (split_start < tot_slots);
|
||||
|
||||
if (!is_active_split) return;
|
||||
|
||||
// Adjust the pointer for the block_table for this batch
|
||||
const int32_t batch_offset = bid * block_table_stride;
|
||||
int32_t offset = split_start;
|
||||
int32_t offset_div = offset / block_size;
|
||||
offset = offset % block_size;
|
||||
const int32_t* batch_block_table = block_table + batch_offset;
|
||||
|
||||
// Adjust dst pointer based on the cumulative sequence lengths
|
||||
dst += seq_start * dst_entry_stride;
|
||||
|
||||
const int tid = threadIdx.x;
|
||||
|
||||
// Process each token in this split
|
||||
for (int pid = split_start; pid < split_end; ++pid) {
|
||||
auto block_id = batch_block_table[offset_div];
|
||||
const uint8_t* token_ptr =
|
||||
src_cache + block_id * cache_block_stride + offset * cache_entry_stride;
|
||||
__nv_bfloat16* dst_ptr = dst + pid * dst_entry_stride;
|
||||
|
||||
// FP8 format: 512 bytes fp8 + 16 bytes scales + 128 bytes rope (64 bf16)
|
||||
const uint8_t* no_pe_ptr = token_ptr;
|
||||
const float* scales_ptr = reinterpret_cast<const float*>(token_ptr + 512);
|
||||
const __nv_bfloat16* rope_ptr =
|
||||
reinterpret_cast<const __nv_bfloat16*>(token_ptr + 512 + 16);
|
||||
|
||||
// Parallelize fp8 dequant (512 elements) and rope copy (64 elements)
|
||||
if (tid < 512) {
|
||||
// FP8 dequantization
|
||||
const int tile = tid >> 7; // each tile is 128 elements
|
||||
const float scale = scales_ptr[tile];
|
||||
const uint8_t val = no_pe_ptr[tid];
|
||||
dst_ptr[tid] =
|
||||
fp8::scaled_convert<__nv_bfloat16, uint8_t,
|
||||
vllm::Fp8KVCacheDataType::kFp8E4M3>(val, scale);
|
||||
} else if (tid < 576) {
|
||||
// Rope copy (64 bf16 elements)
|
||||
const int rope_idx = tid - 512;
|
||||
dst_ptr[512 + rope_idx] = rope_ptr[rope_idx];
|
||||
}
|
||||
|
||||
// Move to next token
|
||||
offset += 1;
|
||||
if (offset == block_size) {
|
||||
offset_div += 1;
|
||||
offset = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename scalar_t>
|
||||
// Note(hc): The cp_gather_cache allows seq_starts to no longer be divisible by
|
||||
// block_size.
|
||||
@@ -1199,6 +1280,57 @@ void cp_gather_cache(
|
||||
}
|
||||
}
|
||||
|
||||
void cp_gather_and_upconvert_fp8_kv_cache(
|
||||
torch::Tensor const& src_cache, // [NUM_BLOCKS, BLOCK_SIZE, 656]
|
||||
torch::Tensor const& dst, // [TOT_TOKENS, 576]
|
||||
torch::Tensor const& block_table, // [BATCH, BLOCK_INDICES]
|
||||
torch::Tensor const& seq_lens, // [BATCH]
|
||||
torch::Tensor const& workspace_starts, // [BATCH]
|
||||
int64_t batch_size) {
|
||||
at::cuda::OptionalCUDAGuard device_guard(src_cache.device());
|
||||
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
||||
|
||||
int32_t block_size = src_cache.size(1);
|
||||
int32_t head_dim = dst.size(1);
|
||||
|
||||
TORCH_CHECK(block_table.dtype() == torch::kInt32,
|
||||
"block_table must be int32");
|
||||
TORCH_CHECK(seq_lens.dtype() == torch::kInt32, "seq_lens must be int32");
|
||||
TORCH_CHECK(workspace_starts.dtype() == torch::kInt32,
|
||||
"workspace_starts must be int32");
|
||||
|
||||
TORCH_CHECK(src_cache.device() == dst.device(),
|
||||
"src_cache and dst must be on the same device");
|
||||
TORCH_CHECK(src_cache.device() == block_table.device(),
|
||||
"src_cache and block_table must be on the same device");
|
||||
TORCH_CHECK(src_cache.device() == seq_lens.device(),
|
||||
"src_cache and seq_lens must be on the same device");
|
||||
TORCH_CHECK(src_cache.device() == workspace_starts.device(),
|
||||
"src_cache and workspace_starts must be on the same device");
|
||||
|
||||
TORCH_CHECK(src_cache.dtype() == torch::kUInt8, "src_cache must be uint8");
|
||||
TORCH_CHECK(dst.dtype() == torch::kBFloat16, "dst must be bfloat16");
|
||||
TORCH_CHECK(head_dim == 576, "head_dim must be 576 for MLA");
|
||||
|
||||
int64_t block_table_stride = block_table.stride(0);
|
||||
int64_t cache_block_stride = src_cache.stride(0);
|
||||
int64_t cache_entry_stride = src_cache.stride(1);
|
||||
int64_t dst_entry_stride = dst.stride(0);
|
||||
|
||||
// Decide on the number of splits based on the batch size
|
||||
int num_splits = batch_size > 128 ? 2 : batch_size > 64 ? 4 : 16;
|
||||
dim3 grid(batch_size, num_splits);
|
||||
dim3 block(576);
|
||||
|
||||
vllm::cp_gather_and_upconvert_fp8_kv_cache<<<grid, block, 0, stream>>>(
|
||||
src_cache.data_ptr<uint8_t>(),
|
||||
reinterpret_cast<__nv_bfloat16*>(dst.data_ptr()),
|
||||
block_table.data_ptr<int32_t>(), seq_lens.data_ptr<int32_t>(),
|
||||
workspace_starts.data_ptr<int32_t>(), block_size, head_dim,
|
||||
block_table_stride, cache_block_stride, cache_entry_stride,
|
||||
dst_entry_stride);
|
||||
}
|
||||
|
||||
// Macro to dispatch the kernel based on the data type.
|
||||
#define CALL_INDEXER_K_QUANT_AND_CACHE(KV_T, CACHE_T, KV_DTYPE) \
|
||||
vllm::indexer_k_quant_and_cache_kernel<KV_T, CACHE_T, KV_DTYPE> \
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user