Compare commits
1236 Commits
v0.11.1rc5
...
v0.13.0rc1
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
7d80c73d42 | ||
|
|
b75f826fca | ||
|
|
c3487aca34 | ||
|
|
abe93bce59 | ||
|
|
2e7035dd8c | ||
|
|
4c2e10ea19 | ||
|
|
03b5f940fd | ||
|
|
2e7054da06 | ||
|
|
3c680f4a17 | ||
|
|
fccd532587 | ||
|
|
00e5cbb967 | ||
|
|
7618dc973d | ||
|
|
f8dacc66b6 | ||
|
|
7cab92fd45 | ||
|
|
73a484caa1 | ||
|
|
b37bf51e75 | ||
|
|
95501a70ec | ||
|
|
e858bfe051 | ||
|
|
d471b2aff0 | ||
|
|
9e6562a3f6 | ||
|
|
0b6a8a304c | ||
|
|
804e3468c0 | ||
|
|
83319b44c2 | ||
|
|
56037dfa2f | ||
|
|
5dcd593baf | ||
|
|
5c213d2899 | ||
|
|
ee14644ba9 | ||
|
|
1166c31cc7 | ||
|
|
03416eada6 | ||
|
|
c72ea10723 | ||
|
|
67475a6e81 | ||
|
|
9c32df6101 | ||
|
|
aeb82b1930 | ||
|
|
aed846917f | ||
|
|
e4605d225e | ||
|
|
58d5b3f514 | ||
|
|
c2e1987a6e | ||
|
|
e130845984 | ||
|
|
4b03b50211 | ||
|
|
4c6fd25880 | ||
|
|
03b91f7262 | ||
|
|
f6227c22ab | ||
|
|
ea657f2078 | ||
|
|
db14f61f2d | ||
|
|
78c7503364 | ||
|
|
e41312a2f5 | ||
|
|
7b35011ad1 | ||
|
|
ae339b1a67 | ||
|
|
0ee6416f67 | ||
|
|
d9417096d1 | ||
|
|
9d6235ca9a | ||
|
|
f1599ca55d | ||
|
|
60d17251c9 | ||
|
|
1fb632fdb6 | ||
|
|
6af70e11a0 | ||
|
|
ae0f69b16a | ||
|
|
799804d140 | ||
|
|
0d402d2600 | ||
|
|
d1b5e7afbf | ||
|
|
fcd5306f65 | ||
|
|
398a596ed2 | ||
|
|
67312cad11 | ||
|
|
87aee9ed2b | ||
|
|
184076c3fe | ||
|
|
eb1051fb95 | ||
|
|
80433e225e | ||
|
|
5c2433a6f3 | ||
|
|
77072e93b3 | ||
|
|
2e660c2434 | ||
|
|
408cf42f67 | ||
|
|
9e77ffca3f | ||
|
|
bcb6f5947f | ||
|
|
cd00c443d2 | ||
|
|
d143271234 | ||
|
|
c6df05ebb4 | ||
|
|
d726a7b0ed | ||
|
|
344b50d525 | ||
|
|
735284ed86 | ||
|
|
444f0e3f33 | ||
|
|
af0444bf40 | ||
|
|
0044c4038c | ||
|
|
b952f4d3c3 | ||
|
|
541a2ef892 | ||
|
|
b0f4866a77 | ||
|
|
879ddb09c3 | ||
|
|
1b0482b9d1 | ||
|
|
e83b7e379c | ||
|
|
27f4c2fd46 | ||
|
|
a49d813fa8 | ||
|
|
17eb25e327 | ||
|
|
dce6d229f7 | ||
|
|
cbedb703cc | ||
|
|
8d3da4c79d | ||
|
|
421125d03a | ||
|
|
671427efbf | ||
|
|
21bb323542 | ||
|
|
17a9abec2b | ||
|
|
92c35abb24 | ||
|
|
43e7593031 | ||
|
|
c46b932df2 | ||
|
|
6476382384 | ||
|
|
d6aeaddf4a | ||
|
|
a238cbd89d | ||
|
|
4026ae31e9 | ||
|
|
b12f4a9830 | ||
|
|
40a046cd82 | ||
|
|
e858bc4d14 | ||
|
|
e3fbb6f152 | ||
|
|
c4d62618ca | ||
|
|
62079d8600 | ||
|
|
bf4a901af9 | ||
|
|
7e31c3a3f6 | ||
|
|
dc839ad03d | ||
|
|
02a4169193 | ||
|
|
7b5575fa7d | ||
|
|
77e4472809 | ||
|
|
962d703818 | ||
|
|
e23ca3a0e8 | ||
|
|
3633035a3f | ||
|
|
bff78310d9 | ||
|
|
adb315060c | ||
|
|
4e26d3b09e | ||
|
|
66e674cdd5 | ||
|
|
dff0a2b394 | ||
|
|
dc264bcea1 | ||
|
|
78c44fd722 | ||
|
|
e7296b08da | ||
|
|
da7bc54ea8 | ||
|
|
949a6a19d2 | ||
|
|
2c174420f5 | ||
|
|
0d8a7d8a26 | ||
|
|
9843e332da | ||
|
|
b7d85cf25c | ||
|
|
c2894d3883 | ||
|
|
3628bcaaf2 | ||
|
|
b73b158ab0 | ||
|
|
7ae13c66ba | ||
|
|
f16356fe36 | ||
|
|
65ee97288a | ||
|
|
62b3333448 | ||
|
|
feecba09af | ||
|
|
6038b1b04b | ||
|
|
60a66ea2dc | ||
|
|
06579f9a82 | ||
|
|
6e865b6a83 | ||
|
|
d698bb382d | ||
|
|
2c22c4ca2d | ||
|
|
5867819eaf | ||
|
|
7c9b2c8f81 | ||
|
|
0098a6e3da | ||
|
|
befb59e5b1 | ||
|
|
aaddc9c82a | ||
|
|
263c38d74d | ||
|
|
bcf43ab1f3 | ||
|
|
4470ee2f90 | ||
|
|
690cc3ef20 | ||
|
|
1f0d184590 | ||
|
|
c8ab988b15 | ||
|
|
48a5fff66e | ||
|
|
1119f6e47a | ||
|
|
e10c84e06a | ||
|
|
ece2825a29 | ||
|
|
652ba93da3 | ||
|
|
6dcb07f676 | ||
|
|
46cbbca05c | ||
|
|
b286a311c2 | ||
|
|
990f806473 | ||
|
|
5b4b42c0b6 | ||
|
|
cc050558f4 | ||
|
|
5c32a06a04 | ||
|
|
dd97e047e0 | ||
|
|
9998ea5b57 | ||
|
|
74c4d80c6c | ||
|
|
1b7c7f5159 | ||
|
|
6796ce8bdb | ||
|
|
e96a6a6dca | ||
|
|
6366c098d7 | ||
|
|
842aba501d | ||
|
|
f2f4cea6cc | ||
|
|
dfdda96747 | ||
|
|
ffdd18111b | ||
|
|
b8a6ae4158 | ||
|
|
899e2ef558 | ||
|
|
68eb5c8d97 | ||
|
|
5430e110c0 | ||
|
|
3f1b03739a | ||
|
|
9aa33a74b0 | ||
|
|
fd68e909db | ||
|
|
404fc4bfc0 | ||
|
|
82a64b3d8f | ||
|
|
9ae2f60374 | ||
|
|
80f8af4b2f | ||
|
|
8aaa81b35f | ||
|
|
fca3f46658 | ||
|
|
28097d5638 | ||
|
|
dd38ba3a26 | ||
|
|
5f91cdda75 | ||
|
|
33a3d6c798 | ||
|
|
c493b9d092 | ||
|
|
ad32e3e19c | ||
|
|
1109f98288 | ||
|
|
b5407869c8 | ||
|
|
2902c34826 | ||
|
|
ac1886588f | ||
|
|
2fc5d6e0d7 | ||
|
|
afe9eb408e | ||
|
|
19bee6d12d | ||
|
|
dd5d1ef780 | ||
|
|
d1f7392c5f | ||
|
|
9ae3c55b10 | ||
|
|
9bcf92295a | ||
|
|
5aa9b09040 | ||
|
|
1bb17ecb39 | ||
|
|
15b1511a15 | ||
|
|
b78772c433 | ||
|
|
f5d3d93c40 | ||
|
|
78f4bb0ba8 | ||
|
|
b294e28db2 | ||
|
|
787b84a9fc | ||
|
|
42c1949643 | ||
|
|
cc4e296ea6 | ||
|
|
a21cd9ed23 | ||
|
|
7fe9c1a223 | ||
|
|
3f42b05fbc | ||
|
|
69520bc695 | ||
|
|
3a7751485b | ||
|
|
bbfb55c29e | ||
|
|
0bec63fa31 | ||
|
|
c719c40540 | ||
|
|
b08025a83b | ||
|
|
d7284a2604 | ||
|
|
506ed87e87 | ||
|
|
4dd7978374 | ||
|
|
5cdd664509 | ||
|
|
5f67361fd1 | ||
|
|
5d91d2b292 | ||
|
|
c014de1ec7 | ||
|
|
1b1e35aaf9 | ||
|
|
5e5646e206 | ||
|
|
0a9caca9f5 | ||
|
|
e6f114ac25 | ||
|
|
6fc5841db1 | ||
|
|
3ff5b53bc2 | ||
|
|
1528e079e2 | ||
|
|
afb1e5b380 | ||
|
|
1c593e117d | ||
|
|
a2b053dc85 | ||
|
|
1d93f11675 | ||
|
|
2d613de9ae | ||
|
|
c77b9929a0 | ||
|
|
63b1da76ba | ||
|
|
52cb349fc0 | ||
|
|
0ec8422171 | ||
|
|
2eb4fe9129 | ||
|
|
51c57b51dd | ||
|
|
60c3d413af | ||
|
|
68ffbca7e4 | ||
|
|
951445a52d | ||
|
|
d8c6210eea | ||
|
|
8bbcf8b6e7 | ||
|
|
70fb77b4dc | ||
|
|
48d15a32aa | ||
|
|
3b221cb661 | ||
|
|
0037b5746a | ||
|
|
f5b0846ba0 | ||
|
|
13ea39bc09 | ||
|
|
4b612664fd | ||
|
|
653591d5e7 | ||
|
|
e2fbfc955e | ||
|
|
a690fb5bd6 | ||
|
|
81fe3f82af | ||
|
|
53bf71b0f0 | ||
|
|
f441d36cee | ||
|
|
22274b2184 | ||
|
|
fc95521ba5 | ||
|
|
d0cd728907 | ||
|
|
fa8804ad9c | ||
|
|
4b40924998 | ||
|
|
c0dfc89485 | ||
|
|
44822d7ff2 | ||
|
|
342c4f1472 | ||
|
|
1336a1ea24 | ||
|
|
eaf81485ed | ||
|
|
38caf7fa1a | ||
|
|
cabc77cc86 | ||
|
|
ec7035c9d4 | ||
|
|
fc6acc88ca | ||
|
|
d0985c5feb | ||
|
|
092bb73b8a | ||
|
|
5d43f7372e | ||
|
|
37593deb02 | ||
|
|
f5516039c5 | ||
|
|
36db0a35e4 | ||
|
|
5cfa967efa | ||
|
|
b95db244ee | ||
|
|
ad9d656bfa | ||
|
|
f37e8938d2 | ||
|
|
f0a28bf661 | ||
|
|
86e178f7c4 | ||
|
|
014ece97c7 | ||
|
|
62de4f4257 | ||
|
|
83805a6078 | ||
|
|
1ab8fc8197 | ||
|
|
f72a817bdf | ||
|
|
ec38a7368d | ||
|
|
21c2627934 | ||
|
|
39d28108f4 | ||
|
|
cd719de5cb | ||
|
|
8c363ed666 | ||
|
|
64bc09ba27 | ||
|
|
47539cfd3e | ||
|
|
2afcec4dec | ||
|
|
9381b5cde0 | ||
|
|
66b5840287 | ||
|
|
82c795d6f2 | ||
|
|
e1464c3a08 | ||
|
|
a491b0911b | ||
|
|
b9d0504a36 | ||
|
|
1656ad3704 | ||
|
|
fa59fe417f | ||
|
|
fe3398fab2 | ||
|
|
ad7f714d62 | ||
|
|
f4341f45d3 | ||
|
|
34a984274e | ||
|
|
f223ed4181 | ||
|
|
04a797cd0e | ||
|
|
6afc0ffaf6 | ||
|
|
39e63dec7c | ||
|
|
4a80ad0a25 | ||
|
|
4b17ce6815 | ||
|
|
e23f665d83 | ||
|
|
ca1b1e7296 | ||
|
|
762a4a6ca9 | ||
|
|
b2c50eda50 | ||
|
|
1dcafb3dea | ||
|
|
ea3370b428 | ||
|
|
c625d7b1c6 | ||
|
|
6173682b6e | ||
|
|
9726e64530 | ||
|
|
3fd1fb0b60 | ||
|
|
a51f4186f2 | ||
|
|
7675ba30de | ||
|
|
7c1ed45848 | ||
|
|
1986de1375 | ||
|
|
3461e7efd8 | ||
|
|
fecae12cd7 | ||
|
|
8d9338fae4 | ||
|
|
d40c854009 | ||
|
|
4332955602 | ||
|
|
f946a8d743 | ||
|
|
6f9d81d03b | ||
|
|
fae6943068 | ||
|
|
3bcbb30cbf | ||
|
|
9e6bcda3ac | ||
|
|
9eec282cb5 | ||
|
|
0808eb813b | ||
|
|
460d8bbf2d | ||
|
|
e2f56c309d | ||
|
|
f8151b66fa | ||
|
|
1168768a2d | ||
|
|
8e7a891602 | ||
|
|
953d9c820b | ||
|
|
33b06a6f24 | ||
|
|
5c2b5cb422 | ||
|
|
3cb32e5d6e | ||
|
|
ccbdf51bd5 | ||
|
|
5f5521bd5d | ||
|
|
b2c1d294fa | ||
|
|
cc0f2a0e19 | ||
|
|
480598958e | ||
|
|
b34e8775a3 | ||
|
|
f4b76056ee | ||
|
|
37b15e97e8 | ||
|
|
c7ba1f6bc7 | ||
|
|
18523b87f6 | ||
|
|
745a3bae1a | ||
|
|
35657bcd7a | ||
|
|
be493e0b3c | ||
|
|
ae0ce1be27 | ||
|
|
a5345bf49d | ||
|
|
e5a621b724 | ||
|
|
38658ec6f3 | ||
|
|
a24ea5414b | ||
|
|
ea228b4491 | ||
|
|
d45269b378 | ||
|
|
ee9841daa9 | ||
|
|
0840abdd24 | ||
|
|
e1f262337b | ||
|
|
fc1d8be3dc | ||
|
|
cd007a53b4 | ||
|
|
66d3d5422c | ||
|
|
bab438ff3e | ||
|
|
882851dc81 | ||
|
|
2f5f9acd55 | ||
|
|
cf348c8d27 | ||
|
|
a5abd1d384 | ||
|
|
e6d4f3c254 | ||
|
|
51906c8c55 | ||
|
|
0838b52e2e | ||
|
|
00d3310d2d | ||
|
|
da3222f371 | ||
|
|
43c5792592 | ||
|
|
3ecabd06ee | ||
|
|
c069086b9c | ||
|
|
11ea5ec1ff | ||
|
|
ecb1952378 | ||
|
|
da8e1a1bf9 | ||
|
|
ee80aee1ca | ||
|
|
0aeb698b77 | ||
|
|
9bb33c8919 | ||
|
|
a67dec7cba | ||
|
|
77740191de | ||
|
|
df01eda4dc | ||
|
|
ba1fcd84a7 | ||
|
|
56539cddac | ||
|
|
430dd4d9eb | ||
|
|
c4c0354eec | ||
|
|
e603129505 | ||
|
|
0b0aa874e8 | ||
|
|
70d5953f82 | ||
|
|
3650a74ed8 | ||
|
|
bb706d6048 | ||
|
|
e30859dff3 | ||
|
|
452a7c9f7c | ||
|
|
d9d342d214 | ||
|
|
53d7f1f601 | ||
|
|
c5ee430328 | ||
|
|
8d6a89dffd | ||
|
|
56531b79cc | ||
|
|
12866af748 | ||
|
|
d8819c88eb | ||
|
|
de75b0bb70 | ||
|
|
7df0289782 | ||
|
|
0abc79482a | ||
|
|
4e57c6587f | ||
|
|
e7d776273d | ||
|
|
c32a18cbe7 | ||
|
|
b07555d26f | ||
|
|
0353d2e162 | ||
|
|
a1f2676879 | ||
|
|
48ddb02b79 | ||
|
|
e502098643 | ||
|
|
dbc3d9991a | ||
|
|
794029f012 | ||
|
|
0231ce836a | ||
|
|
516c3f7847 | ||
|
|
51fc9e017a | ||
|
|
bf0c75cd4f | ||
|
|
c2c661af9b | ||
|
|
798e87db5c | ||
|
|
de6889946b | ||
|
|
7a80b01889 | ||
|
|
e1dd706cd1 | ||
|
|
a685b47c57 | ||
|
|
32c40b95e0 | ||
|
|
db2906108a | ||
|
|
67fc16cd8c | ||
|
|
6330f9477d | ||
|
|
ef1f7030f0 | ||
|
|
12c007e288 | ||
|
|
f242cfcdd5 | ||
|
|
888152bf87 | ||
|
|
fe3a4f5b34 | ||
|
|
98caeadd54 | ||
|
|
64deead719 | ||
|
|
7992324f23 | ||
|
|
40a6f53f6c | ||
|
|
ce58fdc1c3 | ||
|
|
a21256c463 | ||
|
|
316c8492bf | ||
|
|
2d9ee28cab | ||
|
|
81db702ed2 | ||
|
|
92effb07a4 | ||
|
|
87185c88d5 | ||
|
|
9cf4edae6e | ||
|
|
7012d8b45e | ||
|
|
22b42b5402 | ||
|
|
cb7214d8ea | ||
|
|
77e10c9cab | ||
|
|
6f1355a1b7 | ||
|
|
a4ad43ad5a | ||
|
|
a178a0b40b | ||
|
|
b8328b49fb | ||
|
|
5f9679a43b | ||
|
|
699bca76c0 | ||
|
|
c17610e2ba | ||
|
|
71df2a57ef | ||
|
|
4dd42db566 | ||
|
|
84371daf75 | ||
|
|
f32c7d6f54 | ||
|
|
3cfa63ad99 | ||
|
|
4d6afcaddc | ||
|
|
97588c4d12 | ||
|
|
839c6b7b72 | ||
|
|
8f066146c3 | ||
|
|
cec418b5df | ||
|
|
cc313cb73d | ||
|
|
26a465584a | ||
|
|
e924bbb4f4 | ||
|
|
656516c315 | ||
|
|
e48b2e6848 | ||
|
|
7a228b5305 | ||
|
|
f716a15372 | ||
|
|
2601f18a82 | ||
|
|
4de87866a8 | ||
|
|
eca7a8fb59 | ||
|
|
8005e606bf | ||
|
|
68dfe28eae | ||
|
|
ed40d85929 | ||
|
|
0ff70821c9 | ||
|
|
5253f4276f | ||
|
|
30854783ad | ||
|
|
1073ba68b0 | ||
|
|
c309bb5245 | ||
|
|
3e1ad40655 | ||
|
|
62d54ba46d | ||
|
|
b004c00418 | ||
|
|
7f12c82fa6 | ||
|
|
6fb0215eee | ||
|
|
55c21c8836 | ||
|
|
3999442f1c | ||
|
|
71362ffab4 | ||
|
|
20ee418adc | ||
|
|
389aa1b2eb | ||
|
|
3ed767ec06 | ||
|
|
5f96c00c55 | ||
|
|
4587063267 | ||
|
|
472fdee974 | ||
|
|
df78aeef08 | ||
|
|
7df331c66b | ||
|
|
eb5352a770 | ||
|
|
d1cf8214e5 | ||
|
|
730bd35378 | ||
|
|
f55c76c2b3 | ||
|
|
d84d8f4429 | ||
|
|
ae66818379 | ||
|
|
d44a63c6d6 | ||
|
|
066209a045 | ||
|
|
5f7209a793 | ||
|
|
2d4978a57e | ||
|
|
6965a392a4 | ||
|
|
5a4802588e | ||
|
|
8e22da1d7f | ||
|
|
a4fdf2405c | ||
|
|
e6309acdba | ||
|
|
988ee66b0d | ||
|
|
ea38474ac5 | ||
|
|
742e9ff6b3 | ||
|
|
e9056056fb | ||
|
|
1489902b53 | ||
|
|
933f67ecd8 | ||
|
|
fd65015a14 | ||
|
|
77e1c035d0 | ||
|
|
6f403501a0 | ||
|
|
052950e5b3 | ||
|
|
1ef9c9e294 | ||
|
|
5c8f2adf50 | ||
|
|
ed8e6843cc | ||
|
|
d045e22dfe | ||
|
|
1d34eb11e0 | ||
|
|
9a3101b2ba | ||
|
|
d5dbdbfcb2 | ||
|
|
30d6466238 | ||
|
|
e9af6ba62a | ||
|
|
c6fa3895e9 | ||
|
|
3137991f55 | ||
|
|
57430fc95c | ||
|
|
c68c7b403d | ||
|
|
53a1ba6ec5 | ||
|
|
1840c5cb18 | ||
|
|
1bed891f72 | ||
|
|
ceca060501 | ||
|
|
75648b16dd | ||
|
|
460d02a417 | ||
|
|
b4c8fbaae2 | ||
|
|
e99e467384 | ||
|
|
a42ab317ac | ||
|
|
b7f1f490a6 | ||
|
|
30b44a1598 | ||
|
|
1f400c58b8 | ||
|
|
711241c13c | ||
|
|
d7219bcda3 | ||
|
|
4050bae417 | ||
|
|
f1805db1a6 | ||
|
|
434f3d3eb8 | ||
|
|
2092ce8c39 | ||
|
|
fc9f821d20 | ||
|
|
9452863088 | ||
|
|
2b1b3dfa4b | ||
|
|
cca2d2cdbe | ||
|
|
aab0102a26 | ||
|
|
b34129bf8e | ||
|
|
4d7231e774 | ||
|
|
8ac3a41487 | ||
|
|
7d6da483b0 | ||
|
|
e4c3182c68 | ||
|
|
b4734b9550 | ||
|
|
30b9c67743 | ||
|
|
11857a00b0 | ||
|
|
8c25f9cfb6 | ||
|
|
56e96b37e4 | ||
|
|
698024ecce | ||
|
|
0730414999 | ||
|
|
a982f5b5ea | ||
|
|
0e741c12e3 | ||
|
|
56669c1f29 | ||
|
|
3f5f36da3f | ||
|
|
e1eefa4c40 | ||
|
|
ed6ae1e36a | ||
|
|
9875be6431 | ||
|
|
df44df0143 | ||
|
|
87cbbdff63 | ||
|
|
986ab5db63 | ||
|
|
dd39f91edb | ||
|
|
c7a29d2c8d | ||
|
|
8237ab8a2b | ||
|
|
3fd74189db | ||
|
|
5e5a7eb16f | ||
|
|
3d84ef9054 | ||
|
|
4d01b64284 | ||
|
|
114b0e2500 | ||
|
|
647464719b | ||
|
|
e5bfcb6a88 | ||
|
|
22924383e1 | ||
|
|
56f45eddaf | ||
|
|
82b05b15e6 | ||
|
|
a2e9ebe9e2 | ||
|
|
93c8672ceb | ||
|
|
371b1d4c61 | ||
|
|
c9e093116c | ||
|
|
c0c2dd1e0b | ||
|
|
06c20c9904 | ||
|
|
6eb745d9bd | ||
|
|
66483a9d00 | ||
|
|
edfe867208 | ||
|
|
dc45efc8ef | ||
|
|
fb8851f254 | ||
|
|
a903d59ffa | ||
|
|
322cb02872 | ||
|
|
2c52c7fd9a | ||
|
|
1e1c06789e | ||
|
|
7218f83992 | ||
|
|
20e4497be2 | ||
|
|
1c7bcc55b8 | ||
|
|
a9705a290a | ||
|
|
64192d5624 | ||
|
|
fe25772aa9 | ||
|
|
0cca9b4d13 | ||
|
|
a8c536829c | ||
|
|
fcbcba6c70 | ||
|
|
3168285fca | ||
|
|
3fb0d90999 | ||
|
|
05c2dee7e9 | ||
|
|
1d642872a2 | ||
|
|
9ccef8e333 | ||
|
|
537cc635c7 | ||
|
|
5031cd5d55 | ||
|
|
3aaa94ac99 | ||
|
|
8e38e99829 | ||
|
|
0075bfffd4 | ||
|
|
cb0a7b4bea | ||
|
|
8f4f77a727 | ||
|
|
22e44ad589 | ||
|
|
88f5b19f0b | ||
|
|
613abb50d5 | ||
|
|
cdeec2e606 | ||
|
|
1607e664f0 | ||
|
|
68d7231991 | ||
|
|
2fd893b4ce | ||
|
|
02f5903b84 | ||
|
|
ac10fd3c69 | ||
|
|
9d2d561257 | ||
|
|
fe69f331f8 | ||
|
|
3319a493fc | ||
|
|
61728cd1df | ||
|
|
0c80efd94f | ||
|
|
a8b70304d6 | ||
|
|
d44e9df7d4 | ||
|
|
48fc8b1e59 | ||
|
|
1ffe934c8a | ||
|
|
2c8b9182b5 | ||
|
|
4f5299f717 | ||
|
|
09540cd918 | ||
|
|
da2f6800e0 | ||
|
|
ba558c029a | ||
|
|
97cfa99d59 | ||
|
|
bbc6c2f1e5 | ||
|
|
8151609583 | ||
|
|
fdf93486d6 | ||
|
|
d69062c67a | ||
|
|
ae4821a108 | ||
|
|
7ed27f3cb5 | ||
|
|
a4511e38db | ||
|
|
71d0ae1c54 | ||
|
|
3d4e7d34be | ||
|
|
6a25ea5f0e | ||
|
|
73ff872db0 | ||
|
|
468a8d72ba | ||
|
|
4c23690f43 | ||
|
|
814843e021 | ||
|
|
20852c8f4c | ||
|
|
40b6b38f2c | ||
|
|
da94c7c0eb | ||
|
|
1395461f5f | ||
|
|
9912b8ccb8 | ||
|
|
49ef847aa8 | ||
|
|
67745d189f | ||
|
|
2a2d5d2780 | ||
|
|
c3e2978620 | ||
|
|
e4bb2684bc | ||
|
|
c64c0b78de | ||
|
|
0af3d4f0df | ||
|
|
da8dadf68b | ||
|
|
f226a3f0c1 | ||
|
|
c2612371ad | ||
|
|
49a986ecd4 | ||
|
|
f6aa122698 | ||
|
|
184b12fdc6 | ||
|
|
b9489f51e1 | ||
|
|
285eaa4285 | ||
|
|
439368496d | ||
|
|
896e41ae04 | ||
|
|
5bb1da5190 | ||
|
|
5bdd155277 | ||
|
|
0168f69e50 | ||
|
|
083cf326dc | ||
|
|
bf9e1e8767 | ||
|
|
3ddcf46011 | ||
|
|
d0a73620cc | ||
|
|
88ab591f0b | ||
|
|
b6e04390d3 | ||
|
|
552cac95b5 | ||
|
|
61485844fc | ||
|
|
f77bce001a | ||
|
|
a289cc1dde | ||
|
|
95ae50b7d1 | ||
|
|
7765e5ba75 | ||
|
|
d8874c61a5 | ||
|
|
f8b19c0ffd | ||
|
|
e42bd8c2e3 | ||
|
|
7f064491f8 | ||
|
|
64e39d667c | ||
|
|
1b82fb0ad3 | ||
|
|
d4acf518d0 | ||
|
|
ab01cd14e5 | ||
|
|
577bb34fff | ||
|
|
3380ed5e11 | ||
|
|
6f37419244 | ||
|
|
60e089f0b9 | ||
|
|
d64429bb36 | ||
|
|
561253b37f | ||
|
|
80b6080ddc | ||
|
|
03ee48111d | ||
|
|
5a87076d6e | ||
|
|
ac1daf3233 | ||
|
|
63fed55506 | ||
|
|
8d259fad6c | ||
|
|
3bc1175798 | ||
|
|
af02c40970 | ||
|
|
b316ac6589 | ||
|
|
a55b64635c | ||
|
|
d231876ce3 | ||
|
|
f849ee739c | ||
|
|
be263f7645 | ||
|
|
2bb4435cb7 | ||
|
|
07cadab27a | ||
|
|
637f292196 | ||
|
|
e439c784fa | ||
|
|
085a525332 | ||
|
|
89d3679221 | ||
|
|
cb15ee28db | ||
|
|
f36292dbee | ||
|
|
173b356abf | ||
|
|
638e4196d1 | ||
|
|
1ec978c209 | ||
|
|
74b5267d3a | ||
|
|
dd6ac1c2bb | ||
|
|
98b4d389ed | ||
|
|
6965ef436f | ||
|
|
c9e665852a | ||
|
|
363aaeef0f | ||
|
|
ac86bff8cb | ||
|
|
edfe498189 | ||
|
|
f05d474c8a | ||
|
|
9fc81ec765 | ||
|
|
186352b270 | ||
|
|
58e61e56b7 | ||
|
|
75f01b9d3c | ||
|
|
ba041d980b | ||
|
|
e0c910bb89 | ||
|
|
bf3ffb61e6 | ||
|
|
e5c78956c0 | ||
|
|
2e0ad629b0 | ||
|
|
5a84b76b86 | ||
|
|
0de4f217ab | ||
|
|
f08eab2acc | ||
|
|
8977ffb5e6 | ||
|
|
fd4555089a | ||
|
|
cec275efce | ||
|
|
e2741f6cbc | ||
|
|
67187554dd | ||
|
|
a425dc256e | ||
|
|
964d65deed | ||
|
|
9261eb3dc1 | ||
|
|
cdd7025961 | ||
|
|
085424808e | ||
|
|
a17e36f223 | ||
|
|
8cc40f8992 | ||
|
|
6f1e7f7226 | ||
|
|
d54a18a47e | ||
|
|
5f3cd7f7f2 | ||
|
|
c934caee88 | ||
|
|
3f8a874065 | ||
|
|
511a6b611d | ||
|
|
96b23b8e3b | ||
|
|
433c0f8675 | ||
|
|
8d3748d3c7 | ||
|
|
db56a59970 | ||
|
|
9324e10275 | ||
|
|
4516d44b7f | ||
|
|
41b92f7d38 | ||
|
|
360bd8762f | ||
|
|
ecf8230d4d | ||
|
|
8cfbe89b93 | ||
|
|
fd75d3e8c0 | ||
|
|
c9a3a02149 | ||
|
|
bc3e43069a | ||
|
|
c36bcfe6b3 | ||
|
|
529cea343d | ||
|
|
93103575ce | ||
|
|
15ae8e0784 | ||
|
|
0b25498990 | ||
|
|
0aecd9138f | ||
|
|
da14ae0fad | ||
|
|
01bea115c4 | ||
|
|
b39a5026eb | ||
|
|
622e6106a9 | ||
|
|
2aa75c752b | ||
|
|
4d5943bda6 | ||
|
|
f2b8e1c551 | ||
|
|
6e25b1cddf | ||
|
|
e64011f29a | ||
|
|
1b622deba7 | ||
|
|
faed7bf07e | ||
|
|
262d263f6c | ||
|
|
968060c15a | ||
|
|
5d6ce2b960 | ||
|
|
f9f3b596f3 | ||
|
|
119c4927b3 | ||
|
|
fe1cd7704d | ||
|
|
fdfd5075aa | ||
|
|
327c0a9a23 | ||
|
|
06c4873d95 | ||
|
|
d3387750f1 | ||
|
|
b230286fbc | ||
|
|
3035d1a166 | ||
|
|
07a606aa7e | ||
|
|
a7791eac9d | ||
|
|
8da2f28f53 | ||
|
|
86d15bfd8d | ||
|
|
c9fe6abe7c | ||
|
|
c47b6c85ac | ||
|
|
c428e8d80b | ||
|
|
5e973209aa | ||
|
|
e63fd44560 | ||
|
|
11ac9ddd03 | ||
|
|
5c9ad138d5 | ||
|
|
fa183e9271 | ||
|
|
4ab34f6ef1 | ||
|
|
c33b87e777 | ||
|
|
4504e8029b | ||
|
|
ca00b1bfc6 | ||
|
|
d44fbbab0e | ||
|
|
7e082bc14e | ||
|
|
dbbe0c756a | ||
|
|
7dca0c90cb | ||
|
|
1a0b157a2e | ||
|
|
7c38ed0f1c | ||
|
|
a1d3866dda | ||
|
|
97d1c99302 | ||
|
|
3226283461 | ||
|
|
8832fff972 | ||
|
|
a543e678b4 | ||
|
|
2dacd57394 | ||
|
|
d75ad04818 | ||
|
|
52eadcec9e | ||
|
|
51c599f0ec | ||
|
|
69d0e90313 | ||
|
|
4ca5cd5740 | ||
|
|
10f01d5a3a | ||
|
|
3eb0c2673e | ||
|
|
d8140b9833 | ||
|
|
74a9a9faad | ||
|
|
478ee511de | ||
|
|
58ce8d12b7 | ||
|
|
94a9ebcf31 | ||
|
|
a39dd7bb06 | ||
|
|
64d57c3be7 | ||
|
|
a1e7fa362a | ||
|
|
bac904565f | ||
|
|
304419576a | ||
|
|
a742134cc5 | ||
|
|
728a9eb70e | ||
|
|
bc5bd45c7d | ||
|
|
f76e85c299 | ||
|
|
54aecd9ed5 | ||
|
|
10138c92a5 | ||
|
|
a9d18b5107 | ||
|
|
edb59a9470 | ||
|
|
c5f10cc139 | ||
|
|
d143152308 | ||
|
|
a4730c1b4f | ||
|
|
d3ade61e42 | ||
|
|
1761dea1a8 | ||
|
|
c748355e0d | ||
|
|
91864b79b3 | ||
|
|
ac0bb2c307 | ||
|
|
f31419ed8b | ||
|
|
b9ce9a3013 | ||
|
|
4ccffe561f | ||
|
|
cbb799e314 | ||
|
|
9f0247cfa4 | ||
|
|
7f829be7d3 | ||
|
|
e1710393c4 | ||
|
|
3f770f4427 | ||
|
|
48c879369f | ||
|
|
1788aa1efb | ||
|
|
d23539549a | ||
|
|
412e153df5 | ||
|
|
e5f599d4d1 | ||
|
|
28534b92b9 | ||
|
|
d4902ba56d | ||
|
|
df4d3a44a8 | ||
|
|
9d1c474704 | ||
|
|
8c32c6e4b4 | ||
|
|
de120bc94f | ||
|
|
4228be7959 | ||
|
|
76e4dcf225 | ||
|
|
d5edcb8678 | ||
|
|
6c3c0f8235 | ||
|
|
684f254585 | ||
|
|
e553424919 | ||
|
|
5a1271d83a | ||
|
|
05576df85c | ||
|
|
68c09efc37 | ||
|
|
a7ef3eb0cd | ||
|
|
f9a4087182 | ||
|
|
287bbbeb06 | ||
|
|
3143eb23fc | ||
|
|
b886068056 | ||
|
|
a90ad7d838 | ||
|
|
533b018f72 | ||
|
|
a1448b4b69 | ||
|
|
fa1970201d | ||
|
|
3380543b20 | ||
|
|
afffd3cc8a | ||
|
|
7dbe6d81d6 | ||
|
|
b30dfa03c5 | ||
|
|
2e78150d24 | ||
|
|
d381eb967f | ||
|
|
9973e6e04a | ||
|
|
c7991269dd | ||
|
|
f0359fffa4 | ||
|
|
798c7bebca | ||
|
|
4fd4b743a2 | ||
|
|
cc079763c5 | ||
|
|
a7adbc6c6b | ||
|
|
e605e8e323 | ||
|
|
bca74e32b7 | ||
|
|
8d706cca90 | ||
|
|
57201a6a4c | ||
|
|
f2d9ad0620 | ||
|
|
de540c0354 | ||
|
|
39029d5192 | ||
|
|
35d801f13f | ||
|
|
0bf29fadf5 | ||
|
|
a5a790eea6 | ||
|
|
b30372cbd0 | ||
|
|
d17ecc6b19 | ||
|
|
021143561f | ||
|
|
30700b1cd7 | ||
|
|
4b94ed8f92 | ||
|
|
6dec9f6109 | ||
|
|
bf6a3d0ff5 | ||
|
|
40d33264c6 | ||
|
|
9c84ca8293 | ||
|
|
6d54336ae5 | ||
|
|
34553b9d27 | ||
|
|
b039bfda8f | ||
|
|
d0e186c16f | ||
|
|
f080a83511 | ||
|
|
40e2eeeb92 | ||
|
|
b06b9470ca | ||
|
|
4673e465ff | ||
|
|
912744d066 | ||
|
|
15be507c86 | ||
|
|
6f7de33bed | ||
|
|
a98cc35c34 | ||
|
|
e8697faf03 | ||
|
|
03fa4d3fb3 | ||
|
|
6b2b9fd934 | ||
|
|
c5f685b3ae | ||
|
|
c4768dcf47 | ||
|
|
a65a934ebe | ||
|
|
4a8d6bd168 | ||
|
|
636efd10a5 | ||
|
|
289eb6c537 | ||
|
|
19d91ece4b | ||
|
|
7ae5a5fb11 | ||
|
|
de2b78305f | ||
|
|
e5e9067e61 | ||
|
|
3a7d580343 | ||
|
|
05f8d69077 | ||
|
|
404d7a9d14 | ||
|
|
171133f929 | ||
|
|
32787d0644 | ||
|
|
975676d174 | ||
|
|
77d702a22b | ||
|
|
2108a571d7 | ||
|
|
47604137a2 | ||
|
|
26990d25dc | ||
|
|
d9ab1ad9d1 | ||
|
|
608bb14462 | ||
|
|
4a36681f85 | ||
|
|
d15afc1fd0 | ||
|
|
934a9c3b79 | ||
|
|
70af44fd10 | ||
|
|
781f5ebf52 | ||
|
|
0852527647 | ||
|
|
61d25dc44b | ||
|
|
d0c7792004 | ||
|
|
b158df2813 | ||
|
|
1aaecda078 | ||
|
|
811df41ee9 | ||
|
|
67a2da890e | ||
|
|
da786e339e | ||
|
|
18903216f5 | ||
|
|
d0ceb38ae8 | ||
|
|
155ad56d7b | ||
|
|
5fb4137c99 | ||
|
|
68a72a5cc1 | ||
|
|
0f872b7977 | ||
|
|
4b1ff13221 | ||
|
|
e0d6b4a867 | ||
|
|
72b1c2ae2c | ||
|
|
e0919f331d | ||
|
|
8e19d470af | ||
|
|
1958bda9b4 | ||
|
|
7bdb42b2f2 | ||
|
|
315068eb4a | ||
|
|
ccd98b59c1 | ||
|
|
21b82f4ea2 | ||
|
|
a736e5ff77 | ||
|
|
9da9208b20 | ||
|
|
11fd69dd54 | ||
|
|
c0a4b95d64 | ||
|
|
a47d94f18c | ||
|
|
e70fbc599b | ||
|
|
4bf56c79cc | ||
|
|
59b453eaa2 | ||
|
|
827e4237bc | ||
|
|
ca6f755d24 | ||
|
|
ca90f50304 | ||
|
|
da855b42d2 | ||
|
|
449de9001a | ||
|
|
d4aa65c998 | ||
|
|
7a8375f8a0 | ||
|
|
5e0c1fe69c | ||
|
|
4507a6dae4 | ||
|
|
d1dd5f53e4 | ||
|
|
e52e4da971 | ||
|
|
2176778cd3 | ||
|
|
0370679ce9 | ||
|
|
8816e375d3 | ||
|
|
f32229293e | ||
|
|
c757a15f0f | ||
|
|
59a50afa08 | ||
|
|
981cadb35c | ||
|
|
c3ee80a01a | ||
|
|
3755c14532 | ||
|
|
201dc98acc | ||
|
|
a404e2c0f1 | ||
|
|
e31946f86e | ||
|
|
bde5039325 | ||
|
|
d72299d47b | ||
|
|
80679f108f | ||
|
|
43ecd0a900 | ||
|
|
07d614511f | ||
|
|
f948ab6945 | ||
|
|
d71af5f502 | ||
|
|
90189c71a9 | ||
|
|
d79d9f0780 | ||
|
|
b6a248bdd7 | ||
|
|
1767658559 | ||
|
|
efe73e9b57 | ||
|
|
0b8e871e5e | ||
|
|
5ee93a5956 | ||
|
|
e15601789b | ||
|
|
65ac8d8dc4 | ||
|
|
ffb08379d8 | ||
|
|
e04492449e | ||
|
|
518ec6b722 | ||
|
|
802748bddb | ||
|
|
faedbb4d4f | ||
|
|
40db194446 | ||
|
|
c765f0b443 | ||
|
|
002b07c4b2 | ||
|
|
752ddeacaa | ||
|
|
c18f88c6ca | ||
|
|
6fd0df8132 | ||
|
|
3f5a4b6473 | ||
|
|
6cae1e5332 | ||
|
|
80c9275348 | ||
|
|
e50c454672 | ||
|
|
5d16d0fa62 | ||
|
|
0606bea2b6 | ||
|
|
6e97eccf5d | ||
|
|
6ab183813c | ||
|
|
6b7a81185d | ||
|
|
b57789b62b | ||
|
|
377061d481 | ||
|
|
86dca07d9b | ||
|
|
16b37f3119 | ||
|
|
0976711f3b | ||
|
|
e261d37c9a | ||
|
|
b7cbc25416 | ||
|
|
d43ad5a757 | ||
|
|
0ff05e3770 | ||
|
|
428bc7bf1c | ||
|
|
878fd5a16f | ||
|
|
18b39828d9 | ||
|
|
4ea62b77f5 | ||
|
|
d4e547bb7e | ||
|
|
2d977a7a9e | ||
|
|
1fb4217a05 | ||
|
|
611c86ea3c | ||
|
|
dc937175d4 | ||
|
|
2f1cc8cef1 | ||
|
|
938a81692e | ||
|
|
c9f66da8fd | ||
|
|
05cae69f0f | ||
|
|
5fd8f02ea9 | ||
|
|
97e3dda84b | ||
|
|
5a0a6dfd55 | ||
|
|
938772af03 | ||
|
|
e4ee658672 | ||
|
|
77f8001f53 | ||
|
|
300a265978 | ||
|
|
03c4c4aa9d | ||
|
|
2ec401bc39 | ||
|
|
4022a9d279 | ||
|
|
53f6e81dfd | ||
|
|
43a6acfb7d | ||
|
|
58279c60b5 | ||
|
|
2f84ae1f27 | ||
|
|
f32cbc9a0c | ||
|
|
7e4be74104 | ||
|
|
380ba6816d | ||
|
|
14a125a06d | ||
|
|
c02fccdbd2 | ||
|
|
6ddae74054 | ||
|
|
b13a447546 | ||
|
|
7956b0c0bc | ||
|
|
3758757377 | ||
|
|
ccd3e55e51 | ||
|
|
01baefe674 | ||
|
|
786030721e | ||
|
|
145c00a4d3 | ||
|
|
55011aef24 | ||
|
|
a4398fbb5e | ||
|
|
2c19d96777 | ||
|
|
4bc400f47e | ||
|
|
cac4c10ef0 | ||
|
|
f7d2946e99 | ||
|
|
294c805f1d | ||
|
|
40b69e33e7 | ||
|
|
32257297dd | ||
|
|
ba464e6ae2 | ||
|
|
7f4bdadb92 | ||
|
|
cec7c28833 | ||
|
|
18961c5ea6 | ||
|
|
470ad118b6 | ||
|
|
1bf43ae35d | ||
|
|
0ce743f4e1 | ||
|
|
6c317a656e | ||
|
|
00b31a36a2 | ||
|
|
73444b7b56 | ||
|
|
853a8eb53b | ||
|
|
758ea2e980 | ||
|
|
685c99ee77 | ||
|
|
1e88fb751b | ||
|
|
c2ed069b32 | ||
|
|
af6e19f50f | ||
|
|
99d69af9ec | ||
|
|
d811b442d3 | ||
|
|
30a14b034f | ||
|
|
799ce45cc1 | ||
|
|
2c0c7c39bd | ||
|
|
e675118849 | ||
|
|
e2347dbf58 | ||
|
|
879a06579e | ||
|
|
29de3cdee4 | ||
|
|
7e2729b57e | ||
|
|
3a5de7d2d6 | ||
|
|
bc4486d609 | ||
|
|
0cdbe7b744 | ||
|
|
df334868ca | ||
|
|
0e0a638c3b | ||
|
|
f29aeb5a25 | ||
|
|
5e8862e9e0 | ||
|
|
9e5bd3076e | ||
|
|
fc16f1c477 | ||
|
|
bc306fe5e9 | ||
|
|
103a468bbf | ||
|
|
70bfbd7b16 | ||
|
|
d6517be3cd | ||
|
|
7e06c40e63 | ||
|
|
675704ac01 | ||
|
|
0384aa7150 | ||
|
|
3857eb8725 | ||
|
|
933cdea440 | ||
|
|
3933f18a5e | ||
|
|
e5ef4dfc11 | ||
|
|
36960501d3 | ||
|
|
b2e65cb4a7 | ||
|
|
2bf0bcc1fc | ||
|
|
697f507a8e | ||
|
|
d5d2a0fe74 | ||
|
|
c9791f1813 | ||
|
|
e7acb20076 | ||
|
|
4b68c4a55b | ||
|
|
a8141fa649 | ||
|
|
4917002523 | ||
|
|
a2981c4272 | ||
|
|
4574d48bab | ||
|
|
ab98f6556f |
24
.buildkite/ci_config.yaml
Normal file
24
.buildkite/ci_config.yaml
Normal file
@@ -0,0 +1,24 @@
|
|||||||
|
name: vllm_ci
|
||||||
|
job_dirs:
|
||||||
|
- ".buildkite/test_areas"
|
||||||
|
- ".buildkite/image_build"
|
||||||
|
run_all_patterns:
|
||||||
|
- "docker/Dockerfile"
|
||||||
|
- "CMakeLists.txt"
|
||||||
|
- "requirements/common.txt"
|
||||||
|
- "requirements/cuda.txt"
|
||||||
|
- "requirements/build.txt"
|
||||||
|
- "requirements/test.txt"
|
||||||
|
- "setup.py"
|
||||||
|
- "csrc/"
|
||||||
|
- "cmake/"
|
||||||
|
run_all_exclude_patterns:
|
||||||
|
- "docker/Dockerfile."
|
||||||
|
- "csrc/cpu/"
|
||||||
|
- "csrc/rocm/"
|
||||||
|
- "cmake/hipify.py"
|
||||||
|
- "cmake/cpu_extension.cmake"
|
||||||
|
registries: public.ecr.aws/q9t5s3a7
|
||||||
|
repositories:
|
||||||
|
main: "vllm-ci-postmerge-repo"
|
||||||
|
premerge: "vllm-ci-test-repo"
|
||||||
@@ -1,46 +0,0 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
|
||||||
|
|
||||||
import argparse
|
|
||||||
import os
|
|
||||||
|
|
||||||
template = """<!DOCTYPE html>
|
|
||||||
<html>
|
|
||||||
<body>
|
|
||||||
<h1>Links for vLLM</h1/>
|
|
||||||
<a href="../{x86_wheel_html_escaped}">{x86_wheel}</a><br/>
|
|
||||||
<a href="../{arm_wheel_html_escaped}">{arm_wheel}</a><br/>
|
|
||||||
</body>
|
|
||||||
</html>
|
|
||||||
"""
|
|
||||||
|
|
||||||
parser = argparse.ArgumentParser()
|
|
||||||
parser.add_argument("--wheel", help="The wheel path.", required=True)
|
|
||||||
args = parser.parse_args()
|
|
||||||
|
|
||||||
filename = os.path.basename(args.wheel)
|
|
||||||
|
|
||||||
with open("index.html", "w") as f:
|
|
||||||
print(f"Generated index.html for {args.wheel}")
|
|
||||||
# sync the abi tag with .buildkite/scripts/upload-wheels.sh
|
|
||||||
if "x86_64" in filename:
|
|
||||||
x86_wheel = filename
|
|
||||||
arm_wheel = filename.replace("x86_64", "aarch64").replace(
|
|
||||||
"manylinux1", "manylinux2014"
|
|
||||||
)
|
|
||||||
elif "aarch64" in filename:
|
|
||||||
x86_wheel = filename.replace("aarch64", "x86_64").replace(
|
|
||||||
"manylinux2014", "manylinux1"
|
|
||||||
)
|
|
||||||
arm_wheel = filename
|
|
||||||
else:
|
|
||||||
raise ValueError(f"Unsupported wheel: {filename}")
|
|
||||||
# cloudfront requires escaping the '+' character
|
|
||||||
f.write(
|
|
||||||
template.format(
|
|
||||||
x86_wheel=x86_wheel,
|
|
||||||
x86_wheel_html_escaped=x86_wheel.replace("+", "%2B"),
|
|
||||||
arm_wheel=arm_wheel,
|
|
||||||
arm_wheel_html_escaped=arm_wheel.replace("+", "%2B"),
|
|
||||||
)
|
|
||||||
)
|
|
||||||
56
.buildkite/image_build/image_build.sh
Executable file
56
.buildkite/image_build/image_build.sh
Executable file
@@ -0,0 +1,56 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
set -e
|
||||||
|
|
||||||
|
if [[ $# -lt 8 ]]; then
|
||||||
|
echo "Usage: $0 <registry> <repo> <commit> <branch> <vllm_use_precompiled> <vllm_merge_base_commit> <cache_from> <cache_to>"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
REGISTRY=$1
|
||||||
|
REPO=$2
|
||||||
|
BUILDKITE_COMMIT=$3
|
||||||
|
BRANCH=$4
|
||||||
|
VLLM_USE_PRECOMPILED=$5
|
||||||
|
VLLM_MERGE_BASE_COMMIT=$6
|
||||||
|
CACHE_FROM=$7
|
||||||
|
CACHE_TO=$8
|
||||||
|
|
||||||
|
# authenticate with AWS ECR
|
||||||
|
aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin $REGISTRY
|
||||||
|
aws ecr get-login-password --region us-east-1 | docker login --username AWS --password-stdin 936637512419.dkr.ecr.us-east-1.amazonaws.com
|
||||||
|
|
||||||
|
# docker buildx
|
||||||
|
docker buildx create --name vllm-builder --driver docker-container --use
|
||||||
|
docker buildx inspect --bootstrap
|
||||||
|
docker buildx ls
|
||||||
|
|
||||||
|
# skip build if image already exists
|
||||||
|
if [[ -z $(docker manifest inspect $REGISTRY/$REPO:$BUILDKITE_COMMIT) ]]; then
|
||||||
|
echo "Image not found, proceeding with build..."
|
||||||
|
else
|
||||||
|
echo "Image found"
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ "${VLLM_USE_PRECOMPILED:-0}" == "1" ]]; then
|
||||||
|
merge_base_commit_build_args="--build-arg VLLM_MERGE_BASE_COMMIT=${VLLM_MERGE_BASE_COMMIT}"
|
||||||
|
else
|
||||||
|
merge_base_commit_build_args=""
|
||||||
|
fi
|
||||||
|
|
||||||
|
# build
|
||||||
|
docker buildx build --file docker/Dockerfile \
|
||||||
|
--build-arg max_jobs=16 \
|
||||||
|
--build-arg buildkite_commit=$BUILDKITE_COMMIT \
|
||||||
|
--build-arg USE_SCCACHE=1 \
|
||||||
|
--build-arg TORCH_CUDA_ARCH_LIST="8.0 8.9 9.0 10.0" \
|
||||||
|
--build-arg FI_TORCH_CUDA_ARCH_LIST="8.0 8.9 9.0a 10.0a" \
|
||||||
|
--build-arg VLLM_USE_PRECOMPILED="${VLLM_USE_PRECOMPILED:-0}" \
|
||||||
|
${merge_base_commit_build_args} \
|
||||||
|
--cache-from type=registry,ref=${CACHE_FROM},mode=max \
|
||||||
|
--cache-to type=registry,ref=${CACHE_TO},mode=max \
|
||||||
|
--tag ${REGISTRY}/${REPO}:${BUILDKITE_COMMIT} \
|
||||||
|
$( [[ "${BRANCH}" == "main" ]] && echo "--tag ${REGISTRY}/${REPO}:latest" ) \
|
||||||
|
--push \
|
||||||
|
--target test \
|
||||||
|
--progress plain .
|
||||||
57
.buildkite/image_build/image_build.yaml
Normal file
57
.buildkite/image_build/image_build.yaml
Normal file
@@ -0,0 +1,57 @@
|
|||||||
|
group: Abuild
|
||||||
|
steps:
|
||||||
|
- label: ":docker: Build image"
|
||||||
|
key: image-build
|
||||||
|
depends_on: []
|
||||||
|
commands:
|
||||||
|
- .buildkite/image_build/image_build.sh $REGISTRY $REPO $BUILDKITE_COMMIT $BRANCH $VLLM_USE_PRECOMPILED $VLLM_MERGE_BASE_COMMIT $CACHE_FROM $CACHE_TO
|
||||||
|
retry:
|
||||||
|
automatic:
|
||||||
|
- exit_status: -1 # Agent was lost
|
||||||
|
limit: 2
|
||||||
|
- exit_status: -10 # Agent was lost
|
||||||
|
limit: 2
|
||||||
|
|
||||||
|
- label: ":docker: Build CPU image"
|
||||||
|
key: image-build-cpu
|
||||||
|
depends_on: []
|
||||||
|
commands:
|
||||||
|
- .buildkite/image_build/image_build_cpu.sh $REGISTRY $REPO $BUILDKITE_COMMIT
|
||||||
|
env:
|
||||||
|
DOCKER_BUILDKIT: "1"
|
||||||
|
retry:
|
||||||
|
automatic:
|
||||||
|
- exit_status: -1 # Agent was lost
|
||||||
|
limit: 2
|
||||||
|
- exit_status: -10 # Agent was lost
|
||||||
|
limit: 2
|
||||||
|
|
||||||
|
- label: ":docker: Build HPU image"
|
||||||
|
soft_fail: true
|
||||||
|
depends_on: []
|
||||||
|
key: image-build-hpu
|
||||||
|
commands:
|
||||||
|
- .buildkite/image_build/image_build_hpu.sh $REGISTRY $REPO $BUILDKITE_COMMIT
|
||||||
|
env:
|
||||||
|
DOCKER_BUILDKIT: "1"
|
||||||
|
retry:
|
||||||
|
automatic:
|
||||||
|
- exit_status: -1 # Agent was lost
|
||||||
|
limit: 2
|
||||||
|
- exit_status: -10 # Agent was lost
|
||||||
|
limit: 2
|
||||||
|
|
||||||
|
- label: ":docker: Build CPU arm64 image"
|
||||||
|
key: cpu-arm64-image-build
|
||||||
|
depends_on: []
|
||||||
|
optional: true
|
||||||
|
commands:
|
||||||
|
- .buildkite/image_build/image_build_cpu_arm64.sh $REGISTRY $REPO $BUILDKITE_COMMIT
|
||||||
|
env:
|
||||||
|
DOCKER_BUILDKIT: "1"
|
||||||
|
retry:
|
||||||
|
automatic:
|
||||||
|
- exit_status: -1 # Agent was lost
|
||||||
|
limit: 2
|
||||||
|
- exit_status: -10 # Agent was lost
|
||||||
|
limit: 2
|
||||||
36
.buildkite/image_build/image_build_cpu.sh
Executable file
36
.buildkite/image_build/image_build_cpu.sh
Executable file
@@ -0,0 +1,36 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
set -e
|
||||||
|
|
||||||
|
if [[ $# -lt 3 ]]; then
|
||||||
|
echo "Usage: $0 <registry> <repo> <commit>"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
REGISTRY=$1
|
||||||
|
REPO=$2
|
||||||
|
BUILDKITE_COMMIT=$3
|
||||||
|
|
||||||
|
# authenticate with AWS ECR
|
||||||
|
aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin $REGISTRY
|
||||||
|
|
||||||
|
# skip build if image already exists
|
||||||
|
if [[ -z $(docker manifest inspect $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu) ]]; then
|
||||||
|
echo "Image not found, proceeding with build..."
|
||||||
|
else
|
||||||
|
echo "Image found"
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
# build
|
||||||
|
docker build --file docker/Dockerfile.cpu \
|
||||||
|
--build-arg max_jobs=16 \
|
||||||
|
--build-arg buildkite_commit=$BUILDKITE_COMMIT \
|
||||||
|
--build-arg VLLM_CPU_AVX512BF16=true \
|
||||||
|
--build-arg VLLM_CPU_AVX512VNNI=true \
|
||||||
|
--build-arg VLLM_CPU_AMXBF16=true \
|
||||||
|
--tag $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu \
|
||||||
|
--target vllm-test \
|
||||||
|
--progress plain .
|
||||||
|
|
||||||
|
# push
|
||||||
|
docker push $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu
|
||||||
33
.buildkite/image_build/image_build_cpu_arm64.sh
Executable file
33
.buildkite/image_build/image_build_cpu_arm64.sh
Executable file
@@ -0,0 +1,33 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
set -e
|
||||||
|
|
||||||
|
if [[ $# -lt 3 ]]; then
|
||||||
|
echo "Usage: $0 <registry> <repo> <commit>"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
REGISTRY=$1
|
||||||
|
REPO=$2
|
||||||
|
BUILDKITE_COMMIT=$3
|
||||||
|
|
||||||
|
# authenticate with AWS ECR
|
||||||
|
aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin $REGISTRY
|
||||||
|
|
||||||
|
# skip build if image already exists
|
||||||
|
if [[ -z $(docker manifest inspect $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu) ]]; then
|
||||||
|
echo "Image not found, proceeding with build..."
|
||||||
|
else
|
||||||
|
echo "Image found"
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
# build
|
||||||
|
docker build --file docker/Dockerfile.cpu \
|
||||||
|
--build-arg max_jobs=16 \
|
||||||
|
--build-arg buildkite_commit=$BUILDKITE_COMMIT \
|
||||||
|
--tag $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu \
|
||||||
|
--target vllm-test \
|
||||||
|
--progress plain .
|
||||||
|
|
||||||
|
# push
|
||||||
|
docker push $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu
|
||||||
34
.buildkite/image_build/image_build_hpu.sh
Executable file
34
.buildkite/image_build/image_build_hpu.sh
Executable file
@@ -0,0 +1,34 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
set -e
|
||||||
|
|
||||||
|
if [[ $# -lt 3 ]]; then
|
||||||
|
echo "Usage: $0 <registry> <repo> <commit>"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
REGISTRY=$1
|
||||||
|
REPO=$2
|
||||||
|
BUILDKITE_COMMIT=$3
|
||||||
|
|
||||||
|
# authenticate with AWS ECR
|
||||||
|
aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin $REGISTRY
|
||||||
|
|
||||||
|
# skip build if image already exists
|
||||||
|
if [[ -z $(docker manifest inspect $REGISTRY/$REPO:$BUILDKITE_COMMIT-hpu) ]]; then
|
||||||
|
echo "Image not found, proceeding with build..."
|
||||||
|
else
|
||||||
|
echo "Image found"
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
# build
|
||||||
|
docker build \
|
||||||
|
--file tests/pytorch_ci_hud_benchmark/Dockerfile.hpu \
|
||||||
|
--build-arg max_jobs=16 \
|
||||||
|
--build-arg buildkite_commit=$BUILDKITE_COMMIT \
|
||||||
|
--tag $REGISTRY/$REPO:$BUILDKITE_COMMIT-hpu \
|
||||||
|
--progress plain \
|
||||||
|
https://github.com/vllm-project/vllm-gaudi.git
|
||||||
|
|
||||||
|
# push
|
||||||
|
docker push $REGISTRY/$REPO:$BUILDKITE_COMMIT-hpu
|
||||||
@@ -8,3 +8,4 @@ tasks:
|
|||||||
value: 0.80
|
value: 0.80
|
||||||
limit: 250 # will run on 250 * 14 subjects = 3500 samples
|
limit: 250 # will run on 250 * 14 subjects = 3500 samples
|
||||||
num_fewshot: 5
|
num_fewshot: 5
|
||||||
|
rtol: 0.05
|
||||||
|
|||||||
@@ -1,12 +0,0 @@
|
|||||||
# For vllm script, with -t option (tensor parallel size).
|
|
||||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Qwen2-1.5B-Instruct-W8A16-Channelwise -b "auto" -l 1000 -f 5 -t 1
|
|
||||||
model_name: "nm-testing/Qwen2-1.5B-Instruct-W8A16-Channelwise"
|
|
||||||
tasks:
|
|
||||||
- name: "gsm8k"
|
|
||||||
metrics:
|
|
||||||
- name: "exact_match,strict-match"
|
|
||||||
value: 0.595
|
|
||||||
- name: "exact_match,flexible-extract"
|
|
||||||
value: 0.582
|
|
||||||
limit: 1000
|
|
||||||
num_fewshot: 5
|
|
||||||
1
.buildkite/lm-eval-harness/configs/models-large-rocm.txt
Normal file
1
.buildkite/lm-eval-harness/configs/models-large-rocm.txt
Normal file
@@ -0,0 +1 @@
|
|||||||
|
Meta-Llama-4-Maverick-17B-128E-Instruct-FP8.yaml
|
||||||
@@ -9,11 +9,40 @@ pytest -s -v test_lm_eval_correctness.py \
|
|||||||
--tp-size=1
|
--tp-size=1
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
import os
|
||||||
|
from contextlib import contextmanager
|
||||||
|
|
||||||
import lm_eval
|
import lm_eval
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import yaml
|
import yaml
|
||||||
|
|
||||||
RTOL = 0.08
|
DEFAULT_RTOL = 0.08
|
||||||
|
|
||||||
|
|
||||||
|
@contextmanager
|
||||||
|
def scoped_env_vars(new_env: dict[str, str]):
|
||||||
|
if not new_env:
|
||||||
|
# Fast path: nothing to do
|
||||||
|
yield
|
||||||
|
return
|
||||||
|
|
||||||
|
old_values = {}
|
||||||
|
new_keys = []
|
||||||
|
|
||||||
|
try:
|
||||||
|
for key, value in new_env.items():
|
||||||
|
if key in os.environ:
|
||||||
|
old_values[key] = os.environ[key]
|
||||||
|
else:
|
||||||
|
new_keys.append(key)
|
||||||
|
os.environ[key] = str(value)
|
||||||
|
yield
|
||||||
|
finally:
|
||||||
|
# Restore / clean up
|
||||||
|
for key, value in old_values.items():
|
||||||
|
os.environ[key] = value
|
||||||
|
for key in new_keys:
|
||||||
|
os.environ.pop(key, None)
|
||||||
|
|
||||||
|
|
||||||
def launch_lm_eval(eval_config, tp_size):
|
def launch_lm_eval(eval_config, tp_size):
|
||||||
@@ -32,6 +61,9 @@ def launch_lm_eval(eval_config, tp_size):
|
|||||||
f"trust_remote_code={trust_remote_code},"
|
f"trust_remote_code={trust_remote_code},"
|
||||||
f"max_model_len={max_model_len},"
|
f"max_model_len={max_model_len},"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
env_vars = eval_config.get("env_vars", None)
|
||||||
|
with scoped_env_vars(env_vars):
|
||||||
results = lm_eval.simple_evaluate(
|
results = lm_eval.simple_evaluate(
|
||||||
model=backend,
|
model=backend,
|
||||||
model_args=model_args,
|
model_args=model_args,
|
||||||
@@ -57,6 +89,8 @@ def test_lm_eval_correctness_param(config_filename, tp_size):
|
|||||||
|
|
||||||
results = launch_lm_eval(eval_config, tp_size)
|
results = launch_lm_eval(eval_config, tp_size)
|
||||||
|
|
||||||
|
rtol = eval_config.get("rtol", DEFAULT_RTOL)
|
||||||
|
|
||||||
success = True
|
success = True
|
||||||
for task in eval_config["tasks"]:
|
for task in eval_config["tasks"]:
|
||||||
for metric in task["metrics"]:
|
for metric in task["metrics"]:
|
||||||
@@ -64,8 +98,9 @@ def test_lm_eval_correctness_param(config_filename, tp_size):
|
|||||||
measured_value = results["results"][task["name"]][metric["name"]]
|
measured_value = results["results"][task["name"]][metric["name"]]
|
||||||
print(
|
print(
|
||||||
f"{task['name']} | {metric['name']}: "
|
f"{task['name']} | {metric['name']}: "
|
||||||
f"ground_truth={ground_truth} | measured={measured_value}"
|
f"ground_truth={ground_truth:.3f} | "
|
||||||
|
f"measured={measured_value:.3f} | rtol={rtol}"
|
||||||
)
|
)
|
||||||
success = success and np.isclose(ground_truth, measured_value, rtol=RTOL)
|
success = success and np.isclose(ground_truth, measured_value, rtol=rtol)
|
||||||
|
|
||||||
assert success
|
assert success
|
||||||
|
|||||||
@@ -7,7 +7,7 @@ vLLM also maintains a continuous performance benchmark under [perf.vllm.ai](http
|
|||||||
|
|
||||||
## Performance benchmark quick overview
|
## Performance benchmark quick overview
|
||||||
|
|
||||||
**Benchmarking Coverage**: latency, throughput and fix-qps serving on B200, A100, H100 and Intel® Xeon® Processors, with different models.
|
**Benchmarking Coverage**: latency, throughput and fix-qps serving on B200, A100, H100, Intel® Xeon® Processors and Intel® Gaudi® 3 Accelerators with different models.
|
||||||
|
|
||||||
**Benchmarking Duration**: about 1hr.
|
**Benchmarking Duration**: about 1hr.
|
||||||
|
|
||||||
@@ -34,6 +34,7 @@ Runtime environment variables:
|
|||||||
|
|
||||||
See [performance-benchmarks-descriptions.md](performance-benchmarks-descriptions.md) for detailed descriptions, and use `tests/latency-tests.json`, `tests/throughput-tests.json`, `tests/serving-tests.json` to configure the test cases.
|
See [performance-benchmarks-descriptions.md](performance-benchmarks-descriptions.md) for detailed descriptions, and use `tests/latency-tests.json`, `tests/throughput-tests.json`, `tests/serving-tests.json` to configure the test cases.
|
||||||
> NOTE: For Intel® Xeon® Processors, use `tests/latency-tests-cpu.json`, `tests/throughput-tests-cpu.json`, `tests/serving-tests-cpu.json` instead.
|
> NOTE: For Intel® Xeon® Processors, use `tests/latency-tests-cpu.json`, `tests/throughput-tests-cpu.json`, `tests/serving-tests-cpu.json` instead.
|
||||||
|
For Intel® Gaudi® 3 Accelerators, use `tests/latency-tests-hpu.json`, `tests/throughput-tests-hpu.json`, `tests/serving-tests-hpu.json` instead.
|
||||||
>
|
>
|
||||||
### Latency test
|
### Latency test
|
||||||
|
|
||||||
@@ -107,6 +108,65 @@ The number of this test is less stable compared to the delay and latency benchma
|
|||||||
|
|
||||||
WARNING: The benchmarking script will save json results by itself, so please do not configure `--save-results` or other results-saving-related parameters in `serving-tests.json`.
|
WARNING: The benchmarking script will save json results by itself, so please do not configure `--save-results` or other results-saving-related parameters in `serving-tests.json`.
|
||||||
|
|
||||||
|
#### Default Parameters Field
|
||||||
|
|
||||||
|
We can specify default parameters in a JSON field with key `defaults`. Parameters defined in the field are applied globally to all serving tests, and can be overridden in test case fields. Here is an example:
|
||||||
|
|
||||||
|
<details>
|
||||||
|
<summary> An Example of default parameters field </summary>
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"defaults": {
|
||||||
|
"qps_list": [
|
||||||
|
"inf"
|
||||||
|
],
|
||||||
|
"server_environment_variables": {
|
||||||
|
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1
|
||||||
|
},
|
||||||
|
"server_parameters": {
|
||||||
|
"tensor_parallel_size": 1,
|
||||||
|
"dtype": "bfloat16",
|
||||||
|
"block_size": 128,
|
||||||
|
"disable_log_stats": "",
|
||||||
|
"load_format": "dummy"
|
||||||
|
},
|
||||||
|
"client_parameters": {
|
||||||
|
"backend": "vllm",
|
||||||
|
"dataset_name": "random",
|
||||||
|
"random-input-len": 128,
|
||||||
|
"random-output-len": 128,
|
||||||
|
"num_prompts": 200,
|
||||||
|
"ignore-eos": ""
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"tests": [
|
||||||
|
{
|
||||||
|
"test_name": "serving_llama3B_tp2_random_128_128",
|
||||||
|
"server_parameters": {
|
||||||
|
"model": "meta-llama/Llama-3.2-3B-Instruct",
|
||||||
|
"tensor_parallel_size": 2,
|
||||||
|
},
|
||||||
|
"client_parameters": {
|
||||||
|
"model": "meta-llama/Llama-3.2-3B-Instruct",
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"test_name": "serving_qwen3_tp4_random_128_128",
|
||||||
|
"server_parameters": {
|
||||||
|
"model": "Qwen/Qwen3-14B",
|
||||||
|
"tensor_parallel_size": 4,
|
||||||
|
},
|
||||||
|
"client_parameters": {
|
||||||
|
"model": "Qwen/Qwen3-14B",
|
||||||
|
}
|
||||||
|
},
|
||||||
|
]
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
</details>
|
||||||
|
|
||||||
### Visualizing the results
|
### Visualizing the results
|
||||||
|
|
||||||
The `convert-results-json-to-markdown.py` helps you put the benchmarking results inside a markdown table, by formatting [descriptions.md](performance-benchmarks-descriptions.md) with real benchmarking results.
|
The `convert-results-json-to-markdown.py` helps you put the benchmarking results inside a markdown table, by formatting [descriptions.md](performance-benchmarks-descriptions.md) with real benchmarking results.
|
||||||
|
|||||||
@@ -5,7 +5,7 @@
|
|||||||
- Input length: 32 tokens.
|
- Input length: 32 tokens.
|
||||||
- Output length: 128 tokens.
|
- Output length: 128 tokens.
|
||||||
- Batch size: fixed (8).
|
- Batch size: fixed (8).
|
||||||
- GPU Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
|
- GPU/HPU Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
|
||||||
- CPU Models: llama-3.1 8B.
|
- CPU Models: llama-3.1 8B.
|
||||||
- Evaluation metrics: end-to-end latency (mean, median, p99).
|
- Evaluation metrics: end-to-end latency (mean, median, p99).
|
||||||
|
|
||||||
@@ -16,7 +16,7 @@
|
|||||||
- Input length: randomly sample 200 prompts from ShareGPT dataset (with fixed random seed).
|
- Input length: randomly sample 200 prompts from ShareGPT dataset (with fixed random seed).
|
||||||
- Output length: the corresponding output length of these 200 prompts.
|
- Output length: the corresponding output length of these 200 prompts.
|
||||||
- Batch size: dynamically determined by vllm to achieve maximum throughput.
|
- Batch size: dynamically determined by vllm to achieve maximum throughput.
|
||||||
- GPU Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
|
- GPU/HPU Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
|
||||||
- CPU Models: llama-3.1 8B.
|
- CPU Models: llama-3.1 8B.
|
||||||
- Evaluation metrics: throughput.
|
- Evaluation metrics: throughput.
|
||||||
|
|
||||||
@@ -28,7 +28,7 @@
|
|||||||
- Output length: the corresponding output length of these 200 prompts.
|
- Output length: the corresponding output length of these 200 prompts.
|
||||||
- Batch size: dynamically determined by vllm and the arrival pattern of the requests.
|
- Batch size: dynamically determined by vllm and the arrival pattern of the requests.
|
||||||
- **Average QPS (query per second)**: 1, 4, 16 and inf. QPS = inf means all requests come at once. For other QPS values, the arrival time of each query is determined using a random Poisson process (with fixed random seed).
|
- **Average QPS (query per second)**: 1, 4, 16 and inf. QPS = inf means all requests come at once. For other QPS values, the arrival time of each query is determined using a random Poisson process (with fixed random seed).
|
||||||
- GPU Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
|
- GPU/HPU Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
|
||||||
- We also added a speculative decoding test for llama-3 70B on GPU, under QPS 2
|
- We also added a speculative decoding test for llama-3 70B on GPU, under QPS 2
|
||||||
- CPU Models: llama-3.1 8B.
|
- CPU Models: llama-3.1 8B.
|
||||||
- Evaluation metrics: throughput, TTFT (time to the first token, with mean, median and p99), ITL (inter-token latency, with mean, median and p99).
|
- Evaluation metrics: throughput, TTFT (time to the first token, with mean, median and p99), ITL (inter-token latency, with mean, median and p99).
|
||||||
|
|||||||
@@ -15,6 +15,8 @@ check_gpus() {
|
|||||||
declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
|
declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
|
||||||
elif command -v amd-smi; then
|
elif command -v amd-smi; then
|
||||||
declare -g gpu_count=$(amd-smi list | grep 'GPU' | wc -l)
|
declare -g gpu_count=$(amd-smi list | grep 'GPU' | wc -l)
|
||||||
|
elif command -v hl-smi; then
|
||||||
|
declare -g gpu_count=$(hl-smi --list | grep -i "Module ID" | wc -l)
|
||||||
fi
|
fi
|
||||||
|
|
||||||
if [[ $gpu_count -gt 0 ]]; then
|
if [[ $gpu_count -gt 0 ]]; then
|
||||||
@@ -23,10 +25,16 @@ check_gpus() {
|
|||||||
echo "Need at least 1 GPU to run benchmarking."
|
echo "Need at least 1 GPU to run benchmarking."
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
declare -g arch_suffix=''
|
||||||
|
|
||||||
if command -v nvidia-smi; then
|
if command -v nvidia-smi; then
|
||||||
declare -g gpu_type=$(nvidia-smi --query-gpu=name --format=csv,noheader | awk '{print $2}')
|
declare -g gpu_type=$(nvidia-smi --query-gpu=name --format=csv,noheader | awk '{print $2}')
|
||||||
elif command -v amd-smi; then
|
elif command -v amd-smi; then
|
||||||
declare -g gpu_type=$(amd-smi static -g 0 -a | grep 'MARKET_NAME' | awk '{print $2}')
|
declare -g gpu_type=$(amd-smi static -g 0 -a | grep 'MARKET_NAME' | awk '{print $2}')
|
||||||
|
elif command -v hl-smi; then
|
||||||
|
declare -g gpu_type=$(hl-smi -q | grep "Product Name" | head -n 1 | awk -F ':' '{print $2}' | sed 's/^ *//')
|
||||||
|
arch_suffix='-hpu'
|
||||||
fi
|
fi
|
||||||
echo "GPU type is $gpu_type"
|
echo "GPU type is $gpu_type"
|
||||||
}
|
}
|
||||||
@@ -102,7 +110,8 @@ json2envs() {
|
|||||||
wait_for_server() {
|
wait_for_server() {
|
||||||
# wait for vllm server to start
|
# wait for vllm server to start
|
||||||
# return 1 if vllm server crashes
|
# return 1 if vllm server crashes
|
||||||
timeout 1200 bash -c '
|
local timeout_val="1200"
|
||||||
|
timeout "$timeout_val" bash -c '
|
||||||
until curl -X POST localhost:8000/v1/completions; do
|
until curl -X POST localhost:8000/v1/completions; do
|
||||||
sleep 1
|
sleep 1
|
||||||
done' && return 0 || return 1
|
done' && return 0 || return 1
|
||||||
@@ -138,6 +147,10 @@ kill_gpu_processes() {
|
|||||||
while [ "$(amd-smi metric -g 0 | grep 'USED_VRAM' | awk '{print $2}')" -ge 1000 ]; do
|
while [ "$(amd-smi metric -g 0 | grep 'USED_VRAM' | awk '{print $2}')" -ge 1000 ]; do
|
||||||
sleep 1
|
sleep 1
|
||||||
done
|
done
|
||||||
|
elif command -v hl-smi; then
|
||||||
|
while [ "$(hl-smi -q | grep "Used" | head -n 1 | awk '{print $3}')" -ge 1000 ]; do
|
||||||
|
sleep 1
|
||||||
|
done
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# remove vllm config file
|
# remove vllm config file
|
||||||
@@ -304,12 +317,44 @@ run_throughput_tests() {
|
|||||||
run_serving_tests() {
|
run_serving_tests() {
|
||||||
# run serving tests using `vllm bench serve` command
|
# run serving tests using `vllm bench serve` command
|
||||||
# $1: a json file specifying serving test cases
|
# $1: a json file specifying serving test cases
|
||||||
|
#
|
||||||
|
# Supported JSON formats:
|
||||||
|
# 1) Plain format: top-level array
|
||||||
|
# [ { "test_name": "...", "server_parameters": {...}, ... }, ... ]
|
||||||
|
#
|
||||||
|
# 2) Default parameters field + plain format tests
|
||||||
|
# {
|
||||||
|
# "defaults": { ... },
|
||||||
|
# "tests": [ { "test_name": "...", "server_parameters": {...}, ... }, ... ]
|
||||||
|
# }
|
||||||
|
|
||||||
local serving_test_file
|
local serving_test_file
|
||||||
serving_test_file=$1
|
serving_test_file=$1
|
||||||
|
|
||||||
# Iterate over serving tests
|
# Iterate over serving tests
|
||||||
jq -c '.[]' "$serving_test_file" | while read -r params; do
|
jq -c '
|
||||||
|
if type == "array" then
|
||||||
|
# Plain format: test cases array
|
||||||
|
.[]
|
||||||
|
elif (type == "object" and has("tests")) then
|
||||||
|
# merge the default parameters into each test cases
|
||||||
|
. as $root
|
||||||
|
| ($root.defaults // {}) as $d
|
||||||
|
| ($root.tests // [])[]
|
||||||
|
# default qps / max_concurrency from defaults if missing
|
||||||
|
| .qps_list = (.qps_list // $d.qps_list)
|
||||||
|
| .max_concurrency_list = (.max_concurrency_list // $d.max_concurrency_list)
|
||||||
|
# merge envs / params: test overrides defaults
|
||||||
|
| .server_environment_variables =
|
||||||
|
(($d.server_environment_variables // {}) + (.server_environment_variables // {}))
|
||||||
|
| .server_parameters =
|
||||||
|
(($d.server_parameters // {}) + (.server_parameters // {}))
|
||||||
|
| .client_parameters =
|
||||||
|
(($d.client_parameters // {}) + (.client_parameters // {}))
|
||||||
|
else
|
||||||
|
error("Unsupported serving test file format: must be array or object with .tests")
|
||||||
|
end
|
||||||
|
' "$serving_test_file" | while read -r params; do
|
||||||
# get the test name, and append the GPU type back to it.
|
# get the test name, and append the GPU type back to it.
|
||||||
test_name=$(echo "$params" | jq -r '.test_name')
|
test_name=$(echo "$params" | jq -r '.test_name')
|
||||||
if [[ ! "$test_name" =~ ^serving_ ]]; then
|
if [[ ! "$test_name" =~ ^serving_ ]]; then
|
||||||
@@ -323,16 +368,21 @@ run_serving_tests() {
|
|||||||
continue
|
continue
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# get client and server arguments
|
# get client and server arguments (after merged the default parameters)
|
||||||
server_params=$(echo "$params" | jq -r '.server_parameters')
|
server_params=$(echo "$params" | jq -r '.server_parameters')
|
||||||
server_envs=$(echo "$params" | jq -r '.server_environment_variables')
|
server_envs=$(echo "$params" | jq -r '.server_environment_variables')
|
||||||
client_params=$(echo "$params" | jq -r '.client_parameters')
|
client_params=$(echo "$params" | jq -r '.client_parameters')
|
||||||
|
|
||||||
server_args=$(json2args "$server_params")
|
server_args=$(json2args "$server_params")
|
||||||
server_envs=$(json2envs "$server_envs")
|
server_envs=$(json2envs "$server_envs")
|
||||||
client_args=$(json2args "$client_params")
|
client_args=$(json2args "$client_params")
|
||||||
|
|
||||||
|
# qps_list
|
||||||
qps_list=$(echo "$params" | jq -r '.qps_list')
|
qps_list=$(echo "$params" | jq -r '.qps_list')
|
||||||
qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
|
qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
|
||||||
echo "Running over qps list $qps_list"
|
echo "Running over qps list $qps_list"
|
||||||
|
|
||||||
|
# max_concurrency_list (fallback to num_prompts if missing)
|
||||||
max_concurrency_list=$(echo "$params" | jq -r '.max_concurrency_list')
|
max_concurrency_list=$(echo "$params" | jq -r '.max_concurrency_list')
|
||||||
if [[ -z "$max_concurrency_list" || "$max_concurrency_list" == "null" ]]; then
|
if [[ -z "$max_concurrency_list" || "$max_concurrency_list" == "null" ]]; then
|
||||||
num_prompts=$(echo "$client_params" | jq -r '.num_prompts')
|
num_prompts=$(echo "$client_params" | jq -r '.num_prompts')
|
||||||
@@ -451,6 +501,7 @@ main() {
|
|||||||
ARCH='-cpu'
|
ARCH='-cpu'
|
||||||
else
|
else
|
||||||
check_gpus
|
check_gpus
|
||||||
|
ARCH="$arch_suffix"
|
||||||
fi
|
fi
|
||||||
check_hf_token
|
check_hf_token
|
||||||
|
|
||||||
|
|||||||
@@ -0,0 +1,55 @@
|
|||||||
|
[
|
||||||
|
{
|
||||||
|
"test_name": "latency_llama8B_tp1",
|
||||||
|
"environment_variables": {
|
||||||
|
"PT_HPU_LAZY_MODE": 1,
|
||||||
|
"VLLM_CONTIGUOUS_PA": 1,
|
||||||
|
"VLLM_DEFRAG": 1
|
||||||
|
},
|
||||||
|
"parameters": {
|
||||||
|
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
||||||
|
"tensor_parallel_size": 1,
|
||||||
|
"load_format": "dummy",
|
||||||
|
"num-iters-warmup": 5,
|
||||||
|
"num-iters": 15,
|
||||||
|
"max-model-len": 256,
|
||||||
|
"async-scheduling": ""
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"test_name": "latency_llama70B_tp4",
|
||||||
|
"environment_variables": {
|
||||||
|
"PT_HPU_LAZY_MODE": 1,
|
||||||
|
"PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
|
||||||
|
"VLLM_CONTIGUOUS_PA": 1,
|
||||||
|
"VLLM_DEFRAG": 1
|
||||||
|
},
|
||||||
|
"parameters": {
|
||||||
|
"model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
|
||||||
|
"tensor_parallel_size": 4,
|
||||||
|
"load_format": "dummy",
|
||||||
|
"num-iters-warmup": 5,
|
||||||
|
"num-iters": 15,
|
||||||
|
"max-model-len": 256,
|
||||||
|
"async-scheduling": ""
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"test_name": "latency_mixtral8x7B_tp2",
|
||||||
|
"environment_variables": {
|
||||||
|
"PT_HPU_LAZY_MODE": 1,
|
||||||
|
"PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
|
||||||
|
"VLLM_CONTIGUOUS_PA": 1,
|
||||||
|
"VLLM_DEFRAG": 1
|
||||||
|
},
|
||||||
|
"parameters": {
|
||||||
|
"model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
|
||||||
|
"tensor_parallel_size": 2,
|
||||||
|
"load_format": "dummy",
|
||||||
|
"num-iters-warmup": 5,
|
||||||
|
"num-iters": 15,
|
||||||
|
"max-model-len": 256,
|
||||||
|
"async-scheduling": ""
|
||||||
|
}
|
||||||
|
}
|
||||||
|
]
|
||||||
@@ -1,610 +0,0 @@
|
|||||||
[
|
|
||||||
{
|
|
||||||
"test_name": "serving_llama8B_bf16_tp1_sharegpt",
|
|
||||||
"qps_list": ["inf"],
|
|
||||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
|
||||||
"server_environment_variables": {
|
|
||||||
"VLLM_RPC_TIMEOUT": 100000,
|
|
||||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
|
||||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
|
||||||
"VLLM_CPU_SGL_KERNEL": 1,
|
|
||||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
|
||||||
},
|
|
||||||
"server_parameters": {
|
|
||||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
|
||||||
"tensor_parallel_size": 1,
|
|
||||||
"dtype": "bfloat16",
|
|
||||||
"distributed_executor_backend": "mp",
|
|
||||||
"block_size": 128,
|
|
||||||
"trust_remote_code": "",
|
|
||||||
"disable_log_stats": "",
|
|
||||||
"enforce_eager": "",
|
|
||||||
"max_num_batched_tokens": 2048,
|
|
||||||
"max_num_seqs": 256,
|
|
||||||
"load_format": "dummy"
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
|
||||||
"backend": "vllm",
|
|
||||||
"dataset_name": "sharegpt",
|
|
||||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
|
||||||
"num_prompts": 200
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "serving_llama8B_bf16_tp2_sharegpt",
|
|
||||||
"qps_list": ["inf"],
|
|
||||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
|
||||||
"server_environment_variables": {
|
|
||||||
"VLLM_RPC_TIMEOUT": 100000,
|
|
||||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
|
||||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
|
||||||
"VLLM_CPU_SGL_KERNEL": 1,
|
|
||||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
|
||||||
},
|
|
||||||
"server_parameters": {
|
|
||||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
|
||||||
"tensor_parallel_size": 2,
|
|
||||||
"dtype": "bfloat16",
|
|
||||||
"distributed_executor_backend": "mp",
|
|
||||||
"block_size": 128,
|
|
||||||
"trust_remote_code": "",
|
|
||||||
"disable_log_stats": "",
|
|
||||||
"enforce_eager": "",
|
|
||||||
"max_num_batched_tokens": 2048,
|
|
||||||
"max_num_seqs": 256,
|
|
||||||
"load_format": "dummy"
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
|
||||||
"backend": "vllm",
|
|
||||||
"dataset_name": "sharegpt",
|
|
||||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
|
||||||
"num_prompts": 200
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "serving_llama8B_bf16_tp4_sharegpt",
|
|
||||||
"qps_list": ["inf"],
|
|
||||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
|
||||||
"server_environment_variables": {
|
|
||||||
"VLLM_RPC_TIMEOUT": 100000,
|
|
||||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
|
||||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
|
||||||
"VLLM_CPU_SGL_KERNEL": 1,
|
|
||||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
|
||||||
},
|
|
||||||
"server_parameters": {
|
|
||||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
|
||||||
"tensor_parallel_size": 4,
|
|
||||||
"dtype": "bfloat16",
|
|
||||||
"distributed_executor_backend": "mp",
|
|
||||||
"block_size": 128,
|
|
||||||
"trust_remote_code": "",
|
|
||||||
"disable_log_stats": "",
|
|
||||||
"enforce_eager": "",
|
|
||||||
"max_num_batched_tokens": 2048,
|
|
||||||
"max_num_seqs": 256,
|
|
||||||
"load_format": "dummy"
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
|
||||||
"backend": "vllm",
|
|
||||||
"dataset_name": "sharegpt",
|
|
||||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
|
||||||
"num_prompts": 200
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "serving_llama8B_bf16_tp1_random_128_128",
|
|
||||||
"qps_list": ["inf"],
|
|
||||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
|
|
||||||
"server_environment_variables": {
|
|
||||||
"VLLM_RPC_TIMEOUT": 100000,
|
|
||||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
|
||||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
|
||||||
"VLLM_CPU_SGL_KERNEL": 1,
|
|
||||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
|
||||||
},
|
|
||||||
"server_parameters": {
|
|
||||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
|
||||||
"tensor_parallel_size": 1,
|
|
||||||
"dtype": "bfloat16",
|
|
||||||
"distributed_executor_backend": "mp",
|
|
||||||
"block_size": 128,
|
|
||||||
"trust_remote_code": "",
|
|
||||||
"enable_chunked_prefill": "",
|
|
||||||
"disable_log_stats": "",
|
|
||||||
"enforce_eager": "",
|
|
||||||
"max_num_batched_tokens": 2048,
|
|
||||||
"max_num_seqs": 256,
|
|
||||||
"load_format": "dummy"
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
|
||||||
"backend": "vllm",
|
|
||||||
"dataset_name": "random",
|
|
||||||
"random-input-len": 128,
|
|
||||||
"random-output-len": 128,
|
|
||||||
"ignore-eos": "",
|
|
||||||
"num_prompts": 1000
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "serving_llama8B_bf16_tp2_random_128_128",
|
|
||||||
"qps_list": ["inf"],
|
|
||||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
|
|
||||||
"server_environment_variables": {
|
|
||||||
"VLLM_RPC_TIMEOUT": 100000,
|
|
||||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
|
||||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
|
||||||
"VLLM_CPU_SGL_KERNEL": 1,
|
|
||||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
|
||||||
},
|
|
||||||
"server_parameters": {
|
|
||||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
|
||||||
"tensor_parallel_size": 2,
|
|
||||||
"dtype": "bfloat16",
|
|
||||||
"distributed_executor_backend": "mp",
|
|
||||||
"block_size": 128,
|
|
||||||
"trust_remote_code": "",
|
|
||||||
"enable_chunked_prefill": "",
|
|
||||||
"disable_log_stats": "",
|
|
||||||
"enforce_eager": "",
|
|
||||||
"max_num_batched_tokens": 2048,
|
|
||||||
"max_num_seqs": 256,
|
|
||||||
"load_format": "dummy"
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
|
||||||
"backend": "vllm",
|
|
||||||
"dataset_name": "random",
|
|
||||||
"random-input-len": 128,
|
|
||||||
"random-output-len": 128,
|
|
||||||
"ignore-eos": "",
|
|
||||||
"num_prompts": 1000
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "serving_llama8B_bf16_tp4_random_128_128",
|
|
||||||
"qps_list": ["inf"],
|
|
||||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
|
|
||||||
"server_environment_variables": {
|
|
||||||
"VLLM_RPC_TIMEOUT": 100000,
|
|
||||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
|
||||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
|
||||||
"VLLM_CPU_SGL_KERNEL": 1,
|
|
||||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
|
||||||
},
|
|
||||||
"server_parameters": {
|
|
||||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
|
||||||
"tensor_parallel_size": 4,
|
|
||||||
"dtype": "bfloat16",
|
|
||||||
"distributed_executor_backend": "mp",
|
|
||||||
"block_size": 128,
|
|
||||||
"trust_remote_code": "",
|
|
||||||
"enable_chunked_prefill": "",
|
|
||||||
"disable_log_stats": "",
|
|
||||||
"enforce_eager": "",
|
|
||||||
"max_num_batched_tokens": 2048,
|
|
||||||
"max_num_seqs": 256,
|
|
||||||
"load_format": "dummy"
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
|
||||||
"backend": "vllm",
|
|
||||||
"dataset_name": "random",
|
|
||||||
"random-input-len": 128,
|
|
||||||
"random-output-len": 128,
|
|
||||||
"num_prompts": 1000
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "serving_llama8B_int8_tp1_sharegpt",
|
|
||||||
"qps_list": ["inf"],
|
|
||||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
|
||||||
"server_environment_variables": {
|
|
||||||
"VLLM_RPC_TIMEOUT": 100000,
|
|
||||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
|
||||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
|
||||||
"VLLM_CPU_SGL_KERNEL": 1,
|
|
||||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
|
||||||
},
|
|
||||||
"server_parameters": {
|
|
||||||
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
|
|
||||||
"tensor_parallel_size": 1,
|
|
||||||
"dtype": "bfloat16",
|
|
||||||
"distributed_executor_backend": "mp",
|
|
||||||
"block_size": 128,
|
|
||||||
"trust_remote_code": "",
|
|
||||||
"disable_log_stats": "",
|
|
||||||
"enforce_eager": "",
|
|
||||||
"max_num_batched_tokens": 2048,
|
|
||||||
"max_num_seqs": 256,
|
|
||||||
"load_format": "dummy"
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
|
|
||||||
"backend": "vllm",
|
|
||||||
"dataset_name": "sharegpt",
|
|
||||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
|
||||||
"num_prompts": 200
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "serving_llama8B_int8_tp2_sharegpt",
|
|
||||||
"qps_list": ["inf"],
|
|
||||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
|
||||||
"server_environment_variables": {
|
|
||||||
"VLLM_RPC_TIMEOUT": 100000,
|
|
||||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
|
||||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
|
||||||
"VLLM_CPU_SGL_KERNEL": 1,
|
|
||||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
|
||||||
},
|
|
||||||
"server_parameters": {
|
|
||||||
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
|
|
||||||
"tensor_parallel_size": 2,
|
|
||||||
"dtype": "bfloat16",
|
|
||||||
"distributed_executor_backend": "mp",
|
|
||||||
"block_size": 128,
|
|
||||||
"trust_remote_code": "",
|
|
||||||
"disable_log_stats": "",
|
|
||||||
"enforce_eager": "",
|
|
||||||
"max_num_batched_tokens": 2048,
|
|
||||||
"max_num_seqs": 256,
|
|
||||||
"load_format": "dummy"
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
|
|
||||||
"backend": "vllm",
|
|
||||||
"dataset_name": "sharegpt",
|
|
||||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
|
||||||
"num_prompts": 200
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "serving_llama8B_int8_tp4_sharegpt",
|
|
||||||
"qps_list": ["inf"],
|
|
||||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
|
||||||
"server_environment_variables": {
|
|
||||||
"VLLM_RPC_TIMEOUT": 100000,
|
|
||||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
|
||||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
|
||||||
"VLLM_CPU_SGL_KERNEL": 1,
|
|
||||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
|
||||||
},
|
|
||||||
"server_parameters": {
|
|
||||||
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
|
|
||||||
"tensor_parallel_size": 4,
|
|
||||||
"dtype": "bfloat16",
|
|
||||||
"distributed_executor_backend": "mp",
|
|
||||||
"block_size": 128,
|
|
||||||
"trust_remote_code": "",
|
|
||||||
"disable_log_stats": "",
|
|
||||||
"enforce_eager": "",
|
|
||||||
"max_num_batched_tokens": 2048,
|
|
||||||
"max_num_seqs": 256,
|
|
||||||
"load_format": "dummy"
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
|
|
||||||
"backend": "vllm",
|
|
||||||
"dataset_name": "sharegpt",
|
|
||||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
|
||||||
"num_prompts": 200
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "serving_llama8B_int8_tp1_random_128_128",
|
|
||||||
"qps_list": ["inf"],
|
|
||||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
|
|
||||||
"server_environment_variables": {
|
|
||||||
"VLLM_RPC_TIMEOUT": 100000,
|
|
||||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
|
||||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
|
||||||
"VLLM_CPU_SGL_KERNEL": 1,
|
|
||||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
|
||||||
},
|
|
||||||
"server_parameters": {
|
|
||||||
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
|
|
||||||
"tensor_parallel_size": 1,
|
|
||||||
"dtype": "bfloat16",
|
|
||||||
"distributed_executor_backend": "mp",
|
|
||||||
"block_size": 128,
|
|
||||||
"trust_remote_code": "",
|
|
||||||
"enable_chunked_prefill": "",
|
|
||||||
"disable_log_stats": "",
|
|
||||||
"enforce_eager": "",
|
|
||||||
"max_num_batched_tokens": 2048,
|
|
||||||
"max_num_seqs": 256,
|
|
||||||
"load_format": "dummy"
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
|
|
||||||
"backend": "vllm",
|
|
||||||
"dataset_name": "random",
|
|
||||||
"random-input-len": 128,
|
|
||||||
"random-output-len": 128,
|
|
||||||
"ignore-eos": "",
|
|
||||||
"num_prompts": 1000
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "serving_llama8B_int8_tp2_random_128_128",
|
|
||||||
"qps_list": ["inf"],
|
|
||||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
|
|
||||||
"server_environment_variables": {
|
|
||||||
"VLLM_RPC_TIMEOUT": 100000,
|
|
||||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
|
||||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
|
||||||
"VLLM_CPU_SGL_KERNEL": 1,
|
|
||||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
|
||||||
},
|
|
||||||
"server_parameters": {
|
|
||||||
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
|
|
||||||
"tensor_parallel_size": 2,
|
|
||||||
"dtype": "bfloat16",
|
|
||||||
"distributed_executor_backend": "mp",
|
|
||||||
"block_size": 128,
|
|
||||||
"trust_remote_code": "",
|
|
||||||
"enable_chunked_prefill": "",
|
|
||||||
"disable_log_stats": "",
|
|
||||||
"enforce_eager": "",
|
|
||||||
"max_num_batched_tokens": 2048,
|
|
||||||
"max_num_seqs": 256,
|
|
||||||
"load_format": "dummy"
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
|
|
||||||
"backend": "vllm",
|
|
||||||
"dataset_name": "random",
|
|
||||||
"random-input-len": 128,
|
|
||||||
"random-output-len": 128,
|
|
||||||
"ignore-eos": "",
|
|
||||||
"num_prompts": 1000
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "serving_llama8B_int8_tp4_random_128_128",
|
|
||||||
"qps_list": ["inf"],
|
|
||||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
|
|
||||||
"server_environment_variables": {
|
|
||||||
"VLLM_RPC_TIMEOUT": 100000,
|
|
||||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
|
||||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
|
||||||
"VLLM_CPU_SGL_KERNEL": 1,
|
|
||||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
|
||||||
},
|
|
||||||
"server_parameters": {
|
|
||||||
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
|
|
||||||
"tensor_parallel_size": 4,
|
|
||||||
"dtype": "bfloat16",
|
|
||||||
"distributed_executor_backend": "mp",
|
|
||||||
"block_size": 128,
|
|
||||||
"trust_remote_code": "",
|
|
||||||
"enable_chunked_prefill": "",
|
|
||||||
"disable_log_stats": "",
|
|
||||||
"enforce_eager": "",
|
|
||||||
"max_num_batched_tokens": 2048,
|
|
||||||
"max_num_seqs": 256,
|
|
||||||
"load_format": "dummy"
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
|
|
||||||
"backend": "vllm",
|
|
||||||
"dataset_name": "random",
|
|
||||||
"random-input-len": 128,
|
|
||||||
"random-output-len": 128,
|
|
||||||
"ignore-eos": "",
|
|
||||||
"num_prompts": 1000
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "serving_llama8B_int4_tp1_sharegpt",
|
|
||||||
"qps_list": ["inf"],
|
|
||||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
|
||||||
"server_environment_variables": {
|
|
||||||
"VLLM_RPC_TIMEOUT": 100000,
|
|
||||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
|
||||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
|
||||||
"VLLM_CPU_SGL_KERNEL": 1,
|
|
||||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
|
||||||
},
|
|
||||||
"server_parameters": {
|
|
||||||
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
|
||||||
"quantization": "awq",
|
|
||||||
"tensor_parallel_size": 1,
|
|
||||||
"dtype": "bfloat16",
|
|
||||||
"distributed_executor_backend": "mp",
|
|
||||||
"block_size": 128,
|
|
||||||
"trust_remote_code": "",
|
|
||||||
"disable_log_stats": "",
|
|
||||||
"enforce_eager": "",
|
|
||||||
"max_num_batched_tokens": 2048,
|
|
||||||
"max_num_seqs": 256,
|
|
||||||
"load_format": "dummy"
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
|
||||||
"backend": "vllm",
|
|
||||||
"dataset_name": "sharegpt",
|
|
||||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
|
||||||
"num_prompts": 200
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "serving_llama8B_int4_tp2_sharegpt",
|
|
||||||
"qps_list": ["inf"],
|
|
||||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
|
||||||
"server_environment_variables": {
|
|
||||||
"VLLM_RPC_TIMEOUT": 100000,
|
|
||||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
|
||||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
|
||||||
"VLLM_CPU_SGL_KERNEL": 1,
|
|
||||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
|
||||||
},
|
|
||||||
"server_parameters": {
|
|
||||||
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
|
||||||
"quantization": "awq",
|
|
||||||
"tensor_parallel_size": 2,
|
|
||||||
"dtype": "bfloat16",
|
|
||||||
"distributed_executor_backend": "mp",
|
|
||||||
"block_size": 128,
|
|
||||||
"trust_remote_code": "",
|
|
||||||
"disable_log_stats": "",
|
|
||||||
"enforce_eager": "",
|
|
||||||
"max_num_batched_tokens": 2048,
|
|
||||||
"max_num_seqs": 256,
|
|
||||||
"load_format": "dummy"
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
|
||||||
"backend": "vllm",
|
|
||||||
"dataset_name": "sharegpt",
|
|
||||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
|
||||||
"num_prompts": 200
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "serving_llama8B_int4_tp4_sharegpt",
|
|
||||||
"qps_list": ["inf"],
|
|
||||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
|
||||||
"server_environment_variables": {
|
|
||||||
"VLLM_RPC_TIMEOUT": 100000,
|
|
||||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
|
||||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
|
||||||
"VLLM_CPU_SGL_KERNEL": 1,
|
|
||||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
|
||||||
},
|
|
||||||
"server_parameters": {
|
|
||||||
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
|
||||||
"quantization": "awq",
|
|
||||||
"tensor_parallel_size": 4,
|
|
||||||
"dtype": "bfloat16",
|
|
||||||
"distributed_executor_backend": "mp",
|
|
||||||
"block_size": 128,
|
|
||||||
"trust_remote_code": "",
|
|
||||||
"disable_log_stats": "",
|
|
||||||
"enforce_eager": "",
|
|
||||||
"max_num_batched_tokens": 2048,
|
|
||||||
"max_num_seqs": 256,
|
|
||||||
"load_format": "dummy"
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
|
||||||
"backend": "vllm",
|
|
||||||
"dataset_name": "sharegpt",
|
|
||||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
|
||||||
"num_prompts": 200
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "serving_llama8B_int4_tp1_random_128_128",
|
|
||||||
"qps_list": ["inf"],
|
|
||||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
|
|
||||||
"server_environment_variables": {
|
|
||||||
"VLLM_RPC_TIMEOUT": 100000,
|
|
||||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
|
||||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
|
||||||
"VLLM_CPU_SGL_KERNEL": 1,
|
|
||||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
|
||||||
},
|
|
||||||
"server_parameters": {
|
|
||||||
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
|
||||||
"quantization": "awq",
|
|
||||||
"tensor_parallel_size": 1,
|
|
||||||
"dtype": "bfloat16",
|
|
||||||
"distributed_executor_backend": "mp",
|
|
||||||
"block_size": 128,
|
|
||||||
"trust_remote_code": "",
|
|
||||||
"enable_chunked_prefill": "",
|
|
||||||
"disable_log_stats": "",
|
|
||||||
"enforce_eager": "",
|
|
||||||
"max_num_batched_tokens": 2048,
|
|
||||||
"max_num_seqs": 256,
|
|
||||||
"load_format": "dummy"
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
|
||||||
"backend": "vllm",
|
|
||||||
"dataset_name": "random",
|
|
||||||
"random-input-len": 128,
|
|
||||||
"random-output-len": 128,
|
|
||||||
"ignore-eos": "",
|
|
||||||
"num_prompts": 1000
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "serving_llama8B_int4_tp2_random_128_128",
|
|
||||||
"qps_list": ["inf"],
|
|
||||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
|
|
||||||
"server_environment_variables": {
|
|
||||||
"VLLM_RPC_TIMEOUT": 100000,
|
|
||||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
|
||||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
|
||||||
"VLLM_CPU_SGL_KERNEL": 1,
|
|
||||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
|
||||||
},
|
|
||||||
"server_parameters": {
|
|
||||||
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
|
||||||
"quantization": "awq",
|
|
||||||
"tensor_parallel_size": 2,
|
|
||||||
"dtype": "bfloat16",
|
|
||||||
"distributed_executor_backend": "mp",
|
|
||||||
"block_size": 128,
|
|
||||||
"trust_remote_code": "",
|
|
||||||
"enable_chunked_prefill": "",
|
|
||||||
"disable_log_stats": "",
|
|
||||||
"enforce_eager": "",
|
|
||||||
"max_num_batched_tokens": 2048,
|
|
||||||
"max_num_seqs": 256,
|
|
||||||
"load_format": "dummy"
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
|
||||||
"backend": "vllm",
|
|
||||||
"dataset_name": "random",
|
|
||||||
"random-input-len": 128,
|
|
||||||
"random-output-len": 128,
|
|
||||||
"ignore-eos": "",
|
|
||||||
"num_prompts": 1000
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "serving_llama8B_int4_tp4_random_128_128",
|
|
||||||
"qps_list": ["inf"],
|
|
||||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
|
|
||||||
"server_environment_variables": {
|
|
||||||
"VLLM_RPC_TIMEOUT": 100000,
|
|
||||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
|
||||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
|
||||||
"VLLM_CPU_SGL_KERNEL": 1,
|
|
||||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
|
||||||
},
|
|
||||||
"server_parameters": {
|
|
||||||
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
|
||||||
"quantization": "awq",
|
|
||||||
"tensor_parallel_size": 4,
|
|
||||||
"dtype": "bfloat16",
|
|
||||||
"distributed_executor_backend": "mp",
|
|
||||||
"block_size": 128,
|
|
||||||
"trust_remote_code": "",
|
|
||||||
"enable_chunked_prefill": "",
|
|
||||||
"disable_log_stats": "",
|
|
||||||
"enforce_eager": "",
|
|
||||||
"max_num_batched_tokens": 2048,
|
|
||||||
"max_num_seqs": 256,
|
|
||||||
"load_format": "dummy"
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
|
||||||
"backend": "vllm",
|
|
||||||
"dataset_name": "random",
|
|
||||||
"random-input-len": 128,
|
|
||||||
"random-output-len": 128,
|
|
||||||
"ignore-eos": "",
|
|
||||||
"num_prompts": 1000
|
|
||||||
}
|
|
||||||
}
|
|
||||||
]
|
|
||||||
File diff suppressed because it is too large
Load Diff
@@ -1,8 +1,9 @@
|
|||||||
[
|
{
|
||||||
{
|
"defaults": {
|
||||||
"test_name": "serving_llama8B_tp1_sharegpt",
|
"qps_list": [
|
||||||
"qps_list": [1, 4, 16, "inf"],
|
"inf"
|
||||||
"max_concurrency_list": [32],
|
],
|
||||||
|
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
||||||
"server_environment_variables": {
|
"server_environment_variables": {
|
||||||
"VLLM_RPC_TIMEOUT": 100000,
|
"VLLM_RPC_TIMEOUT": 100000,
|
||||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||||
@@ -26,251 +27,220 @@
|
|||||||
"client_parameters": {
|
"client_parameters": {
|
||||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||||
"backend": "vllm",
|
"backend": "vllm",
|
||||||
|
"ignore-eos": "",
|
||||||
|
"num_prompts": 200
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"tests": [
|
||||||
|
{
|
||||||
|
"test_name": "serving_llama8B_tp1_sharegpt",
|
||||||
|
"server_parameters": {
|
||||||
|
"tensor_parallel_size": 1
|
||||||
|
},
|
||||||
|
"client_parameters": {
|
||||||
"dataset_name": "sharegpt",
|
"dataset_name": "sharegpt",
|
||||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json"
|
||||||
"num_prompts": 32
|
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"test_name": "serving_llama8B_tp2_sharegpt",
|
"test_name": "serving_llama8B_tp2_sharegpt",
|
||||||
"qps_list": [1, 4, 16, "inf"],
|
|
||||||
"max_concurrency_list": [32],
|
|
||||||
"server_environment_variables": {
|
|
||||||
"VLLM_RPC_TIMEOUT": 100000,
|
|
||||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
|
||||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
|
||||||
"VLLM_CPU_SGL_KERNEL": 1,
|
|
||||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
|
||||||
},
|
|
||||||
"server_parameters": {
|
"server_parameters": {
|
||||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
"tensor_parallel_size": 2
|
||||||
"tensor_parallel_size": 2,
|
|
||||||
"dtype": "bfloat16",
|
|
||||||
"distributed_executor_backend": "mp",
|
|
||||||
"block_size": 128,
|
|
||||||
"trust_remote_code": "",
|
|
||||||
"disable_log_stats": "",
|
|
||||||
"enforce_eager": "",
|
|
||||||
"max_num_batched_tokens": 2048,
|
|
||||||
"max_num_seqs": 256,
|
|
||||||
"load_format": "dummy"
|
|
||||||
},
|
},
|
||||||
"client_parameters": {
|
"client_parameters": {
|
||||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
|
||||||
"backend": "vllm",
|
|
||||||
"dataset_name": "sharegpt",
|
"dataset_name": "sharegpt",
|
||||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json"
|
||||||
"num_prompts": 32
|
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"test_name": "serving_llama8B_tp1_random_128_128",
|
"test_name": "serving_llama8B_tp1_random_128_128",
|
||||||
"qps_list": [1, 4, 16, "inf"],
|
|
||||||
"max_concurrency_list": [32],
|
|
||||||
"server_environment_variables": {
|
|
||||||
"VLLM_RPC_TIMEOUT": 100000,
|
|
||||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
|
||||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
|
||||||
"VLLM_CPU_SGL_KERNEL": 1,
|
|
||||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
|
||||||
},
|
|
||||||
"server_parameters": {
|
"server_parameters": {
|
||||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
"tensor_parallel_size": 1
|
||||||
"tensor_parallel_size": 1,
|
|
||||||
"dtype": "bfloat16",
|
|
||||||
"distributed_executor_backend": "mp",
|
|
||||||
"block_size": 128,
|
|
||||||
"trust_remote_code": "",
|
|
||||||
"enable_chunked_prefill": "",
|
|
||||||
"disable_log_stats": "",
|
|
||||||
"enforce_eager": "",
|
|
||||||
"max_num_batched_tokens": 2048,
|
|
||||||
"max_num_seqs": 256,
|
|
||||||
"load_format": "dummy"
|
|
||||||
},
|
},
|
||||||
"client_parameters": {
|
"client_parameters": {
|
||||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
|
||||||
"backend": "vllm",
|
|
||||||
"dataset_name": "random",
|
"dataset_name": "random",
|
||||||
"random-input-len": 128,
|
"random-input-len": 128,
|
||||||
"random-output-len": 128,
|
"random-output-len": 128
|
||||||
"ignore-eos": "",
|
|
||||||
"num_prompts": 32
|
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"test_name": "serving_llama8B_tp2_random_128_128",
|
"test_name": "serving_llama8B_tp2_random_128_128",
|
||||||
"qps_list": [1, 4, 16, "inf"],
|
|
||||||
"max_concurrency_list": [32],
|
|
||||||
"server_environment_variables": {
|
|
||||||
"VLLM_RPC_TIMEOUT": 100000,
|
|
||||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
|
||||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
|
||||||
"VLLM_CPU_SGL_KERNEL": 1,
|
|
||||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
|
||||||
},
|
|
||||||
"server_parameters": {
|
"server_parameters": {
|
||||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
"tensor_parallel_size": 2
|
||||||
"tensor_parallel_size": 2,
|
|
||||||
"dtype": "bfloat16",
|
|
||||||
"distributed_executor_backend": "mp",
|
|
||||||
"block_size": 128,
|
|
||||||
"trust_remote_code": "",
|
|
||||||
"enable_chunked_prefill": "",
|
|
||||||
"disable_log_stats": "",
|
|
||||||
"enforce_eager": "",
|
|
||||||
"max_num_batched_tokens": 2048,
|
|
||||||
"max_num_seqs": 256,
|
|
||||||
"load_format": "dummy"
|
|
||||||
},
|
},
|
||||||
"client_parameters": {
|
"client_parameters": {
|
||||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
|
||||||
"backend": "vllm",
|
|
||||||
"dataset_name": "random",
|
"dataset_name": "random",
|
||||||
"random-input-len": 128,
|
"random-input-len": 128,
|
||||||
"random-output-len": 128,
|
"random-output-len": 128
|
||||||
"ignore-eos": "",
|
}
|
||||||
"num_prompts": 32
|
},
|
||||||
|
{
|
||||||
|
"test_name": "serving_llama8B_tp4_random_128_128",
|
||||||
|
"server_parameters": {
|
||||||
|
"tensor_parallel_size": 4
|
||||||
|
},
|
||||||
|
"client_parameters": {
|
||||||
|
"dataset_name": "random",
|
||||||
|
"random-input-len": 128,
|
||||||
|
"random-output-len": 128
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"test_name": "serving_llama8B_tp1_random_128_2048",
|
"test_name": "serving_llama8B_tp1_random_128_2048",
|
||||||
"qps_list": [1, 4, 16, "inf"],
|
|
||||||
"max_concurrency_list": [32],
|
|
||||||
"server_environment_variables": {
|
|
||||||
"VLLM_RPC_TIMEOUT": 100000,
|
|
||||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
|
||||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
|
||||||
"VLLM_CPU_SGL_KERNEL": 1,
|
|
||||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
|
||||||
},
|
|
||||||
"server_parameters": {
|
"server_parameters": {
|
||||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
"tensor_parallel_size": 1
|
||||||
"tensor_parallel_size": 1,
|
|
||||||
"dtype": "bfloat16",
|
|
||||||
"distributed_executor_backend": "mp",
|
|
||||||
"block_size": 128,
|
|
||||||
"trust_remote_code": "",
|
|
||||||
"enable_chunked_prefill": "",
|
|
||||||
"disable_log_stats": "",
|
|
||||||
"enforce_eager": "",
|
|
||||||
"max_num_batched_tokens": 2048,
|
|
||||||
"max_num_seqs": 256,
|
|
||||||
"load_format": "dummy"
|
|
||||||
},
|
},
|
||||||
"client_parameters": {
|
"client_parameters": {
|
||||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
|
||||||
"backend": "vllm",
|
|
||||||
"dataset_name": "random",
|
"dataset_name": "random",
|
||||||
"random-input-len": 128,
|
"random-input-len": 128,
|
||||||
"random-output-len": 2048,
|
"random-output-len": 2048
|
||||||
"ignore-eos": "",
|
|
||||||
"num_prompts": 32
|
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"test_name": "serving_llama8B_tp2_random_128_2048",
|
"test_name": "serving_llama8B_tp2_random_128_2048",
|
||||||
"qps_list": [1, 4, 16, "inf"],
|
|
||||||
"max_concurrency_list": [32],
|
|
||||||
"server_environment_variables": {
|
|
||||||
"VLLM_RPC_TIMEOUT": 100000,
|
|
||||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
|
||||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
|
||||||
"VLLM_CPU_SGL_KERNEL": 1,
|
|
||||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
|
||||||
},
|
|
||||||
"server_parameters": {
|
"server_parameters": {
|
||||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
"tensor_parallel_size": 2
|
||||||
"tensor_parallel_size": 2,
|
|
||||||
"dtype": "bfloat16",
|
|
||||||
"distributed_executor_backend": "mp",
|
|
||||||
"block_size": 128,
|
|
||||||
"trust_remote_code": "",
|
|
||||||
"enable_chunked_prefill": "",
|
|
||||||
"disable_log_stats": "",
|
|
||||||
"enforce_eager": "",
|
|
||||||
"max_num_batched_tokens": 2048,
|
|
||||||
"max_num_seqs": 256,
|
|
||||||
"load_format": "dummy"
|
|
||||||
},
|
},
|
||||||
"client_parameters": {
|
"client_parameters": {
|
||||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
|
||||||
"backend": "vllm",
|
|
||||||
"dataset_name": "random",
|
"dataset_name": "random",
|
||||||
"random-input-len": 128,
|
"random-input-len": 128,
|
||||||
"random-output-len": 2048,
|
"random-output-len": 2048
|
||||||
"ignore-eos": "",
|
}
|
||||||
"num_prompts": 32
|
},
|
||||||
|
{
|
||||||
|
"test_name": "serving_llama8B_tp4_random_128_2048",
|
||||||
|
"server_parameters": {
|
||||||
|
"tensor_parallel_size": 4
|
||||||
|
},
|
||||||
|
"client_parameters": {
|
||||||
|
"dataset_name": "random",
|
||||||
|
"random-input-len": 128,
|
||||||
|
"random-output-len": 2048
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"test_name": "serving_llama8B_tp1_random_2048_128",
|
"test_name": "serving_llama8B_tp1_random_2048_128",
|
||||||
"qps_list": [1, 4, 16, "inf"],
|
|
||||||
"max_concurrency_list": [32],
|
|
||||||
"server_environment_variables": {
|
|
||||||
"VLLM_RPC_TIMEOUT": 100000,
|
|
||||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
|
||||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
|
||||||
"VLLM_CPU_SGL_KERNEL": 1,
|
|
||||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
|
||||||
},
|
|
||||||
"server_parameters": {
|
"server_parameters": {
|
||||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
"tensor_parallel_size": 1
|
||||||
"tensor_parallel_size": 1,
|
|
||||||
"dtype": "bfloat16",
|
|
||||||
"distributed_executor_backend": "mp",
|
|
||||||
"block_size": 128,
|
|
||||||
"trust_remote_code": "",
|
|
||||||
"enable_chunked_prefill": "",
|
|
||||||
"disable_log_stats": "",
|
|
||||||
"enforce_eager": "",
|
|
||||||
"max_num_batched_tokens": 2048,
|
|
||||||
"max_num_seqs": 256,
|
|
||||||
"load_format": "dummy"
|
|
||||||
},
|
},
|
||||||
"client_parameters": {
|
"client_parameters": {
|
||||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
|
||||||
"backend": "vllm",
|
|
||||||
"dataset_name": "random",
|
"dataset_name": "random",
|
||||||
"random-input-len": 2048,
|
"random-input-len": 2048,
|
||||||
"random-output-len": 128,
|
"random-output-len": 128
|
||||||
"ignore-eos": "",
|
|
||||||
"num_prompts": 32
|
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"test_name": "serving_llama8B_tp2_random_2048_128",
|
"test_name": "serving_llama8B_tp2_random_2048_128",
|
||||||
"qps_list": [1, 4, 16, "inf"],
|
|
||||||
"max_concurrency_list": [32],
|
|
||||||
"server_environment_variables": {
|
|
||||||
"VLLM_RPC_TIMEOUT": 100000,
|
|
||||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
|
||||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
|
||||||
"VLLM_CPU_SGL_KERNEL": 1,
|
|
||||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
|
||||||
},
|
|
||||||
"server_parameters": {
|
"server_parameters": {
|
||||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
"tensor_parallel_size": 2
|
||||||
"tensor_parallel_size": 2,
|
|
||||||
"dtype": "bfloat16",
|
|
||||||
"distributed_executor_backend": "mp",
|
|
||||||
"block_size": 128,
|
|
||||||
"trust_remote_code": "",
|
|
||||||
"enable_chunked_prefill": "",
|
|
||||||
"disable_log_stats": "",
|
|
||||||
"enforce_eager": "",
|
|
||||||
"max_num_batched_tokens": 2048,
|
|
||||||
"max_num_seqs": 256,
|
|
||||||
"load_format": "dummy"
|
|
||||||
},
|
},
|
||||||
"client_parameters": {
|
"client_parameters": {
|
||||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
|
||||||
"backend": "vllm",
|
|
||||||
"dataset_name": "random",
|
"dataset_name": "random",
|
||||||
"random-input-len": 2048,
|
"random-input-len": 2048,
|
||||||
"random-output-len": 128,
|
"random-output-len": 128
|
||||||
"ignore-eos": "",
|
}
|
||||||
"num_prompts": 32
|
},
|
||||||
|
{
|
||||||
|
"test_name": "serving_llama8B_tp4_random_2048_128",
|
||||||
|
"server_parameters": {
|
||||||
|
"tensor_parallel_size": 4
|
||||||
|
},
|
||||||
|
"client_parameters": {
|
||||||
|
"dataset_name": "random",
|
||||||
|
"random-input-len": 2048,
|
||||||
|
"random-output-len": 128
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"test_name": "serving_llama3B_tp1_random_128_128",
|
||||||
|
"server_parameters": {
|
||||||
|
"model": "meta-llama/Llama-3.2-3B-Instruct",
|
||||||
|
"tensor_parallel_size": 1
|
||||||
|
},
|
||||||
|
"client_parameters": {
|
||||||
|
"model": "meta-llama/Llama-3.2-3B-Instruct",
|
||||||
|
"dataset_name": "random",
|
||||||
|
"random-input-len": 128,
|
||||||
|
"random-output-len": 128
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"test_name": "serving_granite2B_tp1_random_128_128",
|
||||||
|
"server_parameters": {
|
||||||
|
"model": "ibm-granite/granite-3.2-2b-instruct",
|
||||||
|
"tensor_parallel_size": 1
|
||||||
|
},
|
||||||
|
"client_parameters": {
|
||||||
|
"model": "ibm-granite/granite-3.2-2b-instruct",
|
||||||
|
"dataset_name": "random",
|
||||||
|
"random-input-len": 128,
|
||||||
|
"random-output-len": 128
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"test_name": "serving_qwen1.7B_tp1_random_128_128",
|
||||||
|
"server_parameters": {
|
||||||
|
"model": "Qwen/Qwen3-1.7B",
|
||||||
|
"tensor_parallel_size": 1
|
||||||
|
},
|
||||||
|
"client_parameters": {
|
||||||
|
"model": "Qwen/Qwen3-1.7B",
|
||||||
|
"dataset_name": "random",
|
||||||
|
"random-input-len": 128,
|
||||||
|
"random-output-len": 128
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"test_name": "serving_qwen4B_tp1_random_128_128",
|
||||||
|
"server_parameters": {
|
||||||
|
"model": "Qwen/Qwen3-4B",
|
||||||
|
"tensor_parallel_size": 1
|
||||||
|
},
|
||||||
|
"client_parameters": {
|
||||||
|
"model": "Qwen/Qwen3-4B",
|
||||||
|
"dataset_name": "random",
|
||||||
|
"random-input-len": 128,
|
||||||
|
"random-output-len": 128
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"test_name": "serving_qwen8B_tp1_random_128_128",
|
||||||
|
"server_parameters": {
|
||||||
|
"model": "Qwen/Qwen3-8B",
|
||||||
|
"tensor_parallel_size": 1
|
||||||
|
},
|
||||||
|
"client_parameters": {
|
||||||
|
"model": "Qwen/Qwen3-8B",
|
||||||
|
"dataset_name": "random",
|
||||||
|
"random-input-len": 128,
|
||||||
|
"random-output-len": 128
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"test_name": "serving_glm9B_tp1_random_128_128",
|
||||||
|
"server_parameters": {
|
||||||
|
"model": "zai-org/glm-4-9b-hf",
|
||||||
|
"tensor_parallel_size": 1
|
||||||
|
},
|
||||||
|
"client_parameters": {
|
||||||
|
"model": "zai-org/glm-4-9b-hf",
|
||||||
|
"dataset_name": "random",
|
||||||
|
"random-input-len": 128,
|
||||||
|
"random-output-len": 128
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"test_name": "serving_gemma7B_tp1_random_128_128",
|
||||||
|
"server_parameters": {
|
||||||
|
"model": "google/gemma-7b",
|
||||||
|
"tensor_parallel_size": 1
|
||||||
|
},
|
||||||
|
"client_parameters": {
|
||||||
|
"model": "google/gemma-7b",
|
||||||
|
"dataset_name": "random",
|
||||||
|
"random-input-len": 128,
|
||||||
|
"random-output-len": 128
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
|
}
|
||||||
|
|||||||
@@ -0,0 +1,82 @@
|
|||||||
|
[
|
||||||
|
{
|
||||||
|
"test_name": "serving_llama8B_tp1_sharegpt",
|
||||||
|
"qps_list": [1, 4, 16, "inf"],
|
||||||
|
"server_environment_variables": {
|
||||||
|
"PT_HPU_LAZY_MODE": 1,
|
||||||
|
"VLLM_CONTIGUOUS_PA": 1,
|
||||||
|
"VLLM_DEFRAG": 1
|
||||||
|
},
|
||||||
|
"server_parameters": {
|
||||||
|
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
||||||
|
"tensor_parallel_size": 1,
|
||||||
|
"swap_space": 16,
|
||||||
|
"disable_log_stats": "",
|
||||||
|
"load_format": "dummy",
|
||||||
|
"max-model-len": 2048,
|
||||||
|
"max-num-seqs": 256,
|
||||||
|
"async-scheduling": ""
|
||||||
|
},
|
||||||
|
"client_parameters": {
|
||||||
|
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
||||||
|
"backend": "vllm",
|
||||||
|
"dataset_name": "sharegpt",
|
||||||
|
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||||
|
"num_prompts": 200
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"test_name": "serving_llama70B_tp4_sharegpt",
|
||||||
|
"qps_list": [1, 4, 16, "inf"],
|
||||||
|
"server_environment_variables": {
|
||||||
|
"PT_HPU_LAZY_MODE": 1,
|
||||||
|
"PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
|
||||||
|
"VLLM_CONTIGUOUS_PA": 1,
|
||||||
|
"VLLM_DEFRAG": 1
|
||||||
|
},
|
||||||
|
"server_parameters": {
|
||||||
|
"model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
|
||||||
|
"tensor_parallel_size": 4,
|
||||||
|
"swap_space": 16,
|
||||||
|
"disable_log_stats": "",
|
||||||
|
"load_format": "dummy",
|
||||||
|
"max-model-len": 2048,
|
||||||
|
"max-num-seqs": 256,
|
||||||
|
"async-scheduling": ""
|
||||||
|
},
|
||||||
|
"client_parameters": {
|
||||||
|
"model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
|
||||||
|
"backend": "vllm",
|
||||||
|
"dataset_name": "sharegpt",
|
||||||
|
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||||
|
"num_prompts": 200
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"test_name": "serving_mixtral8x7B_tp2_sharegpt",
|
||||||
|
"qps_list": [1, 4, 16, "inf"],
|
||||||
|
"server_environment_variables": {
|
||||||
|
"PT_HPU_LAZY_MODE": 1,
|
||||||
|
"PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
|
||||||
|
"VLLM_CONTIGUOUS_PA": 1,
|
||||||
|
"VLLM_DEFRAG": 1
|
||||||
|
},
|
||||||
|
"server_parameters": {
|
||||||
|
"model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
|
||||||
|
"tensor_parallel_size": 2,
|
||||||
|
"swap_space": 16,
|
||||||
|
"disable_log_stats": "",
|
||||||
|
"load_format": "dummy",
|
||||||
|
"max-model-len": 2048,
|
||||||
|
"max-num-seqs": 256,
|
||||||
|
"async-scheduling": ""
|
||||||
|
},
|
||||||
|
"client_parameters": {
|
||||||
|
"model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
|
||||||
|
"backend": "vllm",
|
||||||
|
"dataset_name": "sharegpt",
|
||||||
|
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||||
|
"num_prompts": 200
|
||||||
|
}
|
||||||
|
}
|
||||||
|
]
|
||||||
@@ -0,0 +1,61 @@
|
|||||||
|
[
|
||||||
|
{
|
||||||
|
"test_name": "throughput_llama8B_tp1",
|
||||||
|
"environment_variables": {
|
||||||
|
"PT_HPU_LAZY_MODE": 1,
|
||||||
|
"VLLM_CONTIGUOUS_PA": 1,
|
||||||
|
"VLLM_DEFRAG": 1
|
||||||
|
},
|
||||||
|
"parameters": {
|
||||||
|
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
||||||
|
"tensor_parallel_size": 1,
|
||||||
|
"load_format": "dummy",
|
||||||
|
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||||
|
"num_prompts": 1000,
|
||||||
|
"backend": "vllm",
|
||||||
|
"max-model-len": 2048,
|
||||||
|
"max-num-seqs": 512,
|
||||||
|
"async-scheduling": ""
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"test_name": "throughput_llama70B_tp4",
|
||||||
|
"environment_variables": {
|
||||||
|
"PT_HPU_LAZY_MODE": 1,
|
||||||
|
"PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
|
||||||
|
"VLLM_CONTIGUOUS_PA": 1,
|
||||||
|
"VLLM_DEFRAG": 1
|
||||||
|
},
|
||||||
|
"parameters": {
|
||||||
|
"model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
|
||||||
|
"tensor_parallel_size": 4,
|
||||||
|
"load_format": "dummy",
|
||||||
|
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||||
|
"num_prompts": 1000,
|
||||||
|
"backend": "vllm",
|
||||||
|
"max-model-len": 2048,
|
||||||
|
"max-num-seqs": 512,
|
||||||
|
"async-scheduling": ""
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"test_name": "throughput_mixtral8x7B_tp2",
|
||||||
|
"environment_variables": {
|
||||||
|
"PT_HPU_LAZY_MODE": 1,
|
||||||
|
"PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
|
||||||
|
"VLLM_CONTIGUOUS_PA": 1,
|
||||||
|
"VLLM_DEFRAG": 1
|
||||||
|
},
|
||||||
|
"parameters": {
|
||||||
|
"model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
|
||||||
|
"tensor_parallel_size": 2,
|
||||||
|
"load_format": "dummy",
|
||||||
|
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||||
|
"num_prompts": 1000,
|
||||||
|
"backend": "vllm",
|
||||||
|
"max-model-len": 2048,
|
||||||
|
"max-num-seqs": 512,
|
||||||
|
"async-scheduling": ""
|
||||||
|
}
|
||||||
|
}
|
||||||
|
]
|
||||||
@@ -8,7 +8,7 @@ steps:
|
|||||||
commands:
|
commands:
|
||||||
# #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here:
|
# #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here:
|
||||||
# https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7
|
# https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7
|
||||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg VLLM_MAIN_CUDA_VERSION=12.9 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
||||||
- "mkdir artifacts"
|
- "mkdir artifacts"
|
||||||
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
||||||
- "bash .buildkite/scripts/upload-wheels.sh"
|
- "bash .buildkite/scripts/upload-wheels.sh"
|
||||||
@@ -30,19 +30,6 @@ steps:
|
|||||||
DOCKER_BUILDKIT: "1"
|
DOCKER_BUILDKIT: "1"
|
||||||
|
|
||||||
# x86 + CUDA builds
|
# x86 + CUDA builds
|
||||||
- label: "Build wheel - CUDA 12.8"
|
|
||||||
depends_on: ~
|
|
||||||
id: build-wheel-cuda-12-8
|
|
||||||
agents:
|
|
||||||
queue: cpu_queue_postmerge
|
|
||||||
commands:
|
|
||||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
|
||||||
- "mkdir artifacts"
|
|
||||||
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
|
||||||
- "bash .buildkite/scripts/upload-wheels.sh"
|
|
||||||
env:
|
|
||||||
DOCKER_BUILDKIT: "1"
|
|
||||||
|
|
||||||
- label: "Build wheel - CUDA 12.9"
|
- label: "Build wheel - CUDA 12.9"
|
||||||
depends_on: ~
|
depends_on: ~
|
||||||
id: build-wheel-cuda-12-9
|
id: build-wheel-cuda-12-9
|
||||||
@@ -109,31 +96,12 @@ steps:
|
|||||||
- label: "Annotate release workflow"
|
- label: "Annotate release workflow"
|
||||||
depends_on:
|
depends_on:
|
||||||
- create-multi-arch-manifest
|
- create-multi-arch-manifest
|
||||||
- build-wheel-cuda-12-8
|
|
||||||
id: annotate-release-workflow
|
id: annotate-release-workflow
|
||||||
agents:
|
agents:
|
||||||
queue: cpu_queue_postmerge
|
queue: cpu_queue_postmerge
|
||||||
commands:
|
commands:
|
||||||
- "bash .buildkite/scripts/annotate-release.sh"
|
- "bash .buildkite/scripts/annotate-release.sh"
|
||||||
|
|
||||||
- label: "Build and publish TPU release image"
|
|
||||||
depends_on: ~
|
|
||||||
if: build.env("NIGHTLY") == "1"
|
|
||||||
agents:
|
|
||||||
queue: tpu_queue_postmerge
|
|
||||||
commands:
|
|
||||||
- "yes | docker system prune -a"
|
|
||||||
- "git fetch --all"
|
|
||||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --tag vllm/vllm-tpu:nightly --tag vllm/vllm-tpu:$BUILDKITE_COMMIT --progress plain -f docker/Dockerfile.tpu ."
|
|
||||||
- "docker push vllm/vllm-tpu:nightly"
|
|
||||||
- "docker push vllm/vllm-tpu:$BUILDKITE_COMMIT"
|
|
||||||
plugins:
|
|
||||||
- docker-login#v3.0.0:
|
|
||||||
username: vllmbot
|
|
||||||
password-env: DOCKERHUB_TOKEN
|
|
||||||
env:
|
|
||||||
DOCKER_BUILDKIT: "1"
|
|
||||||
|
|
||||||
- input: "Provide Release version here"
|
- input: "Provide Release version here"
|
||||||
id: input-release-version
|
id: input-release-version
|
||||||
fields:
|
fields:
|
||||||
@@ -150,7 +118,7 @@ steps:
|
|||||||
queue: cpu_queue_postmerge
|
queue: cpu_queue_postmerge
|
||||||
commands:
|
commands:
|
||||||
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
||||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_CPU_AVX512BF16=true --build-arg VLLM_CPU_AVX512VNNI=true --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ."
|
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_CPU_AVX512BF16=true --build-arg VLLM_CPU_AVX512VNNI=true --build-arg VLLM_CPU_AMXBF16=true --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ."
|
||||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest"
|
- "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest"
|
||||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version)"
|
- "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version)"
|
||||||
env:
|
env:
|
||||||
|
|||||||
@@ -2,22 +2,29 @@
|
|||||||
|
|
||||||
set -ex
|
set -ex
|
||||||
|
|
||||||
# Get release version and strip leading 'v' if present
|
# Get release version, default to 1.0.0.dev for nightly/per-commit builds
|
||||||
RELEASE_VERSION=$(buildkite-agent meta-data get release-version | sed 's/^v//')
|
RELEASE_VERSION=$(buildkite-agent meta-data get release-version 2>/dev/null | sed 's/^v//')
|
||||||
|
if [ -z "${RELEASE_VERSION}" ]; then
|
||||||
if [ -z "$RELEASE_VERSION" ]; then
|
RELEASE_VERSION="1.0.0.dev"
|
||||||
echo "Error: RELEASE_VERSION is empty. 'release-version' metadata might not be set or is invalid."
|
|
||||||
exit 1
|
|
||||||
fi
|
fi
|
||||||
|
|
||||||
buildkite-agent annotate --style 'info' --context 'release-workflow' << EOF
|
buildkite-agent annotate --style 'info' --context 'release-workflow' << EOF
|
||||||
To download the wheel:
|
To download the wheel (by commit):
|
||||||
|
\`\`\`
|
||||||
|
aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux1_x86_64.whl .
|
||||||
|
aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux2014_aarch64.whl .
|
||||||
|
|
||||||
|
aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}+cu129-cp38-abi3-manylinux1_x86_64.whl .
|
||||||
|
aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}+cu129-cp38-abi3-manylinux1_x86_64.whl .
|
||||||
|
\`\`\`
|
||||||
|
|
||||||
|
To download the wheel (by version):
|
||||||
\`\`\`
|
\`\`\`
|
||||||
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux1_x86_64.whl .
|
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux1_x86_64.whl .
|
||||||
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux2014_aarch64.whl .
|
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux2014_aarch64.whl .
|
||||||
|
|
||||||
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu126/vllm-${RELEASE_VERSION}+cu126-cp38-abi3-manylinux1_x86_64.whl .
|
|
||||||
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu129/vllm-${RELEASE_VERSION}+cu129-cp38-abi3-manylinux1_x86_64.whl .
|
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu129/vllm-${RELEASE_VERSION}+cu129-cp38-abi3-manylinux1_x86_64.whl .
|
||||||
|
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu130/vllm-${RELEASE_VERSION}+cu130-cp38-abi3-manylinux1_x86_64.whl .
|
||||||
\`\`\`
|
\`\`\`
|
||||||
|
|
||||||
To download and upload the image:
|
To download and upload the image:
|
||||||
@@ -38,8 +45,9 @@ docker tag vllm/vllm-openai:aarch64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
|
|||||||
docker push vllm/vllm-openai:latest-aarch64
|
docker push vllm/vllm-openai:latest-aarch64
|
||||||
docker push vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
|
docker push vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
|
||||||
|
|
||||||
docker manifest create vllm/vllm-openai:latest vllm/vllm-openai:latest-x86_64 vllm/vllm-openai:latest-aarch64 --amend
|
docker manifest rm vllm/vllm-openai:latest
|
||||||
docker manifest create vllm/vllm-openai:v${RELEASE_VERSION} vllm/vllm-openai:v${RELEASE_VERSION}-x86_64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64 --amend
|
docker manifest create vllm/vllm-openai:latest vllm/vllm-openai:latest-x86_64 vllm/vllm-openai:latest-aarch64
|
||||||
|
docker manifest create vllm/vllm-openai:v${RELEASE_VERSION} vllm/vllm-openai:v${RELEASE_VERSION}-x86_64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
|
||||||
docker manifest push vllm/vllm-openai:latest
|
docker manifest push vllm/vllm-openai:latest
|
||||||
docker manifest push vllm/vllm-openai:v${RELEASE_VERSION}
|
docker manifest push vllm/vllm-openai:v${RELEASE_VERSION}
|
||||||
\`\`\`
|
\`\`\`
|
||||||
|
|||||||
389
.buildkite/scripts/generate-nightly-index.py
Normal file
389
.buildkite/scripts/generate-nightly-index.py
Normal file
@@ -0,0 +1,389 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
|
||||||
|
# do not complain about line length (for docstring)
|
||||||
|
# ruff: noqa: E501
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import json
|
||||||
|
import sys
|
||||||
|
from dataclasses import asdict, dataclass
|
||||||
|
from datetime import datetime
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Any
|
||||||
|
from urllib.parse import quote
|
||||||
|
|
||||||
|
import regex as re
|
||||||
|
|
||||||
|
if not sys.version_info >= (3, 12):
|
||||||
|
raise RuntimeError("This script requires Python 3.12 or higher.")
|
||||||
|
|
||||||
|
INDEX_HTML_TEMPLATE = """<!DOCTYPE html>
|
||||||
|
<html>
|
||||||
|
<!-- {comment} -->
|
||||||
|
<meta name="pypi:repository-version" content="1.0">
|
||||||
|
<body>
|
||||||
|
{items}
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class WheelFileInfo:
|
||||||
|
package_name: str
|
||||||
|
version: str
|
||||||
|
build_tag: str | None
|
||||||
|
python_tag: str
|
||||||
|
abi_tag: str
|
||||||
|
platform_tag: str
|
||||||
|
variant: str | None
|
||||||
|
filename: str
|
||||||
|
|
||||||
|
|
||||||
|
def parse_from_filename(file: str) -> WheelFileInfo:
|
||||||
|
"""
|
||||||
|
Parse wheel file name to extract metadata.
|
||||||
|
|
||||||
|
The format of wheel names:
|
||||||
|
{package_name}-{version}(-{build_tag})?-{python_tag}-{abi_tag}-{platform_tag}.whl
|
||||||
|
All versions could contain a variant like '+cu129' or '.cpu' or `.rocm` (or not).
|
||||||
|
Example:
|
||||||
|
vllm-0.11.0-cp38-abi3-manylinux1_x86_64.whl
|
||||||
|
vllm-0.10.2rc2+cu129-cp38-abi3-manylinux2014_aarch64.whl
|
||||||
|
vllm-0.11.1rc8.dev14+gaa384b3c0-cp38-abi3-manylinux2014_aarch64.whl
|
||||||
|
vllm-0.11.1rc8.dev14+gaa384b3c0.cu130-cp38-abi3-manylinux1_x86_64.whl
|
||||||
|
"""
|
||||||
|
wheel_file_re = re.compile(
|
||||||
|
r"^(?P<package_name>.+)-(?P<version>[^-]+?)(-(?P<build_tag>[^-]+))?-(?P<python_tag>[^-]+)-(?P<abi_tag>[^-]+)-(?P<platform_tag>[^-]+)\.whl$"
|
||||||
|
)
|
||||||
|
match = wheel_file_re.match(file)
|
||||||
|
if not match:
|
||||||
|
raise ValueError(f"Invalid wheel file name: {file}")
|
||||||
|
|
||||||
|
package_name = match.group("package_name")
|
||||||
|
version = match.group("version")
|
||||||
|
build_tag = match.group("build_tag")
|
||||||
|
python_tag = match.group("python_tag")
|
||||||
|
abi_tag = match.group("abi_tag")
|
||||||
|
platform_tag = match.group("platform_tag")
|
||||||
|
|
||||||
|
# extract variant from version
|
||||||
|
variant = None
|
||||||
|
if "dev" in version:
|
||||||
|
ver_after_dev = version.split("dev")[-1]
|
||||||
|
if "." in ver_after_dev:
|
||||||
|
variant = ver_after_dev.split(".")[-1]
|
||||||
|
version = version.removesuffix("." + variant)
|
||||||
|
else:
|
||||||
|
if "+" in version:
|
||||||
|
version, variant = version.split("+")
|
||||||
|
|
||||||
|
return WheelFileInfo(
|
||||||
|
package_name=package_name,
|
||||||
|
version=version,
|
||||||
|
build_tag=build_tag,
|
||||||
|
python_tag=python_tag,
|
||||||
|
abi_tag=abi_tag,
|
||||||
|
platform_tag=platform_tag,
|
||||||
|
variant=variant,
|
||||||
|
filename=file,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def generate_project_list(subdir_names: list[str], comment: str = "") -> str:
|
||||||
|
"""
|
||||||
|
Generate project list HTML content linking to each project & variant sub-directory.
|
||||||
|
"""
|
||||||
|
href_tags = []
|
||||||
|
for name in sorted(subdir_names):
|
||||||
|
name = name.strip("/").strip(".")
|
||||||
|
href_tags.append(f' <a href="{name}/">{name}/</a><br/>')
|
||||||
|
return INDEX_HTML_TEMPLATE.format(items="\n".join(href_tags), comment=comment)
|
||||||
|
|
||||||
|
|
||||||
|
def generate_package_index_and_metadata(
|
||||||
|
wheel_files: list[WheelFileInfo],
|
||||||
|
wheel_base_dir: Path,
|
||||||
|
index_base_dir: Path,
|
||||||
|
comment: str = "",
|
||||||
|
) -> tuple[str, str]:
|
||||||
|
"""
|
||||||
|
Generate package index HTML content for a specific package, linking to actual wheel files.
|
||||||
|
"""
|
||||||
|
href_tags = []
|
||||||
|
metadata = []
|
||||||
|
for file in sorted(wheel_files, key=lambda x: x.filename):
|
||||||
|
relative_path = (
|
||||||
|
wheel_base_dir.relative_to(index_base_dir, walk_up=True) / file.filename
|
||||||
|
)
|
||||||
|
# handle with '+' in URL, and avoid double-encoding '/' and already-encoded '%2B'
|
||||||
|
# NOTE: this is AWS S3 specific behavior!
|
||||||
|
file_path_quoted = quote(relative_path.as_posix(), safe=":%/")
|
||||||
|
href_tags.append(f' <a href="{file_path_quoted}">{file.filename}</a><br/>')
|
||||||
|
file_meta = asdict(file)
|
||||||
|
file_meta["path"] = file_path_quoted
|
||||||
|
metadata.append(file_meta)
|
||||||
|
index_str = INDEX_HTML_TEMPLATE.format(items="\n".join(href_tags), comment=comment)
|
||||||
|
metadata_str = json.dumps(metadata, indent=2)
|
||||||
|
return index_str, metadata_str
|
||||||
|
|
||||||
|
|
||||||
|
def generate_index_and_metadata(
|
||||||
|
whl_files: list[str],
|
||||||
|
wheel_base_dir: Path,
|
||||||
|
index_base_dir: Path,
|
||||||
|
default_variant: str | None = None,
|
||||||
|
alias_to_default: str | None = None,
|
||||||
|
comment: str = "",
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
Generate index for all wheel files.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
whl_files (list[str]): List of wheel files (must be directly under `wheel_base_dir`).
|
||||||
|
wheel_base_dir (Path): Base directory for wheel files.
|
||||||
|
index_base_dir (Path): Base directory to store index files.
|
||||||
|
default_variant (str | None): The default variant name, if any.
|
||||||
|
alias_to_default (str | None): Alias variant name for the default variant, if any.
|
||||||
|
comment (str | None): Optional comment to include in the generated HTML files.
|
||||||
|
|
||||||
|
First, parse all wheel files to extract metadata.
|
||||||
|
We need to collect all wheel files for each variant, and generate an index for it (in a sub-directory).
|
||||||
|
The index for the default variant (if any) is generated in the root index directory.
|
||||||
|
|
||||||
|
If `default_variant` is provided, all wheels must have variant suffixes, and the default variant index
|
||||||
|
is purely a copy of the corresponding variant index, with only the links adjusted.
|
||||||
|
Otherwise, all wheels without variant suffixes are treated as the default variant.
|
||||||
|
|
||||||
|
If `alias_to_default` is provided, an additional alias sub-directory is created, it has the same content
|
||||||
|
as the default variant index, but the links are adjusted accordingly.
|
||||||
|
|
||||||
|
Index directory structure:
|
||||||
|
index_base_dir/ (hosted at wheels.vllm.ai/{nightly,$commit,$version}/)
|
||||||
|
index.html # project list, linking to "vllm/" and other packages, and all variant sub-directories
|
||||||
|
vllm/
|
||||||
|
index.html # package index, pointing to actual files in wheel_base_dir (relative path)
|
||||||
|
metadata.json # machine-readable metadata for all wheels in this package
|
||||||
|
cpu/ # cpu variant sub-directory
|
||||||
|
index.html
|
||||||
|
vllm/
|
||||||
|
index.html
|
||||||
|
metadata.json
|
||||||
|
cu129/ # cu129 is actually the alias to default variant
|
||||||
|
index.html
|
||||||
|
vllm/
|
||||||
|
index.html
|
||||||
|
metadata.json
|
||||||
|
cu130/ # cu130 variant sub-directory
|
||||||
|
index.html
|
||||||
|
vllm/
|
||||||
|
index.html
|
||||||
|
metadata.json
|
||||||
|
...
|
||||||
|
|
||||||
|
metadata.json stores a dump of all wheel files' metadata in a machine-readable format:
|
||||||
|
[
|
||||||
|
{
|
||||||
|
"package_name": "vllm",
|
||||||
|
"version": "0.10.2rc2",
|
||||||
|
"build_tag": null,
|
||||||
|
"python_tag": "cp38",
|
||||||
|
"abi_tag": "abi3",
|
||||||
|
"platform_tag": "manylinux2014_aarch64",
|
||||||
|
"variant": "cu129",
|
||||||
|
"filename": "vllm-0.10.2rc2+cu129-cp38-abi3-manylinux2014_aarch64.whl",
|
||||||
|
"path": "../vllm-0.10.2rc2%2Bcu129-cp38-abi3-manylinux2014_aarch64.whl" # to be concatenated with the directory URL and URL-encoded
|
||||||
|
},
|
||||||
|
...
|
||||||
|
]
|
||||||
|
"""
|
||||||
|
|
||||||
|
parsed_files = [parse_from_filename(f) for f in whl_files]
|
||||||
|
|
||||||
|
if not parsed_files:
|
||||||
|
print("No wheel files found, skipping index generation.")
|
||||||
|
return
|
||||||
|
|
||||||
|
# Group by variant
|
||||||
|
variant_to_files: dict[str, list[WheelFileInfo]] = {}
|
||||||
|
for file in parsed_files:
|
||||||
|
variant = file.variant or "default"
|
||||||
|
if variant not in variant_to_files:
|
||||||
|
variant_to_files[variant] = []
|
||||||
|
variant_to_files[variant].append(file)
|
||||||
|
|
||||||
|
print(f"Found variants: {list(variant_to_files.keys())}")
|
||||||
|
|
||||||
|
# sanity check for default variant
|
||||||
|
if default_variant:
|
||||||
|
if "default" in variant_to_files:
|
||||||
|
raise ValueError(
|
||||||
|
"All wheel files must have variant suffixes when `default_variant` is specified."
|
||||||
|
)
|
||||||
|
if default_variant not in variant_to_files:
|
||||||
|
raise ValueError(
|
||||||
|
f"Default variant '{default_variant}' not found among wheel files."
|
||||||
|
)
|
||||||
|
|
||||||
|
if alias_to_default:
|
||||||
|
if "default" not in variant_to_files:
|
||||||
|
# e.g. only some wheels are uploaded to S3 currently
|
||||||
|
print(
|
||||||
|
"[WARN] Alias to default variant specified, but no default variant found."
|
||||||
|
)
|
||||||
|
elif alias_to_default in variant_to_files:
|
||||||
|
raise ValueError(
|
||||||
|
f"Alias variant name '{alias_to_default}' already exists among wheel files."
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
variant_to_files[alias_to_default] = variant_to_files["default"].copy()
|
||||||
|
print(f"Alias variant '{alias_to_default}' created for default variant.")
|
||||||
|
|
||||||
|
# Generate comment in HTML header
|
||||||
|
comment_str = f" ({comment})" if comment else ""
|
||||||
|
comment_tmpl = f"Generated on {datetime.now().isoformat()}{comment_str}"
|
||||||
|
|
||||||
|
# Generate index for each variant
|
||||||
|
subdir_names = set()
|
||||||
|
for variant, files in variant_to_files.items():
|
||||||
|
if variant == "default":
|
||||||
|
variant_dir = index_base_dir
|
||||||
|
else:
|
||||||
|
variant_dir = index_base_dir / variant
|
||||||
|
subdir_names.add(variant)
|
||||||
|
|
||||||
|
variant_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
# gather all package names in this variant
|
||||||
|
packages = set(f.package_name for f in files)
|
||||||
|
if variant == "default":
|
||||||
|
# these packages should also appear in the "project list"
|
||||||
|
# generate after all variants are processed
|
||||||
|
subdir_names = subdir_names.union(packages)
|
||||||
|
else:
|
||||||
|
# generate project list for this variant directly
|
||||||
|
project_list_str = generate_project_list(sorted(packages), comment_tmpl)
|
||||||
|
with open(variant_dir / "index.html", "w") as f:
|
||||||
|
f.write(project_list_str)
|
||||||
|
|
||||||
|
for package in packages:
|
||||||
|
# filter files belonging to this package only
|
||||||
|
package_files = [f for f in files if f.package_name == package]
|
||||||
|
package_dir = variant_dir / package
|
||||||
|
package_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
index_str, metadata_str = generate_package_index_and_metadata(
|
||||||
|
package_files, wheel_base_dir, package_dir, comment
|
||||||
|
)
|
||||||
|
with open(package_dir / "index.html", "w") as f:
|
||||||
|
f.write(index_str)
|
||||||
|
with open(package_dir / "metadata.json", "w") as f:
|
||||||
|
f.write(metadata_str)
|
||||||
|
|
||||||
|
# Generate top-level project list index
|
||||||
|
project_list_str = generate_project_list(sorted(subdir_names), comment_tmpl)
|
||||||
|
with open(index_base_dir / "index.html", "w") as f:
|
||||||
|
f.write(project_list_str)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
"""
|
||||||
|
Arguments:
|
||||||
|
--version <version> : version string for the current build (e.g., commit hash)
|
||||||
|
--current-objects <path_to_json> : path to JSON file containing current S3 objects listing in this version directory
|
||||||
|
--output-dir <output_directory> : directory to store generated index files
|
||||||
|
--alias-to-default <alias_variant_name> : (optional) alias variant name for the default variant
|
||||||
|
--comment <comment_string> : (optional) comment string to include in generated HTML files
|
||||||
|
"""
|
||||||
|
|
||||||
|
parser = argparse.ArgumentParser(
|
||||||
|
description="Process nightly build wheel files to generate indices."
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--version",
|
||||||
|
type=str,
|
||||||
|
required=True,
|
||||||
|
help="Version string for the current build (e.g., commit hash)",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--current-objects",
|
||||||
|
type=str,
|
||||||
|
required=True,
|
||||||
|
help="Path to JSON file containing current S3 objects listing in this version directory",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--output-dir",
|
||||||
|
type=str,
|
||||||
|
required=True,
|
||||||
|
help="Directory to store generated index files",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--alias-to-default",
|
||||||
|
type=str,
|
||||||
|
default=None,
|
||||||
|
help="Alias variant name for the default variant",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--comment",
|
||||||
|
type=str,
|
||||||
|
default="",
|
||||||
|
help="Optional comment string to include in generated HTML files",
|
||||||
|
)
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
version = args.version
|
||||||
|
if "/" in version or "\\" in version:
|
||||||
|
raise ValueError("Version string must not contain slashes.")
|
||||||
|
current_objects_path = Path(args.current_objects)
|
||||||
|
output_dir = Path(args.output_dir)
|
||||||
|
if not output_dir.exists():
|
||||||
|
output_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
# Read current objects JSON
|
||||||
|
with open(current_objects_path) as f:
|
||||||
|
current_objects: dict[str, list[dict[str, Any]]] = json.load(f)
|
||||||
|
|
||||||
|
# current_objects looks like from list_objects_v2 S3 API:
|
||||||
|
"""
|
||||||
|
"Contents": [
|
||||||
|
{
|
||||||
|
"Key": "e2f56c309d2a28899c68975a7e104502d56deb8f/vllm-0.11.2.dev363+ge2f56c309-cp38-abi3-manylinux1_x86_64.whl",
|
||||||
|
"LastModified": "2025-11-28T14:00:32+00:00",
|
||||||
|
"ETag": "\"37a38339c7cdb61ca737021b968075df-52\"",
|
||||||
|
"ChecksumAlgorithm": [
|
||||||
|
"CRC64NVME"
|
||||||
|
],
|
||||||
|
"ChecksumType": "FULL_OBJECT",
|
||||||
|
"Size": 435649349,
|
||||||
|
"StorageClass": "STANDARD"
|
||||||
|
},
|
||||||
|
...
|
||||||
|
]
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Extract wheel file keys
|
||||||
|
wheel_files = []
|
||||||
|
for item in current_objects.get("Contents", []):
|
||||||
|
key: str = item["Key"]
|
||||||
|
if key.endswith(".whl"):
|
||||||
|
wheel_files.append(key.split("/")[-1]) # only the filename is used
|
||||||
|
|
||||||
|
print(f"Found {len(wheel_files)} wheel files for version {version}: {wheel_files}")
|
||||||
|
|
||||||
|
# Generate index and metadata, assuming wheels and indices are stored as:
|
||||||
|
# s3://vllm-wheels/{version}/<wheel files>
|
||||||
|
# s3://vllm-wheels/<anything>/<index files>
|
||||||
|
wheel_base_dir = Path(output_dir).parent / version
|
||||||
|
index_base_dir = Path(output_dir)
|
||||||
|
|
||||||
|
generate_index_and_metadata(
|
||||||
|
whl_files=wheel_files,
|
||||||
|
wheel_base_dir=wheel_base_dir,
|
||||||
|
index_base_dir=index_base_dir,
|
||||||
|
default_variant=None,
|
||||||
|
alias_to_default=args.alias_to_default,
|
||||||
|
comment=args.comment.strip(),
|
||||||
|
)
|
||||||
|
print(f"Successfully generated index and metadata in {output_dir}")
|
||||||
@@ -78,17 +78,13 @@ HF_MOUNT="/root/.cache/huggingface"
|
|||||||
commands=$@
|
commands=$@
|
||||||
echo "Commands:$commands"
|
echo "Commands:$commands"
|
||||||
|
|
||||||
if [[ $commands == *"pytest -v -s basic_correctness/test_basic_correctness.py"* ]]; then
|
commands=${commands//"pytest -v -s basic_correctness/test_basic_correctness.py"/"pytest -v -s basic_correctness/test_basic_correctness.py"}
|
||||||
commands=${commands//"pytest -v -s basic_correctness/test_basic_correctness.py"/"VLLM_USE_TRITON_FLASH_ATTN=0 pytest -v -s basic_correctness/test_basic_correctness.py"}
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [[ $commands == *"pytest -v -s models/test_registry.py"* ]]; then
|
if [[ $commands == *"pytest -v -s models/test_registry.py"* ]]; then
|
||||||
commands=${commands//"pytest -v -s models/test_registry.py"/"pytest -v -s models/test_registry.py -k 'not BambaForCausalLM and not GritLM and not Mamba2ForCausalLM and not Zamba2ForCausalLM'"}
|
commands=${commands//"pytest -v -s models/test_registry.py"/"pytest -v -s models/test_registry.py -k 'not BambaForCausalLM and not GritLM and not Mamba2ForCausalLM and not Zamba2ForCausalLM'"}
|
||||||
fi
|
fi
|
||||||
|
|
||||||
if [[ $commands == *"pytest -v -s compile/test_basic_correctness.py"* ]]; then
|
commands=${commands//"pytest -v -s compile/test_basic_correctness.py"/"pytest -v -s compile/test_basic_correctness.py"}
|
||||||
commands=${commands//"pytest -v -s compile/test_basic_correctness.py"/"VLLM_USE_TRITON_FLASH_ATTN=0 pytest -v -s compile/test_basic_correctness.py"}
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [[ $commands == *"pytest -v -s lora"* ]]; then
|
if [[ $commands == *"pytest -v -s lora"* ]]; then
|
||||||
commands=${commands//"pytest -v -s lora"/"VLLM_ROCM_CUSTOM_PAGED_ATTN=0 pytest -v -s lora"}
|
commands=${commands//"pytest -v -s lora"/"VLLM_ROCM_CUSTOM_PAGED_ATTN=0 pytest -v -s lora"}
|
||||||
@@ -173,19 +169,28 @@ fi
|
|||||||
PARALLEL_JOB_COUNT=8
|
PARALLEL_JOB_COUNT=8
|
||||||
MYPYTHONPATH=".."
|
MYPYTHONPATH=".."
|
||||||
|
|
||||||
|
# Test that we're launching on the machine that has
|
||||||
|
# proper access to GPUs
|
||||||
|
render_gid=$(getent group render | cut -d: -f3)
|
||||||
|
if [[ -z "$render_gid" ]]; then
|
||||||
|
echo "Error: 'render' group not found. This is required for GPU access." >&2
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
# check if the command contains shard flag, we will run all shards in parallel because the host have 8 GPUs.
|
# check if the command contains shard flag, we will run all shards in parallel because the host have 8 GPUs.
|
||||||
if [[ $commands == *"--shard-id="* ]]; then
|
if [[ $commands == *"--shard-id="* ]]; then
|
||||||
# assign job count as the number of shards used
|
# assign job count as the number of shards used
|
||||||
commands=${commands//"--num-shards= "/"--num-shards=${PARALLEL_JOB_COUNT} "}
|
commands=$(echo "$commands" | sed -E "s/--num-shards[[:blank:]]*=[[:blank:]]*[0-9]*/--num-shards=${PARALLEL_JOB_COUNT} /g" | sed 's/ \\ / /g')
|
||||||
for GPU in $(seq 0 $(($PARALLEL_JOB_COUNT-1))); do
|
for GPU in $(seq 0 $(($PARALLEL_JOB_COUNT-1))); do
|
||||||
# assign shard-id for each shard
|
# assign shard-id for each shard
|
||||||
commands_gpu=${commands//"--shard-id= "/"--shard-id=${GPU} "}
|
commands_gpu=$(echo "$commands" | sed -E "s/--shard-id[[:blank:]]*=[[:blank:]]*[0-9]*/--shard-id=${GPU} /g" | sed 's/ \\ / /g')
|
||||||
echo "Shard ${GPU} commands:$commands_gpu"
|
echo "Shard ${GPU} commands:$commands_gpu"
|
||||||
echo "Render devices: $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES"
|
echo "Render devices: $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES"
|
||||||
docker run \
|
docker run \
|
||||||
--device /dev/kfd $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES \
|
--device /dev/kfd $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES \
|
||||||
--network=host \
|
--network=host \
|
||||||
--shm-size=16gb \
|
--shm-size=16gb \
|
||||||
|
--group-add "$render_gid" \
|
||||||
--rm \
|
--rm \
|
||||||
-e HIP_VISIBLE_DEVICES="${GPU}" \
|
-e HIP_VISIBLE_DEVICES="${GPU}" \
|
||||||
-e HF_TOKEN \
|
-e HF_TOKEN \
|
||||||
@@ -217,8 +222,8 @@ else
|
|||||||
--device /dev/kfd $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES \
|
--device /dev/kfd $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES \
|
||||||
--network=host \
|
--network=host \
|
||||||
--shm-size=16gb \
|
--shm-size=16gb \
|
||||||
|
--group-add "$render_gid" \
|
||||||
--rm \
|
--rm \
|
||||||
-e HIP_VISIBLE_DEVICES=0 \
|
|
||||||
-e HF_TOKEN \
|
-e HF_TOKEN \
|
||||||
-e AWS_ACCESS_KEY_ID \
|
-e AWS_ACCESS_KEY_ID \
|
||||||
-e AWS_SECRET_ACCESS_KEY \
|
-e AWS_SECRET_ACCESS_KEY \
|
||||||
|
|||||||
63
.buildkite/scripts/hardware_ci/run-cpu-test-arm.sh
Executable file
63
.buildkite/scripts/hardware_ci/run-cpu-test-arm.sh
Executable file
@@ -0,0 +1,63 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
# This script build the CPU docker image and run the offline inference inside the container.
|
||||||
|
# It serves a sanity check for compilation and basic model usage.
|
||||||
|
set -ex
|
||||||
|
|
||||||
|
# allow to bind to different cores
|
||||||
|
CORE_RANGE=${CORE_RANGE:-0-16}
|
||||||
|
OMP_CORE_RANGE=${OMP_CORE_RANGE:-0-16}
|
||||||
|
|
||||||
|
export CMAKE_BUILD_PARALLEL_LEVEL=16
|
||||||
|
|
||||||
|
# Setup cleanup
|
||||||
|
remove_docker_container() {
|
||||||
|
set -e;
|
||||||
|
docker rm -f cpu-test || true;
|
||||||
|
}
|
||||||
|
trap remove_docker_container EXIT
|
||||||
|
remove_docker_container
|
||||||
|
|
||||||
|
# Try building the docker image
|
||||||
|
docker build --tag cpu-test --target vllm-test -f docker/Dockerfile.cpu .
|
||||||
|
|
||||||
|
# Run the image
|
||||||
|
docker run -itd --cpuset-cpus="$CORE_RANGE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=16 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test cpu-test
|
||||||
|
|
||||||
|
function cpu_tests() {
|
||||||
|
set -e
|
||||||
|
|
||||||
|
docker exec cpu-test bash -c "
|
||||||
|
set -e
|
||||||
|
pip list"
|
||||||
|
|
||||||
|
# offline inference
|
||||||
|
docker exec cpu-test bash -c "
|
||||||
|
set -e
|
||||||
|
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
|
||||||
|
|
||||||
|
# Run kernel tests
|
||||||
|
docker exec cpu-test bash -c "
|
||||||
|
set -e
|
||||||
|
pytest -x -v -s tests/kernels/test_onednn.py
|
||||||
|
pytest -x -v -s tests/kernels/attention/test_cpu_attn.py
|
||||||
|
pytest -x -v -s tests/kernels/moe/test_moe.py -k test_cpu_fused_moe_basic"
|
||||||
|
|
||||||
|
# basic online serving
|
||||||
|
docker exec cpu-test bash -c '
|
||||||
|
set -e
|
||||||
|
VLLM_CPU_OMP_THREADS_BIND=$E2E_OMP_THREADS vllm serve Qwen/Qwen3-0.6B --max-model-len 2048 &
|
||||||
|
server_pid=$!
|
||||||
|
timeout 600 bash -c "until curl localhost:8000/v1/models; do sleep 1; done" || exit 1
|
||||||
|
vllm bench serve \
|
||||||
|
--backend vllm \
|
||||||
|
--dataset-name random \
|
||||||
|
--model Qwen/Qwen3-0.6B \
|
||||||
|
--num-prompts 20 \
|
||||||
|
--endpoint /v1/completions
|
||||||
|
kill -s SIGTERM $server_pid &'
|
||||||
|
}
|
||||||
|
|
||||||
|
# All of CPU tests are expected to be finished less than 40 mins.
|
||||||
|
export -f cpu_tests
|
||||||
|
timeout 2h bash -c cpu_tests
|
||||||
@@ -25,20 +25,22 @@ function cpu_tests() {
|
|||||||
|
|
||||||
# offline inference
|
# offline inference
|
||||||
podman exec -it "$container_id" bash -c "
|
podman exec -it "$container_id" bash -c "
|
||||||
|
export TORCH_COMPILE_DISABLE=1
|
||||||
set -xve
|
set -xve
|
||||||
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m" >> $HOME/test_basic.log
|
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m" >> $HOME/test_basic.log
|
||||||
|
|
||||||
# Run basic model test
|
# Run basic model test
|
||||||
podman exec -it "$container_id" bash -c "
|
podman exec -it "$container_id" bash -c "
|
||||||
|
export TORCH_COMPILE_DISABLE=1
|
||||||
set -evx
|
set -evx
|
||||||
pip install pytest pytest-asyncio einops peft Pillow soundfile transformers_stream_generator matplotlib
|
pip install pytest pytest-asyncio einops peft Pillow soundfile transformers_stream_generator matplotlib
|
||||||
pip install sentence-transformers datamodel_code_generator
|
pip install sentence-transformers datamodel_code_generator tblib
|
||||||
|
|
||||||
# Note: disable Bart until supports V1
|
# Note: disable Bart until supports V1
|
||||||
# pytest -v -s tests/models/language/generation/test_bart.py -m cpu_model
|
# pytest -v -s tests/models/language/generation/test_bart.py -m cpu_model
|
||||||
pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-openai-community/gpt2]
|
pytest -v -s tests/models/language/generation/test_common.py::test_models[False-False-5-32-openai-community/gpt2]
|
||||||
pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-facebook/opt-125m]
|
pytest -v -s tests/models/language/generation/test_common.py::test_models[False-False-5-32-facebook/opt-125m]
|
||||||
pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-google/gemma-1.1-2b-it]
|
pytest -v -s tests/models/language/generation/test_common.py::test_models[False-False-5-32-google/gemma-1.1-2b-it]
|
||||||
pytest -v -s tests/models/language/pooling/test_classification.py::test_models[float-jason9693/Qwen2.5-1.5B-apeach]
|
pytest -v -s tests/models/language/pooling/test_classification.py::test_models[float-jason9693/Qwen2.5-1.5B-apeach]
|
||||||
# TODO: Below test case tests/models/language/pooling/test_embedding.py::test_models[True-ssmits/Qwen2-7B-Instruct-embed-base] fails on ppc64le. Disabling it for time being.
|
# TODO: Below test case tests/models/language/pooling/test_embedding.py::test_models[True-ssmits/Qwen2-7B-Instruct-embed-base] fails on ppc64le. Disabling it for time being.
|
||||||
# pytest -v -s tests/models/language/pooling/test_embedding.py -m cpu_model" >> $HOME/test_rest.log
|
# pytest -v -s tests/models/language/pooling/test_embedding.py -m cpu_model" >> $HOME/test_rest.log
|
||||||
|
|||||||
@@ -21,8 +21,8 @@ trap remove_docker_container EXIT
|
|||||||
remove_docker_container
|
remove_docker_container
|
||||||
|
|
||||||
# Try building the docker image
|
# Try building the docker image
|
||||||
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --tag cpu-test-"$NUMA_NODE" --target vllm-test -f docker/Dockerfile.cpu .
|
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --progress plain --tag cpu-test-"$NUMA_NODE" --target vllm-test -f docker/Dockerfile.cpu .
|
||||||
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" --tag cpu-test-"$NUMA_NODE"-avx2 --target vllm-test -f docker/Dockerfile.cpu .
|
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --progress plain --build-arg VLLM_CPU_DISABLE_AVX512="true" --tag cpu-test-"$NUMA_NODE"-avx2 --target vllm-test -f docker/Dockerfile.cpu .
|
||||||
|
|
||||||
# Run the image, setting --shm-size=4g for tensor parallel.
|
# Run the image, setting --shm-size=4g for tensor parallel.
|
||||||
docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=16 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"
|
docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=16 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"
|
||||||
@@ -49,6 +49,7 @@ function cpu_tests() {
|
|||||||
# Run kernel tests
|
# Run kernel tests
|
||||||
docker exec cpu-test-"$NUMA_NODE" bash -c "
|
docker exec cpu-test-"$NUMA_NODE" bash -c "
|
||||||
set -e
|
set -e
|
||||||
|
pytest -x -v -s tests/kernels/attention/test_cpu_attn.py
|
||||||
pytest -x -v -s tests/kernels/test_onednn.py"
|
pytest -x -v -s tests/kernels/test_onednn.py"
|
||||||
|
|
||||||
# Run basic model test
|
# Run basic model test
|
||||||
@@ -72,12 +73,11 @@ function cpu_tests() {
|
|||||||
pytest -x -s -v \
|
pytest -x -s -v \
|
||||||
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_logprobs"
|
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_logprobs"
|
||||||
|
|
||||||
# Note: disable it until supports V1
|
# Run AWQ/GPTQ test
|
||||||
# Run AWQ test
|
docker exec cpu-test-"$NUMA_NODE" bash -c "
|
||||||
# docker exec cpu-test-"$NUMA_NODE" bash -c "
|
set -e
|
||||||
# set -e
|
pytest -x -s -v \
|
||||||
# VLLM_USE_V1=0 pytest -x -s -v \
|
tests/quantization/test_cpu_wna16.py"
|
||||||
# tests/quantization/test_ipex_quant.py"
|
|
||||||
|
|
||||||
# Run multi-lora tests
|
# Run multi-lora tests
|
||||||
docker exec cpu-test-"$NUMA_NODE" bash -c "
|
docker exec cpu-test-"$NUMA_NODE" bash -c "
|
||||||
@@ -116,4 +116,4 @@ function cpu_tests() {
|
|||||||
|
|
||||||
# All of CPU tests are expected to be finished less than 40 mins.
|
# All of CPU tests are expected to be finished less than 40 mins.
|
||||||
export -f cpu_tests
|
export -f cpu_tests
|
||||||
timeout 2h bash -c "cpu_tests $CORE_RANGE $NUMA_NODE"
|
timeout 2.5h bash -c "cpu_tests $CORE_RANGE $NUMA_NODE"
|
||||||
|
|||||||
@@ -74,6 +74,7 @@ FROM ${BASE_IMAGE_NAME}
|
|||||||
|
|
||||||
# Define environments
|
# Define environments
|
||||||
ENV DEBIAN_FRONTEND=noninteractive
|
ENV DEBIAN_FRONTEND=noninteractive
|
||||||
|
ENV SOC_VERSION="ascend910b1"
|
||||||
|
|
||||||
RUN pip config set global.index-url http://cache-service-vllm.nginx-pypi-cache.svc.cluster.local:${PYPI_CACHE_PORT}/pypi/simple && \
|
RUN pip config set global.index-url http://cache-service-vllm.nginx-pypi-cache.svc.cluster.local:${PYPI_CACHE_PORT}/pypi/simple && \
|
||||||
pip config set global.trusted-host cache-service-vllm.nginx-pypi-cache.svc.cluster.local && \
|
pip config set global.trusted-host cache-service-vllm.nginx-pypi-cache.svc.cluster.local && \
|
||||||
|
|||||||
@@ -35,9 +35,10 @@ docker run \
|
|||||||
echo $ZE_AFFINITY_MASK
|
echo $ZE_AFFINITY_MASK
|
||||||
pip install tblib==3.1.0
|
pip install tblib==3.1.0
|
||||||
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
|
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
|
||||||
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 -O3 -O.cudagraph_mode=NONE
|
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 -O3 -cc.cudagraph_mode=NONE
|
||||||
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray
|
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray
|
||||||
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp
|
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp
|
||||||
|
python3 examples/offline_inference/basic/generate.py --model Intel/Qwen2.5-0.5B-W4A16-G128-AutoRound-LLMC-TEST-ONLY --enforce-eager
|
||||||
VLLM_ATTENTION_BACKEND=TRITON_ATTN python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
|
VLLM_ATTENTION_BACKEND=TRITON_ATTN python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
|
||||||
cd tests
|
cd tests
|
||||||
pytest -v -s v1/core
|
pytest -v -s v1/core
|
||||||
@@ -46,6 +47,6 @@ docker run \
|
|||||||
pytest -v -s v1/worker --ignore=v1/worker/test_gpu_model_runner.py
|
pytest -v -s v1/worker --ignore=v1/worker/test_gpu_model_runner.py
|
||||||
pytest -v -s v1/structured_output
|
pytest -v -s v1/structured_output
|
||||||
pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_tree_attention.py --ignore=v1/spec_decode/test_speculators_eagle3.py
|
pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_tree_attention.py --ignore=v1/spec_decode/test_speculators_eagle3.py
|
||||||
pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_nixl_connector.py --ignore=v1/kv_connector/unit/test_shared_storage_connector.py
|
pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_nixl_connector.py --ignore=v1/kv_connector/unit/test_example_connector.py --ignore=v1/kv_connector/unit/test_lmcache_integration.py
|
||||||
pytest -v -s v1/test_serial_utils.py
|
pytest -v -s v1/test_serial_utils.py
|
||||||
'
|
'
|
||||||
|
|||||||
@@ -12,6 +12,11 @@ REPO_ROOT="$(cd "${SCRIPT_DIR}/../.." && pwd)"
|
|||||||
PRIME_RL_REPO="https://github.com/PrimeIntellect-ai/prime-rl.git"
|
PRIME_RL_REPO="https://github.com/PrimeIntellect-ai/prime-rl.git"
|
||||||
PRIME_RL_DIR="${REPO_ROOT}/prime-rl"
|
PRIME_RL_DIR="${REPO_ROOT}/prime-rl"
|
||||||
|
|
||||||
|
if command -v rocm-smi &> /dev/null || command -v rocminfo &> /dev/null; then
|
||||||
|
echo "AMD GPU detected. Prime-RL currently only supports NVIDIA. Skipping..."
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
|
||||||
echo "Setting up Prime-RL integration test environment..."
|
echo "Setting up Prime-RL integration test environment..."
|
||||||
|
|
||||||
# Clean up any existing Prime-RL directory
|
# Clean up any existing Prime-RL directory
|
||||||
|
|||||||
@@ -0,0 +1,73 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
set -euxo pipefail
|
||||||
|
|
||||||
|
# args: [THRESHOLD] [NUM_QUESTIONS] [START_PORT]
|
||||||
|
THRESHOLD=${1:-0.25}
|
||||||
|
NUM_Q=${2:-1319}
|
||||||
|
PORT=${3:-8030}
|
||||||
|
OUT_DIR=${OUT_DIR:-/tmp/vllm-scheduled}
|
||||||
|
mkdir -p "${OUT_DIR}"
|
||||||
|
|
||||||
|
wait_for_server() {
|
||||||
|
local port=$1
|
||||||
|
timeout 600 bash -c '
|
||||||
|
until curl -sf "http://127.0.0.1:'"$port"'/health" > /dev/null; do
|
||||||
|
sleep 1
|
||||||
|
done'
|
||||||
|
}
|
||||||
|
|
||||||
|
MODEL="deepseek-ai/DeepSeek-V2-lite"
|
||||||
|
|
||||||
|
# Set BACKENDS based on platform
|
||||||
|
if command -v rocm-smi &> /dev/null || [[ -d /opt/rocm ]] || [[ -n "${ROCM_PATH:-}" ]]; then
|
||||||
|
# ROCm platform
|
||||||
|
BACKENDS=("allgather_reducescatter")
|
||||||
|
# Disable MOE padding for ROCm since it is causing eplb to fail
|
||||||
|
export VLLM_ROCM_MOE_PADDING=0
|
||||||
|
else
|
||||||
|
# Non-ROCm platform (CUDA/other)
|
||||||
|
BACKENDS=("deepep_high_throughput" "deepep_low_latency")
|
||||||
|
fi
|
||||||
|
|
||||||
|
cleanup() {
|
||||||
|
if [[ -n "${SERVER_PID:-}" ]] && kill -0 "${SERVER_PID}" 2>/dev/null; then
|
||||||
|
kill "${SERVER_PID}" 2>/dev/null || true
|
||||||
|
for _ in {1..20}; do
|
||||||
|
kill -0 "${SERVER_PID}" 2>/dev/null || break
|
||||||
|
sleep 0.5
|
||||||
|
done
|
||||||
|
kill -9 "${SERVER_PID}" 2>/dev/null || true
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
trap cleanup EXIT
|
||||||
|
|
||||||
|
for BACK in "${BACKENDS[@]}"; do
|
||||||
|
VLLM_DEEP_GEMM_WARMUP=skip \
|
||||||
|
VLLM_ALL2ALL_BACKEND=$BACK \
|
||||||
|
vllm serve "$MODEL" \
|
||||||
|
--enforce-eager \
|
||||||
|
--tensor-parallel-size 2 \
|
||||||
|
--data-parallel-size 2 \
|
||||||
|
--enable-expert-parallel \
|
||||||
|
--enable-eplb \
|
||||||
|
--eplb-config '{"window_size":200,"step_interval":600,"use_async":true}' \
|
||||||
|
--trust-remote-code \
|
||||||
|
--max-model-len 2048 \
|
||||||
|
--port $PORT &
|
||||||
|
SERVER_PID=$!
|
||||||
|
wait_for_server $PORT
|
||||||
|
|
||||||
|
TAG=$(echo "$MODEL" | tr '/: \\n' '_____')
|
||||||
|
OUT="${OUT_DIR}/${TAG}_${BACK}_async_eplb.json"
|
||||||
|
python3 tests/evals/gsm8k/gsm8k_eval.py --host http://127.0.0.1 --port $PORT --num-questions ${NUM_Q} --save-results ${OUT}
|
||||||
|
python3 - <<PY
|
||||||
|
import json; acc=json.load(open('${OUT}'))['accuracy']
|
||||||
|
print(f"${MODEL} ${BACK}: accuracy {acc:.3f}")
|
||||||
|
assert acc >= ${THRESHOLD}, f"${MODEL} ${BACK} accuracy {acc}"
|
||||||
|
PY
|
||||||
|
|
||||||
|
cleanup
|
||||||
|
SERVER_PID=
|
||||||
|
sleep 1
|
||||||
|
PORT=$((PORT+1))
|
||||||
|
done
|
||||||
@@ -0,0 +1,73 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
set -euxo pipefail
|
||||||
|
|
||||||
|
# args: [THRESHOLD] [NUM_QUESTIONS] [START_PORT]
|
||||||
|
THRESHOLD=${1:-0.25}
|
||||||
|
NUM_Q=${2:-1319}
|
||||||
|
PORT=${3:-8010}
|
||||||
|
OUT_DIR=${OUT_DIR:-/tmp/vllm-scheduled}
|
||||||
|
mkdir -p "${OUT_DIR}"
|
||||||
|
|
||||||
|
wait_for_server() {
|
||||||
|
local port=$1
|
||||||
|
timeout 600 bash -c '
|
||||||
|
until curl -sf "http://127.0.0.1:'"$port"'/health" > /dev/null; do
|
||||||
|
sleep 1
|
||||||
|
done'
|
||||||
|
}
|
||||||
|
|
||||||
|
MODEL="deepseek-ai/DeepSeek-V2-lite"
|
||||||
|
|
||||||
|
# Set BACKENDS based on platform
|
||||||
|
if command -v rocm-smi &> /dev/null || [[ -d /opt/rocm ]] || [[ -n "${ROCM_PATH:-}" ]]; then
|
||||||
|
# ROCm platform
|
||||||
|
BACKENDS=("allgather_reducescatter")
|
||||||
|
# Disable MOE padding for ROCm since it is causing eplb to fail
|
||||||
|
export VLLM_ROCM_MOE_PADDING=0
|
||||||
|
else
|
||||||
|
# Non-ROCm platform (CUDA/other)
|
||||||
|
BACKENDS=("deepep_high_throughput" "deepep_low_latency")
|
||||||
|
fi
|
||||||
|
|
||||||
|
cleanup() {
|
||||||
|
if [[ -n "${SERVER_PID:-}" ]] && kill -0 "${SERVER_PID}" 2>/dev/null; then
|
||||||
|
kill "${SERVER_PID}" 2>/dev/null || true
|
||||||
|
for _ in {1..20}; do
|
||||||
|
kill -0 "${SERVER_PID}" 2>/dev/null || break
|
||||||
|
sleep 0.5
|
||||||
|
done
|
||||||
|
kill -9 "${SERVER_PID}" 2>/dev/null || true
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
trap cleanup EXIT
|
||||||
|
|
||||||
|
for BACK in "${BACKENDS[@]}"; do
|
||||||
|
VLLM_DEEP_GEMM_WARMUP=skip \
|
||||||
|
VLLM_ALL2ALL_BACKEND=$BACK \
|
||||||
|
vllm serve "$MODEL" \
|
||||||
|
--enforce-eager \
|
||||||
|
--tensor-parallel-size 2 \
|
||||||
|
--data-parallel-size 2 \
|
||||||
|
--enable-expert-parallel \
|
||||||
|
--enable-eplb \
|
||||||
|
--eplb-config '{"window_size":200,"step_interval":600}' \
|
||||||
|
--trust-remote-code \
|
||||||
|
--max-model-len 2048 \
|
||||||
|
--port $PORT &
|
||||||
|
SERVER_PID=$!
|
||||||
|
wait_for_server $PORT
|
||||||
|
|
||||||
|
TAG=$(echo "$MODEL" | tr '/: \\n' '_____')
|
||||||
|
OUT="${OUT_DIR}/${TAG}_${BACK}.json"
|
||||||
|
python3 tests/evals/gsm8k/gsm8k_eval.py --host http://127.0.0.1 --port $PORT --num-questions ${NUM_Q} --save-results ${OUT}
|
||||||
|
python3 - <<PY
|
||||||
|
import json; acc=json.load(open('${OUT}'))['accuracy']
|
||||||
|
print(f"${MODEL} ${BACK}: accuracy {acc:.3f}")
|
||||||
|
assert acc >= ${THRESHOLD}, f"${MODEL} ${BACK} accuracy {acc}"
|
||||||
|
PY
|
||||||
|
|
||||||
|
cleanup
|
||||||
|
SERVER_PID=
|
||||||
|
sleep 1
|
||||||
|
PORT=$((PORT+1))
|
||||||
|
done
|
||||||
@@ -0,0 +1,74 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
set -euxo pipefail
|
||||||
|
|
||||||
|
# args: [THRESHOLD] [NUM_QUESTIONS] [START_PORT] [DATA_PARALLEL_SIZE] [TENSOR_PARALLEL_SIZE]
|
||||||
|
THRESHOLD=${1:-0.8}
|
||||||
|
NUM_Q=${2:-1319}
|
||||||
|
PORT=${3:-8020}
|
||||||
|
DATA_PARALLEL_SIZE=${4:-2}
|
||||||
|
TENSOR_PARALLEL_SIZE=${5:-2}
|
||||||
|
OUT_DIR=${OUT_DIR:-/tmp/vllm-scheduled}
|
||||||
|
mkdir -p "${OUT_DIR}"
|
||||||
|
|
||||||
|
wait_for_server() {
|
||||||
|
local port=$1
|
||||||
|
timeout 600 bash -c '
|
||||||
|
until curl -sf "http://127.0.0.1:'"$port"'/health" > /dev/null; do
|
||||||
|
sleep 1
|
||||||
|
done'
|
||||||
|
}
|
||||||
|
|
||||||
|
MODEL="QWen/Qwen3-30B-A3B-FP8"
|
||||||
|
# Set BACKENDS based on platform
|
||||||
|
if command -v rocm-smi &> /dev/null || [[ -d /opt/rocm ]] || [[ -n "${ROCM_PATH:-}" ]]; then
|
||||||
|
# ROCm platform
|
||||||
|
BACKENDS=("allgather_reducescatter")
|
||||||
|
# Disable MOE padding for ROCm since it is causing eplb to fail
|
||||||
|
export VLLM_ROCM_MOE_PADDING=0
|
||||||
|
else
|
||||||
|
# Non-ROCm platform (CUDA/other)
|
||||||
|
BACKENDS=("deepep_high_throughput" "deepep_low_latency")
|
||||||
|
fi
|
||||||
|
|
||||||
|
cleanup() {
|
||||||
|
if [[ -n "${SERVER_PID:-}" ]] && kill -0 "${SERVER_PID}" 2>/dev/null; then
|
||||||
|
kill "${SERVER_PID}" 2>/dev/null || true
|
||||||
|
for _ in {1..20}; do
|
||||||
|
kill -0 "${SERVER_PID}" 2>/dev/null || break
|
||||||
|
sleep 0.5
|
||||||
|
done
|
||||||
|
kill -9 "${SERVER_PID}" 2>/dev/null || true
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
trap cleanup EXIT
|
||||||
|
|
||||||
|
for BACK in "${BACKENDS[@]}"; do
|
||||||
|
VLLM_DEEP_GEMM_WARMUP=skip \
|
||||||
|
VLLM_ALL2ALL_BACKEND=$BACK \
|
||||||
|
vllm serve "$MODEL" \
|
||||||
|
--enforce-eager \
|
||||||
|
--enable-eplb \
|
||||||
|
--eplb-config '{"window_size":10, "step_interval":100, "num_redundant_experts":0, "log_balancedness":true}' \
|
||||||
|
--tensor-parallel-size ${TENSOR_PARALLEL_SIZE} \
|
||||||
|
--data-parallel-size ${DATA_PARALLEL_SIZE} \
|
||||||
|
--enable-expert-parallel \
|
||||||
|
--trust-remote-code \
|
||||||
|
--max-model-len 2048 \
|
||||||
|
--port $PORT &
|
||||||
|
SERVER_PID=$!
|
||||||
|
wait_for_server $PORT
|
||||||
|
|
||||||
|
TAG=$(echo "$MODEL" | tr '/: \\n' '_____')
|
||||||
|
OUT="${OUT_DIR}/${TAG}_${BACK}.json"
|
||||||
|
python3 tests/evals/gsm8k/gsm8k_eval.py --host http://127.0.0.1 --port $PORT --num-questions ${NUM_Q} --save-results ${OUT}
|
||||||
|
python3 - <<PY
|
||||||
|
import json; acc=json.load(open('${OUT}'))['accuracy']
|
||||||
|
print(f"${MODEL} ${BACK}: accuracy {acc:.3f}")
|
||||||
|
assert acc >= ${THRESHOLD}, f"${MODEL} ${BACK} accuracy {acc}"
|
||||||
|
PY
|
||||||
|
|
||||||
|
cleanup
|
||||||
|
SERVER_PID=
|
||||||
|
sleep 1
|
||||||
|
PORT=$((PORT+1))
|
||||||
|
done
|
||||||
@@ -0,0 +1,74 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
set -euxo pipefail
|
||||||
|
|
||||||
|
# args: [THRESHOLD] [NUM_QUESTIONS] [START_PORT]
|
||||||
|
THRESHOLD=${1:-0.25}
|
||||||
|
NUM_Q=${2:-1319}
|
||||||
|
PORT=${3:-8040}
|
||||||
|
OUT_DIR=${OUT_DIR:-/tmp/vllm-scheduled}
|
||||||
|
mkdir -p "${OUT_DIR}"
|
||||||
|
|
||||||
|
wait_for_server() {
|
||||||
|
local port=$1
|
||||||
|
timeout 600 bash -c '
|
||||||
|
until curl -sf "http://127.0.0.1:'"$port"'/health" > /dev/null; do
|
||||||
|
sleep 1
|
||||||
|
done'
|
||||||
|
}
|
||||||
|
|
||||||
|
MODEL="Qwen/Qwen3-Next-80B-A3B-Instruct"
|
||||||
|
|
||||||
|
# Set BACKENDS based on platform
|
||||||
|
if command -v rocm-smi &> /dev/null || [[ -d /opt/rocm ]] || [[ -n "${ROCM_PATH:-}" ]]; then
|
||||||
|
# ROCm platform
|
||||||
|
BACKENDS=("allgather_reducescatter")
|
||||||
|
# Disable MOE padding for ROCm since it is causing eplb to fail
|
||||||
|
export VLLM_ROCM_MOE_PADDING=0
|
||||||
|
else
|
||||||
|
# Non-ROCm platform (CUDA/other)
|
||||||
|
BACKENDS=("deepep_high_throughput" "deepep_low_latency")
|
||||||
|
fi
|
||||||
|
|
||||||
|
cleanup() {
|
||||||
|
if [[ -n "${SERVER_PID:-}" ]] && kill -0 "${SERVER_PID}" 2>/dev/null; then
|
||||||
|
kill "${SERVER_PID}" 2>/dev/null || true
|
||||||
|
for _ in {1..20}; do
|
||||||
|
kill -0 "${SERVER_PID}" 2>/dev/null || break
|
||||||
|
sleep 0.5
|
||||||
|
done
|
||||||
|
kill -9 "${SERVER_PID}" 2>/dev/null || true
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
trap cleanup EXIT
|
||||||
|
|
||||||
|
for BACK in "${BACKENDS[@]}"; do
|
||||||
|
VLLM_DEEP_GEMM_WARMUP=skip \
|
||||||
|
VLLM_ALL2ALL_BACKEND=$BACK \
|
||||||
|
vllm serve "$MODEL" \
|
||||||
|
--enforce-eager \
|
||||||
|
--tensor-parallel-size 4 \
|
||||||
|
--enable-expert-parallel \
|
||||||
|
--enable-eplb \
|
||||||
|
--eplb-config '{"window_size":200,"step_interval":600,"use_async":true}' \
|
||||||
|
--speculative-config '{"method":"qwen3_next_mtp","num_speculative_tokens":1}' \
|
||||||
|
--trust-remote-code \
|
||||||
|
--max-model-len 2048 \
|
||||||
|
--gpu-memory-utilization 0.9 \
|
||||||
|
--port $PORT &
|
||||||
|
SERVER_PID=$!
|
||||||
|
wait_for_server $PORT
|
||||||
|
|
||||||
|
TAG=$(echo "$MODEL" | tr '/: \\n' '_____')
|
||||||
|
OUT="${OUT_DIR}/${TAG}_${BACK}.json"
|
||||||
|
python3 tests/evals/gsm8k/gsm8k_eval.py --host http://127.0.0.1 --port $PORT --num-questions ${NUM_Q} --save-results ${OUT}
|
||||||
|
python3 - <<PY
|
||||||
|
import json; acc=json.load(open('${OUT}'))['accuracy']
|
||||||
|
print(f"${MODEL} ${BACK}: accuracy {acc:.3f}")
|
||||||
|
assert acc >= ${THRESHOLD}, f"${MODEL} ${BACK} accuracy {acc}"
|
||||||
|
PY
|
||||||
|
|
||||||
|
cleanup
|
||||||
|
SERVER_PID=
|
||||||
|
sleep 1
|
||||||
|
PORT=$((PORT+1))
|
||||||
|
done
|
||||||
@@ -2,6 +2,28 @@
|
|||||||
|
|
||||||
set -ex
|
set -ex
|
||||||
|
|
||||||
|
# ======== part 0: setup ========
|
||||||
|
|
||||||
|
BUCKET="vllm-wheels"
|
||||||
|
INDICES_OUTPUT_DIR="indices"
|
||||||
|
DEFAULT_VARIANT_ALIAS="cu129" # align with vLLM_MAIN_CUDA_VERSION in vllm/envs.py
|
||||||
|
PYTHON=${PYTHON_PROG:=python3} # try to read from env var, otherwise use python3
|
||||||
|
SUBPATH=$BUILDKITE_COMMIT
|
||||||
|
S3_COMMIT_PREFIX="s3://$BUCKET/$SUBPATH/"
|
||||||
|
|
||||||
|
# detect if python3.10+ is available
|
||||||
|
has_new_python=$($PYTHON -c "print(1 if __import__('sys').version_info >= (3,12) else 0)")
|
||||||
|
if [[ "$has_new_python" -eq 0 ]]; then
|
||||||
|
# use new python from docker
|
||||||
|
docker pull python:3-slim
|
||||||
|
PYTHON="docker run --rm -v $(pwd):/app -w /app python:3-slim python3"
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "Using python interpreter: $PYTHON"
|
||||||
|
echo "Python version: $($PYTHON --version)"
|
||||||
|
|
||||||
|
# ========= part 1: collect, rename & upload the wheel ==========
|
||||||
|
|
||||||
# Assume wheels are in artifacts/dist/*.whl
|
# Assume wheels are in artifacts/dist/*.whl
|
||||||
wheel_files=(artifacts/dist/*.whl)
|
wheel_files=(artifacts/dist/*.whl)
|
||||||
|
|
||||||
@@ -10,74 +32,72 @@ if [[ ${#wheel_files[@]} -ne 1 ]]; then
|
|||||||
echo "Error: Expected exactly one wheel file in artifacts/dist/, but found ${#wheel_files[@]}"
|
echo "Error: Expected exactly one wheel file in artifacts/dist/, but found ${#wheel_files[@]}"
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# Get the single wheel file
|
|
||||||
wheel="${wheel_files[0]}"
|
wheel="${wheel_files[0]}"
|
||||||
|
|
||||||
# Detect architecture and rename 'linux' to appropriate manylinux version
|
# current build image uses ubuntu 20.04, which corresponds to manylinux_2_31
|
||||||
arch=$(uname -m)
|
# refer to https://github.com/mayeut/pep600_compliance?tab=readme-ov-file#acceptable-distros-to-build-wheels
|
||||||
if [[ $arch == "x86_64" ]]; then
|
manylinux_version="manylinux_2_31"
|
||||||
manylinux_version="manylinux1"
|
|
||||||
elif [[ $arch == "aarch64" ]]; then
|
|
||||||
manylinux_version="manylinux2014"
|
|
||||||
else
|
|
||||||
echo "Warning: Unknown architecture $arch, using manylinux1 as default"
|
|
||||||
manylinux_version="manylinux1"
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Rename 'linux' to the appropriate manylinux version in the wheel filename
|
# Rename 'linux' to the appropriate manylinux version in the wheel filename
|
||||||
|
if [[ "$wheel" != *"linux"* ]]; then
|
||||||
|
echo "Error: Wheel filename does not contain 'linux': $wheel"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
new_wheel="${wheel/linux/$manylinux_version}"
|
new_wheel="${wheel/linux/$manylinux_version}"
|
||||||
mv -- "$wheel" "$new_wheel"
|
mv -- "$wheel" "$new_wheel"
|
||||||
wheel="$new_wheel"
|
wheel="$new_wheel"
|
||||||
|
echo "Renamed wheel to: $wheel"
|
||||||
|
|
||||||
# Extract the version from the wheel
|
# Extract the version from the wheel
|
||||||
version=$(unzip -p "$wheel" '**/METADATA' | grep '^Version: ' | cut -d' ' -f2)
|
version=$(unzip -p "$wheel" '**/METADATA' | grep '^Version: ' | cut -d' ' -f2)
|
||||||
echo "Version: $version"
|
echo "Version in wheel: $version"
|
||||||
|
pure_version="${version%%+*}"
|
||||||
|
echo "Pure version (without variant): $pure_version"
|
||||||
|
|
||||||
normal_wheel="$wheel" # Save the original wheel filename
|
# copy wheel to its own bucket
|
||||||
|
aws s3 cp "$wheel" "$S3_COMMIT_PREFIX"
|
||||||
|
|
||||||
# If the version contains "dev", rename it to v1.0.0.dev for consistency
|
# ========= part 2: generate and upload indices ==========
|
||||||
if [[ $version == *dev* ]]; then
|
# generate indices for all existing wheels in the commit directory
|
||||||
suffix="${version##*.}"
|
# this script might be run multiple times if there are multiple variants being built
|
||||||
if [[ $suffix == cu* ]]; then
|
# so we need to guarantee there is little chance for "TOCTOU" issues
|
||||||
new_version="1.0.0.dev+${suffix}"
|
# i.e., one process is generating indices while another is uploading a new wheel
|
||||||
else
|
# so we need to ensure no time-consuming operations happen below
|
||||||
new_version="1.0.0.dev"
|
|
||||||
fi
|
|
||||||
new_wheel="${wheel/$version/$new_version}"
|
|
||||||
# use cp to keep both files in the artifacts directory
|
|
||||||
cp -- "$wheel" "$new_wheel"
|
|
||||||
wheel="$new_wheel"
|
|
||||||
version="$new_version"
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Upload the wheel to S3
|
# list all wheels in the commit directory
|
||||||
python3 .buildkite/generate_index.py --wheel "$normal_wheel"
|
echo "Existing wheels on S3:"
|
||||||
|
aws s3 ls "$S3_COMMIT_PREFIX"
|
||||||
|
obj_json="objects.json"
|
||||||
|
aws s3api list-objects-v2 --bucket "$BUCKET" --prefix "$SUBPATH/" --delimiter / --output json > "$obj_json"
|
||||||
|
mkdir -p "$INDICES_OUTPUT_DIR"
|
||||||
|
|
||||||
# generate index for this commit
|
# call script to generate indicies for all existing wheels
|
||||||
aws s3 cp "$wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
|
# this indices have relative paths that could work as long as it is next to the wheel directory in s3
|
||||||
aws s3 cp "$normal_wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
|
# i.e., the wheels are always in s3://vllm-wheels/<commit>/
|
||||||
|
# and indices can be placed in /<commit>/, or /nightly/, or /<version>/
|
||||||
if [[ $normal_wheel == *"cu129"* ]]; then
|
if [[ ! -z "$DEFAULT_VARIANT_ALIAS" ]]; then
|
||||||
# only upload index.html for cu129 wheels (default wheels) as it
|
alias_arg="--alias-to-default $DEFAULT_VARIANT_ALIAS"
|
||||||
# is available on both x86 and arm64
|
|
||||||
aws s3 cp index.html "s3://vllm-wheels/$BUILDKITE_COMMIT/vllm/index.html"
|
|
||||||
aws s3 cp "s3://vllm-wheels/nightly/index.html" "s3://vllm-wheels/$BUILDKITE_COMMIT/index.html"
|
|
||||||
else
|
else
|
||||||
echo "Skipping index files for non-cu129 wheels"
|
alias_arg=""
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# generate index for nightly
|
# HACK: we do not need regex module here, but it is required by pre-commit hook
|
||||||
aws s3 cp "$wheel" "s3://vllm-wheels/nightly/"
|
# To avoid any external dependency, we simply replace it back to the stdlib re module
|
||||||
aws s3 cp "$normal_wheel" "s3://vllm-wheels/nightly/"
|
sed -i 's/import regex as re/import re/g' .buildkite/scripts/generate-nightly-index.py
|
||||||
|
$PYTHON .buildkite/scripts/generate-nightly-index.py --version "$SUBPATH" --current-objects "$obj_json" --output-dir "$INDICES_OUTPUT_DIR" --comment "commit $BUILDKITE_COMMIT" $alias_arg
|
||||||
|
|
||||||
if [[ $normal_wheel == *"cu129"* ]]; then
|
# copy indices to /<commit>/ unconditionally
|
||||||
# only upload index.html for cu129 wheels (default wheels) as it
|
echo "Uploading indices to $S3_COMMIT_PREFIX"
|
||||||
# is available on both x86 and arm64
|
aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "$S3_COMMIT_PREFIX"
|
||||||
aws s3 cp index.html "s3://vllm-wheels/nightly/vllm/index.html"
|
|
||||||
else
|
# copy to /nightly/ only if it is on the main branch and not a PR
|
||||||
echo "Skipping index files for non-cu129 wheels"
|
if [[ "$BUILDKITE_BRANCH" == "main" && "$BUILDKITE_PULL_REQUEST" == "false" ]]; then
|
||||||
|
echo "Uploading indices to overwrite /nightly/"
|
||||||
|
aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "s3://$BUCKET/nightly/"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
aws s3 cp "$wheel" "s3://vllm-wheels/$version/"
|
# copy to /<pure_version>/ only if it does not have "dev" in the version
|
||||||
aws s3 cp index.html "s3://vllm-wheels/$version/vllm/index.html"
|
if [[ "$version" != *"dev"* ]]; then
|
||||||
|
echo "Uploading indices to overwrite /$pure_version/"
|
||||||
|
aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "s3://$BUCKET/$pure_version/"
|
||||||
|
fi
|
||||||
|
|||||||
@@ -39,20 +39,20 @@ steps:
|
|||||||
# if this test fails, it means the nightly torch version is not compatible with some
|
# if this test fails, it means the nightly torch version is not compatible with some
|
||||||
# of the dependencies. Please check the error message and add the package to whitelist
|
# of the dependencies. Please check the error message and add the package to whitelist
|
||||||
# in /vllm/tools/pre_commit/generate_nightly_torch_test.py
|
# in /vllm/tools/pre_commit/generate_nightly_torch_test.py
|
||||||
mirror_hardwares: [amdexperimental]
|
mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
|
||||||
agent_pool: mi325_1
|
agent_pool: mi325_1
|
||||||
# grade: Blocking
|
grade: Blocking
|
||||||
soft_fail: true
|
soft_fail: true
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- requirements/nightly_torch_test.txt
|
- requirements/nightly_torch_test.txt
|
||||||
commands:
|
commands:
|
||||||
- bash standalone_tests/pytorch_nightly_dependency.sh
|
- bash standalone_tests/pytorch_nightly_dependency.sh
|
||||||
|
|
||||||
- label: Async Engine, Inputs, Utils, Worker Test # 36min
|
- label: Async Engine, Inputs, Utils, Worker Test # 10min
|
||||||
timeout_in_minutes: 50
|
timeout_in_minutes: 15
|
||||||
mirror_hardwares: [amdexperimental, amdproduction]
|
mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
|
||||||
agent_pool: mi325_1
|
agent_pool: mi325_1
|
||||||
# grade: Blocking
|
grade: Blocking
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
- tests/multimodal
|
- tests/multimodal
|
||||||
@@ -61,25 +61,29 @@ steps:
|
|||||||
- pytest -v -s -m 'not cpu_test' multimodal
|
- pytest -v -s -m 'not cpu_test' multimodal
|
||||||
- pytest -v -s utils_
|
- pytest -v -s utils_
|
||||||
|
|
||||||
- label: Async Engine, Inputs, Utils, Worker Test (CPU) # 4 mins
|
- label: Async Engine, Inputs, Utils, Worker, Config Test (CPU) # 15min
|
||||||
timeout_in_minutes: 10
|
timeout_in_minutes: 20
|
||||||
mirror_hardwares: [amdexperimental, amdproduction]
|
mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
|
||||||
agent_pool: mi325_1
|
agent_pool: mi325_1
|
||||||
# grade: Blocking
|
grade: Blocking
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
- tests/test_inputs.py
|
- tests/test_inputs.py
|
||||||
- tests/test_outputs.py
|
- tests/test_outputs.py
|
||||||
- tests/multimodal
|
- tests/multimodal
|
||||||
- tests/standalone_tests/lazy_imports.py
|
- tests/standalone_tests/lazy_imports.py
|
||||||
|
- tests/tokenizers_
|
||||||
- tests/transformers_utils
|
- tests/transformers_utils
|
||||||
|
- tests/config
|
||||||
no_gpu: true
|
no_gpu: true
|
||||||
commands:
|
commands:
|
||||||
- python3 standalone_tests/lazy_imports.py
|
- python3 standalone_tests/lazy_imports.py
|
||||||
- pytest -v -s test_inputs.py
|
- pytest -v -s test_inputs.py
|
||||||
- pytest -v -s test_outputs.py
|
- pytest -v -s test_outputs.py
|
||||||
- pytest -v -s -m 'cpu_test' multimodal
|
- pytest -v -s -m 'cpu_test' multimodal
|
||||||
|
- pytest -v -s tokenizers_
|
||||||
- pytest -v -s transformers_utils
|
- pytest -v -s transformers_utils
|
||||||
|
- pytest -v -s config
|
||||||
|
|
||||||
- label: Python-only Installation Test # 10min
|
- label: Python-only Installation Test # 10min
|
||||||
timeout_in_minutes: 20
|
timeout_in_minutes: 20
|
||||||
@@ -111,9 +115,9 @@ steps:
|
|||||||
- pytest -v -s basic_correctness/test_cpu_offload.py
|
- pytest -v -s basic_correctness/test_cpu_offload.py
|
||||||
|
|
||||||
- label: Entrypoints Unit Tests # 5min
|
- label: Entrypoints Unit Tests # 5min
|
||||||
mirror_hardwares: [amdexperimental, amdproduction]
|
mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
|
||||||
agent_pool: mi325_1
|
agent_pool: mi325_1
|
||||||
# grade: Blocking
|
grade: Blocking
|
||||||
timeout_in_minutes: 10
|
timeout_in_minutes: 10
|
||||||
working_dir: "/vllm-workspace/tests"
|
working_dir: "/vllm-workspace/tests"
|
||||||
fast_check: true
|
fast_check: true
|
||||||
@@ -187,7 +191,7 @@ steps:
|
|||||||
- tests/distributed/test_utils
|
- tests/distributed/test_utils
|
||||||
- tests/distributed/test_pynccl
|
- tests/distributed/test_pynccl
|
||||||
- tests/distributed/test_events
|
- tests/distributed/test_events
|
||||||
- tests/compile/test_basic_correctness
|
- tests/compile/fullgraph/test_basic_correctness.py
|
||||||
- examples/offline_inference/rlhf.py
|
- examples/offline_inference/rlhf.py
|
||||||
- examples/offline_inference/rlhf_colocate.py
|
- examples/offline_inference/rlhf_colocate.py
|
||||||
- tests/examples/offline_inference/data_parallel.py
|
- tests/examples/offline_inference/data_parallel.py
|
||||||
@@ -210,12 +214,13 @@ steps:
|
|||||||
# test with internal dp
|
# test with internal dp
|
||||||
- python3 ../examples/offline_inference/data_parallel.py --enforce-eager
|
- python3 ../examples/offline_inference/data_parallel.py --enforce-eager
|
||||||
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
|
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
|
||||||
|
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
|
||||||
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
|
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
|
||||||
- TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_internal_lb_dp.py
|
- TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_internal_lb_dp.py
|
||||||
- TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_hybrid_lb_dp.py
|
- TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_hybrid_lb_dp.py
|
||||||
- pytest -v -s v1/engine/test_engine_core_client.py::test_kv_cache_events_dp
|
- pytest -v -s v1/engine/test_engine_core_client.py::test_kv_cache_events_dp
|
||||||
- pytest -v -s distributed/test_utils.py
|
- pytest -v -s distributed/test_utils.py
|
||||||
- pytest -v -s compile/test_basic_correctness.py
|
- pytest -v -s compile/fullgraph/test_basic_correctness.py
|
||||||
- pytest -v -s distributed/test_pynccl.py
|
- pytest -v -s distributed/test_pynccl.py
|
||||||
- pytest -v -s distributed/test_events.py
|
- pytest -v -s distributed/test_events.py
|
||||||
- pytest -v -s distributed/test_symm_mem_allreduce.py
|
- pytest -v -s distributed/test_symm_mem_allreduce.py
|
||||||
@@ -226,10 +231,31 @@ steps:
|
|||||||
- VLLM_ALLOW_INSECURE_SERIALIZATION=1 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
|
- VLLM_ALLOW_INSECURE_SERIALIZATION=1 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
|
||||||
- popd
|
- popd
|
||||||
|
|
||||||
- label: EPLB Algorithm Test # 5min
|
- label: Distributed Tests (8 GPUs) # 4min
|
||||||
mirror_hardwares: [amdexperimental, amdproduction]
|
timeout_in_minutes: 10
|
||||||
agent_pool: mi325_1
|
mirror_hardwares: [amdexperimental]
|
||||||
|
agent_pool: mi325_8
|
||||||
# grade: Blocking
|
# grade: Blocking
|
||||||
|
gpu: h100
|
||||||
|
num_gpus: 8
|
||||||
|
working_dir: "/vllm-workspace/tests"
|
||||||
|
source_file_dependencies:
|
||||||
|
- examples/offline_inference/torchrun_dp_example.py
|
||||||
|
- vllm/config/parallel.py
|
||||||
|
- vllm/distributed/
|
||||||
|
- vllm/v1/engine/llm_engine.py
|
||||||
|
- vllm/v1/executor/uniproc_executor.py
|
||||||
|
- vllm/v1/worker/gpu_worker.py
|
||||||
|
commands:
|
||||||
|
# https://github.com/NVIDIA/nccl/issues/1838
|
||||||
|
#- export NCCL_CUMEM_HOST_ENABLE=0
|
||||||
|
# test with torchrun tp=2 and dp=4 with ep
|
||||||
|
- torchrun --nproc-per-node=8 ../examples/offline_inference/torchrun_dp_example.py --tp-size=2 --pp-size=1 --dp-size=4 --enable-ep
|
||||||
|
|
||||||
|
- label: EPLB Algorithm Test # 5min
|
||||||
|
mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
|
||||||
|
agent_pool: mi325_1
|
||||||
|
grade: Blocking
|
||||||
timeout_in_minutes: 15
|
timeout_in_minutes: 15
|
||||||
working_dir: "/vllm-workspace/tests"
|
working_dir: "/vllm-workspace/tests"
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
@@ -238,11 +264,11 @@ steps:
|
|||||||
commands:
|
commands:
|
||||||
- pytest -v -s distributed/test_eplb_algo.py
|
- pytest -v -s distributed/test_eplb_algo.py
|
||||||
|
|
||||||
- label: EPLB Execution Test # 5min
|
- label: EPLB Execution Test # 10min
|
||||||
mirror_hardwares: [amdexperimental, amdproduction]
|
mirror_hardwares: [amdexperimental, amdproduction]
|
||||||
agent_pool: mi325_4
|
agent_pool: mi325_4
|
||||||
# grade: Blocking
|
# grade: Blocking
|
||||||
timeout_in_minutes: 15
|
timeout_in_minutes: 20
|
||||||
working_dir: "/vllm-workspace/tests"
|
working_dir: "/vllm-workspace/tests"
|
||||||
num_gpus: 4
|
num_gpus: 4
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
@@ -250,6 +276,7 @@ steps:
|
|||||||
- tests/distributed/test_eplb_execute.py
|
- tests/distributed/test_eplb_execute.py
|
||||||
commands:
|
commands:
|
||||||
- pytest -v -s distributed/test_eplb_execute.py
|
- pytest -v -s distributed/test_eplb_execute.py
|
||||||
|
- pytest -v -s distributed/test_eplb_spec_decode.py
|
||||||
|
|
||||||
- label: Metrics, Tracing Test # 12min
|
- label: Metrics, Tracing Test # 12min
|
||||||
timeout_in_minutes: 20
|
timeout_in_minutes: 20
|
||||||
@@ -273,7 +300,7 @@ steps:
|
|||||||
|
|
||||||
- label: Regression Test # 7min
|
- label: Regression Test # 7min
|
||||||
timeout_in_minutes: 20
|
timeout_in_minutes: 20
|
||||||
mirror_hardwares: [amdexperimental, amdproduction]
|
mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
|
||||||
agent_pool: mi325_1
|
agent_pool: mi325_1
|
||||||
grade: Blocking
|
grade: Blocking
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
@@ -284,23 +311,20 @@ steps:
|
|||||||
- pytest -v -s test_regression.py
|
- pytest -v -s test_regression.py
|
||||||
working_dir: "/vllm-workspace/tests" # optional
|
working_dir: "/vllm-workspace/tests" # optional
|
||||||
|
|
||||||
- label: Engine Test # 25min
|
- label: Engine Test # 9min
|
||||||
timeout_in_minutes: 40
|
timeout_in_minutes: 15
|
||||||
mirror_hardwares: [amdexperimental, amdproduction]
|
mirror_hardwares: [amdexperimental, amdproduction]
|
||||||
agent_pool: mi325_1
|
agent_pool: mi325_1
|
||||||
#grade: Blocking
|
# grade: Blocking
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
- tests/engine
|
- tests/engine
|
||||||
- tests/tokenization
|
|
||||||
- tests/test_sequence
|
- tests/test_sequence
|
||||||
- tests/test_config
|
- tests/test_config
|
||||||
- tests/test_logger
|
- tests/test_logger
|
||||||
- tests/test_vllm_port
|
- tests/test_vllm_port
|
||||||
commands:
|
commands:
|
||||||
- pytest -v -s engine test_sequence.py test_config.py test_logger.py test_vllm_port.py
|
- pytest -v -s engine test_sequence.py test_config.py test_logger.py test_vllm_port.py
|
||||||
# OOM in the CI unless we run this separately
|
|
||||||
- pytest -v -s tokenization
|
|
||||||
|
|
||||||
- label: V1 Test e2e + engine # 30min
|
- label: V1 Test e2e + engine # 30min
|
||||||
timeout_in_minutes: 45
|
timeout_in_minutes: 45
|
||||||
@@ -318,9 +342,9 @@ steps:
|
|||||||
|
|
||||||
- label: V1 Test entrypoints # 35min
|
- label: V1 Test entrypoints # 35min
|
||||||
timeout_in_minutes: 50
|
timeout_in_minutes: 50
|
||||||
mirror_hardwares: [amdexperimental, amdproduction]
|
mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
|
||||||
agent_pool: mi325_1
|
agent_pool: mi325_1
|
||||||
# grade: Blocking
|
grade: Blocking
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
- tests/v1
|
- tests/v1
|
||||||
@@ -337,6 +361,7 @@ steps:
|
|||||||
- tests/v1
|
- tests/v1
|
||||||
commands:
|
commands:
|
||||||
# split the test to avoid interference
|
# split the test to avoid interference
|
||||||
|
- uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
|
||||||
- pytest -v -s -m 'not cpu_test' v1/core
|
- pytest -v -s -m 'not cpu_test' v1/core
|
||||||
- pytest -v -s v1/executor
|
- pytest -v -s v1/executor
|
||||||
- pytest -v -s v1/kv_offload
|
- pytest -v -s v1/kv_offload
|
||||||
@@ -348,14 +373,53 @@ steps:
|
|||||||
- pytest -v -s -m 'not cpu_test' v1/metrics
|
- pytest -v -s -m 'not cpu_test' v1/metrics
|
||||||
- pytest -v -s v1/test_oracle.py
|
- pytest -v -s v1/test_oracle.py
|
||||||
- pytest -v -s v1/test_request.py
|
- pytest -v -s v1/test_request.py
|
||||||
|
- pytest -v -s v1/test_outputs.py
|
||||||
# Integration test for streaming correctness (requires special branch).
|
# Integration test for streaming correctness (requires special branch).
|
||||||
- pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api
|
- pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api
|
||||||
- pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
|
- pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
|
||||||
|
|
||||||
- label: V1 Test others (CPU) # 5 mins
|
# TODO: Add the "V1 Test attetion (MI300)" test group
|
||||||
mirror_hardwares: [amdexperimental, amdproduction]
|
|
||||||
|
- label: V1 Test attention (H100) # 10min
|
||||||
|
mirror_hardwares: [amdexperimental]
|
||||||
agent_pool: mi325_1
|
agent_pool: mi325_1
|
||||||
# grade: Blocking
|
# grade: Blocking
|
||||||
|
timeout_in_minutes: 30
|
||||||
|
gpu: h100
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/v1/attention
|
||||||
|
- tests/v1/attention
|
||||||
|
commands:
|
||||||
|
- pytest -v -s v1/attention
|
||||||
|
|
||||||
|
- label: Batch Invariance Tests (H100) # 10min
|
||||||
|
mirror_hardwares: [amdexperimental]
|
||||||
|
agent_pool: mi325_1
|
||||||
|
timeout_in_minutes: 25
|
||||||
|
gpu: h100
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/v1/attention
|
||||||
|
- vllm/model_executor/layers
|
||||||
|
- tests/v1/determinism/
|
||||||
|
commands:
|
||||||
|
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||||
|
- pip install pytest-timeout pytest-forked
|
||||||
|
- pytest -v -s v1/determinism/test_batch_invariance.py
|
||||||
|
- pytest -v -s v1/determinism/test_rms_norm_batch_invariant.py
|
||||||
|
|
||||||
|
- label: V1 Test attention (B200) # 10min
|
||||||
|
timeout_in_minutes: 30
|
||||||
|
gpu: b200
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/v1/attention
|
||||||
|
- tests/v1/attention
|
||||||
|
commands:
|
||||||
|
- VLLM_DISABLE_FLASHINFER_PREFILL=1 pytest -v -s v1/attention # TODO: FI prefill is bugged and causes incorrectness, fix this
|
||||||
|
|
||||||
|
- label: V1 Test others (CPU) # 5 mins
|
||||||
|
mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
|
||||||
|
agent_pool: mi325_1
|
||||||
|
grade: Blocking
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
- tests/v1
|
- tests/v1
|
||||||
@@ -377,23 +441,29 @@ steps:
|
|||||||
working_dir: "/vllm-workspace/examples"
|
working_dir: "/vllm-workspace/examples"
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/entrypoints
|
- vllm/entrypoints
|
||||||
|
- vllm/multimodal
|
||||||
- examples/
|
- examples/
|
||||||
commands:
|
commands:
|
||||||
- pip install tensorizer # for tensorizer test
|
- pip install tensorizer # for tensorizer test
|
||||||
|
# for basic
|
||||||
|
- python3 offline_inference/basic/chat.py
|
||||||
- python3 offline_inference/basic/generate.py --model facebook/opt-125m
|
- python3 offline_inference/basic/generate.py --model facebook/opt-125m
|
||||||
- python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
|
- python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
|
||||||
- python3 offline_inference/basic/chat.py
|
- python3 offline_inference/basic/classify.py
|
||||||
- python3 offline_inference/prefix_caching.py
|
- python3 offline_inference/basic/embed.py
|
||||||
- python3 offline_inference/llm_engine_example.py
|
- python3 offline_inference/basic/score.py
|
||||||
|
# for multi-modal models
|
||||||
- python3 offline_inference/audio_language.py --seed 0
|
- python3 offline_inference/audio_language.py --seed 0
|
||||||
- python3 offline_inference/vision_language.py --seed 0
|
- python3 offline_inference/vision_language.py --seed 0
|
||||||
- python3 offline_inference/vision_language_pooling.py --seed 0
|
- python3 offline_inference/vision_language_pooling.py --seed 0
|
||||||
- python3 offline_inference/vision_language_multi_image.py --seed 0
|
- python3 offline_inference/vision_language_multi_image.py --seed 0
|
||||||
- python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
|
|
||||||
- python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
|
- python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
|
||||||
- python3 offline_inference/basic/classify.py
|
# for pooling models
|
||||||
- python3 offline_inference/basic/embed.py
|
- python3 pooling/pooling/vision_language_pooling.py --seed 0
|
||||||
- python3 offline_inference/basic/score.py
|
# for features demo
|
||||||
|
- python3 offline_inference/prefix_caching.py
|
||||||
|
- python3 offline_inference/llm_engine_example.py
|
||||||
|
- python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
|
||||||
- python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
|
- python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
|
||||||
# https://github.com/vllm-project/vllm/pull/26682 uses slightly more memory in PyTorch 2.9+ causing this test to OOM in 1xL4 GPU
|
# https://github.com/vllm-project/vllm/pull/26682 uses slightly more memory in PyTorch 2.9+ causing this test to OOM in 1xL4 GPU
|
||||||
- python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536
|
- python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536
|
||||||
@@ -441,47 +511,12 @@ steps:
|
|||||||
--ignore=lora/test_llm_with_multi_loras.py \
|
--ignore=lora/test_llm_with_multi_loras.py \
|
||||||
--ignore=lora/test_olmoe_tp.py \
|
--ignore=lora/test_olmoe_tp.py \
|
||||||
--ignore=lora/test_deepseekv2_tp.py \
|
--ignore=lora/test_deepseekv2_tp.py \
|
||||||
--ignore=lora/test_gptoss.py \
|
--ignore=lora/test_gptoss_tp.py \
|
||||||
--ignore=lora/test_qwen3moe_tp.py
|
--ignore=lora/test_qwen3moe_tp.py
|
||||||
parallelism: 4
|
parallelism: 4
|
||||||
|
|
||||||
- label: PyTorch Compilation Unit Tests # 15min
|
- label: PyTorch Compilation Unit Tests # 15min
|
||||||
timeout_in_minutes: 30
|
timeout_in_minutes: 30
|
||||||
mirror_hardwares: [amdexperimental]
|
|
||||||
agent_pool: mi325_1
|
|
||||||
# grade: Blocking
|
|
||||||
torch_nightly: true
|
|
||||||
source_file_dependencies:
|
|
||||||
- vllm/
|
|
||||||
- tests/compile
|
|
||||||
commands:
|
|
||||||
- pytest -v -s compile/test_pass_manager.py
|
|
||||||
- pytest -v -s compile/test_fusion.py
|
|
||||||
- pytest -v -s compile/test_fusion_attn.py
|
|
||||||
- pytest -v -s compile/test_functionalization.py
|
|
||||||
- pytest -v -s compile/test_silu_mul_quant_fusion.py
|
|
||||||
# - pytest -v -s compile/test_sequence_parallelism.py
|
|
||||||
# - pytest -v -s compile/test_async_tp.py
|
|
||||||
- pytest -v -s compile/test_fusion_all_reduce.py
|
|
||||||
- pytest -v -s compile/test_decorator.py
|
|
||||||
- pytest -v -s compile/test_noop_elimination.py
|
|
||||||
- pytest -v -s compile/test_aot_compile.py
|
|
||||||
|
|
||||||
- label: PyTorch Fullgraph Smoke Test # 15min
|
|
||||||
timeout_in_minutes: 30
|
|
||||||
mirror_hardwares: [amdexperimental]
|
|
||||||
agent_pool: mi325_1
|
|
||||||
# grade: Blocking
|
|
||||||
torch_nightly: true
|
|
||||||
source_file_dependencies:
|
|
||||||
- vllm/
|
|
||||||
- tests/compile
|
|
||||||
commands:
|
|
||||||
- pytest -v -s compile/test_basic_correctness.py
|
|
||||||
- pytest -v -s compile/piecewise/
|
|
||||||
|
|
||||||
- label: PyTorch Fullgraph Test # 22min
|
|
||||||
timeout_in_minutes: 35
|
|
||||||
mirror_hardwares: [amdexperimental, amdproduction]
|
mirror_hardwares: [amdexperimental, amdproduction]
|
||||||
agent_pool: mi325_1
|
agent_pool: mi325_1
|
||||||
# grade: Blocking
|
# grade: Blocking
|
||||||
@@ -490,8 +525,56 @@ steps:
|
|||||||
- vllm/
|
- vllm/
|
||||||
- tests/compile
|
- tests/compile
|
||||||
commands:
|
commands:
|
||||||
- pytest -v -s compile/test_full_graph.py
|
# Run unit tests defined directly under compile/,
|
||||||
- pytest -v -s compile/test_fusions_e2e.py
|
# not including subdirectories, which are usually heavier
|
||||||
|
# tests covered elsewhere.
|
||||||
|
# Use `find` to launch multiple instances of pytest so that
|
||||||
|
# they do not suffer from https://github.com/vllm-project/vllm/issues/28965
|
||||||
|
- "find compile/ -maxdepth 1 -name 'test_*.py' -exec pytest -s -v {} \\\\;"
|
||||||
|
|
||||||
|
- label: PyTorch Fullgraph Smoke Test # 15min
|
||||||
|
timeout_in_minutes: 30
|
||||||
|
mirror_hardwares: [amdexperimental, amdproduction]
|
||||||
|
agent_pool: mi325_1
|
||||||
|
# grade: Blocking
|
||||||
|
torch_nightly: true
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/
|
||||||
|
- tests/compile
|
||||||
|
commands:
|
||||||
|
# Run smoke tests under fullgraph directory, except test_full_graph.py
|
||||||
|
# as it is a heavy test that is covered in other steps.
|
||||||
|
# Use `find` to launch multiple instances of pytest so that
|
||||||
|
# they do not suffer from https://github.com/vllm-project/vllm/issues/28965
|
||||||
|
- "find compile/fullgraph/ -name 'test_*.py' -not -name 'test_full_graph.py' -exec pytest -s -v {} \\\\;"
|
||||||
|
|
||||||
|
- label: PyTorch Fullgraph Test # 27min
|
||||||
|
timeout_in_minutes: 40
|
||||||
|
mirror_hardwares: [amdexperimental, amdproduction]
|
||||||
|
agent_pool: mi325_1
|
||||||
|
# grade: Blocking
|
||||||
|
torch_nightly: true
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/
|
||||||
|
- tests/compile
|
||||||
|
commands:
|
||||||
|
- pytest -v -s compile/fullgraph/test_full_graph.py -k 'not test_fp8_kv_scale_compile'
|
||||||
|
# Limit to no custom ops to reduce running time
|
||||||
|
# Wrap with quotes to escape yaml and avoid starting -k string with a -
|
||||||
|
- "pytest -v -s compile/distributed/test_fusions_e2e.py -k 'TRITON and not +quant_fp8 and not Llama-4'"
|
||||||
|
|
||||||
|
- label: Cudagraph test
|
||||||
|
timeout_in_minutes: 20
|
||||||
|
mirror_hardwares: [amdexperimental, amdproduction]
|
||||||
|
agent_pool: mi325_1
|
||||||
|
source_file_dependencies:
|
||||||
|
- tests/v1/cudagraph
|
||||||
|
- vllm/v1/cudagraph_dispatcher.py
|
||||||
|
- vllm/config/compilation.py
|
||||||
|
- vllm/compilation
|
||||||
|
commands:
|
||||||
|
- pytest -v -s v1/cudagraph/test_cudagraph_dispatch.py
|
||||||
|
- pytest -v -s v1/cudagraph/test_cudagraph_mode.py
|
||||||
|
|
||||||
- label: Kernels Core Operation Test # 48min
|
- label: Kernels Core Operation Test # 48min
|
||||||
timeout_in_minutes: 75
|
timeout_in_minutes: 75
|
||||||
@@ -507,7 +590,7 @@ steps:
|
|||||||
|
|
||||||
- label: Kernels Attention Test %N # 23min
|
- label: Kernels Attention Test %N # 23min
|
||||||
timeout_in_minutes: 35
|
timeout_in_minutes: 35
|
||||||
mirror_hardwares: [amdexperimental]
|
mirror_hardwares: [amdexperimental, amdproduction]
|
||||||
agent_pool: mi325_8
|
agent_pool: mi325_8
|
||||||
# grade: Blocking
|
# grade: Blocking
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
@@ -534,7 +617,7 @@ steps:
|
|||||||
|
|
||||||
- label: Kernels MoE Test %N # 40min
|
- label: Kernels MoE Test %N # 40min
|
||||||
timeout_in_minutes: 60
|
timeout_in_minutes: 60
|
||||||
mirror_hardwares: [amdexperimental]
|
mirror_hardwares: [amdexperimental, amdproduction]
|
||||||
agent_pool: mi325_8
|
agent_pool: mi325_8
|
||||||
# grade: Blocking
|
# grade: Blocking
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
@@ -543,6 +626,8 @@ steps:
|
|||||||
- tests/kernels/moe
|
- tests/kernels/moe
|
||||||
- vllm/model_executor/layers/fused_moe/
|
- vllm/model_executor/layers/fused_moe/
|
||||||
- vllm/distributed/device_communicators/
|
- vllm/distributed/device_communicators/
|
||||||
|
- vllm/envs.py
|
||||||
|
- vllm/config
|
||||||
commands:
|
commands:
|
||||||
- pytest -v -s kernels/moe --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
|
- pytest -v -s kernels/moe --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
|
||||||
parallelism: 2
|
parallelism: 2
|
||||||
@@ -559,12 +644,35 @@ steps:
|
|||||||
commands:
|
commands:
|
||||||
- pytest -v -s kernels/mamba
|
- pytest -v -s kernels/mamba
|
||||||
|
|
||||||
|
- label: Kernels DeepGEMM Test (H100) # Nvidia-centric
|
||||||
|
# Not replicating for CUTLAS & CuTe
|
||||||
|
timeout_in_minutes: 45
|
||||||
|
gpu: h100
|
||||||
|
num_gpus: 1
|
||||||
|
source_file_dependencies:
|
||||||
|
- tools/install_deepgemm.sh
|
||||||
|
- vllm/utils/deep_gemm.py
|
||||||
|
- vllm/model_executor/layers/fused_moe
|
||||||
|
- vllm/model_executor/layers/quantization
|
||||||
|
- tests/kernels/quantization/test_block_fp8.py
|
||||||
|
- tests/kernels/moe/test_deepgemm.py
|
||||||
|
- tests/kernels/moe/test_batched_deepgemm.py
|
||||||
|
- tests/kernels/attention/test_deepgemm_attention.py
|
||||||
|
commands:
|
||||||
|
- pytest -v -s kernels/quantization/test_block_fp8.py -k deep_gemm
|
||||||
|
- pytest -v -s kernels/moe/test_deepgemm.py
|
||||||
|
- pytest -v -s kernels/moe/test_batched_deepgemm.py
|
||||||
|
- pytest -v -s kernels/attention/test_deepgemm_attention.py
|
||||||
|
|
||||||
- label: Model Executor Test # 23min
|
- label: Model Executor Test # 23min
|
||||||
timeout_in_minutes: 35
|
timeout_in_minutes: 35
|
||||||
|
torch_nightly: true
|
||||||
mirror_hardwares: [amdexperimental, amdproduction]
|
mirror_hardwares: [amdexperimental, amdproduction]
|
||||||
agent_pool: mi325_1
|
agent_pool: mi325_1
|
||||||
# grade: Blocking
|
# grade: Blocking
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
|
- vllm/engine/arg_utils.py
|
||||||
|
- vllm/config/model.py
|
||||||
- vllm/model_executor
|
- vllm/model_executor
|
||||||
- tests/model_executor
|
- tests/model_executor
|
||||||
- tests/entrypoints/openai/test_tensorizer_entrypoint.py
|
- tests/entrypoints/openai/test_tensorizer_entrypoint.py
|
||||||
@@ -614,6 +722,7 @@ steps:
|
|||||||
# we can only upgrade after this is resolved
|
# we can only upgrade after this is resolved
|
||||||
# TODO(jerryzh168): resolve the above comment
|
# TODO(jerryzh168): resolve the above comment
|
||||||
- uv pip install --system torchao==0.13.0
|
- uv pip install --system torchao==0.13.0
|
||||||
|
- uv pip install --system conch-triton-kernels
|
||||||
- VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py
|
- VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py
|
||||||
|
|
||||||
- label: LM Eval Small Models # 53min
|
- label: LM Eval Small Models # 53min
|
||||||
@@ -624,12 +733,13 @@ steps:
|
|||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- csrc/
|
- csrc/
|
||||||
- vllm/model_executor/layers/quantization
|
- vllm/model_executor/layers/quantization
|
||||||
|
autorun_on_main: true
|
||||||
commands:
|
commands:
|
||||||
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt --tp-size=1
|
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt --tp-size=1
|
||||||
|
|
||||||
- label: OpenAI API correctness # 22min
|
- label: OpenAI API correctness # 10min
|
||||||
timeout_in_minutes: 30
|
timeout_in_minutes: 15
|
||||||
mirror_hardwares: [amdexperimental]
|
mirror_hardwares: [amdexperimental, amdproduction]
|
||||||
agent_pool: mi325_1
|
agent_pool: mi325_1
|
||||||
# grade: Blocking
|
# grade: Blocking
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
@@ -637,6 +747,7 @@ steps:
|
|||||||
- vllm/entrypoints/openai/
|
- vllm/entrypoints/openai/
|
||||||
- vllm/model_executor/models/whisper.py
|
- vllm/model_executor/models/whisper.py
|
||||||
commands: # LMEval+Transcription WER check
|
commands: # LMEval+Transcription WER check
|
||||||
|
# Transcription WER check is skipped because encoder-decoder models are not supported on ROCm, see https://github.com/vllm-project/vllm/issues/27442
|
||||||
- pytest -s entrypoints/openai/correctness/
|
- pytest -s entrypoints/openai/correctness/
|
||||||
|
|
||||||
- label: OpenAI-Compatible Tool Use # 23 min
|
- label: OpenAI-Compatible Tool Use # 23 min
|
||||||
@@ -686,6 +797,7 @@ steps:
|
|||||||
torch_nightly: true
|
torch_nightly: true
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/model_executor/models/
|
- vllm/model_executor/models/
|
||||||
|
- vllm/transformers_utils/
|
||||||
- tests/models/test_initialization.py
|
- tests/models/test_initialization.py
|
||||||
commands:
|
commands:
|
||||||
# Only when vLLM model source is modified - test initialization of a large
|
# Only when vLLM model source is modified - test initialization of a large
|
||||||
@@ -831,6 +943,18 @@ steps:
|
|||||||
commands:
|
commands:
|
||||||
- pytest -v -s models/language/pooling_mteb_test
|
- pytest -v -s models/language/pooling_mteb_test
|
||||||
|
|
||||||
|
- label: Multi-Modal Processor Test (CPU)
|
||||||
|
timeout_in_minutes: 60
|
||||||
|
mirror_hardwares: [amdexperimental]
|
||||||
|
agent_pool: mi325_1
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/
|
||||||
|
- tests/models/multimodal
|
||||||
|
no_gpu: true
|
||||||
|
commands:
|
||||||
|
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
|
||||||
|
- pytest -v -s models/multimodal/processing --ignore models/multimodal/processing/test_tensor_schema.py
|
||||||
|
|
||||||
- label: Multi-Modal Processor Test # 44min
|
- label: Multi-Modal Processor Test # 44min
|
||||||
timeout_in_minutes: 60
|
timeout_in_minutes: 60
|
||||||
mirror_hardwares: [amdexperimental]
|
mirror_hardwares: [amdexperimental]
|
||||||
@@ -858,10 +982,11 @@ steps:
|
|||||||
- pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing
|
- pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing
|
||||||
- cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model # Otherwise, mp_method="spawn" doesn't work
|
- cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model # Otherwise, mp_method="spawn" doesn't work
|
||||||
|
|
||||||
- label: Multi-Modal Accuracy Eval (Small Models) # 50min
|
- label: Multi-Modal Accuracy Eval (Small Models) # 150min - 180min
|
||||||
mirror_hardwares: [amdexperimental]
|
timeout_in_minutes: 180
|
||||||
|
mirror_hardwares: [amdexperimental, amdproduction]
|
||||||
agent_pool: mi325_1
|
agent_pool: mi325_1
|
||||||
timeout_in_minutes: 70
|
# grade: Blocking
|
||||||
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
|
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/multimodal/
|
- vllm/multimodal/
|
||||||
@@ -870,7 +995,8 @@ steps:
|
|||||||
commands:
|
commands:
|
||||||
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-mm-small.txt --tp-size=1
|
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-mm-small.txt --tp-size=1
|
||||||
|
|
||||||
- label: Multi-Modal Models Test (Extended) 1
|
- label: Multi-Modal Models Test (Extended) 1 # 60min
|
||||||
|
timeout_in_minutes: 120
|
||||||
mirror_hardwares: [amdexperimental]
|
mirror_hardwares: [amdexperimental]
|
||||||
agent_pool: mi325_1
|
agent_pool: mi325_1
|
||||||
# grade: Blocking
|
# grade: Blocking
|
||||||
@@ -894,7 +1020,8 @@ steps:
|
|||||||
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
|
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
|
||||||
- pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=0) and not core_model'
|
- pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=0) and not core_model'
|
||||||
|
|
||||||
- label: Multi-Modal Models Test (Extended) 3
|
- label: Multi-Modal Models Test (Extended) 3 # 75min
|
||||||
|
timeout_in_minutes: 150
|
||||||
mirror_hardwares: [amdexperimental]
|
mirror_hardwares: [amdexperimental]
|
||||||
agent_pool: mi325_1
|
agent_pool: mi325_1
|
||||||
# grade: Blocking
|
# grade: Blocking
|
||||||
@@ -932,16 +1059,17 @@ steps:
|
|||||||
- label: Transformers Nightly Models Test
|
- label: Transformers Nightly Models Test
|
||||||
mirror_hardwares: [amdexperimental]
|
mirror_hardwares: [amdexperimental]
|
||||||
agent_pool: mi325_1
|
agent_pool: mi325_1
|
||||||
|
# grade: Blocking
|
||||||
working_dir: "/vllm-workspace/"
|
working_dir: "/vllm-workspace/"
|
||||||
optional: true
|
optional: true
|
||||||
commands:
|
commands:
|
||||||
- pip install --upgrade git+https://github.com/huggingface/transformers
|
- pip install --upgrade git+https://github.com/huggingface/transformers
|
||||||
- pytest -v -s tests/models/test_initialization.py
|
- pytest -v -s tests/models/test_initialization.py -k 'not (Gemma3 or ModernBert or Qwen2_5_VL or Qwen2_5vl or Qwen2VL or TransformersMultiModalEmbeddingModel or TransformersMultiModalForSequenceClassification or Ultravox or Phi4Multimodal or LlavaNextVideo or MiniCPMO or Lfm2Moe or PaliGemma or RobertaForSequenceClassification or Ovis2_5 or Fuyu or DeepseekOCR or KimiVL)'
|
||||||
- pytest -v -s tests/models/test_transformers.py
|
- pytest -v -s tests/models/test_transformers.py
|
||||||
- pytest -v -s tests/models/multimodal/processing/
|
# - pytest -v -s tests/models/multimodal/processing/
|
||||||
- pytest -v -s tests/models/multimodal/test_mapping.py
|
- pytest -v -s tests/models/multimodal/test_mapping.py -k 'not (Gemma3 or Qwen2VL or Qwen2_5_VL)'
|
||||||
- python3 examples/offline_inference/basic/chat.py
|
- python3 examples/offline_inference/basic/chat.py
|
||||||
- python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
|
# - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
|
||||||
# Whisper needs spawn method to avoid deadlock
|
# Whisper needs spawn method to avoid deadlock
|
||||||
- VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper
|
- VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper
|
||||||
|
|
||||||
@@ -959,11 +1087,16 @@ steps:
|
|||||||
- vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py
|
- vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py
|
||||||
- vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
|
- vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
|
||||||
- vllm/v1/attention/backends/flashinfer.py
|
- vllm/v1/attention/backends/flashinfer.py
|
||||||
|
- vllm/v1/attention/backends/mla/cutlass_mla.py
|
||||||
|
- vllm/v1/attention/backends/mla/flashinfer_mla.py
|
||||||
|
- vllm/platforms/cuda.py
|
||||||
|
- vllm/attention/selector.py
|
||||||
commands:
|
commands:
|
||||||
- nvidia-smi
|
- nvidia-smi
|
||||||
- python3 examples/offline_inference/basic/chat.py
|
- python3 examples/offline_inference/basic/chat.py
|
||||||
# Attention
|
# Attention
|
||||||
# num_heads2 broken by https://github.com/flashinfer-ai/flashinfer/issues/1353
|
# num_heads2 broken by https://github.com/flashinfer-ai/flashinfer/issues/1353
|
||||||
|
- pytest -v -s tests/kernels/attention/test_attention_selector.py
|
||||||
- pytest -v -s tests/kernels/attention/test_flashinfer.py -k 'not num_heads2'
|
- pytest -v -s tests/kernels/attention/test_flashinfer.py -k 'not num_heads2'
|
||||||
- pytest -v -s tests/kernels/attention/test_flashinfer_trtllm_attention.py
|
- pytest -v -s tests/kernels/attention/test_flashinfer_trtllm_attention.py
|
||||||
- pytest -v -s tests/kernels/attention/test_cutlass_mla_decode.py
|
- pytest -v -s tests/kernels/attention/test_cutlass_mla_decode.py
|
||||||
@@ -980,8 +1113,9 @@ steps:
|
|||||||
- pytest -v -s tests/kernels/moe/test_nvfp4_moe.py
|
- pytest -v -s tests/kernels/moe/test_nvfp4_moe.py
|
||||||
- pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py
|
- pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py
|
||||||
- pytest -v -s tests/kernels/moe/test_flashinfer.py
|
- pytest -v -s tests/kernels/moe/test_flashinfer.py
|
||||||
|
- pytest -v -s tests/kernels/moe/test_cutedsl_moe.py
|
||||||
|
|
||||||
- label: Blackwell Fusion Tests # 30 min
|
- label: Blackwell Fusion and Compile Tests # 30 min
|
||||||
timeout_in_minutes: 40
|
timeout_in_minutes: 40
|
||||||
working_dir: "/vllm-workspace/"
|
working_dir: "/vllm-workspace/"
|
||||||
gpu: b200
|
gpu: b200
|
||||||
@@ -989,18 +1123,50 @@ steps:
|
|||||||
- csrc/quantization/fp4/
|
- csrc/quantization/fp4/
|
||||||
- vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
|
- vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
|
||||||
- vllm/v1/attention/backends/flashinfer.py
|
- vllm/v1/attention/backends/flashinfer.py
|
||||||
|
- vllm/v1/worker/
|
||||||
|
- vllm/v1/cudagraph_dispatcher.py
|
||||||
|
- vllm/compilation/
|
||||||
|
# can affect pattern matching
|
||||||
|
- vllm/model_executor/layers/layernorm.py
|
||||||
|
- vllm/model_executor/layers/activation.py
|
||||||
|
- vllm/model_executor/layers/quantization/input_quant_fp8.py
|
||||||
|
- tests/compile/test_fusion_attn.py
|
||||||
|
- tests/compile/test_silu_mul_quant_fusion.py
|
||||||
|
- tests/compile/distributed/test_fusion_all_reduce.py
|
||||||
|
- tests/compile/distributed/test_fusions_e2e.py
|
||||||
|
- tests/compile/fullgraph/test_full_graph.py
|
||||||
|
commands:
|
||||||
|
- nvidia-smi
|
||||||
|
- pytest -v -s tests/compile/test_fusion_attn.py
|
||||||
|
- pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
|
||||||
|
# this runner has 2 GPUs available even though num_gpus=2 is not set
|
||||||
|
- pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
|
||||||
|
# Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time
|
||||||
|
# Wrap with quotes to escape yaml
|
||||||
|
- "pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and not +quant_fp8 and not +rms_norm'"
|
||||||
|
# test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40)
|
||||||
|
- pytest -v -s tests/compile/fullgraph/test_full_graph.py::test_fp8_kv_scale_compile
|
||||||
|
|
||||||
|
- label: Blackwell Fusion E2E Tests # 30 min
|
||||||
|
timeout_in_minutes: 40
|
||||||
|
working_dir: "/vllm-workspace/"
|
||||||
|
gpu: b200
|
||||||
|
optional: true
|
||||||
|
num_gpus: 2
|
||||||
|
source_file_dependencies:
|
||||||
|
- csrc/quantization/fp4/
|
||||||
|
- vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
|
||||||
|
- vllm/v1/attention/backends/flashinfer.py
|
||||||
- vllm/compilation/
|
- vllm/compilation/
|
||||||
# can affect pattern matching
|
# can affect pattern matching
|
||||||
- vllm/model_executor/layers/layernorm.py
|
- vllm/model_executor/layers/layernorm.py
|
||||||
- vllm/model_executor/layers/activation.py
|
- vllm/model_executor/layers/activation.py
|
||||||
- vllm/model_executor/layers/quantization/input_quant_fp8.py
|
- vllm/model_executor/layers/quantization/input_quant_fp8.py
|
||||||
|
- tests/compile/distributed/test_fusions_e2e.py
|
||||||
commands:
|
commands:
|
||||||
- nvidia-smi
|
- nvidia-smi
|
||||||
- pytest -v -s tests/compile/test_fusion_attn.py
|
# Run all e2e fusion tests
|
||||||
- pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
|
- pytest -v -s tests/compile/distributed/test_fusions_e2e.py
|
||||||
# this runner has 2 GPUs available even though num_gpus=2 is not set
|
|
||||||
- pytest -v -s tests/compile/test_fusion_all_reduce.py
|
|
||||||
- pytest -v -s tests/compile/test_fusions_e2e.py
|
|
||||||
|
|
||||||
- label: Blackwell GPT-OSS Eval
|
- label: Blackwell GPT-OSS Eval
|
||||||
timeout_in_minutes: 60
|
timeout_in_minutes: 60
|
||||||
@@ -1104,7 +1270,7 @@ steps:
|
|||||||
- vllm/worker/worker_base.py
|
- vllm/worker/worker_base.py
|
||||||
- vllm/v1/engine/
|
- vllm/v1/engine/
|
||||||
- vllm/v1/worker/
|
- vllm/v1/worker/
|
||||||
- tests/compile/test_basic_correctness.py
|
- tests/compile/fullgraph/test_basic_correctness.py
|
||||||
- tests/compile/test_wrapper.py
|
- tests/compile/test_wrapper.py
|
||||||
- tests/distributed/
|
- tests/distributed/
|
||||||
- tests/entrypoints/llm/test_collective_rpc.py
|
- tests/entrypoints/llm/test_collective_rpc.py
|
||||||
@@ -1114,10 +1280,11 @@ steps:
|
|||||||
- tests/v1/worker/test_worker_memory_snapshot.py
|
- tests/v1/worker/test_worker_memory_snapshot.py
|
||||||
commands:
|
commands:
|
||||||
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
|
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
|
||||||
|
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
|
||||||
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
|
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
|
||||||
- DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py
|
- DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py
|
||||||
- pytest -v -s entrypoints/llm/test_collective_rpc.py
|
- pytest -v -s entrypoints/llm/test_collective_rpc.py
|
||||||
- pytest -v -s ./compile/test_basic_correctness.py
|
- pytest -v -s ./compile/fullgraph/test_basic_correctness.py
|
||||||
- pytest -v -s ./compile/test_wrapper.py
|
- pytest -v -s ./compile/test_wrapper.py
|
||||||
- VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
|
- VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
|
||||||
- VLLM_TEST_SAME_HOST=1 VLLM_TEST_WITH_DEFAULT_DEVICE_SET=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
|
- VLLM_TEST_SAME_HOST=1 VLLM_TEST_WITH_DEFAULT_DEVICE_SET=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
|
||||||
@@ -1149,7 +1316,7 @@ steps:
|
|||||||
|
|
||||||
- label: Plugin Tests (2 GPUs) # 40min
|
- label: Plugin Tests (2 GPUs) # 40min
|
||||||
timeout_in_minutes: 60
|
timeout_in_minutes: 60
|
||||||
mirror_hardwares: [amdexperimental]
|
mirror_hardwares: [amdexperimental, amdproduction]
|
||||||
agent_pool: mi325_2
|
agent_pool: mi325_2
|
||||||
# grade: Blocking
|
# grade: Blocking
|
||||||
working_dir: "/vllm-workspace/tests"
|
working_dir: "/vllm-workspace/tests"
|
||||||
@@ -1218,9 +1385,14 @@ steps:
|
|||||||
- pytest -v -s -x lora/test_llm_with_multi_loras.py
|
- pytest -v -s -x lora/test_llm_with_multi_loras.py
|
||||||
- pytest -v -s -x lora/test_olmoe_tp.py
|
- pytest -v -s -x lora/test_olmoe_tp.py
|
||||||
|
|
||||||
|
# Disabled for now because MXFP4 backend on non-cuda platform
|
||||||
|
# doesn't support LoRA yet
|
||||||
|
#- pytest -v -s -x lora/test_gptoss_tp.py
|
||||||
|
|
||||||
|
|
||||||
- label: Weight Loading Multiple GPU Test # 33min
|
- label: Weight Loading Multiple GPU Test # 33min
|
||||||
timeout_in_minutes: 45
|
timeout_in_minutes: 45
|
||||||
mirror_hardwares: [amdexperimental]
|
mirror_hardwares: [amdexperimental, amdproduction]
|
||||||
agent_pool: mi325_2
|
agent_pool: mi325_2
|
||||||
# grade: Blocking
|
# grade: Blocking
|
||||||
working_dir: "/vllm-workspace/tests"
|
working_dir: "/vllm-workspace/tests"
|
||||||
@@ -1230,7 +1402,7 @@ steps:
|
|||||||
- vllm/
|
- vllm/
|
||||||
- tests/weight_loading
|
- tests/weight_loading
|
||||||
commands:
|
commands:
|
||||||
- bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models.txt
|
- bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-amd.txt
|
||||||
|
|
||||||
- label: Weight Loading Multiple GPU Test - Large Models # optional
|
- label: Weight Loading Multiple GPU Test - Large Models # optional
|
||||||
mirror_hardwares: [amdexperimental]
|
mirror_hardwares: [amdexperimental]
|
||||||
@@ -1238,17 +1410,17 @@ steps:
|
|||||||
# grade: Blocking
|
# grade: Blocking
|
||||||
working_dir: "/vllm-workspace/tests"
|
working_dir: "/vllm-workspace/tests"
|
||||||
num_gpus: 2
|
num_gpus: 2
|
||||||
gpu: a100
|
|
||||||
optional: true
|
optional: true
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
- tests/weight_loading
|
- tests/weight_loading
|
||||||
commands:
|
commands:
|
||||||
- bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large.txt
|
- bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large-amd.txt
|
||||||
|
|
||||||
- label: NixlConnector PD accuracy tests (Distributed) # 30min
|
- label: NixlConnector PD accuracy tests (Distributed) # 30min
|
||||||
mirror_hardwares: [amdexperimental]
|
mirror_hardwares: [amdexperimental]
|
||||||
agent_pool: mi325_4
|
agent_pool: mi325_4
|
||||||
|
# grade: Blocking
|
||||||
timeout_in_minutes: 30
|
timeout_in_minutes: 30
|
||||||
working_dir: "/vllm-workspace/tests"
|
working_dir: "/vllm-workspace/tests"
|
||||||
num_gpus: 4
|
num_gpus: 4
|
||||||
@@ -1263,6 +1435,9 @@ steps:
|
|||||||
##### A100 test #####
|
##### A100 test #####
|
||||||
|
|
||||||
- label: Distributed Tests (A100) # optional
|
- label: Distributed Tests (A100) # optional
|
||||||
|
mirror_hardwares: [amdexperimental]
|
||||||
|
agent_pool: mi325_4
|
||||||
|
# grade: Blocking
|
||||||
gpu: a100
|
gpu: a100
|
||||||
optional: true
|
optional: true
|
||||||
num_gpus: 4
|
num_gpus: 4
|
||||||
@@ -1276,7 +1451,86 @@ steps:
|
|||||||
- TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
|
- TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
|
||||||
- pytest -v -s -x lora/test_mixtral.py
|
- pytest -v -s -x lora/test_mixtral.py
|
||||||
|
|
||||||
|
|
||||||
- label: LM Eval Large Models # optional
|
- label: LM Eval Large Models # optional
|
||||||
|
gpu: a100
|
||||||
|
optional: true
|
||||||
|
mirror_hardwares: [amdexperimental]
|
||||||
|
agent_pool: mi325_4
|
||||||
|
# grade: Blocking
|
||||||
|
num_gpus: 4
|
||||||
|
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
|
||||||
|
source_file_dependencies:
|
||||||
|
- csrc/
|
||||||
|
- vllm/model_executor/layers/quantization
|
||||||
|
commands:
|
||||||
|
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||||
|
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4
|
||||||
|
|
||||||
|
##### H100 test #####
|
||||||
|
- label: LM Eval Large Models (H100) # optional
|
||||||
|
gpu: h100
|
||||||
|
optional: true
|
||||||
|
mirror_hardwares: [amdexperimental]
|
||||||
|
agent_pool: mi325_4
|
||||||
|
# grade: Blocking
|
||||||
|
num_gpus: 4
|
||||||
|
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
|
||||||
|
source_file_dependencies:
|
||||||
|
- csrc/
|
||||||
|
- vllm/model_executor/layers/quantization
|
||||||
|
commands:
|
||||||
|
- export VLLM_USE_DEEP_GEMM=0 # We found Triton is faster than DeepGEMM for H100
|
||||||
|
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt --tp-size=4
|
||||||
|
|
||||||
|
|
||||||
|
##### H200 test #####
|
||||||
|
- label: Distributed Tests (H200) # optional
|
||||||
|
mirror_hardwares: [amdexperimental]
|
||||||
|
agent_pool: mi325_2
|
||||||
|
# grade: Blocking
|
||||||
|
gpu: h200
|
||||||
|
optional: true
|
||||||
|
working_dir: "/vllm-workspace/"
|
||||||
|
num_gpus: 2
|
||||||
|
commands:
|
||||||
|
- VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_async_tp.py
|
||||||
|
- pytest -v -s tests/compile/distributed/test_sequence_parallelism.py
|
||||||
|
- pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
|
||||||
|
#- pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm
|
||||||
|
- "VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'"
|
||||||
|
- VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/distributed/test_sequence_parallel.py
|
||||||
|
- pytest -v -s tests/distributed/test_context_parallel.py
|
||||||
|
- HIP_VISIBLE_DEVICES=0,1 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048
|
||||||
|
- pytest -v -s tests/v1/distributed/test_dbo.py
|
||||||
|
|
||||||
|
##### B200 test #####
|
||||||
|
- label: Distributed Tests (B200) # optional
|
||||||
|
gpu: b200
|
||||||
|
optional: true
|
||||||
|
working_dir: "/vllm-workspace/"
|
||||||
|
num_gpus: 2
|
||||||
|
commands:
|
||||||
|
- pytest -v -s tests/distributed/test_context_parallel.py
|
||||||
|
- pytest -v -s tests/distributed/test_nccl_symm_mem_allreduce.py
|
||||||
|
- pytest -v -s tests/v1/distributed/test_dbo.py
|
||||||
|
|
||||||
|
##### E2E Eval Tests #####
|
||||||
|
- label: LM Eval Small Models (1 Card) # 15min
|
||||||
|
timeout_in_minutes: 20
|
||||||
|
mirror_hardwares: [amdexperimental, amdproduction]
|
||||||
|
agent_pool: mi325_1
|
||||||
|
# grade: Blocking
|
||||||
|
source_file_dependencies:
|
||||||
|
- csrc/
|
||||||
|
- vllm/model_executor/layers/quantization
|
||||||
|
commands:
|
||||||
|
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt --tp-size=1
|
||||||
|
|
||||||
|
- label: LM Eval Large Models (4 Card)
|
||||||
|
mirror_hardwares: [amdexperimental, amdproduction]
|
||||||
|
agent_pool: mi325_4
|
||||||
|
# grade: Blocking
|
||||||
gpu: a100
|
gpu: a100
|
||||||
optional: true
|
optional: true
|
||||||
num_gpus: 4
|
num_gpus: 4
|
||||||
@@ -1288,29 +1542,29 @@ steps:
|
|||||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||||
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4
|
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4
|
||||||
|
|
||||||
##### H200 test #####
|
- label: ROCm LM Eval Large Models (8 Card)
|
||||||
- label: Distributed Tests (H200) # optional
|
mirror_hardwares: [amdproduction]
|
||||||
gpu: h200
|
agent_pool: mi325_8
|
||||||
optional: true
|
num_gpus: 8
|
||||||
working_dir: "/vllm-workspace/"
|
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
|
||||||
num_gpus: 2
|
|
||||||
commands:
|
commands:
|
||||||
- pytest -v -s tests/compile/test_async_tp.py
|
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||||
- pytest -v -s tests/compile/test_sequence_parallelism.py
|
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-rocm.txt --tp-size=8
|
||||||
- pytest -v -s tests/compile/test_fusion_all_reduce.py
|
|
||||||
- pytest -v -s tests/compile/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm
|
|
||||||
- pytest -v -s tests/distributed/test_context_parallel.py
|
|
||||||
- CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048
|
|
||||||
|
|
||||||
##### B200 test #####
|
- label: ROCm GPT-OSS Eval
|
||||||
- label: Distributed Tests (B200) # optional
|
timeout_in_minutes: 60
|
||||||
gpu: b200
|
|
||||||
optional: true
|
|
||||||
working_dir: "/vllm-workspace/"
|
working_dir: "/vllm-workspace/"
|
||||||
num_gpus: 2
|
agent_pool: mi325_1
|
||||||
|
mirror_hardwares: [amdexperimental, amdproduction]
|
||||||
|
optional: true # run on nightlies
|
||||||
|
source_file_dependencies:
|
||||||
|
- tests/evals/gpt_oss
|
||||||
|
- vllm/model_executor/models/gpt_oss.py
|
||||||
|
- vllm/model_executor/layers/quantization/mxfp4.py
|
||||||
|
- vllm/v1/attention/backends/flashinfer.py
|
||||||
commands:
|
commands:
|
||||||
- pytest -v -s tests/distributed/test_context_parallel.py
|
- uv pip install --system 'gpt-oss[eval]==0.0.5'
|
||||||
- pytest -v -s tests/distributed/test_nccl_symm_mem_allreduce.py
|
- VLLM_ROCM_USE_AITER_MHA=0 VLLM_ROCM_USE_AITER=1 VLLM_USE_AITER_UNIFIED_ATTENTION=1 pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58
|
||||||
|
|
||||||
##### RL Integration Tests #####
|
##### RL Integration Tests #####
|
||||||
- label: Prime-RL Integration Test # 15min
|
- label: Prime-RL Integration Test # 15min
|
||||||
@@ -1326,3 +1580,59 @@ steps:
|
|||||||
- .buildkite/scripts/run-prime-rl-test.sh
|
- .buildkite/scripts/run-prime-rl-test.sh
|
||||||
commands:
|
commands:
|
||||||
- bash .buildkite/scripts/run-prime-rl-test.sh
|
- bash .buildkite/scripts/run-prime-rl-test.sh
|
||||||
|
- label: DeepSeek V2-Lite Accuracy
|
||||||
|
mirror_hardwares: [amdexperimental, amdproduction]
|
||||||
|
agent_pool: mi325_4
|
||||||
|
# grade: Blocking
|
||||||
|
timeout_in_minutes: 60
|
||||||
|
gpu: h100
|
||||||
|
optional: true
|
||||||
|
num_gpus: 4
|
||||||
|
working_dir: "/vllm-workspace"
|
||||||
|
commands:
|
||||||
|
- bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh 0.25 200 8010
|
||||||
|
|
||||||
|
- label: Qwen3-30B-A3B-FP8-block Accuracy (H100)
|
||||||
|
mirror_hardwares: [amdexperimental, amdproduction]
|
||||||
|
agent_pool: mi325_4
|
||||||
|
# grade: Blocking
|
||||||
|
timeout_in_minutes: 60
|
||||||
|
gpu: h100
|
||||||
|
optional: true
|
||||||
|
num_gpus: 4
|
||||||
|
working_dir: "/vllm-workspace"
|
||||||
|
commands:
|
||||||
|
- bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020
|
||||||
|
|
||||||
|
- label: Qwen3-30B-A3B-FP8-block Accuracy (B200)
|
||||||
|
timeout_in_minutes: 60
|
||||||
|
gpu: b200
|
||||||
|
optional: true
|
||||||
|
num_gpus: 2
|
||||||
|
working_dir: "/vllm-workspace"
|
||||||
|
commands:
|
||||||
|
- bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1
|
||||||
|
|
||||||
|
- label: DeepSeek V2-Lite Async EPLB Accuracy
|
||||||
|
timeout_in_minutes: 60
|
||||||
|
mirror_hardwares: [amdexperimental]
|
||||||
|
agent_pool: mi325_4
|
||||||
|
# grade: Blocking
|
||||||
|
gpu: h100
|
||||||
|
optional: true
|
||||||
|
num_gpus: 4
|
||||||
|
working_dir: "/vllm-workspace"
|
||||||
|
commands:
|
||||||
|
- bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_async_eplb.sh 0.25 1319 8030
|
||||||
|
|
||||||
|
- label: Qwen3-Next-80B-A3B-Instruct MTP Async EPLB Accuracy
|
||||||
|
timeout_in_minutes: 60
|
||||||
|
mirror_hardwares: [amdexperimental]
|
||||||
|
agent_pool: mi325_4
|
||||||
|
# grade: Blocking
|
||||||
|
gpu: h100
|
||||||
|
optional: true
|
||||||
|
num_gpus: 4
|
||||||
|
working_dir: "/vllm-workspace"
|
||||||
|
commands:
|
||||||
|
- bash .buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh 0.8 1319 8040
|
||||||
|
|||||||
@@ -25,6 +25,7 @@
|
|||||||
# and $$BUILDKITE_PARALLEL_JOB_COUNT environment variables.
|
# and $$BUILDKITE_PARALLEL_JOB_COUNT environment variables.
|
||||||
# working_dir(str): specify the place where the command should execute, default to /vllm-workspace/tests
|
# working_dir(str): specify the place where the command should execute, default to /vllm-workspace/tests
|
||||||
# source_file_dependencies(list): the list of prefixes to opt-in the test for, if empty, the test will always run.
|
# source_file_dependencies(list): the list of prefixes to opt-in the test for, if empty, the test will always run.
|
||||||
|
# autorun_on_main (bool): default to false, if true, the test will run automatically when commit is pushed to main branch.
|
||||||
|
|
||||||
# When adding a test
|
# When adding a test
|
||||||
# - If the test belongs to an existing group, add it there
|
# - If the test belongs to an existing group, add it there
|
||||||
@@ -56,22 +57,26 @@ steps:
|
|||||||
- pytest -v -s -m 'not cpu_test' multimodal
|
- pytest -v -s -m 'not cpu_test' multimodal
|
||||||
- pytest -v -s utils_
|
- pytest -v -s utils_
|
||||||
|
|
||||||
- label: Async Engine, Inputs, Utils, Worker Test (CPU) # 4 mins
|
- label: Async Engine, Inputs, Utils, Worker, Config Test (CPU) # 15min
|
||||||
timeout_in_minutes: 10
|
timeout_in_minutes: 20
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
- tests/test_inputs.py
|
- tests/test_inputs.py
|
||||||
- tests/test_outputs.py
|
- tests/test_outputs.py
|
||||||
- tests/multimodal
|
- tests/multimodal
|
||||||
- tests/standalone_tests/lazy_imports.py
|
- tests/standalone_tests/lazy_imports.py
|
||||||
|
- tests/tokenizers_
|
||||||
- tests/transformers_utils
|
- tests/transformers_utils
|
||||||
|
- tests/config
|
||||||
no_gpu: true
|
no_gpu: true
|
||||||
commands:
|
commands:
|
||||||
- python3 standalone_tests/lazy_imports.py
|
- python3 standalone_tests/lazy_imports.py
|
||||||
- pytest -v -s test_inputs.py
|
- pytest -v -s test_inputs.py
|
||||||
- pytest -v -s test_outputs.py
|
- pytest -v -s test_outputs.py
|
||||||
- pytest -v -s -m 'cpu_test' multimodal
|
- pytest -v -s -m 'cpu_test' multimodal
|
||||||
|
- pytest -v -s tokenizers_
|
||||||
- pytest -v -s transformers_utils
|
- pytest -v -s transformers_utils
|
||||||
|
- pytest -v -s config
|
||||||
|
|
||||||
- label: Python-only Installation Test # 10min
|
- label: Python-only Installation Test # 10min
|
||||||
timeout_in_minutes: 20
|
timeout_in_minutes: 20
|
||||||
@@ -164,7 +169,7 @@ steps:
|
|||||||
- tests/distributed/test_utils
|
- tests/distributed/test_utils
|
||||||
- tests/distributed/test_pynccl
|
- tests/distributed/test_pynccl
|
||||||
- tests/distributed/test_events
|
- tests/distributed/test_events
|
||||||
- tests/compile/test_basic_correctness
|
- tests/compile/fullgraph/test_basic_correctness.py
|
||||||
- examples/offline_inference/rlhf.py
|
- examples/offline_inference/rlhf.py
|
||||||
- examples/offline_inference/rlhf_colocate.py
|
- examples/offline_inference/rlhf_colocate.py
|
||||||
- tests/examples/offline_inference/data_parallel.py
|
- tests/examples/offline_inference/data_parallel.py
|
||||||
@@ -189,12 +194,13 @@ steps:
|
|||||||
# test with internal dp
|
# test with internal dp
|
||||||
- python3 ../examples/offline_inference/data_parallel.py --enforce-eager
|
- python3 ../examples/offline_inference/data_parallel.py --enforce-eager
|
||||||
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
|
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
|
||||||
|
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
|
||||||
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
|
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
|
||||||
- TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_internal_lb_dp.py
|
- TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_internal_lb_dp.py
|
||||||
- TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_hybrid_lb_dp.py
|
- TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_hybrid_lb_dp.py
|
||||||
- pytest -v -s v1/engine/test_engine_core_client.py::test_kv_cache_events_dp
|
- pytest -v -s v1/engine/test_engine_core_client.py::test_kv_cache_events_dp
|
||||||
- pytest -v -s distributed/test_utils.py
|
- pytest -v -s distributed/test_utils.py
|
||||||
- pytest -v -s compile/test_basic_correctness.py
|
- pytest -v -s compile/fullgraph/test_basic_correctness.py
|
||||||
- pytest -v -s distributed/test_pynccl.py
|
- pytest -v -s distributed/test_pynccl.py
|
||||||
- pytest -v -s distributed/test_events.py
|
- pytest -v -s distributed/test_events.py
|
||||||
- pytest -v -s distributed/test_symm_mem_allreduce.py
|
- pytest -v -s distributed/test_symm_mem_allreduce.py
|
||||||
@@ -232,8 +238,8 @@ steps:
|
|||||||
commands:
|
commands:
|
||||||
- pytest -v -s distributed/test_eplb_algo.py
|
- pytest -v -s distributed/test_eplb_algo.py
|
||||||
|
|
||||||
- label: EPLB Execution Test # 5min
|
- label: EPLB Execution Test # 10min
|
||||||
timeout_in_minutes: 15
|
timeout_in_minutes: 20
|
||||||
working_dir: "/vllm-workspace/tests"
|
working_dir: "/vllm-workspace/tests"
|
||||||
num_gpus: 4
|
num_gpus: 4
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
@@ -241,6 +247,7 @@ steps:
|
|||||||
- tests/distributed/test_eplb_execute.py
|
- tests/distributed/test_eplb_execute.py
|
||||||
commands:
|
commands:
|
||||||
- pytest -v -s distributed/test_eplb_execute.py
|
- pytest -v -s distributed/test_eplb_execute.py
|
||||||
|
- pytest -v -s distributed/test_eplb_spec_decode.py
|
||||||
|
|
||||||
- label: Metrics, Tracing Test # 12min
|
- label: Metrics, Tracing Test # 12min
|
||||||
timeout_in_minutes: 20
|
timeout_in_minutes: 20
|
||||||
@@ -271,21 +278,18 @@ steps:
|
|||||||
- pytest -v -s test_regression.py
|
- pytest -v -s test_regression.py
|
||||||
working_dir: "/vllm-workspace/tests" # optional
|
working_dir: "/vllm-workspace/tests" # optional
|
||||||
|
|
||||||
- label: Engine Test # 25min
|
- label: Engine Test # 9min
|
||||||
timeout_in_minutes: 40
|
timeout_in_minutes: 15
|
||||||
mirror_hardwares: [amdexperimental]
|
mirror_hardwares: [amdexperimental]
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
- tests/engine
|
- tests/engine
|
||||||
- tests/tokenization
|
|
||||||
- tests/test_sequence
|
- tests/test_sequence
|
||||||
- tests/test_config
|
- tests/test_config
|
||||||
- tests/test_logger
|
- tests/test_logger
|
||||||
- tests/test_vllm_port
|
- tests/test_vllm_port
|
||||||
commands:
|
commands:
|
||||||
- pytest -v -s engine test_sequence.py test_config.py test_logger.py test_vllm_port.py
|
- pytest -v -s engine test_sequence.py test_config.py test_logger.py test_vllm_port.py
|
||||||
# OOM in the CI unless we run this separately
|
|
||||||
- pytest -v -s tokenization
|
|
||||||
|
|
||||||
- label: V1 Test e2e + engine # 30min
|
- label: V1 Test e2e + engine # 30min
|
||||||
timeout_in_minutes: 45
|
timeout_in_minutes: 45
|
||||||
@@ -315,6 +319,7 @@ steps:
|
|||||||
- vllm/
|
- vllm/
|
||||||
- tests/v1
|
- tests/v1
|
||||||
commands:
|
commands:
|
||||||
|
- uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
|
||||||
# split the test to avoid interference
|
# split the test to avoid interference
|
||||||
- pytest -v -s -m 'not cpu_test' v1/core
|
- pytest -v -s -m 'not cpu_test' v1/core
|
||||||
- pytest -v -s v1/executor
|
- pytest -v -s v1/executor
|
||||||
@@ -327,6 +332,7 @@ steps:
|
|||||||
- pytest -v -s -m 'not cpu_test' v1/metrics
|
- pytest -v -s -m 'not cpu_test' v1/metrics
|
||||||
- pytest -v -s v1/test_oracle.py
|
- pytest -v -s v1/test_oracle.py
|
||||||
- pytest -v -s v1/test_request.py
|
- pytest -v -s v1/test_request.py
|
||||||
|
- pytest -v -s v1/test_outputs.py
|
||||||
# Integration test for streaming correctness (requires special branch).
|
# Integration test for streaming correctness (requires special branch).
|
||||||
- pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api
|
- pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api
|
||||||
- pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
|
- pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
|
||||||
@@ -340,6 +346,28 @@ steps:
|
|||||||
commands:
|
commands:
|
||||||
- pytest -v -s v1/attention
|
- pytest -v -s v1/attention
|
||||||
|
|
||||||
|
- label: Batch Invariance Tests (H100) # 10min
|
||||||
|
timeout_in_minutes: 25
|
||||||
|
gpu: h100
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/v1/attention
|
||||||
|
- vllm/model_executor/layers
|
||||||
|
- tests/v1/determinism/
|
||||||
|
commands:
|
||||||
|
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||||
|
- pip install pytest-timeout pytest-forked
|
||||||
|
- pytest -v -s v1/determinism/test_batch_invariance.py
|
||||||
|
- pytest -v -s v1/determinism/test_rms_norm_batch_invariant.py
|
||||||
|
|
||||||
|
- label: V1 Test attention (B200) # 10min
|
||||||
|
timeout_in_minutes: 30
|
||||||
|
gpu: b200
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/v1/attention
|
||||||
|
- tests/v1/attention
|
||||||
|
commands:
|
||||||
|
- VLLM_DISABLE_FLASHINFER_PREFILL=1 pytest -v -s v1/attention # TODO: FI prefill is bugged and causes incorrectness, fix this
|
||||||
|
|
||||||
- label: V1 Test others (CPU) # 5 mins
|
- label: V1 Test others (CPU) # 5 mins
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
@@ -360,23 +388,28 @@ steps:
|
|||||||
working_dir: "/vllm-workspace/examples"
|
working_dir: "/vllm-workspace/examples"
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/entrypoints
|
- vllm/entrypoints
|
||||||
|
- vllm/multimodal
|
||||||
- examples/
|
- examples/
|
||||||
commands:
|
commands:
|
||||||
- pip install tensorizer # for tensorizer test
|
- pip install tensorizer # for tensorizer test
|
||||||
|
# for basic
|
||||||
|
- python3 offline_inference/basic/chat.py
|
||||||
- python3 offline_inference/basic/generate.py --model facebook/opt-125m
|
- python3 offline_inference/basic/generate.py --model facebook/opt-125m
|
||||||
- python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
|
- python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
|
||||||
- python3 offline_inference/basic/chat.py
|
|
||||||
- python3 offline_inference/prefix_caching.py
|
|
||||||
- python3 offline_inference/llm_engine_example.py
|
|
||||||
- python3 offline_inference/audio_language.py --seed 0
|
|
||||||
- python3 offline_inference/vision_language.py --seed 0
|
|
||||||
- python3 offline_inference/vision_language_pooling.py --seed 0
|
|
||||||
- python3 offline_inference/vision_language_multi_image.py --seed 0
|
|
||||||
- python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
|
|
||||||
- python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
|
|
||||||
- python3 offline_inference/basic/classify.py
|
- python3 offline_inference/basic/classify.py
|
||||||
- python3 offline_inference/basic/embed.py
|
- python3 offline_inference/basic/embed.py
|
||||||
- python3 offline_inference/basic/score.py
|
- python3 offline_inference/basic/score.py
|
||||||
|
# for multi-modal models
|
||||||
|
- python3 offline_inference/audio_language.py --seed 0
|
||||||
|
- python3 offline_inference/vision_language.py --seed 0
|
||||||
|
- python3 offline_inference/vision_language_multi_image.py --seed 0
|
||||||
|
- python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
|
||||||
|
# for pooling models
|
||||||
|
- python3 pooling/pooling/vision_language_pooling.py --seed 0
|
||||||
|
# for features demo
|
||||||
|
- python3 offline_inference/prefix_caching.py
|
||||||
|
- python3 offline_inference/llm_engine_example.py
|
||||||
|
- python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
|
||||||
- python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
|
- python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
|
||||||
# https://github.com/vllm-project/vllm/pull/26682 uses slightly more memory in PyTorch 2.9+ causing this test to OOM in 1xL4 GPU
|
# https://github.com/vllm-project/vllm/pull/26682 uses slightly more memory in PyTorch 2.9+ causing this test to OOM in 1xL4 GPU
|
||||||
- python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536
|
- python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536
|
||||||
@@ -417,7 +450,7 @@ steps:
|
|||||||
--ignore=lora/test_llm_with_multi_loras.py \
|
--ignore=lora/test_llm_with_multi_loras.py \
|
||||||
--ignore=lora/test_olmoe_tp.py \
|
--ignore=lora/test_olmoe_tp.py \
|
||||||
--ignore=lora/test_deepseekv2_tp.py \
|
--ignore=lora/test_deepseekv2_tp.py \
|
||||||
--ignore=lora/test_gptoss.py \
|
--ignore=lora/test_gptoss_tp.py \
|
||||||
--ignore=lora/test_qwen3moe_tp.py
|
--ignore=lora/test_qwen3moe_tp.py
|
||||||
|
|
||||||
parallelism: 4
|
parallelism: 4
|
||||||
@@ -430,15 +463,14 @@ steps:
|
|||||||
- vllm/
|
- vllm/
|
||||||
- tests/compile
|
- tests/compile
|
||||||
commands:
|
commands:
|
||||||
- pytest -v -s compile/test_pass_manager.py
|
# Run unit tests defined directly under compile/,
|
||||||
- pytest -v -s compile/test_fusion.py
|
# not including subdirectories, which are usually heavier
|
||||||
- pytest -v -s compile/test_fusion_attn.py
|
# tests covered elsewhere.
|
||||||
- pytest -v -s compile/test_functionalization.py
|
# Use `find` to launch multiple instances of pytest so that
|
||||||
- pytest -v -s compile/test_silu_mul_quant_fusion.py
|
# they do not suffer from https://github.com/vllm-project/vllm/issues/28965
|
||||||
- pytest -v -s compile/test_fusion_all_reduce.py
|
# However, find does not normally propagate error codes, so we combine it with xargs
|
||||||
- pytest -v -s compile/test_decorator.py
|
# (using -0 for proper path handling)
|
||||||
- pytest -v -s compile/test_noop_elimination.py
|
- "find compile/ -maxdepth 1 -name 'test_*.py' -print0 | xargs -0 -n1 -I{} pytest -s -v '{}'"
|
||||||
- pytest -v -s compile/test_aot_compile.py
|
|
||||||
|
|
||||||
- label: PyTorch Fullgraph Smoke Test # 15min
|
- label: PyTorch Fullgraph Smoke Test # 15min
|
||||||
timeout_in_minutes: 30
|
timeout_in_minutes: 30
|
||||||
@@ -448,19 +480,27 @@ steps:
|
|||||||
- vllm/
|
- vllm/
|
||||||
- tests/compile
|
- tests/compile
|
||||||
commands:
|
commands:
|
||||||
- pytest -v -s compile/test_basic_correctness.py
|
# Run smoke tests under fullgraph directory, except test_full_graph.py
|
||||||
- pytest -v -s compile/piecewise/
|
# as it is a heavy test that is covered in other steps.
|
||||||
|
# Use `find` to launch multiple instances of pytest so that
|
||||||
|
# they do not suffer from https://github.com/vllm-project/vllm/issues/28965
|
||||||
|
# However, find does not normally propagate error codes, so we combine it with xargs
|
||||||
|
# (using -0 for proper path handling)
|
||||||
|
- "find compile/fullgraph -maxdepth 1 -name 'test_*.py' -not -name 'test_full_graph.py' -print0 | xargs -0 -n1 -I{} pytest -s -v '{}'"
|
||||||
|
|
||||||
- label: PyTorch Fullgraph Test # 22min
|
- label: PyTorch Fullgraph Test # 27min
|
||||||
timeout_in_minutes: 35
|
timeout_in_minutes: 40
|
||||||
mirror_hardwares: [amdexperimental]
|
mirror_hardwares: [amdexperimental]
|
||||||
torch_nightly: true
|
torch_nightly: true
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
- tests/compile
|
- tests/compile
|
||||||
commands:
|
commands:
|
||||||
- pytest -v -s compile/test_full_graph.py
|
# fp8 kv scales not supported on sm89, tested on Blackwell instead
|
||||||
- pytest -v -s compile/test_fusions_e2e.py
|
- pytest -v -s compile/fullgraph/test_full_graph.py -k 'not test_fp8_kv_scale_compile'
|
||||||
|
# Limit to no custom ops to reduce running time
|
||||||
|
# Wrap with quotes to escape yaml and avoid starting -k string with a -
|
||||||
|
- "pytest -v -s compile/distributed/test_fusions_e2e.py -k 'TRITON and not +quant_fp8 and not Llama-4'"
|
||||||
|
|
||||||
- label: Cudagraph test
|
- label: Cudagraph test
|
||||||
timeout_in_minutes: 20
|
timeout_in_minutes: 20
|
||||||
@@ -532,10 +572,32 @@ steps:
|
|||||||
commands:
|
commands:
|
||||||
- pytest -v -s kernels/mamba
|
- pytest -v -s kernels/mamba
|
||||||
|
|
||||||
|
- label: Kernels DeepGEMM Test (H100)
|
||||||
|
timeout_in_minutes: 45
|
||||||
|
gpu: h100
|
||||||
|
num_gpus: 1
|
||||||
|
source_file_dependencies:
|
||||||
|
- tools/install_deepgemm.sh
|
||||||
|
- vllm/utils/deep_gemm.py
|
||||||
|
- vllm/model_executor/layers/fused_moe
|
||||||
|
- vllm/model_executor/layers/quantization
|
||||||
|
- tests/kernels/quantization/test_block_fp8.py
|
||||||
|
- tests/kernels/moe/test_deepgemm.py
|
||||||
|
- tests/kernels/moe/test_batched_deepgemm.py
|
||||||
|
- tests/kernels/attention/test_deepgemm_attention.py
|
||||||
|
commands:
|
||||||
|
- pytest -v -s kernels/quantization/test_block_fp8.py -k deep_gemm
|
||||||
|
- pytest -v -s kernels/moe/test_deepgemm.py
|
||||||
|
- pytest -v -s kernels/moe/test_batched_deepgemm.py
|
||||||
|
- pytest -v -s kernels/attention/test_deepgemm_attention.py
|
||||||
|
|
||||||
- label: Model Executor Test # 23min
|
- label: Model Executor Test # 23min
|
||||||
timeout_in_minutes: 35
|
timeout_in_minutes: 35
|
||||||
|
torch_nightly: true
|
||||||
mirror_hardwares: [amdexperimental]
|
mirror_hardwares: [amdexperimental]
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
|
- vllm/engine/arg_utils.py
|
||||||
|
- vllm/config/model.py
|
||||||
- vllm/model_executor
|
- vllm/model_executor
|
||||||
- tests/model_executor
|
- tests/model_executor
|
||||||
- tests/entrypoints/openai/test_tensorizer_entrypoint.py
|
- tests/entrypoints/openai/test_tensorizer_entrypoint.py
|
||||||
@@ -579,6 +641,7 @@ steps:
|
|||||||
# we can only upgrade after this is resolved
|
# we can only upgrade after this is resolved
|
||||||
# TODO(jerryzh168): resolve the above comment
|
# TODO(jerryzh168): resolve the above comment
|
||||||
- uv pip install --system torchao==0.13.0 --index-url https://download.pytorch.org/whl/cu129
|
- uv pip install --system torchao==0.13.0 --index-url https://download.pytorch.org/whl/cu129
|
||||||
|
- uv pip install --system conch-triton-kernels
|
||||||
- VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py
|
- VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py
|
||||||
|
|
||||||
- label: LM Eval Small Models # 53min
|
- label: LM Eval Small Models # 53min
|
||||||
@@ -587,6 +650,7 @@ steps:
|
|||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- csrc/
|
- csrc/
|
||||||
- vllm/model_executor/layers/quantization
|
- vllm/model_executor/layers/quantization
|
||||||
|
autorun_on_main: true
|
||||||
commands:
|
commands:
|
||||||
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt --tp-size=1
|
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt --tp-size=1
|
||||||
|
|
||||||
@@ -638,6 +702,7 @@ steps:
|
|||||||
torch_nightly: true
|
torch_nightly: true
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/model_executor/models/
|
- vllm/model_executor/models/
|
||||||
|
- vllm/transformers_utils/
|
||||||
- tests/models/test_initialization.py
|
- tests/models/test_initialization.py
|
||||||
commands:
|
commands:
|
||||||
# Only when vLLM model source is modified - test initialization of a large
|
# Only when vLLM model source is modified - test initialization of a large
|
||||||
@@ -764,14 +829,24 @@ steps:
|
|||||||
commands:
|
commands:
|
||||||
- pytest -v -s models/language/pooling_mteb_test
|
- pytest -v -s models/language/pooling_mteb_test
|
||||||
|
|
||||||
- label: Multi-Modal Processor Test # 44min
|
- label: Multi-Modal Processor Test (CPU)
|
||||||
|
timeout_in_minutes: 60
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/
|
||||||
|
- tests/models/multimodal
|
||||||
|
no_gpu: true
|
||||||
|
commands:
|
||||||
|
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
|
||||||
|
- pytest -v -s models/multimodal/processing --ignore models/multimodal/processing/test_tensor_schema.py
|
||||||
|
|
||||||
|
- label: Multi-Modal Processor Test
|
||||||
timeout_in_minutes: 60
|
timeout_in_minutes: 60
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
- tests/models/multimodal
|
- tests/models/multimodal
|
||||||
commands:
|
commands:
|
||||||
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
|
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
|
||||||
- pytest -v -s models/multimodal/processing
|
- pytest -v -s models/multimodal/processing/test_tensor_schema.py
|
||||||
|
|
||||||
- label: Multi-Modal Models Test (Standard) # 60min
|
- label: Multi-Modal Models Test (Standard) # 60min
|
||||||
timeout_in_minutes: 80
|
timeout_in_minutes: 80
|
||||||
@@ -848,6 +923,7 @@ steps:
|
|||||||
- label: Transformers Nightly Models Test
|
- label: Transformers Nightly Models Test
|
||||||
working_dir: "/vllm-workspace/"
|
working_dir: "/vllm-workspace/"
|
||||||
optional: true
|
optional: true
|
||||||
|
soft_fail: true
|
||||||
commands:
|
commands:
|
||||||
- pip install --upgrade git+https://github.com/huggingface/transformers
|
- pip install --upgrade git+https://github.com/huggingface/transformers
|
||||||
- pytest -v -s tests/models/test_initialization.py
|
- pytest -v -s tests/models/test_initialization.py
|
||||||
@@ -873,11 +949,16 @@ steps:
|
|||||||
- vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py
|
- vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py
|
||||||
- vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
|
- vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
|
||||||
- vllm/v1/attention/backends/flashinfer.py
|
- vllm/v1/attention/backends/flashinfer.py
|
||||||
|
- vllm/v1/attention/backends/mla/cutlass_mla.py
|
||||||
|
- vllm/v1/attention/backends/mla/flashinfer_mla.py
|
||||||
|
- vllm/platforms/cuda.py
|
||||||
|
- vllm/attention/selector.py
|
||||||
commands:
|
commands:
|
||||||
- nvidia-smi
|
- nvidia-smi
|
||||||
- python3 examples/offline_inference/basic/chat.py
|
- python3 examples/offline_inference/basic/chat.py
|
||||||
# Attention
|
# Attention
|
||||||
# num_heads2 broken by https://github.com/flashinfer-ai/flashinfer/issues/1353
|
# num_heads2 broken by https://github.com/flashinfer-ai/flashinfer/issues/1353
|
||||||
|
- pytest -v -s tests/kernels/attention/test_attention_selector.py
|
||||||
- pytest -v -s tests/kernels/attention/test_flashinfer.py -k 'not num_heads2'
|
- pytest -v -s tests/kernels/attention/test_flashinfer.py -k 'not num_heads2'
|
||||||
- pytest -v -s tests/kernels/attention/test_flashinfer_trtllm_attention.py
|
- pytest -v -s tests/kernels/attention/test_flashinfer_trtllm_attention.py
|
||||||
- pytest -v -s tests/kernels/attention/test_cutlass_mla_decode.py
|
- pytest -v -s tests/kernels/attention/test_cutlass_mla_decode.py
|
||||||
@@ -894,8 +975,9 @@ steps:
|
|||||||
- pytest -v -s tests/kernels/moe/test_nvfp4_moe.py
|
- pytest -v -s tests/kernels/moe/test_nvfp4_moe.py
|
||||||
- pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py
|
- pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py
|
||||||
- pytest -v -s tests/kernels/moe/test_flashinfer.py
|
- pytest -v -s tests/kernels/moe/test_flashinfer.py
|
||||||
|
- pytest -v -s tests/kernels/moe/test_cutedsl_moe.py
|
||||||
|
|
||||||
- label: Blackwell Fusion Tests # 30 min
|
- label: Blackwell Fusion and Compile Tests # 30 min
|
||||||
timeout_in_minutes: 40
|
timeout_in_minutes: 40
|
||||||
working_dir: "/vllm-workspace/"
|
working_dir: "/vllm-workspace/"
|
||||||
gpu: b200
|
gpu: b200
|
||||||
@@ -903,18 +985,50 @@ steps:
|
|||||||
- csrc/quantization/fp4/
|
- csrc/quantization/fp4/
|
||||||
- vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
|
- vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
|
||||||
- vllm/v1/attention/backends/flashinfer.py
|
- vllm/v1/attention/backends/flashinfer.py
|
||||||
|
- vllm/v1/worker/
|
||||||
|
- vllm/v1/cudagraph_dispatcher.py
|
||||||
|
- vllm/compilation/
|
||||||
|
# can affect pattern matching
|
||||||
|
- vllm/model_executor/layers/layernorm.py
|
||||||
|
- vllm/model_executor/layers/activation.py
|
||||||
|
- vllm/model_executor/layers/quantization/input_quant_fp8.py
|
||||||
|
- tests/compile/test_fusion_attn.py
|
||||||
|
- tests/compile/test_silu_mul_quant_fusion.py
|
||||||
|
- tests/compile/distributed/test_fusion_all_reduce.py
|
||||||
|
- tests/compile/distributed/test_fusions_e2e.py
|
||||||
|
- tests/compile/fullgraph/test_full_graph.py
|
||||||
|
commands:
|
||||||
|
- nvidia-smi
|
||||||
|
- pytest -v -s tests/compile/test_fusion_attn.py
|
||||||
|
- pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
|
||||||
|
# this runner has 2 GPUs available even though num_gpus=2 is not set
|
||||||
|
- pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
|
||||||
|
# Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time
|
||||||
|
# Wrap with quotes to escape yaml
|
||||||
|
- "pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and not +quant_fp8 and not +rms_norm'"
|
||||||
|
# test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40)
|
||||||
|
- pytest -v -s tests/compile/fullgraph/test_full_graph.py::test_fp8_kv_scale_compile
|
||||||
|
|
||||||
|
- label: Blackwell Fusion E2E Tests # 30 min
|
||||||
|
timeout_in_minutes: 40
|
||||||
|
working_dir: "/vllm-workspace/"
|
||||||
|
gpu: b200
|
||||||
|
optional: true
|
||||||
|
num_gpus: 2
|
||||||
|
source_file_dependencies:
|
||||||
|
- csrc/quantization/fp4/
|
||||||
|
- vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
|
||||||
|
- vllm/v1/attention/backends/flashinfer.py
|
||||||
- vllm/compilation/
|
- vllm/compilation/
|
||||||
# can affect pattern matching
|
# can affect pattern matching
|
||||||
- vllm/model_executor/layers/layernorm.py
|
- vllm/model_executor/layers/layernorm.py
|
||||||
- vllm/model_executor/layers/activation.py
|
- vllm/model_executor/layers/activation.py
|
||||||
- vllm/model_executor/layers/quantization/input_quant_fp8.py
|
- vllm/model_executor/layers/quantization/input_quant_fp8.py
|
||||||
|
- tests/compile/distributed/test_fusions_e2e.py
|
||||||
commands:
|
commands:
|
||||||
- nvidia-smi
|
- nvidia-smi
|
||||||
- pytest -v -s tests/compile/test_fusion_attn.py
|
# Run all e2e fusion tests
|
||||||
- pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
|
- pytest -v -s tests/compile/distributed/test_fusions_e2e.py
|
||||||
# this runner has 2 GPUs available even though num_gpus=2 is not set
|
|
||||||
- pytest -v -s tests/compile/test_fusion_all_reduce.py
|
|
||||||
- pytest -v -s tests/compile/test_fusions_e2e.py
|
|
||||||
|
|
||||||
- label: Blackwell GPT-OSS Eval
|
- label: Blackwell GPT-OSS Eval
|
||||||
timeout_in_minutes: 60
|
timeout_in_minutes: 60
|
||||||
@@ -1012,7 +1126,7 @@ steps:
|
|||||||
- vllm/worker/worker_base.py
|
- vllm/worker/worker_base.py
|
||||||
- vllm/v1/engine/
|
- vllm/v1/engine/
|
||||||
- vllm/v1/worker/
|
- vllm/v1/worker/
|
||||||
- tests/compile/test_basic_correctness.py
|
- tests/compile/fullgraph/test_basic_correctness.py
|
||||||
- tests/compile/test_wrapper.py
|
- tests/compile/test_wrapper.py
|
||||||
- tests/distributed/
|
- tests/distributed/
|
||||||
- tests/entrypoints/llm/test_collective_rpc.py
|
- tests/entrypoints/llm/test_collective_rpc.py
|
||||||
@@ -1024,10 +1138,11 @@ steps:
|
|||||||
# https://github.com/NVIDIA/nccl/issues/1838
|
# https://github.com/NVIDIA/nccl/issues/1838
|
||||||
- export NCCL_CUMEM_HOST_ENABLE=0
|
- export NCCL_CUMEM_HOST_ENABLE=0
|
||||||
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
|
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
|
||||||
|
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
|
||||||
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
|
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
|
||||||
- DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py
|
- DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py
|
||||||
- pytest -v -s entrypoints/llm/test_collective_rpc.py
|
- pytest -v -s entrypoints/llm/test_collective_rpc.py
|
||||||
- pytest -v -s ./compile/test_basic_correctness.py
|
- pytest -v -s ./compile/fullgraph/test_basic_correctness.py
|
||||||
- pytest -v -s ./compile/test_wrapper.py
|
- pytest -v -s ./compile/test_wrapper.py
|
||||||
- VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
|
- VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
|
||||||
- VLLM_TEST_SAME_HOST=1 VLLM_TEST_WITH_DEFAULT_DEVICE_SET=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
|
- VLLM_TEST_SAME_HOST=1 VLLM_TEST_WITH_DEFAULT_DEVICE_SET=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
|
||||||
@@ -1119,6 +1234,7 @@ steps:
|
|||||||
- pytest -v -s -x lora/test_llama_tp.py
|
- pytest -v -s -x lora/test_llama_tp.py
|
||||||
- pytest -v -s -x lora/test_llm_with_multi_loras.py
|
- pytest -v -s -x lora/test_llm_with_multi_loras.py
|
||||||
- pytest -v -s -x lora/test_olmoe_tp.py
|
- pytest -v -s -x lora/test_olmoe_tp.py
|
||||||
|
- pytest -v -s -x lora/test_gptoss_tp.py
|
||||||
|
|
||||||
|
|
||||||
- label: Weight Loading Multiple GPU Test # 33min
|
- label: Weight Loading Multiple GPU Test # 33min
|
||||||
@@ -1206,12 +1322,14 @@ steps:
|
|||||||
working_dir: "/vllm-workspace/"
|
working_dir: "/vllm-workspace/"
|
||||||
num_gpus: 2
|
num_gpus: 2
|
||||||
commands:
|
commands:
|
||||||
- pytest -v -s tests/compile/test_async_tp.py
|
- VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_async_tp.py
|
||||||
- pytest -v -s tests/compile/test_sequence_parallelism.py
|
- pytest -v -s tests/compile/distributed/test_sequence_parallelism.py
|
||||||
- pytest -v -s tests/compile/test_fusion_all_reduce.py
|
- pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
|
||||||
- pytest -v -s tests/compile/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm
|
- "VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'"
|
||||||
|
- VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/distributed/test_sequence_parallel.py
|
||||||
- pytest -v -s tests/distributed/test_context_parallel.py
|
- pytest -v -s tests/distributed/test_context_parallel.py
|
||||||
- CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048
|
- CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048
|
||||||
|
- pytest -v -s tests/v1/distributed/test_dbo.py
|
||||||
|
|
||||||
##### B200 test #####
|
##### B200 test #####
|
||||||
- label: Distributed Tests (B200) # optional
|
- label: Distributed Tests (B200) # optional
|
||||||
@@ -1222,6 +1340,7 @@ steps:
|
|||||||
commands:
|
commands:
|
||||||
- pytest -v -s tests/distributed/test_context_parallel.py
|
- pytest -v -s tests/distributed/test_context_parallel.py
|
||||||
- pytest -v -s tests/distributed/test_nccl_symm_mem_allreduce.py
|
- pytest -v -s tests/distributed/test_nccl_symm_mem_allreduce.py
|
||||||
|
- pytest -v -s tests/v1/distributed/test_dbo.py
|
||||||
|
|
||||||
##### RL Integration Tests #####
|
##### RL Integration Tests #####
|
||||||
- label: Prime-RL Integration Test # 15min
|
- label: Prime-RL Integration Test # 15min
|
||||||
@@ -1234,3 +1353,48 @@ steps:
|
|||||||
- .buildkite/scripts/run-prime-rl-test.sh
|
- .buildkite/scripts/run-prime-rl-test.sh
|
||||||
commands:
|
commands:
|
||||||
- bash .buildkite/scripts/run-prime-rl-test.sh
|
- bash .buildkite/scripts/run-prime-rl-test.sh
|
||||||
|
|
||||||
|
- label: DeepSeek V2-Lite Accuracy
|
||||||
|
timeout_in_minutes: 60
|
||||||
|
gpu: h100
|
||||||
|
optional: true
|
||||||
|
num_gpus: 4
|
||||||
|
working_dir: "/vllm-workspace"
|
||||||
|
commands:
|
||||||
|
- bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh 0.25 200 8010
|
||||||
|
|
||||||
|
- label: Qwen3-30B-A3B-FP8-block Accuracy (H100)
|
||||||
|
timeout_in_minutes: 60
|
||||||
|
gpu: h100
|
||||||
|
optional: true
|
||||||
|
num_gpus: 4
|
||||||
|
working_dir: "/vllm-workspace"
|
||||||
|
commands:
|
||||||
|
- bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020
|
||||||
|
|
||||||
|
- label: Qwen3-30B-A3B-FP8-block Accuracy (B200)
|
||||||
|
timeout_in_minutes: 60
|
||||||
|
gpu: b200
|
||||||
|
optional: true
|
||||||
|
num_gpus: 2
|
||||||
|
working_dir: "/vllm-workspace"
|
||||||
|
commands:
|
||||||
|
- bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1
|
||||||
|
|
||||||
|
- label: DeepSeek V2-Lite Async EPLB Accuracy
|
||||||
|
timeout_in_minutes: 60
|
||||||
|
gpu: h100
|
||||||
|
optional: true
|
||||||
|
num_gpus: 4
|
||||||
|
working_dir: "/vllm-workspace"
|
||||||
|
commands:
|
||||||
|
- bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_async_eplb.sh 0.25 1319 8030
|
||||||
|
|
||||||
|
- label: Qwen3-Next-80B-A3B-Instruct MTP Async EPLB Accuracy
|
||||||
|
timeout_in_minutes: 60
|
||||||
|
gpu: h100
|
||||||
|
optional: true
|
||||||
|
num_gpus: 4
|
||||||
|
working_dir: "/vllm-workspace"
|
||||||
|
commands:
|
||||||
|
- bash .buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh 0.8 1319 8040
|
||||||
|
|||||||
21
.buildkite/test_areas/attention.yaml
Normal file
21
.buildkite/test_areas/attention.yaml
Normal file
@@ -0,0 +1,21 @@
|
|||||||
|
group: Attention
|
||||||
|
depends_on:
|
||||||
|
- image-build
|
||||||
|
steps:
|
||||||
|
- label: V1 attention (H100)
|
||||||
|
timeout_in_minutes: 30
|
||||||
|
gpu: h100
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/v1/attention
|
||||||
|
- tests/v1/attention
|
||||||
|
commands:
|
||||||
|
- pytest -v -s v1/attention
|
||||||
|
|
||||||
|
- label: V1 attention (B200)
|
||||||
|
timeout_in_minutes: 30
|
||||||
|
gpu: b200
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/v1/attention
|
||||||
|
- tests/v1/attention
|
||||||
|
commands:
|
||||||
|
- VLLM_DISABLE_FLASHINFER_PREFILL=1 pytest -v -s v1/attention # TODO: FI prefill is bugged and causes incorrectness, fix this
|
||||||
16
.buildkite/test_areas/basic_correctness.yaml
Normal file
16
.buildkite/test_areas/basic_correctness.yaml
Normal file
@@ -0,0 +1,16 @@
|
|||||||
|
group: Basic Correctness
|
||||||
|
depends_on:
|
||||||
|
- image-build
|
||||||
|
steps:
|
||||||
|
- label: Basic Correctness
|
||||||
|
timeout_in_minutes: 30
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/
|
||||||
|
- tests/basic_correctness/test_basic_correctness
|
||||||
|
- tests/basic_correctness/test_cpu_offload
|
||||||
|
- tests/basic_correctness/test_cumem.py
|
||||||
|
commands:
|
||||||
|
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||||
|
- pytest -v -s basic_correctness/test_cumem.py
|
||||||
|
- pytest -v -s basic_correctness/test_basic_correctness.py
|
||||||
|
- pytest -v -s basic_correctness/test_cpu_offload.py
|
||||||
19
.buildkite/test_areas/benchmarks.yaml
Normal file
19
.buildkite/test_areas/benchmarks.yaml
Normal file
@@ -0,0 +1,19 @@
|
|||||||
|
group: Benchmarks
|
||||||
|
depends_on:
|
||||||
|
- image-build
|
||||||
|
steps:
|
||||||
|
- label: Benchmarks
|
||||||
|
timeout_in_minutes: 20
|
||||||
|
working_dir: "/vllm-workspace/.buildkite"
|
||||||
|
source_file_dependencies:
|
||||||
|
- benchmarks/
|
||||||
|
commands:
|
||||||
|
- bash scripts/run-benchmarks.sh
|
||||||
|
|
||||||
|
- label: Benchmarks CLI Test
|
||||||
|
timeout_in_minutes: 20
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/
|
||||||
|
- tests/benchmarks/
|
||||||
|
commands:
|
||||||
|
- pytest -v -s benchmarks/
|
||||||
57
.buildkite/test_areas/compile.yaml
Normal file
57
.buildkite/test_areas/compile.yaml
Normal file
@@ -0,0 +1,57 @@
|
|||||||
|
group: Compile
|
||||||
|
depends_on:
|
||||||
|
- image-build
|
||||||
|
steps:
|
||||||
|
- label: Fusion and Compile Tests (B200)
|
||||||
|
timeout_in_minutes: 40
|
||||||
|
working_dir: "/vllm-workspace/"
|
||||||
|
gpu: b200
|
||||||
|
source_file_dependencies:
|
||||||
|
- csrc/quantization/fp4/
|
||||||
|
- vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
|
||||||
|
- vllm/v1/attention/backends/flashinfer.py
|
||||||
|
- vllm/v1/worker/
|
||||||
|
- vllm/v1/cudagraph_dispatcher.py
|
||||||
|
- vllm/compilation/
|
||||||
|
# can affect pattern matching
|
||||||
|
- vllm/model_executor/layers/layernorm.py
|
||||||
|
- vllm/model_executor/layers/activation.py
|
||||||
|
- vllm/model_executor/layers/quantization/input_quant_fp8.py
|
||||||
|
- tests/compile/test_fusion_attn.py
|
||||||
|
- tests/compile/test_silu_mul_quant_fusion.py
|
||||||
|
- tests/compile/distributed/test_fusion_all_reduce.py
|
||||||
|
- tests/compile/distributed/test_fusions_e2e.py
|
||||||
|
- tests/compile/fullgraph/test_full_graph.py
|
||||||
|
commands:
|
||||||
|
- nvidia-smi
|
||||||
|
- pytest -v -s tests/compile/test_fusion_attn.py
|
||||||
|
- pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
|
||||||
|
# this runner has 2 GPUs available even though num_gpus=2 is not set
|
||||||
|
- pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
|
||||||
|
# Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time
|
||||||
|
# Wrap with quotes to escape yaml
|
||||||
|
- "pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and not +quant_fp8 and not +rms_norm'"
|
||||||
|
# test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40)
|
||||||
|
- pytest -v -s tests/compile/fullgraph/test_full_graph.py::test_fp8_kv_scale_compile
|
||||||
|
|
||||||
|
- label: Fusion E2E (2 GPUs)(B200)
|
||||||
|
timeout_in_minutes: 40
|
||||||
|
working_dir: "/vllm-workspace/"
|
||||||
|
gpu: b200
|
||||||
|
optional: true
|
||||||
|
num_gpus: 2
|
||||||
|
source_file_dependencies:
|
||||||
|
- csrc/quantization/fp4/
|
||||||
|
- vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
|
||||||
|
- vllm/v1/attention/backends/flashinfer.py
|
||||||
|
- vllm/compilation/
|
||||||
|
# can affect pattern matching
|
||||||
|
- vllm/model_executor/layers/layernorm.py
|
||||||
|
- vllm/model_executor/layers/activation.py
|
||||||
|
- vllm/model_executor/layers/quantization/input_quant_fp8.py
|
||||||
|
- tests/compile/distributed/test_fusions_e2e.py
|
||||||
|
commands:
|
||||||
|
- nvidia-smi
|
||||||
|
# Run all e2e fusion tests
|
||||||
|
- pytest -v -s tests/compile/distributed/test_fusions_e2e.py
|
||||||
|
|
||||||
22
.buildkite/test_areas/cuda.yaml
Normal file
22
.buildkite/test_areas/cuda.yaml
Normal file
@@ -0,0 +1,22 @@
|
|||||||
|
group: CUDA
|
||||||
|
depends_on:
|
||||||
|
- image-build
|
||||||
|
steps:
|
||||||
|
- label: Platform Tests (CUDA)
|
||||||
|
timeout_in_minutes: 15
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/
|
||||||
|
- tests/cuda
|
||||||
|
commands:
|
||||||
|
- pytest -v -s cuda/test_cuda_context.py
|
||||||
|
|
||||||
|
- label: Cudagraph
|
||||||
|
timeout_in_minutes: 20
|
||||||
|
source_file_dependencies:
|
||||||
|
- tests/v1/cudagraph
|
||||||
|
- vllm/v1/cudagraph_dispatcher.py
|
||||||
|
- vllm/config/compilation.py
|
||||||
|
- vllm/compilation
|
||||||
|
commands:
|
||||||
|
- pytest -v -s v1/cudagraph/test_cudagraph_dispatch.py
|
||||||
|
- pytest -v -s v1/cudagraph/test_cudagraph_mode.py
|
||||||
199
.buildkite/test_areas/distributed.yaml
Normal file
199
.buildkite/test_areas/distributed.yaml
Normal file
@@ -0,0 +1,199 @@
|
|||||||
|
group: Distributed
|
||||||
|
depends_on:
|
||||||
|
- image-build
|
||||||
|
steps:
|
||||||
|
- label: Distributed Comm Ops
|
||||||
|
timeout_in_minutes: 20
|
||||||
|
working_dir: "/vllm-workspace/tests"
|
||||||
|
num_gpus: 2
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/distributed
|
||||||
|
- tests/distributed
|
||||||
|
commands:
|
||||||
|
- pytest -v -s distributed/test_comm_ops.py
|
||||||
|
- pytest -v -s distributed/test_shm_broadcast.py
|
||||||
|
- pytest -v -s distributed/test_shm_buffer.py
|
||||||
|
- pytest -v -s distributed/test_shm_storage.py
|
||||||
|
|
||||||
|
- label: Distributed (2 GPUs)
|
||||||
|
timeout_in_minutes: 90
|
||||||
|
working_dir: "/vllm-workspace/tests"
|
||||||
|
num_gpus: 2
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/compilation/
|
||||||
|
- vllm/distributed/
|
||||||
|
- vllm/engine/
|
||||||
|
- vllm/executor/
|
||||||
|
- vllm/worker/worker_base.py
|
||||||
|
- vllm/v1/engine/
|
||||||
|
- vllm/v1/worker/
|
||||||
|
- tests/compile/fullgraph/test_basic_correctness.py
|
||||||
|
- tests/compile/test_wrapper.py
|
||||||
|
- tests/distributed/
|
||||||
|
- tests/entrypoints/llm/test_collective_rpc.py
|
||||||
|
- tests/v1/distributed
|
||||||
|
- tests/v1/entrypoints/openai/test_multi_api_servers.py
|
||||||
|
- tests/v1/shutdown
|
||||||
|
- tests/v1/worker/test_worker_memory_snapshot.py
|
||||||
|
commands:
|
||||||
|
# https://github.com/NVIDIA/nccl/issues/1838
|
||||||
|
- export NCCL_CUMEM_HOST_ENABLE=0
|
||||||
|
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
|
||||||
|
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
|
||||||
|
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
|
||||||
|
- DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py
|
||||||
|
- pytest -v -s entrypoints/llm/test_collective_rpc.py
|
||||||
|
- pytest -v -s ./compile/fullgraph/test_basic_correctness.py
|
||||||
|
- pytest -v -s ./compile/test_wrapper.py
|
||||||
|
- VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
|
||||||
|
- VLLM_TEST_SAME_HOST=1 VLLM_TEST_WITH_DEFAULT_DEVICE_SET=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
|
||||||
|
- pytest -v -s distributed/test_sequence_parallel.py
|
||||||
|
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
|
||||||
|
- pytest -v -s v1/worker/test_worker_memory_snapshot.py
|
||||||
|
|
||||||
|
- label: Distributed Tests (4 GPUs)
|
||||||
|
timeout_in_minutes: 50
|
||||||
|
working_dir: "/vllm-workspace/tests"
|
||||||
|
num_gpus: 4
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/distributed/
|
||||||
|
- tests/distributed/test_utils
|
||||||
|
- tests/distributed/test_pynccl
|
||||||
|
- tests/distributed/test_events
|
||||||
|
- tests/compile/fullgraph/test_basic_correctness.py
|
||||||
|
- examples/offline_inference/rlhf.py
|
||||||
|
- examples/offline_inference/rlhf_colocate.py
|
||||||
|
- tests/examples/offline_inference/data_parallel.py
|
||||||
|
- tests/v1/distributed
|
||||||
|
- tests/v1/engine/test_engine_core_client.py
|
||||||
|
- tests/distributed/test_symm_mem_allreduce.py
|
||||||
|
commands:
|
||||||
|
# https://github.com/NVIDIA/nccl/issues/1838
|
||||||
|
- export NCCL_CUMEM_HOST_ENABLE=0
|
||||||
|
# test with torchrun tp=2 and external_dp=2
|
||||||
|
- torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
|
||||||
|
# test with torchrun tp=2 and pp=2
|
||||||
|
- PP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
|
||||||
|
# test with torchrun tp=4 and dp=1
|
||||||
|
- TP_SIZE=4 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
|
||||||
|
# test with torchrun tp=2, pp=2 and dp=1
|
||||||
|
- PP_SIZE=2 TP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
|
||||||
|
# test with torchrun tp=1 and dp=4 with ep
|
||||||
|
- DP_SIZE=4 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
|
||||||
|
# test with torchrun tp=2 and dp=2 with ep
|
||||||
|
- TP_SIZE=2 DP_SIZE=2 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
|
||||||
|
# test with internal dp
|
||||||
|
- python3 ../examples/offline_inference/data_parallel.py --enforce-eager
|
||||||
|
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
|
||||||
|
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
|
||||||
|
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
|
||||||
|
- TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_internal_lb_dp.py
|
||||||
|
- TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_hybrid_lb_dp.py
|
||||||
|
- pytest -v -s v1/engine/test_engine_core_client.py::test_kv_cache_events_dp
|
||||||
|
- pytest -v -s distributed/test_utils.py
|
||||||
|
- pytest -v -s compile/fullgraph/test_basic_correctness.py
|
||||||
|
- pytest -v -s distributed/test_pynccl.py
|
||||||
|
- pytest -v -s distributed/test_events.py
|
||||||
|
- pytest -v -s distributed/test_symm_mem_allreduce.py
|
||||||
|
# TODO: create a dedicated test section for multi-GPU example tests
|
||||||
|
# when we have multiple distributed example tests
|
||||||
|
- cd ../examples/offline_inference
|
||||||
|
- VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf.py
|
||||||
|
- VLLM_ALLOW_INSECURE_SERIALIZATION=1 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
|
||||||
|
|
||||||
|
- label: Distributed Tests (8 GPUs)(H100)
|
||||||
|
timeout_in_minutes: 10
|
||||||
|
gpu: h100
|
||||||
|
num_gpus: 8
|
||||||
|
working_dir: "/vllm-workspace/tests"
|
||||||
|
source_file_dependencies:
|
||||||
|
- examples/offline_inference/torchrun_dp_example.py
|
||||||
|
- vllm/config/parallel.py
|
||||||
|
- vllm/distributed/
|
||||||
|
- vllm/v1/engine/llm_engine.py
|
||||||
|
- vllm/v1/executor/uniproc_executor.py
|
||||||
|
- vllm/v1/worker/gpu_worker.py
|
||||||
|
commands:
|
||||||
|
# https://github.com/NVIDIA/nccl/issues/1838
|
||||||
|
- export NCCL_CUMEM_HOST_ENABLE=0
|
||||||
|
# test with torchrun tp=2 and dp=4 with ep
|
||||||
|
- torchrun --nproc-per-node=8 ../examples/offline_inference/torchrun_dp_example.py --tp-size=2 --pp-size=1 --dp-size=4 --enable-ep
|
||||||
|
|
||||||
|
- label: Distributed Tests (4 GPUs)(A100)
|
||||||
|
gpu: a100
|
||||||
|
optional: true
|
||||||
|
num_gpus: 4
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/
|
||||||
|
commands:
|
||||||
|
# NOTE: don't test llama model here, it seems hf implementation is buggy
|
||||||
|
# see https://github.com/vllm-project/vllm/pull/5689 for details
|
||||||
|
- pytest -v -s distributed/test_custom_all_reduce.py
|
||||||
|
- torchrun --nproc_per_node=2 distributed/test_ca_buffer_sharing.py
|
||||||
|
- TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
|
||||||
|
- pytest -v -s -x lora/test_mixtral.py
|
||||||
|
|
||||||
|
- label: Distributed Tests (2 GPUs)(H200)
|
||||||
|
gpu: h200
|
||||||
|
optional: true
|
||||||
|
working_dir: "/vllm-workspace/"
|
||||||
|
num_gpus: 2
|
||||||
|
commands:
|
||||||
|
- VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_async_tp.py
|
||||||
|
- pytest -v -s tests/compile/distributed/test_sequence_parallelism.py
|
||||||
|
- pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
|
||||||
|
- VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'
|
||||||
|
- VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/distributed/test_sequence_parallel.py
|
||||||
|
- pytest -v -s tests/distributed/test_context_parallel.py
|
||||||
|
- CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048
|
||||||
|
- pytest -v -s tests/v1/distributed/test_dbo.py
|
||||||
|
|
||||||
|
- label: Distributed Tests (2 GPUs)(B200)
|
||||||
|
gpu: b200
|
||||||
|
optional: true
|
||||||
|
working_dir: "/vllm-workspace/"
|
||||||
|
num_gpus: 2
|
||||||
|
commands:
|
||||||
|
- pytest -v -s tests/distributed/test_context_parallel.py
|
||||||
|
- pytest -v -s tests/distributed/test_nccl_symm_mem_allreduce.py
|
||||||
|
- pytest -v -s tests/v1/distributed/test_dbo.py
|
||||||
|
|
||||||
|
- label: 2 Node Test (4 GPUs)
|
||||||
|
timeout_in_minutes: 30
|
||||||
|
working_dir: "/vllm-workspace/tests"
|
||||||
|
num_gpus: 2
|
||||||
|
num_nodes: 2
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/distributed/
|
||||||
|
- vllm/engine/
|
||||||
|
- vllm/executor/
|
||||||
|
- vllm/model_executor/models/
|
||||||
|
- tests/distributed/
|
||||||
|
- tests/examples/offline_inference/data_parallel.py
|
||||||
|
commands:
|
||||||
|
- ./.buildkite/scripts/run-multi-node-test.sh /vllm-workspace/tests 2 2 public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:0bec63fa317e1fbd62e19b0fc31c43c81bf89077 "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=0 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py" "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=1 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code"
|
||||||
|
|
||||||
|
- label: Distributed NixlConnector PD accuracy (4 GPUs)
|
||||||
|
timeout_in_minutes: 30
|
||||||
|
working_dir: "/vllm-workspace/tests"
|
||||||
|
num_gpus: 4
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
|
||||||
|
- tests/v1/kv_connector/nixl_integration/
|
||||||
|
commands:
|
||||||
|
- uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
|
||||||
|
- bash v1/kv_connector/nixl_integration/tp_config_sweep_accuracy_test.sh
|
||||||
|
|
||||||
|
- label: Pipeline + Context Parallelism (4 GPUs))
|
||||||
|
timeout_in_minutes: 60
|
||||||
|
working_dir: "/vllm-workspace/tests"
|
||||||
|
num_gpus: 4
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/distributed/
|
||||||
|
- vllm/engine/
|
||||||
|
- vllm/executor/
|
||||||
|
- vllm/model_executor/models/
|
||||||
|
- tests/distributed/
|
||||||
|
commands:
|
||||||
|
- pytest -v -s distributed/test_pp_cudagraph.py
|
||||||
|
- pytest -v -s distributed/test_pipeline_parallel.py
|
||||||
59
.buildkite/test_areas/e2e_integration.yaml
Normal file
59
.buildkite/test_areas/e2e_integration.yaml
Normal file
@@ -0,0 +1,59 @@
|
|||||||
|
group: E2E Integration
|
||||||
|
depends_on:
|
||||||
|
- image-build
|
||||||
|
steps:
|
||||||
|
- label: DeepSeek V2-Lite Accuracy
|
||||||
|
timeout_in_minutes: 60
|
||||||
|
gpu: h100
|
||||||
|
optional: true
|
||||||
|
num_gpus: 4
|
||||||
|
working_dir: "/vllm-workspace"
|
||||||
|
commands:
|
||||||
|
- bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh 0.25 200 8010
|
||||||
|
|
||||||
|
- label: Qwen3-30B-A3B-FP8-block Accuracy
|
||||||
|
timeout_in_minutes: 60
|
||||||
|
gpu: h100
|
||||||
|
optional: true
|
||||||
|
num_gpus: 4
|
||||||
|
working_dir: "/vllm-workspace"
|
||||||
|
commands:
|
||||||
|
- bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020
|
||||||
|
|
||||||
|
- label: Qwen3-30B-A3B-FP8-block Accuracy (B200)
|
||||||
|
timeout_in_minutes: 60
|
||||||
|
gpu: b200
|
||||||
|
optional: true
|
||||||
|
num_gpus: 2
|
||||||
|
working_dir: "/vllm-workspace"
|
||||||
|
commands:
|
||||||
|
- bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1
|
||||||
|
|
||||||
|
- label: Prime-RL Integration (2 GPUs)
|
||||||
|
timeout_in_minutes: 30
|
||||||
|
optional: true
|
||||||
|
num_gpus: 2
|
||||||
|
working_dir: "/vllm-workspace"
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/
|
||||||
|
- .buildkite/scripts/run-prime-rl-test.sh
|
||||||
|
commands:
|
||||||
|
- bash .buildkite/scripts/run-prime-rl-test.sh
|
||||||
|
|
||||||
|
- label: DeepSeek V2-Lite Async EPLB Accuracy
|
||||||
|
timeout_in_minutes: 60
|
||||||
|
gpu: h100
|
||||||
|
optional: true
|
||||||
|
num_gpus: 4
|
||||||
|
working_dir: "/vllm-workspace"
|
||||||
|
commands:
|
||||||
|
- bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_async_eplb.sh 0.25 1319 8030
|
||||||
|
|
||||||
|
- label: Qwen3-Next-80B-A3B-Instruct MTP Async EPLB Accuracy
|
||||||
|
timeout_in_minutes: 60
|
||||||
|
gpu: h100
|
||||||
|
optional: true
|
||||||
|
num_gpus: 4
|
||||||
|
working_dir: "/vllm-workspace"
|
||||||
|
commands:
|
||||||
|
- bash .buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh 0.8 1319 8040
|
||||||
26
.buildkite/test_areas/engine.yaml
Normal file
26
.buildkite/test_areas/engine.yaml
Normal file
@@ -0,0 +1,26 @@
|
|||||||
|
group: Engine
|
||||||
|
depends_on:
|
||||||
|
- image-build
|
||||||
|
steps:
|
||||||
|
- label: Engine
|
||||||
|
timeout_in_minutes: 15
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/
|
||||||
|
- tests/engine
|
||||||
|
- tests/test_sequence
|
||||||
|
- tests/test_config
|
||||||
|
- tests/test_logger
|
||||||
|
- tests/test_vllm_port
|
||||||
|
commands:
|
||||||
|
- pytest -v -s engine test_sequence.py test_config.py test_logger.py test_vllm_port.py
|
||||||
|
|
||||||
|
- label: V1 e2e + engine
|
||||||
|
timeout_in_minutes: 45
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/
|
||||||
|
- tests/v1
|
||||||
|
commands:
|
||||||
|
# TODO: accuracy does not match, whether setting
|
||||||
|
# VLLM_USE_FLASHINFER_SAMPLER or not on H100.
|
||||||
|
- pytest -v -s v1/e2e
|
||||||
|
- pytest -v -s v1/engine
|
||||||
68
.buildkite/test_areas/entrypoints.yaml
Normal file
68
.buildkite/test_areas/entrypoints.yaml
Normal file
@@ -0,0 +1,68 @@
|
|||||||
|
group: Entrypoints
|
||||||
|
depends_on:
|
||||||
|
- image-build
|
||||||
|
steps:
|
||||||
|
- label: Entrypoints Unit Tests
|
||||||
|
timeout_in_minutes: 10
|
||||||
|
working_dir: "/vllm-workspace/tests"
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/entrypoints
|
||||||
|
- tests/entrypoints/
|
||||||
|
commands:
|
||||||
|
- pytest -v -s entrypoints/openai/tool_parsers
|
||||||
|
- pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/openai --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py --ignore=entrypoints/pooling
|
||||||
|
|
||||||
|
- label: Entrypoints Integration (LLM)
|
||||||
|
timeout_in_minutes: 40
|
||||||
|
working_dir: "/vllm-workspace/tests"
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/
|
||||||
|
- tests/entrypoints/llm
|
||||||
|
- tests/entrypoints/offline_mode
|
||||||
|
commands:
|
||||||
|
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||||
|
- pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_collective_rpc.py
|
||||||
|
- pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
|
||||||
|
- pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
|
||||||
|
|
||||||
|
- label: Entrypoints Integration (API Server)
|
||||||
|
timeout_in_minutes: 130
|
||||||
|
working_dir: "/vllm-workspace/tests"
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/
|
||||||
|
- tests/entrypoints/openai
|
||||||
|
- tests/entrypoints/test_chat_utils
|
||||||
|
commands:
|
||||||
|
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||||
|
- PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/openai/test_collective_rpc.py # PYTHONPATH is needed to import custom Worker extension
|
||||||
|
- pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/test_collective_rpc.py --ignore=entrypoints/openai/tool_parsers/
|
||||||
|
- pytest -v -s entrypoints/test_chat_utils.py
|
||||||
|
|
||||||
|
|
||||||
|
- label: Entrypoints Integration (Pooling)
|
||||||
|
timeout_in_minutes: 50
|
||||||
|
working_dir: "/vllm-workspace/tests"
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/
|
||||||
|
- tests/entrypoints/pooling
|
||||||
|
commands:
|
||||||
|
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||||
|
- pytest -v -s entrypoints/pooling
|
||||||
|
|
||||||
|
|
||||||
|
- label: Entrypoints V1
|
||||||
|
timeout_in_minutes: 50
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/
|
||||||
|
- tests/v1
|
||||||
|
commands:
|
||||||
|
- pytest -v -s v1/entrypoints
|
||||||
|
|
||||||
|
- label: OpenAI API Correctness
|
||||||
|
timeout_in_minutes: 30
|
||||||
|
source_file_dependencies:
|
||||||
|
- csrc/
|
||||||
|
- vllm/entrypoints/openai/
|
||||||
|
- vllm/model_executor/models/whisper.py
|
||||||
|
commands: # LMEval+Transcription WER check
|
||||||
|
- pytest -s entrypoints/openai/correctness/
|
||||||
23
.buildkite/test_areas/expert_parallelism.yaml
Normal file
23
.buildkite/test_areas/expert_parallelism.yaml
Normal file
@@ -0,0 +1,23 @@
|
|||||||
|
group: Expert Parallelism
|
||||||
|
depends_on:
|
||||||
|
- image-build
|
||||||
|
steps:
|
||||||
|
- label: EPLB Algorithm
|
||||||
|
timeout_in_minutes: 15
|
||||||
|
working_dir: "/vllm-workspace/tests"
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/distributed/eplb
|
||||||
|
- tests/distributed/test_eplb_algo.py
|
||||||
|
commands:
|
||||||
|
- pytest -v -s distributed/test_eplb_algo.py
|
||||||
|
|
||||||
|
- label: EPLB Execution
|
||||||
|
timeout_in_minutes: 20
|
||||||
|
working_dir: "/vllm-workspace/tests"
|
||||||
|
num_gpus: 4
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/distributed/eplb
|
||||||
|
- tests/distributed/test_eplb_execute.py
|
||||||
|
commands:
|
||||||
|
- pytest -v -s distributed/test_eplb_execute.py
|
||||||
|
- pytest -v -s distributed/test_eplb_spec_decode.py
|
||||||
117
.buildkite/test_areas/kernels.yaml
Normal file
117
.buildkite/test_areas/kernels.yaml
Normal file
@@ -0,0 +1,117 @@
|
|||||||
|
group: Kernels
|
||||||
|
depends_on:
|
||||||
|
- image-build
|
||||||
|
steps:
|
||||||
|
- label: Kernels Core Operation Test
|
||||||
|
timeout_in_minutes: 75
|
||||||
|
source_file_dependencies:
|
||||||
|
- csrc/
|
||||||
|
- tests/kernels/core
|
||||||
|
- tests/kernels/test_top_k_per_row.py
|
||||||
|
commands:
|
||||||
|
- pytest -v -s kernels/core kernels/test_top_k_per_row.py
|
||||||
|
|
||||||
|
- label: Kernels Attention Test %N
|
||||||
|
timeout_in_minutes: 35
|
||||||
|
source_file_dependencies:
|
||||||
|
- csrc/attention/
|
||||||
|
- vllm/attention
|
||||||
|
- vllm/v1/attention
|
||||||
|
- tests/kernels/attention
|
||||||
|
commands:
|
||||||
|
- pytest -v -s kernels/attention --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
|
||||||
|
parallelism: 2
|
||||||
|
|
||||||
|
- label: Kernels Quantization Test %N
|
||||||
|
timeout_in_minutes: 90
|
||||||
|
source_file_dependencies:
|
||||||
|
- csrc/quantization/
|
||||||
|
- vllm/model_executor/layers/quantization
|
||||||
|
- tests/kernels/quantization
|
||||||
|
commands:
|
||||||
|
- pytest -v -s kernels/quantization --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
|
||||||
|
parallelism: 2
|
||||||
|
|
||||||
|
- label: Kernels MoE Test %N
|
||||||
|
timeout_in_minutes: 60
|
||||||
|
source_file_dependencies:
|
||||||
|
- csrc/quantization/cutlass_w8a8/moe/
|
||||||
|
- csrc/moe/
|
||||||
|
- tests/kernels/moe
|
||||||
|
- vllm/model_executor/layers/fused_moe/
|
||||||
|
- vllm/distributed/device_communicators/
|
||||||
|
- vllm/envs.py
|
||||||
|
- vllm/config
|
||||||
|
commands:
|
||||||
|
- pytest -v -s kernels/moe --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
|
||||||
|
parallelism: 2
|
||||||
|
|
||||||
|
- label: Kernels Mamba Test
|
||||||
|
timeout_in_minutes: 45
|
||||||
|
source_file_dependencies:
|
||||||
|
- csrc/mamba/
|
||||||
|
- tests/kernels/mamba
|
||||||
|
- vllm/model_executor/layers/mamba/ops
|
||||||
|
commands:
|
||||||
|
- pytest -v -s kernels/mamba
|
||||||
|
|
||||||
|
- label: Kernels DeepGEMM Test (H100)
|
||||||
|
timeout_in_minutes: 45
|
||||||
|
gpu: h100
|
||||||
|
num_gpus: 1
|
||||||
|
source_file_dependencies:
|
||||||
|
- tools/install_deepgemm.sh
|
||||||
|
- vllm/utils/deep_gemm.py
|
||||||
|
- vllm/model_executor/layers/fused_moe
|
||||||
|
- vllm/model_executor/layers/quantization
|
||||||
|
- tests/kernels/quantization/test_block_fp8.py
|
||||||
|
- tests/kernels/moe/test_deepgemm.py
|
||||||
|
- tests/kernels/moe/test_batched_deepgemm.py
|
||||||
|
- tests/kernels/attention/test_deepgemm_attention.py
|
||||||
|
commands:
|
||||||
|
- pytest -v -s kernels/quantization/test_block_fp8.py -k deep_gemm
|
||||||
|
- pytest -v -s kernels/moe/test_deepgemm.py
|
||||||
|
- pytest -v -s kernels/moe/test_batched_deepgemm.py
|
||||||
|
- pytest -v -s kernels/attention/test_deepgemm_attention.py
|
||||||
|
|
||||||
|
- label: Kernels (B200)
|
||||||
|
timeout_in_minutes: 30
|
||||||
|
working_dir: "/vllm-workspace/"
|
||||||
|
gpu: b200
|
||||||
|
# optional: true
|
||||||
|
source_file_dependencies:
|
||||||
|
- csrc/quantization/fp4/
|
||||||
|
- csrc/attention/mla/
|
||||||
|
- csrc/quantization/cutlass_w8a8/moe/
|
||||||
|
- vllm/model_executor/layers/fused_moe/cutlass_moe.py
|
||||||
|
- vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py
|
||||||
|
- vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py
|
||||||
|
- vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
|
||||||
|
- vllm/v1/attention/backends/flashinfer.py
|
||||||
|
- vllm/v1/attention/backends/mla/cutlass_mla.py
|
||||||
|
- vllm/v1/attention/backends/mla/flashinfer_mla.py
|
||||||
|
- vllm/platforms/cuda.py
|
||||||
|
- vllm/attention/selector.py
|
||||||
|
commands:
|
||||||
|
- nvidia-smi
|
||||||
|
- python3 examples/offline_inference/basic/chat.py
|
||||||
|
# Attention
|
||||||
|
# num_heads2 broken by https://github.com/flashinfer-ai/flashinfer/issues/1353
|
||||||
|
- pytest -v -s tests/kernels/attention/test_attention_selector.py
|
||||||
|
- pytest -v -s tests/kernels/attention/test_flashinfer.py -k 'not num_heads2'
|
||||||
|
- pytest -v -s tests/kernels/attention/test_flashinfer_trtllm_attention.py
|
||||||
|
- pytest -v -s tests/kernels/attention/test_cutlass_mla_decode.py
|
||||||
|
- pytest -v -s tests/kernels/attention/test_flashinfer_mla_decode.py
|
||||||
|
# Quantization
|
||||||
|
- pytest -v -s tests/kernels/quantization/test_cutlass_scaled_mm.py -k 'fp8'
|
||||||
|
- pytest -v -s tests/kernels/quantization/test_nvfp4_quant.py
|
||||||
|
- pytest -v -s tests/kernels/quantization/test_silu_mul_nvfp4_quant.py
|
||||||
|
- pytest -v -s tests/kernels/quantization/test_nvfp4_scaled_mm.py
|
||||||
|
- pytest -v -s tests/kernels/quantization/test_flashinfer_scaled_mm.py
|
||||||
|
- pytest -v -s tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py
|
||||||
|
- pytest -v -s tests/kernels/quantization/test_nvfp4_qutlass.py
|
||||||
|
- pytest -v -s tests/kernels/quantization/test_mxfp4_qutlass.py
|
||||||
|
- pytest -v -s tests/kernels/moe/test_nvfp4_moe.py
|
||||||
|
- pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py
|
||||||
|
- pytest -v -s tests/kernels/moe/test_flashinfer.py
|
||||||
|
- pytest -v -s tests/kernels/moe/test_cutedsl_moe.py
|
||||||
46
.buildkite/test_areas/lm_eval.yaml
Normal file
46
.buildkite/test_areas/lm_eval.yaml
Normal file
@@ -0,0 +1,46 @@
|
|||||||
|
group: LM Eval
|
||||||
|
depends_on:
|
||||||
|
- image-build
|
||||||
|
steps:
|
||||||
|
- label: LM Eval Small Models
|
||||||
|
timeout_in_minutes: 75
|
||||||
|
source_file_dependencies:
|
||||||
|
- csrc/
|
||||||
|
- vllm/model_executor/layers/quantization
|
||||||
|
autorun_on_main: true
|
||||||
|
commands:
|
||||||
|
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt --tp-size=1
|
||||||
|
|
||||||
|
- label: LM Eval Large Models (4 GPUs)(A100)
|
||||||
|
gpu: a100
|
||||||
|
optional: true
|
||||||
|
num_gpus: 4
|
||||||
|
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
|
||||||
|
source_file_dependencies:
|
||||||
|
- csrc/
|
||||||
|
- vllm/model_executor/layers/quantization
|
||||||
|
commands:
|
||||||
|
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||||
|
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4
|
||||||
|
|
||||||
|
- label: LM Eval Large Models (4 GPUs)(H100)
|
||||||
|
gpu: h100
|
||||||
|
optional: true
|
||||||
|
num_gpus: 4
|
||||||
|
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
|
||||||
|
source_file_dependencies:
|
||||||
|
- csrc/
|
||||||
|
- vllm/model_executor/layers/quantization
|
||||||
|
commands:
|
||||||
|
- export VLLM_USE_DEEP_GEMM=0 # We found Triton is faster than DeepGEMM for H100
|
||||||
|
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt --tp-size=4
|
||||||
|
|
||||||
|
- label: LM Eval Small Models (B200)
|
||||||
|
timeout_in_minutes: 120
|
||||||
|
gpu: b200
|
||||||
|
optional: true
|
||||||
|
source_file_dependencies:
|
||||||
|
- csrc/
|
||||||
|
- vllm/model_executor/layers/quantization
|
||||||
|
commands:
|
||||||
|
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt --tp-size=1
|
||||||
31
.buildkite/test_areas/lora.yaml
Normal file
31
.buildkite/test_areas/lora.yaml
Normal file
@@ -0,0 +1,31 @@
|
|||||||
|
group: LoRA
|
||||||
|
depends_on:
|
||||||
|
- image-build
|
||||||
|
steps:
|
||||||
|
- label: LoRA %N
|
||||||
|
timeout_in_minutes: 30
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/lora
|
||||||
|
- tests/lora
|
||||||
|
commands:
|
||||||
|
- pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py --ignore=lora/test_llm_with_multi_loras.py --ignore=lora/test_olmoe_tp.py --ignore=lora/test_deepseekv2_tp.py --ignore=lora/test_gptoss_tp.py --ignore=lora/test_qwen3moe_tp.py
|
||||||
|
parallelism: 4
|
||||||
|
|
||||||
|
|
||||||
|
- label: LoRA TP (Distributed)
|
||||||
|
timeout_in_minutes: 30
|
||||||
|
num_gpus: 4
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/lora
|
||||||
|
- tests/lora
|
||||||
|
commands:
|
||||||
|
# FIXIT: find out which code initialize cuda before running the test
|
||||||
|
# before the fix, we need to use spawn to test it
|
||||||
|
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||||
|
# There is some Tensor Parallelism related processing logic in LoRA that
|
||||||
|
# requires multi-GPU testing for validation.
|
||||||
|
- pytest -v -s -x lora/test_chatglm3_tp.py
|
||||||
|
- pytest -v -s -x lora/test_llama_tp.py
|
||||||
|
- pytest -v -s -x lora/test_llm_with_multi_loras.py
|
||||||
|
- pytest -v -s -x lora/test_olmoe_tp.py
|
||||||
|
- pytest -v -s -x lora/test_gptoss_tp.py
|
||||||
163
.buildkite/test_areas/misc.yaml
Normal file
163
.buildkite/test_areas/misc.yaml
Normal file
@@ -0,0 +1,163 @@
|
|||||||
|
group: Miscellaneous
|
||||||
|
depends_on:
|
||||||
|
- image-build
|
||||||
|
steps:
|
||||||
|
- label: V1 Others
|
||||||
|
timeout_in_minutes: 60
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/
|
||||||
|
- tests/v1
|
||||||
|
commands:
|
||||||
|
- uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
|
||||||
|
# split the test to avoid interference
|
||||||
|
- pytest -v -s -m 'not cpu_test' v1/core
|
||||||
|
- pytest -v -s v1/executor
|
||||||
|
- pytest -v -s v1/kv_offload
|
||||||
|
- pytest -v -s v1/sample
|
||||||
|
- pytest -v -s v1/logits_processors
|
||||||
|
- pytest -v -s v1/worker
|
||||||
|
- pytest -v -s v1/spec_decode
|
||||||
|
- pytest -v -s -m 'not cpu_test' v1/kv_connector/unit
|
||||||
|
- pytest -v -s -m 'not cpu_test' v1/metrics
|
||||||
|
- pytest -v -s v1/test_oracle.py
|
||||||
|
- pytest -v -s v1/test_request.py
|
||||||
|
- pytest -v -s v1/test_outputs.py
|
||||||
|
# Integration test for streaming correctness (requires special branch).
|
||||||
|
- pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api
|
||||||
|
- pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
|
||||||
|
|
||||||
|
- label: V1 Others (CPU)
|
||||||
|
depends_on: ~
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/
|
||||||
|
- tests/v1
|
||||||
|
no_gpu: true
|
||||||
|
commands:
|
||||||
|
# split the test to avoid interference
|
||||||
|
- pytest -v -s -m 'cpu_test' v1/core
|
||||||
|
- pytest -v -s v1/structured_output
|
||||||
|
- pytest -v -s v1/test_serial_utils.py
|
||||||
|
- pytest -v -s -m 'cpu_test' v1/kv_connector/unit
|
||||||
|
- pytest -v -s -m 'cpu_test' v1/metrics
|
||||||
|
|
||||||
|
- label: Regression
|
||||||
|
timeout_in_minutes: 20
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/
|
||||||
|
- tests/test_regression
|
||||||
|
commands:
|
||||||
|
- pip install modelscope
|
||||||
|
- pytest -v -s test_regression.py
|
||||||
|
working_dir: "/vllm-workspace/tests" # optional
|
||||||
|
|
||||||
|
- label: Examples
|
||||||
|
timeout_in_minutes: 45
|
||||||
|
working_dir: "/vllm-workspace/examples"
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/entrypoints
|
||||||
|
- vllm/multimodal
|
||||||
|
- examples/
|
||||||
|
commands:
|
||||||
|
- pip install tensorizer # for tensorizer test
|
||||||
|
- python3 offline_inference/basic/chat.py # for basic
|
||||||
|
- python3 offline_inference/basic/generate.py --model facebook/opt-125m
|
||||||
|
- python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
|
||||||
|
- python3 offline_inference/basic/classify.py
|
||||||
|
- python3 offline_inference/basic/embed.py
|
||||||
|
- python3 offline_inference/basic/score.py
|
||||||
|
# for multi-modal models
|
||||||
|
- python3 offline_inference/audio_language.py --seed 0
|
||||||
|
- python3 offline_inference/vision_language.py --seed 0
|
||||||
|
- python3 offline_inference/vision_language_multi_image.py --seed 0
|
||||||
|
- python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
|
||||||
|
# for pooling models
|
||||||
|
- python3 pooling/pooling/vision_language_pooling.py --seed 0
|
||||||
|
# for features demo
|
||||||
|
- python3 offline_inference/prefix_caching.py
|
||||||
|
- python3 offline_inference/llm_engine_example.py
|
||||||
|
- python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
|
||||||
|
- python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
|
||||||
|
# https://github.com/vllm-project/vllm/pull/26682 uses slightly more memory in PyTorch 2.9+ causing this test to OOM in 1xL4 GPU
|
||||||
|
- python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536
|
||||||
|
|
||||||
|
- label: Metrics, Tracing (2 GPUs)
|
||||||
|
timeout_in_minutes: 20
|
||||||
|
num_gpus: 2
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/
|
||||||
|
- tests/v1/tracing
|
||||||
|
commands:
|
||||||
|
- "pip install \
|
||||||
|
'opentelemetry-sdk>=1.26.0' \
|
||||||
|
'opentelemetry-api>=1.26.0' \
|
||||||
|
'opentelemetry-exporter-otlp>=1.26.0' \
|
||||||
|
'opentelemetry-semantic-conventions-ai>=0.4.1'"
|
||||||
|
- pytest -v -s v1/tracing
|
||||||
|
|
||||||
|
- label: Python-only Installation
|
||||||
|
depends_on: ~
|
||||||
|
timeout_in_minutes: 20
|
||||||
|
source_file_dependencies:
|
||||||
|
- tests/standalone_tests/python_only_compile.sh
|
||||||
|
- setup.py
|
||||||
|
commands:
|
||||||
|
- bash standalone_tests/python_only_compile.sh
|
||||||
|
|
||||||
|
- label: Async Engine, Inputs, Utils, Worker
|
||||||
|
timeout_in_minutes: 50
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/
|
||||||
|
- tests/multimodal
|
||||||
|
- tests/utils_
|
||||||
|
commands:
|
||||||
|
- pytest -v -s -m 'not cpu_test' multimodal
|
||||||
|
- pytest -v -s utils_
|
||||||
|
|
||||||
|
- label: Async Engine, Inputs, Utils, Worker, Config (CPU)
|
||||||
|
depends_on: ~
|
||||||
|
timeout_in_minutes: 20
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/
|
||||||
|
- tests/test_inputs.py
|
||||||
|
- tests/test_outputs.py
|
||||||
|
- tests/multimodal
|
||||||
|
- tests/standalone_tests/lazy_imports.py
|
||||||
|
- tests/tokenizers_
|
||||||
|
- tests/transformers_utils
|
||||||
|
- tests/config
|
||||||
|
no_gpu: true
|
||||||
|
commands:
|
||||||
|
- python3 standalone_tests/lazy_imports.py
|
||||||
|
- pytest -v -s test_inputs.py
|
||||||
|
- pytest -v -s test_outputs.py
|
||||||
|
- pytest -v -s -m 'cpu_test' multimodal
|
||||||
|
- pytest -v -s tokenizers_
|
||||||
|
- pytest -v -s transformers_utils
|
||||||
|
- pytest -v -s config
|
||||||
|
|
||||||
|
- label: GPT-OSS Eval (B200)
|
||||||
|
timeout_in_minutes: 60
|
||||||
|
working_dir: "/vllm-workspace/"
|
||||||
|
gpu: b200
|
||||||
|
optional: true
|
||||||
|
source_file_dependencies:
|
||||||
|
- tests/evals/gpt_oss
|
||||||
|
- vllm/model_executor/models/gpt_oss.py
|
||||||
|
- vllm/model_executor/layers/quantization/mxfp4.py
|
||||||
|
- vllm/v1/attention/backends/flashinfer.py
|
||||||
|
commands:
|
||||||
|
- uv pip install --system 'gpt-oss[eval]==0.0.5'
|
||||||
|
- pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58
|
||||||
|
|
||||||
|
- label: Batch Invariance (H100)
|
||||||
|
timeout_in_minutes: 25
|
||||||
|
gpu: h100
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/v1/attention
|
||||||
|
- vllm/model_executor/layers
|
||||||
|
- tests/v1/determinism/
|
||||||
|
commands:
|
||||||
|
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||||
|
- pip install pytest-timeout pytest-forked
|
||||||
|
- pytest -v -s v1/determinism/test_batch_invariance.py
|
||||||
|
- pytest -v -s v1/determinism/test_rms_norm_batch_invariant.py
|
||||||
17
.buildkite/test_areas/model_executor.yaml
Normal file
17
.buildkite/test_areas/model_executor.yaml
Normal file
@@ -0,0 +1,17 @@
|
|||||||
|
group: Model Executor
|
||||||
|
depends_on:
|
||||||
|
- image-build
|
||||||
|
steps:
|
||||||
|
- label: Model Executor
|
||||||
|
timeout_in_minutes: 35
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/engine/arg_utils.py
|
||||||
|
- vllm/config/model.py
|
||||||
|
- vllm/model_executor
|
||||||
|
- tests/model_executor
|
||||||
|
- tests/entrypoints/openai/test_tensorizer_entrypoint.py
|
||||||
|
commands:
|
||||||
|
- apt-get update && apt-get install -y curl libsodium23
|
||||||
|
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||||
|
- pytest -v -s model_executor
|
||||||
|
- pytest -v -s entrypoints/openai/test_tensorizer_entrypoint.py
|
||||||
62
.buildkite/test_areas/models_basic.yaml
Normal file
62
.buildkite/test_areas/models_basic.yaml
Normal file
@@ -0,0 +1,62 @@
|
|||||||
|
group: Models - Basic
|
||||||
|
depends_on:
|
||||||
|
- image-build
|
||||||
|
steps:
|
||||||
|
- label: Basic Models Tests (Initialization)
|
||||||
|
timeout_in_minutes: 45
|
||||||
|
mirror_hardwares: [amdexperimental]
|
||||||
|
torch_nightly: true
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/
|
||||||
|
- tests/models/test_initialization.py
|
||||||
|
commands:
|
||||||
|
# Run a subset of model initialization tests
|
||||||
|
- pytest -v -s models/test_initialization.py::test_can_initialize_small_subset
|
||||||
|
|
||||||
|
- label: Basic Models Tests (Extra Initialization) %N
|
||||||
|
timeout_in_minutes: 45
|
||||||
|
mirror_hardwares: [amdexperimental]
|
||||||
|
torch_nightly: true
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/model_executor/models/
|
||||||
|
- tests/models/test_initialization.py
|
||||||
|
commands:
|
||||||
|
# Only when vLLM model source is modified - test initialization of a large
|
||||||
|
# subset of supported models (the complement of the small subset in the above
|
||||||
|
# test.) Also run if model initialization test file is modified
|
||||||
|
- pytest -v -s models/test_initialization.py -k 'not test_can_initialize_small_subset' --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --shard-id=$$BUILDKITE_PARALLEL_JOB
|
||||||
|
parallelism: 2
|
||||||
|
|
||||||
|
- label: Basic Models Tests (Other)
|
||||||
|
timeout_in_minutes: 45
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/
|
||||||
|
- tests/models/test_transformers.py
|
||||||
|
- tests/models/test_registry.py
|
||||||
|
commands:
|
||||||
|
- pytest -v -s models/test_transformers.py models/test_registry.py
|
||||||
|
|
||||||
|
- label: Basic Models Test (Other CPU) # 5min
|
||||||
|
timeout_in_minutes: 10
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/
|
||||||
|
- tests/models/test_utils.py
|
||||||
|
- tests/models/test_vision.py
|
||||||
|
no_gpu: true
|
||||||
|
commands:
|
||||||
|
- pytest -v -s models/test_utils.py models/test_vision.py
|
||||||
|
|
||||||
|
- label: Transformers Nightly Models
|
||||||
|
working_dir: "/vllm-workspace/"
|
||||||
|
optional: true
|
||||||
|
soft_fail: true
|
||||||
|
commands:
|
||||||
|
- pip install --upgrade git+https://github.com/huggingface/transformers
|
||||||
|
- pytest -v -s tests/models/test_initialization.py
|
||||||
|
- pytest -v -s tests/models/test_transformers.py
|
||||||
|
- pytest -v -s tests/models/multimodal/processing/
|
||||||
|
- pytest -v -s tests/models/multimodal/test_mapping.py
|
||||||
|
- python3 examples/offline_inference/basic/chat.py
|
||||||
|
- python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
|
||||||
|
# Whisper needs spawn method to avoid deadlock
|
||||||
|
- VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper
|
||||||
22
.buildkite/test_areas/models_distributed.yaml
Normal file
22
.buildkite/test_areas/models_distributed.yaml
Normal file
@@ -0,0 +1,22 @@
|
|||||||
|
group: Models - Distributed
|
||||||
|
depends_on:
|
||||||
|
- image-build
|
||||||
|
steps:
|
||||||
|
- label: Distributed Model Tests (2 GPUs)
|
||||||
|
timeout_in_minutes: 50
|
||||||
|
working_dir: "/vllm-workspace/tests"
|
||||||
|
num_gpus: 2
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/model_executor/model_loader/sharded_state_loader.py
|
||||||
|
- vllm/model_executor/models/
|
||||||
|
- tests/basic_correctness/
|
||||||
|
- tests/model_executor/model_loader/test_sharded_state_loader.py
|
||||||
|
- tests/models/
|
||||||
|
commands:
|
||||||
|
- TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
|
||||||
|
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s model_executor/model_loader/test_sharded_state_loader.py
|
||||||
|
# Avoid importing model tests that cause CUDA reinitialization error
|
||||||
|
- pytest models/test_transformers.py -v -s -m 'distributed(num_gpus=2)'
|
||||||
|
- pytest models/language -v -s -m 'distributed(num_gpus=2)'
|
||||||
|
- pytest models/multimodal -v -s -m 'distributed(num_gpus=2)' --ignore models/multimodal/generation/test_whisper.py
|
||||||
|
- VLLM_WORKER_MULTIPROC_METHOD=spawn pytest models/multimodal/generation/test_whisper.py -v -s -m 'distributed(num_gpus=2)'
|
||||||
91
.buildkite/test_areas/models_language.yaml
Normal file
91
.buildkite/test_areas/models_language.yaml
Normal file
@@ -0,0 +1,91 @@
|
|||||||
|
group: Models - Language
|
||||||
|
depends_on:
|
||||||
|
- image-build
|
||||||
|
steps:
|
||||||
|
- label: Language Models Tests (Standard)
|
||||||
|
timeout_in_minutes: 25
|
||||||
|
mirror_hardwares: [amdexperimental]
|
||||||
|
torch_nightly: true
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/
|
||||||
|
- tests/models/language
|
||||||
|
commands:
|
||||||
|
# Test standard language models, excluding a subset of slow tests
|
||||||
|
- pip freeze | grep -E 'torch'
|
||||||
|
- pytest -v -s models/language -m 'core_model and (not slow_test)'
|
||||||
|
|
||||||
|
- label: Language Models Tests (Extra Standard) %N
|
||||||
|
timeout_in_minutes: 45
|
||||||
|
mirror_hardwares: [amdexperimental]
|
||||||
|
torch_nightly: true
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/model_executor/models/
|
||||||
|
- tests/models/language/pooling/test_embedding.py
|
||||||
|
- tests/models/language/generation/test_common.py
|
||||||
|
- tests/models/language/pooling/test_classification.py
|
||||||
|
commands:
|
||||||
|
# Shard slow subset of standard language models tests. Only run when model
|
||||||
|
# source is modified, or when specified test files are modified
|
||||||
|
- pip freeze | grep -E 'torch'
|
||||||
|
- pytest -v -s models/language -m 'core_model and slow_test' --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --shard-id=$$BUILDKITE_PARALLEL_JOB
|
||||||
|
parallelism: 2
|
||||||
|
|
||||||
|
- label: Language Models Tests (Hybrid) %N
|
||||||
|
timeout_in_minutes: 75
|
||||||
|
mirror_hardwares: [amdexperimental]
|
||||||
|
torch_nightly: true
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/
|
||||||
|
- tests/models/language/generation
|
||||||
|
commands:
|
||||||
|
# Install fast path packages for testing against transformers
|
||||||
|
# Note: also needed to run plamo2 model in vLLM
|
||||||
|
- uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/mamba@v2.2.5'
|
||||||
|
- uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2'
|
||||||
|
# Shard hybrid language model tests
|
||||||
|
- pytest -v -s models/language/generation -m hybrid_model --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --shard-id=$$BUILDKITE_PARALLEL_JOB
|
||||||
|
parallelism: 2
|
||||||
|
|
||||||
|
- label: Language Models Test (Extended Generation) # 80min
|
||||||
|
timeout_in_minutes: 110
|
||||||
|
mirror_hardwares: [amdexperimental]
|
||||||
|
optional: true
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/
|
||||||
|
- tests/models/language/generation
|
||||||
|
commands:
|
||||||
|
# Install fast path packages for testing against transformers
|
||||||
|
# Note: also needed to run plamo2 model in vLLM
|
||||||
|
- uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/mamba@v2.2.5'
|
||||||
|
- uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2'
|
||||||
|
- pytest -v -s models/language/generation -m '(not core_model) and (not hybrid_model)'
|
||||||
|
|
||||||
|
- label: Language Models Test (PPL)
|
||||||
|
timeout_in_minutes: 110
|
||||||
|
mirror_hardwares: [amdexperimental]
|
||||||
|
optional: true
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/
|
||||||
|
- tests/models/language/generation_ppl_test
|
||||||
|
commands:
|
||||||
|
- pytest -v -s models/language/generation_ppl_test
|
||||||
|
|
||||||
|
- label: Language Models Test (Extended Pooling) # 36min
|
||||||
|
timeout_in_minutes: 50
|
||||||
|
mirror_hardwares: [amdexperimental]
|
||||||
|
optional: true
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/
|
||||||
|
- tests/models/language/pooling
|
||||||
|
commands:
|
||||||
|
- pytest -v -s models/language/pooling -m 'not core_model'
|
||||||
|
|
||||||
|
- label: Language Models Test (MTEB)
|
||||||
|
timeout_in_minutes: 110
|
||||||
|
mirror_hardwares: [amdexperimental]
|
||||||
|
optional: true
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/
|
||||||
|
- tests/models/language/pooling_mteb_test
|
||||||
|
commands:
|
||||||
|
- pytest -v -s models/language/pooling_mteb_test
|
||||||
79
.buildkite/test_areas/models_multimodal.yaml
Normal file
79
.buildkite/test_areas/models_multimodal.yaml
Normal file
@@ -0,0 +1,79 @@
|
|||||||
|
group: Models - Multimodal
|
||||||
|
depends_on:
|
||||||
|
- image-build
|
||||||
|
steps:
|
||||||
|
- label: Multi-Modal Models (Standard) # 60min
|
||||||
|
timeout_in_minutes: 80
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/
|
||||||
|
- tests/models/multimodal
|
||||||
|
commands:
|
||||||
|
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
|
||||||
|
- pip freeze | grep -E 'torch'
|
||||||
|
- pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing
|
||||||
|
- cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model # Otherwise, mp_method="spawn" doesn't work
|
||||||
|
|
||||||
|
- label: Multi-Modal Processor Test (CPU)
|
||||||
|
timeout_in_minutes: 60
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/
|
||||||
|
- tests/models/multimodal
|
||||||
|
no_gpu: true
|
||||||
|
commands:
|
||||||
|
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
|
||||||
|
- pytest -v -s models/multimodal/processing --ignore models/multimodal/processing/test_tensor_schema.py
|
||||||
|
|
||||||
|
- label: Multi-Modal Processor # 44min
|
||||||
|
timeout_in_minutes: 60
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/
|
||||||
|
- tests/models/multimodal
|
||||||
|
commands:
|
||||||
|
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
|
||||||
|
- pytest -v -s models/multimodal/processing/test_tensor_schema.py
|
||||||
|
|
||||||
|
- label: Multi-Modal Accuracy Eval (Small Models) # 50min
|
||||||
|
timeout_in_minutes: 70
|
||||||
|
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/multimodal/
|
||||||
|
- vllm/inputs/
|
||||||
|
- vllm/v1/core/
|
||||||
|
commands:
|
||||||
|
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-mm-small.txt --tp-size=1
|
||||||
|
|
||||||
|
- label: Multi-Modal Models (Extended) 1
|
||||||
|
optional: true
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/
|
||||||
|
- tests/models/multimodal
|
||||||
|
commands:
|
||||||
|
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
|
||||||
|
- pytest -v -s models/multimodal -m 'not core_model' --ignore models/multimodal/generation/test_common.py --ignore models/multimodal/processing
|
||||||
|
|
||||||
|
- label: Multi-Modal Models (Extended) 2
|
||||||
|
optional: true
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/
|
||||||
|
- tests/models/multimodal
|
||||||
|
commands:
|
||||||
|
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
|
||||||
|
- pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=0) and not core_model'
|
||||||
|
|
||||||
|
- label: Multi-Modal Models (Extended) 3
|
||||||
|
optional: true
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/
|
||||||
|
- tests/models/multimodal
|
||||||
|
commands:
|
||||||
|
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
|
||||||
|
- pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=1) and not core_model'
|
||||||
|
|
||||||
|
# This test is used only in PR development phase to test individual models and should never run on main
|
||||||
|
- label: Custom Models
|
||||||
|
optional: true
|
||||||
|
commands:
|
||||||
|
- echo 'Testing custom models...'
|
||||||
|
# PR authors can temporarily add commands below to test individual models
|
||||||
|
# e.g. pytest -v -s models/encoder_decoder/vision_language/test_mllama.py
|
||||||
|
# *To avoid merge conflicts, remember to REMOVE (not just comment out) them before merging the PR*
|
||||||
34
.buildkite/test_areas/plugins.yaml
Normal file
34
.buildkite/test_areas/plugins.yaml
Normal file
@@ -0,0 +1,34 @@
|
|||||||
|
group: Plugins
|
||||||
|
depends_on:
|
||||||
|
- image-build
|
||||||
|
steps:
|
||||||
|
- label: Plugin Tests (2 GPUs)
|
||||||
|
timeout_in_minutes: 60
|
||||||
|
working_dir: "/vllm-workspace/tests"
|
||||||
|
num_gpus: 2
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/plugins/
|
||||||
|
- tests/plugins/
|
||||||
|
commands:
|
||||||
|
# begin platform plugin and general plugin tests, all the code in-between runs on dummy platform
|
||||||
|
- pip install -e ./plugins/vllm_add_dummy_platform
|
||||||
|
- pytest -v -s plugins_tests/test_platform_plugins.py
|
||||||
|
- pip uninstall vllm_add_dummy_platform -y
|
||||||
|
# end platform plugin tests
|
||||||
|
# begin io_processor plugins test, all the code in between uses the prithvi_io_processor plugin
|
||||||
|
- pip install -e ./plugins/prithvi_io_processor_plugin
|
||||||
|
- pytest -v -s plugins_tests/test_io_processor_plugins.py
|
||||||
|
- pip uninstall prithvi_io_processor_plugin -y
|
||||||
|
# end io_processor plugins test
|
||||||
|
# begin stat_logger plugins test
|
||||||
|
- pip install -e ./plugins/vllm_add_dummy_stat_logger
|
||||||
|
- pytest -v -s plugins_tests/test_stats_logger_plugins.py
|
||||||
|
- pip uninstall dummy_stat_logger -y
|
||||||
|
# end stat_logger plugins test
|
||||||
|
# other tests continue here:
|
||||||
|
- pytest -v -s plugins_tests/test_scheduler_plugins.py
|
||||||
|
- pip install -e ./plugins/vllm_add_dummy_model
|
||||||
|
- pytest -v -s distributed/test_distributed_oot.py
|
||||||
|
- pytest -v -s entrypoints/openai/test_oot_registration.py # it needs a clean process
|
||||||
|
- pytest -v -s models/test_oot_registration.py # it needs a clean process
|
||||||
|
- pytest -v -s plugins/lora_resolvers # unit tests for in-tree lora resolver plugins
|
||||||
50
.buildkite/test_areas/pytorch.yaml
Normal file
50
.buildkite/test_areas/pytorch.yaml
Normal file
@@ -0,0 +1,50 @@
|
|||||||
|
group: PyTorch
|
||||||
|
depends_on:
|
||||||
|
- image-build
|
||||||
|
steps:
|
||||||
|
- label: PyTorch Compilation Unit Tests
|
||||||
|
timeout_in_minutes: 30
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/
|
||||||
|
- tests/compile
|
||||||
|
commands:
|
||||||
|
# Run unit tests defined directly under compile/,
|
||||||
|
# not including subdirectories, which are usually heavier
|
||||||
|
# tests covered elsewhere.
|
||||||
|
# Use `find` to launch multiple instances of pytest so that
|
||||||
|
# they do not suffer from https://github.com/vllm-project/vllm/issues/28965
|
||||||
|
- "find compile/ -maxdepth 1 -name 'test_*.py' -exec pytest -s -v {} \\;"
|
||||||
|
|
||||||
|
- label: PyTorch Fullgraph Smoke Test
|
||||||
|
timeout_in_minutes: 30
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/
|
||||||
|
- tests/compile
|
||||||
|
commands:
|
||||||
|
# Run smoke tests under fullgraph directory, except test_full_graph.py
|
||||||
|
# as it is a heavy test that is covered in other steps.
|
||||||
|
# Use `find` to launch multiple instances of pytest so that
|
||||||
|
# they do not suffer from https://github.com/vllm-project/vllm/issues/28965
|
||||||
|
- "find compile/fullgraph/ -name 'test_*.py' -not -name 'test_full_graph.py' -exec pytest -s -v {} \\;"
|
||||||
|
|
||||||
|
- label: PyTorch Fullgraph
|
||||||
|
timeout_in_minutes: 40
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/
|
||||||
|
- tests/compile
|
||||||
|
commands:
|
||||||
|
# fp8 kv scales not supported on sm89, tested on Blackwell instead
|
||||||
|
- pytest -v -s compile/fullgraph/test_full_graph.py -k 'not test_fp8_kv_scale_compile'
|
||||||
|
# Limit to no custom ops to reduce running time
|
||||||
|
# Wrap with quotes to escape yaml and avoid starting -k string with a -
|
||||||
|
- "pytest -v -s compile/distributed/test_fusions_e2e.py -k 'TRITON and not +quant_fp8 and not Llama-4'"
|
||||||
|
|
||||||
|
- label: Pytorch Nightly Dependency Override Check # 2min
|
||||||
|
# if this test fails, it means the nightly torch version is not compatible with some
|
||||||
|
# of the dependencies. Please check the error message and add the package to whitelist
|
||||||
|
# in /vllm/tools/pre_commit/generate_nightly_torch_test.py
|
||||||
|
soft_fail: true
|
||||||
|
source_file_dependencies:
|
||||||
|
- requirements/nightly_torch_test.txt
|
||||||
|
commands:
|
||||||
|
- bash standalone_tests/pytorch_nightly_dependency.sh
|
||||||
46
.buildkite/test_areas/quantization.yaml
Normal file
46
.buildkite/test_areas/quantization.yaml
Normal file
@@ -0,0 +1,46 @@
|
|||||||
|
group: Quantization
|
||||||
|
depends_on:
|
||||||
|
- image-build
|
||||||
|
steps:
|
||||||
|
- label: Quantization
|
||||||
|
timeout_in_minutes: 90
|
||||||
|
source_file_dependencies:
|
||||||
|
- csrc/
|
||||||
|
- vllm/model_executor/layers/quantization
|
||||||
|
- tests/quantization
|
||||||
|
commands:
|
||||||
|
# temporary install here since we need nightly, will move to requirements/test.in
|
||||||
|
# after torchao 0.12 release, and pin a working version of torchao nightly here
|
||||||
|
|
||||||
|
# since torchao nightly is only compatible with torch nightly currently
|
||||||
|
# https://github.com/pytorch/ao/issues/2919, we'll have to skip new torchao tests for now
|
||||||
|
# we can only upgrade after this is resolved
|
||||||
|
# TODO(jerryzh168): resolve the above comment
|
||||||
|
- uv pip install --system torchao==0.13.0 --index-url https://download.pytorch.org/whl/cu129
|
||||||
|
- uv pip install --system conch-triton-kernels
|
||||||
|
- VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py
|
||||||
|
|
||||||
|
- label: Quantized MoE Test (B200)
|
||||||
|
timeout_in_minutes: 60
|
||||||
|
working_dir: "/vllm-workspace/"
|
||||||
|
gpu: b200
|
||||||
|
source_file_dependencies:
|
||||||
|
- tests/quantization/test_blackwell_moe.py
|
||||||
|
- vllm/model_executor/models/deepseek_v2.py
|
||||||
|
- vllm/model_executor/models/gpt_oss.py
|
||||||
|
- vllm/model_executor/models/llama4.py
|
||||||
|
- vllm/model_executor/layers/fused_moe
|
||||||
|
- vllm/model_executor/layers/quantization/compressed_tensors
|
||||||
|
- vllm/model_executor/layers/quantization/modelopt.py
|
||||||
|
- vllm/model_executor/layers/quantization/mxfp4.py
|
||||||
|
- vllm/v1/attention/backends/flashinfer.py
|
||||||
|
commands:
|
||||||
|
- pytest -s -v tests/quantization/test_blackwell_moe.py
|
||||||
|
|
||||||
|
- label: Quantized Models Test
|
||||||
|
timeout_in_minutes: 60
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/model_executor/layers/quantization
|
||||||
|
- tests/models/quantization
|
||||||
|
commands:
|
||||||
|
- pytest -v -s models/quantization
|
||||||
14
.buildkite/test_areas/samplers.yaml
Normal file
14
.buildkite/test_areas/samplers.yaml
Normal file
@@ -0,0 +1,14 @@
|
|||||||
|
group: Samplers
|
||||||
|
depends_on:
|
||||||
|
- image-build
|
||||||
|
steps:
|
||||||
|
- label: Samplers Test
|
||||||
|
timeout_in_minutes: 75
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/model_executor/layers
|
||||||
|
- vllm/sampling_metadata.py
|
||||||
|
- tests/samplers
|
||||||
|
- tests/conftest.py
|
||||||
|
commands:
|
||||||
|
- pytest -v -s samplers
|
||||||
|
- VLLM_USE_FLASHINFER_SAMPLER=1 pytest -v -s samplers
|
||||||
23
.buildkite/test_areas/tool_use.yaml
Normal file
23
.buildkite/test_areas/tool_use.yaml
Normal file
@@ -0,0 +1,23 @@
|
|||||||
|
group: Tool use
|
||||||
|
depends_on:
|
||||||
|
- image-build
|
||||||
|
steps:
|
||||||
|
- label: OpenAI-Compatible Tool Use
|
||||||
|
timeout_in_minutes: 35
|
||||||
|
mirror_hardwares: [amdexperimental]
|
||||||
|
fast_check: false
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/
|
||||||
|
- tests/tool_use
|
||||||
|
commands:
|
||||||
|
- pytest -v -s -m 'not cpu_test' tool_use
|
||||||
|
|
||||||
|
- label: OpenAI-Compatible Tool Use (CPU)
|
||||||
|
depends_on: ~
|
||||||
|
timeout_in_minutes: 10
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/
|
||||||
|
- tests/tool_use
|
||||||
|
no_gpu: true
|
||||||
|
commands:
|
||||||
|
- pytest -v -s -m 'cpu_test' tool_use
|
||||||
25
.buildkite/test_areas/weight_loading.yaml
Normal file
25
.buildkite/test_areas/weight_loading.yaml
Normal file
@@ -0,0 +1,25 @@
|
|||||||
|
group: Weight Loading
|
||||||
|
depends_on:
|
||||||
|
- image-build
|
||||||
|
steps:
|
||||||
|
- label: Weight Loading Multiple GPU # 33min
|
||||||
|
timeout_in_minutes: 45
|
||||||
|
working_dir: "/vllm-workspace/tests"
|
||||||
|
num_gpus: 2
|
||||||
|
optional: true
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/
|
||||||
|
- tests/weight_loading
|
||||||
|
commands:
|
||||||
|
- bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models.txt
|
||||||
|
|
||||||
|
- label: Weight Loading Multiple GPU - Large Models # optional
|
||||||
|
working_dir: "/vllm-workspace/tests"
|
||||||
|
num_gpus: 2
|
||||||
|
gpu: a100
|
||||||
|
optional: true
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/
|
||||||
|
- tests/weight_loading
|
||||||
|
commands:
|
||||||
|
- bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large.txt
|
||||||
63
.github/CODEOWNERS
vendored
63
.github/CODEOWNERS
vendored
@@ -3,13 +3,14 @@
|
|||||||
|
|
||||||
# This lists cover the "core" components of vLLM that require careful review
|
# This lists cover the "core" components of vLLM that require careful review
|
||||||
/vllm/attention @LucasWilkinson
|
/vllm/attention @LucasWilkinson
|
||||||
/vllm/attention/backends/abstract.py @WoosukKwon @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
|
/vllm/attention/backends/abstract.py @WoosukKwon @zhuohan123 @youkaichao @alexm-redhat @njhill
|
||||||
/vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @22quinn
|
/vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-redhat @njhill @22quinn
|
||||||
/vllm/model_executor/layers/fused_moe @mgoin @pavanimajety
|
/vllm/model_executor/layers/fused_moe @mgoin @pavanimajety
|
||||||
/vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth @yewentao256 @pavanimajety
|
/vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth @yewentao256 @pavanimajety
|
||||||
/vllm/model_executor/layers/mamba @tdoublep
|
/vllm/model_executor/layers/mamba @tdoublep
|
||||||
/vllm/model_executor/model_loader @22quinn
|
/vllm/model_executor/model_loader @22quinn
|
||||||
/vllm/multimodal @DarkLight1337 @ywang96 @NickLucche
|
/vllm/model_executor/layers/batch_invariant.py @yewentao256
|
||||||
|
/vllm/multimodal @DarkLight1337 @ywang96 @NickLucche @tjtanaa
|
||||||
/vllm/vllm_flash_attn @LucasWilkinson
|
/vllm/vllm_flash_attn @LucasWilkinson
|
||||||
/vllm/lora @jeejeelee
|
/vllm/lora @jeejeelee
|
||||||
/vllm/reasoning @aarnphm @chaunceyjiang
|
/vllm/reasoning @aarnphm @chaunceyjiang
|
||||||
@@ -20,27 +21,30 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
|
|||||||
|
|
||||||
# Any change to the VllmConfig changes can have a large user-facing impact,
|
# Any change to the VllmConfig changes can have a large user-facing impact,
|
||||||
# so spam a lot of people
|
# so spam a lot of people
|
||||||
/vllm/config @simon-mo @WoosukKwon @youkaichao @robertgshaw2-redhat @mgoin @tlrmchlsmth @houseroad @hmellor @yewentao256 @ProExpertProg
|
/vllm/config @WoosukKwon @youkaichao @robertgshaw2-redhat @mgoin @tlrmchlsmth @houseroad @hmellor @yewentao256 @ProExpertProg
|
||||||
/vllm/config/cache.py @simon-mo @WoosukKwon @youkaichao @robertgshaw2-redhat @mgoin @tlrmchlsmth @houseroad @hmellor @yewentao256 @ProExpertProg @heheda12345
|
/vllm/config/cache.py @WoosukKwon @youkaichao @robertgshaw2-redhat @mgoin @tlrmchlsmth @houseroad @hmellor @yewentao256 @ProExpertProg @heheda12345
|
||||||
|
|
||||||
# vLLM V1
|
# vLLM V1
|
||||||
/vllm/v1/attention @LucasWilkinson
|
/vllm/v1/attention @LucasWilkinson
|
||||||
/vllm/v1/attention/backends/mla @pavanimajety
|
/vllm/v1/attention/backends/mla @pavanimajety
|
||||||
/vllm/v1/attention/backends/flashinfer.py @mgoin @pavanimajety
|
/vllm/v1/attention/backends/flashinfer.py @mgoin @pavanimajety
|
||||||
/vllm/v1/attention/backends/triton_attn.py @tdoublep
|
/vllm/v1/attention/backends/triton_attn.py @tdoublep
|
||||||
/vllm/v1/core @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat @heheda12345 @ApostaC
|
/vllm/v1/core @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @alexm-redhat @heheda12345 @ApostaC
|
||||||
/vllm/v1/sample @22quinn @houseroad @njhill
|
/vllm/v1/sample @22quinn @houseroad @njhill
|
||||||
/vllm/v1/spec_decode @benchislett @luccafong
|
/vllm/v1/spec_decode @benchislett @luccafong
|
||||||
/vllm/v1/structured_output @mgoin @russellb @aarnphm @benchislett
|
/vllm/v1/structured_output @mgoin @russellb @aarnphm @benchislett
|
||||||
/vllm/v1/kv_cache_interface.py @heheda12345
|
/vllm/v1/kv_cache_interface.py @heheda12345
|
||||||
/vllm/v1/offloading @ApostaC
|
/vllm/v1/offloading @ApostaC
|
||||||
|
|
||||||
|
# Model runner V2
|
||||||
|
/vllm/v1/worker/gpu @WoosukKwon
|
||||||
|
|
||||||
# Test ownership
|
# Test ownership
|
||||||
/.buildkite/lm-eval-harness @mgoin @simon-mo
|
/.buildkite/lm-eval-harness @mgoin
|
||||||
/tests/distributed/test_multi_node_assignment.py @youkaichao
|
/tests/distributed/test_multi_node_assignment.py @youkaichao
|
||||||
/tests/distributed/test_pipeline_parallel.py @youkaichao
|
/tests/distributed/test_pipeline_parallel.py @youkaichao
|
||||||
/tests/distributed/test_same_node.py @youkaichao
|
/tests/distributed/test_same_node.py @youkaichao
|
||||||
/tests/entrypoints @DarkLight1337 @robertgshaw2-redhat @simon-mo @aarnphm @NickLucche
|
/tests/entrypoints @DarkLight1337 @robertgshaw2-redhat @aarnphm @NickLucche
|
||||||
/tests/evals @mgoin
|
/tests/evals @mgoin
|
||||||
/tests/kernels @mgoin @tlrmchlsmth @WoosukKwon @yewentao256
|
/tests/kernels @mgoin @tlrmchlsmth @WoosukKwon @yewentao256
|
||||||
/tests/models @DarkLight1337 @ywang96
|
/tests/models @DarkLight1337 @ywang96
|
||||||
@@ -49,18 +53,29 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
|
|||||||
/tests/test_inputs.py @DarkLight1337 @ywang96
|
/tests/test_inputs.py @DarkLight1337 @ywang96
|
||||||
/tests/v1/entrypoints/llm/test_struct_output_generate.py @mgoin @russellb @aarnphm
|
/tests/v1/entrypoints/llm/test_struct_output_generate.py @mgoin @russellb @aarnphm
|
||||||
/tests/v1/structured_output @mgoin @russellb @aarnphm
|
/tests/v1/structured_output @mgoin @russellb @aarnphm
|
||||||
/tests/v1/core @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat @heheda12345 @ApostaC
|
/tests/v1/core @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @alexm-redhat @heheda12345 @ApostaC
|
||||||
/tests/weight_loading @mgoin @youkaichao @yewentao256
|
/tests/weight_loading @mgoin @youkaichao @yewentao256
|
||||||
/tests/lora @jeejeelee
|
/tests/lora @jeejeelee
|
||||||
/tests/models/language/generation/test_hybrid.py @tdoublep
|
/tests/models/language/generation/test_hybrid.py @tdoublep
|
||||||
/tests/v1/kv_connector/nixl_integration @NickLucche
|
/tests/v1/kv_connector/nixl_integration @NickLucche
|
||||||
/tests/v1/kv_connector @ApostaC
|
/tests/v1/kv_connector @ApostaC
|
||||||
/tests/v1/offloading @ApostaC
|
/tests/v1/offloading @ApostaC
|
||||||
|
/tests/v1/determinism @yewentao256
|
||||||
|
|
||||||
# Transformers backend
|
# Transformers modeling backend
|
||||||
/vllm/model_executor/models/transformers @hmellor
|
/vllm/model_executor/models/transformers @hmellor
|
||||||
/tests/models/test_transformers.py @hmellor
|
/tests/models/test_transformers.py @hmellor
|
||||||
|
|
||||||
|
# Observability
|
||||||
|
/vllm/config/observability.py @markmc
|
||||||
|
/vllm/v1/metrics @markmc
|
||||||
|
/tests/v1/metrics @markmc
|
||||||
|
/vllm/tracing.py @markmc
|
||||||
|
/tests/v1/tracing/test_tracing.py @markmc
|
||||||
|
/vllm/config/kv_events.py @markmc
|
||||||
|
/vllm/distributed/kv_events.py @markmc
|
||||||
|
/tests/distributed/test_events.py @markmc
|
||||||
|
|
||||||
# Docs
|
# Docs
|
||||||
/docs/mkdocs @hmellor
|
/docs/mkdocs @hmellor
|
||||||
/docs/**/*.yml @hmellor
|
/docs/**/*.yml @hmellor
|
||||||
@@ -105,11 +120,21 @@ mkdocs.yaml @hmellor
|
|||||||
/vllm/attention/ops/triton_unified_attention.py @tdoublep
|
/vllm/attention/ops/triton_unified_attention.py @tdoublep
|
||||||
|
|
||||||
# ROCm related: specify owner with write access to notify AMD folks for careful code review
|
# ROCm related: specify owner with write access to notify AMD folks for careful code review
|
||||||
/docker/Dockerfile.rocm* @gshtras
|
/vllm/**/*rocm* @tjtanaa
|
||||||
/vllm/v1/attention/backends/rocm*.py @gshtras
|
/docker/Dockerfile.rocm* @gshtras @tjtanaa
|
||||||
/vllm/v1/attention/backends/mla/rocm*.py @gshtras
|
/vllm/v1/attention/backends/rocm*.py @gshtras @tjtanaa
|
||||||
/vllm/attention/ops/rocm*.py @gshtras
|
/vllm/v1/attention/backends/mla/rocm*.py @gshtras @tjtanaa
|
||||||
/vllm/model_executor/layers/fused_moe/rocm*.py @gshtras
|
/vllm/attention/ops/rocm*.py @gshtras @tjtanaa
|
||||||
|
/vllm/model_executor/layers/fused_moe/rocm*.py @gshtras @tjtanaa
|
||||||
|
/csrc/rocm @gshtras @tjtanaa
|
||||||
|
/requirements/*rocm* @tjtanaa
|
||||||
|
/tests/**/*rocm* @tjtanaa
|
||||||
|
/docs/**/*rocm* @tjtanaa
|
||||||
|
/vllm/**/*quark* @tjtanaa
|
||||||
|
/tests/**/*quark* @tjtanaa
|
||||||
|
/docs/**/*quark* @tjtanaa
|
||||||
|
/vllm/**/*aiter* @tjtanaa
|
||||||
|
/tests/**/*aiter* @tjtanaa
|
||||||
|
|
||||||
# TPU
|
# TPU
|
||||||
/vllm/v1/worker/tpu* @NickLucche
|
/vllm/v1/worker/tpu* @NickLucche
|
||||||
@@ -121,9 +146,15 @@ mkdocs.yaml @hmellor
|
|||||||
/requirements/kv_connectors.txt @NickLucche
|
/requirements/kv_connectors.txt @NickLucche
|
||||||
|
|
||||||
# Pooling models
|
# Pooling models
|
||||||
/examples/*/pooling/ @noooop
|
/examples/pooling @noooop
|
||||||
/tests/models/*/pooling* @noooop
|
/tests/models/*/pooling* @noooop
|
||||||
/tests/entrypoints/pooling @noooop
|
/tests/entrypoints/pooling @noooop
|
||||||
|
/vllm/entrypoints/pooling @noooop
|
||||||
/vllm/config/pooler.py @noooop
|
/vllm/config/pooler.py @noooop
|
||||||
/vllm/pooling_params.py @noooop
|
/vllm/pooling_params.py @noooop
|
||||||
/vllm/model_executor/layers/pooler.py @noooop
|
/vllm/model_executor/layers/pooler.py @noooop
|
||||||
|
|
||||||
|
# Security guide and policies
|
||||||
|
/docs/usage/security.md @russellb
|
||||||
|
/SECURITY.md @russellb
|
||||||
|
/docs/contributing/vulnerability_management.md @russellb
|
||||||
|
|||||||
65
.github/mergify.yml
vendored
65
.github/mergify.yml
vendored
@@ -14,6 +14,52 @@ pull_request_rules:
|
|||||||
comment:
|
comment:
|
||||||
message: "Documentation preview: https://vllm--{{number}}.org.readthedocs.build/en/{{number}}/"
|
message: "Documentation preview: https://vllm--{{number}}.org.readthedocs.build/en/{{number}}/"
|
||||||
|
|
||||||
|
- name: comment-pre-commit-failure
|
||||||
|
description: Comment on PR when pre-commit check fails
|
||||||
|
conditions:
|
||||||
|
- status-failure=pre-commit
|
||||||
|
- -closed
|
||||||
|
- -draft
|
||||||
|
actions:
|
||||||
|
comment:
|
||||||
|
message: |
|
||||||
|
Hi @{{author}}, the pre-commit checks have failed. Please run:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
uv pip install pre-commit
|
||||||
|
pre-commit install
|
||||||
|
pre-commit run --all-files
|
||||||
|
```
|
||||||
|
|
||||||
|
Then, commit the changes and push to your branch.
|
||||||
|
|
||||||
|
For future commits, `pre-commit` will run automatically on changed files before each commit.
|
||||||
|
|
||||||
|
> [!TIP]
|
||||||
|
> <details>
|
||||||
|
> <summary>Is <code>mypy</code> or <code>markdownlint</code> failing?</summary>
|
||||||
|
> <br/>
|
||||||
|
> <code>mypy</code> and <code>markdownlint</code> are run differently in CI. If the failure is related to either of these checks, please use the following commands to run them locally:
|
||||||
|
>
|
||||||
|
> ```bash
|
||||||
|
> # For mypy (substitute "3.10" with the failing version if needed)
|
||||||
|
> pre-commit run --hook-stage manual mypy-3.10
|
||||||
|
> # For markdownlint
|
||||||
|
> pre-commit run --hook-stage manual markdownlint
|
||||||
|
> ```
|
||||||
|
> </details>
|
||||||
|
|
||||||
|
- name: comment-dco-failure
|
||||||
|
description: Comment on PR when DCO check fails
|
||||||
|
conditions:
|
||||||
|
- status-failure=dco
|
||||||
|
- -closed
|
||||||
|
- -draft
|
||||||
|
actions:
|
||||||
|
comment:
|
||||||
|
message: |
|
||||||
|
Hi @{{author}}, the DCO check has failed. Please click on DCO in the Checks section for instructions on how to resolve this.
|
||||||
|
|
||||||
- name: label-ci-build
|
- name: label-ci-build
|
||||||
description: Automatically apply ci/build label
|
description: Automatically apply ci/build label
|
||||||
conditions:
|
conditions:
|
||||||
@@ -140,7 +186,7 @@ pull_request_rules:
|
|||||||
- files~=^tests/entrypoints/test_context.py
|
- files~=^tests/entrypoints/test_context.py
|
||||||
- files~=^vllm/model_executor/models/.*gpt[-_]?oss.*\.py
|
- files~=^vllm/model_executor/models/.*gpt[-_]?oss.*\.py
|
||||||
- files~=^vllm/model_executor/layers/.*gpt[-_]?oss.*\.py
|
- files~=^vllm/model_executor/layers/.*gpt[-_]?oss.*\.py
|
||||||
- files~=^vllm/entrypoints/harmony_utils.py
|
- files~=^vllm/entrypoints/openai/parser/harmony_utils.py
|
||||||
- files~=^vllm/entrypoints/tool_server.py
|
- files~=^vllm/entrypoints/tool_server.py
|
||||||
- files~=^vllm/entrypoints/tool.py
|
- files~=^vllm/entrypoints/tool.py
|
||||||
- files~=^vllm/entrypoints/context.py
|
- files~=^vllm/entrypoints/context.py
|
||||||
@@ -151,6 +197,23 @@ pull_request_rules:
|
|||||||
add:
|
add:
|
||||||
- gpt-oss
|
- gpt-oss
|
||||||
|
|
||||||
|
- name: label-nvidia
|
||||||
|
description: Automatically apply nvidia label
|
||||||
|
conditions:
|
||||||
|
- label != stale
|
||||||
|
- or:
|
||||||
|
- files~=cuda
|
||||||
|
- files~=cutlass
|
||||||
|
- files~=flashinfer
|
||||||
|
- files~=trtllm
|
||||||
|
- title~=(?i)NVIDIA
|
||||||
|
- title~=(?i)CUDA
|
||||||
|
- title~=(?i)CUTLASS
|
||||||
|
actions:
|
||||||
|
label:
|
||||||
|
add:
|
||||||
|
- nvidia
|
||||||
|
|
||||||
- name: label-rocm
|
- name: label-rocm
|
||||||
description: Automatically apply rocm label
|
description: Automatically apply rocm label
|
||||||
conditions:
|
conditions:
|
||||||
|
|||||||
4
.github/workflows/cleanup_pr_body.yml
vendored
4
.github/workflows/cleanup_pr_body.yml
vendored
@@ -13,10 +13,10 @@ jobs:
|
|||||||
|
|
||||||
steps:
|
steps:
|
||||||
- name: Checkout repository
|
- name: Checkout repository
|
||||||
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
|
||||||
|
|
||||||
- name: Set up Python
|
- name: Set up Python
|
||||||
uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6.0.0
|
uses: actions/setup-python@83679a892e2d95755f2dac6acb0bfd1e9ac5d548 # v6.1.0
|
||||||
with:
|
with:
|
||||||
python-version: '3.12'
|
python-version: '3.12'
|
||||||
|
|
||||||
|
|||||||
25
.github/workflows/issue_autolabel.yml
vendored
25
.github/workflows/issue_autolabel.yml
vendored
@@ -105,6 +105,31 @@ jobs:
|
|||||||
}
|
}
|
||||||
],
|
],
|
||||||
},
|
},
|
||||||
|
cpu: {
|
||||||
|
// Keyword search - matches whole words only (with word boundaries)
|
||||||
|
keywords: [
|
||||||
|
{
|
||||||
|
term: "CPU Backend",
|
||||||
|
searchIn: "title"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
term: "x86",
|
||||||
|
searchIn: "title"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
term: "ARM",
|
||||||
|
searchIn: "title"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
term: "Apple Silicon",
|
||||||
|
searchIn: "title"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
term: "IBM Z",
|
||||||
|
searchIn: "title"
|
||||||
|
},
|
||||||
|
],
|
||||||
|
},
|
||||||
// Add more label configurations here as needed
|
// Add more label configurations here as needed
|
||||||
// example: {
|
// example: {
|
||||||
// keywords: [...],
|
// keywords: [...],
|
||||||
|
|||||||
80
.github/workflows/macos-smoke-test.yml
vendored
Normal file
80
.github/workflows/macos-smoke-test.yml
vendored
Normal file
@@ -0,0 +1,80 @@
|
|||||||
|
name: macOS Apple Silicon Smoke Test
|
||||||
|
|
||||||
|
on:
|
||||||
|
push:
|
||||||
|
branches:
|
||||||
|
- main
|
||||||
|
workflow_dispatch: # Manual trigger
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
macos-m1-smoke-test:
|
||||||
|
runs-on: macos-latest
|
||||||
|
timeout-minutes: 30
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@v6.0.1
|
||||||
|
|
||||||
|
- uses: astral-sh/setup-uv@v7
|
||||||
|
with:
|
||||||
|
enable-cache: true
|
||||||
|
cache-dependency-glob: |
|
||||||
|
requirements/**/*.txt
|
||||||
|
pyproject.toml
|
||||||
|
python-version: '3.12'
|
||||||
|
|
||||||
|
- name: Create virtual environment
|
||||||
|
run: |
|
||||||
|
uv venv
|
||||||
|
echo "$GITHUB_WORKSPACE/.venv/bin" >> "$GITHUB_PATH"
|
||||||
|
|
||||||
|
- name: Install dependencies and build vLLM
|
||||||
|
run: |
|
||||||
|
uv pip install -r requirements/cpu.txt --index-strategy unsafe-best-match
|
||||||
|
uv pip install -e .
|
||||||
|
env:
|
||||||
|
CMAKE_BUILD_PARALLEL_LEVEL: 4
|
||||||
|
|
||||||
|
- name: Verify installation
|
||||||
|
run: |
|
||||||
|
python -c "import vllm; print(f'vLLM version: {vllm.__version__}')"
|
||||||
|
|
||||||
|
- name: Smoke test vllm serve
|
||||||
|
run: |
|
||||||
|
# Start server in background
|
||||||
|
vllm serve Qwen/Qwen3-0.6B \
|
||||||
|
--max-model-len=2K \
|
||||||
|
--load-format=dummy \
|
||||||
|
--hf-overrides '{"num_hidden_layers": 2}' \
|
||||||
|
--enforce-eager \
|
||||||
|
--port 8000 &
|
||||||
|
|
||||||
|
SERVER_PID=$!
|
||||||
|
|
||||||
|
# Wait for server to start
|
||||||
|
for i in {1..30}; do
|
||||||
|
if curl -s http://localhost:8000/health > /dev/null; then
|
||||||
|
echo "Server started successfully"
|
||||||
|
break
|
||||||
|
fi
|
||||||
|
if [ "$i" -eq 30 ]; then
|
||||||
|
echo "Server failed to start"
|
||||||
|
kill "$SERVER_PID"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
sleep 2
|
||||||
|
done
|
||||||
|
|
||||||
|
# Test health endpoint
|
||||||
|
curl -f http://localhost:8000/health
|
||||||
|
|
||||||
|
# Test completion
|
||||||
|
curl -f http://localhost:8000/v1/completions \
|
||||||
|
-H "Content-Type: application/json" \
|
||||||
|
-d '{
|
||||||
|
"model": "Qwen/Qwen3-0.6B",
|
||||||
|
"prompt": "Hello",
|
||||||
|
"max_tokens": 5
|
||||||
|
}'
|
||||||
|
|
||||||
|
# Cleanup
|
||||||
|
kill "$SERVER_PID"
|
||||||
4
.github/workflows/pre-commit.yml
vendored
4
.github/workflows/pre-commit.yml
vendored
@@ -16,8 +16,8 @@ jobs:
|
|||||||
pre-commit:
|
pre-commit:
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
- uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
|
||||||
- uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6.0.0
|
- uses: actions/setup-python@83679a892e2d95755f2dac6acb0bfd1e9ac5d548 # v6.1.0
|
||||||
with:
|
with:
|
||||||
python-version: "3.12"
|
python-version: "3.12"
|
||||||
- run: echo "::add-matcher::.github/workflows/matchers/actionlint.json"
|
- run: echo "::add-matcher::.github/workflows/matchers/actionlint.json"
|
||||||
|
|||||||
4
.github/workflows/stale.yml
vendored
4
.github/workflows/stale.yml
vendored
@@ -7,13 +7,15 @@ on:
|
|||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
close-issues-and-pull-requests:
|
close-issues-and-pull-requests:
|
||||||
|
# Prevents triggering on forks or other repos
|
||||||
|
if: github.repository == 'vllm-project/vllm'
|
||||||
permissions:
|
permissions:
|
||||||
issues: write
|
issues: write
|
||||||
pull-requests: write
|
pull-requests: write
|
||||||
actions: write
|
actions: write
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/stale@5f858e3efba33a5ca4407a664cc011ad407f2008 # v10.1.0
|
- uses: actions/stale@997185467fa4f803885201cee163a9f38240193d # v10.1.1
|
||||||
with:
|
with:
|
||||||
# Increasing this value ensures that changes to this workflow
|
# Increasing this value ensures that changes to this workflow
|
||||||
# propagate to all issues and PRs in days rather than months
|
# propagate to all issues and PRs in days rather than months
|
||||||
|
|||||||
6
.gitignore
vendored
6
.gitignore
vendored
@@ -4,6 +4,9 @@
|
|||||||
# vllm-flash-attn built from source
|
# vllm-flash-attn built from source
|
||||||
vllm/vllm_flash_attn/*
|
vllm/vllm_flash_attn/*
|
||||||
|
|
||||||
|
# OpenAI triton kernels copied from source
|
||||||
|
vllm/third_party/triton_kernels/*
|
||||||
|
|
||||||
# triton jit
|
# triton jit
|
||||||
.triton
|
.triton
|
||||||
|
|
||||||
@@ -221,3 +224,6 @@ csrc/moe/marlin_moe_wna16/kernel_*
|
|||||||
|
|
||||||
# Ignore ep_kernels_workspace folder
|
# Ignore ep_kernels_workspace folder
|
||||||
ep_kernels_workspace/
|
ep_kernels_workspace/
|
||||||
|
|
||||||
|
# Allow tracked library source folders under submodules (e.g., benchmarks/lib)
|
||||||
|
!vllm/benchmarks/lib/
|
||||||
|
|||||||
@@ -3,10 +3,9 @@ MD007:
|
|||||||
MD013: false
|
MD013: false
|
||||||
MD024:
|
MD024:
|
||||||
siblings_only: true
|
siblings_only: true
|
||||||
|
MD031:
|
||||||
|
list_items: false
|
||||||
MD033: false
|
MD033: false
|
||||||
MD045: false
|
|
||||||
MD046: false
|
MD046: false
|
||||||
MD051: false
|
|
||||||
MD052: false
|
MD052: false
|
||||||
MD053: false
|
|
||||||
MD059: false
|
MD059: false
|
||||||
|
|||||||
@@ -38,7 +38,7 @@ repos:
|
|||||||
rev: 0.9.1
|
rev: 0.9.1
|
||||||
hooks:
|
hooks:
|
||||||
- id: pip-compile
|
- id: pip-compile
|
||||||
args: [requirements/test.in, -o, requirements/test.txt, --index-strategy, unsafe-best-match, --torch-backend, cu129, --python-platform, x86_64-manylinux_2_28]
|
args: [requirements/test.in, -o, requirements/test.txt, --index-strategy, unsafe-best-match, --torch-backend, cu129, --python-platform, x86_64-manylinux_2_28, --python-version, "3.12"]
|
||||||
files: ^requirements/test\.(in|txt)$
|
files: ^requirements/test\.(in|txt)$
|
||||||
- repo: local
|
- repo: local
|
||||||
hooks:
|
hooks:
|
||||||
|
|||||||
171
CMakeLists.txt
171
CMakeLists.txt
@@ -39,6 +39,13 @@ set(PYTHON_SUPPORTED_VERSIONS "3.10" "3.11" "3.12" "3.13")
|
|||||||
# Supported AMD GPU architectures.
|
# Supported AMD GPU architectures.
|
||||||
set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx1200;gfx1201;gfx1150;gfx1151")
|
set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx1200;gfx1201;gfx1150;gfx1151")
|
||||||
|
|
||||||
|
# ROCm installation prefix. Default to /opt/rocm but allow override via
|
||||||
|
# -DROCM_PATH=/your/rocm/path when invoking cmake.
|
||||||
|
if(NOT DEFINED ROCM_PATH)
|
||||||
|
set(ROCM_PATH "/opt/rocm" CACHE PATH "ROCm installation prefix")
|
||||||
|
else()
|
||||||
|
set(ROCM_PATH ${ROCM_PATH} CACHE PATH "ROCm installation prefix" FORCE)
|
||||||
|
endif()
|
||||||
#
|
#
|
||||||
# Supported/expected torch versions for CUDA/ROCm.
|
# Supported/expected torch versions for CUDA/ROCm.
|
||||||
#
|
#
|
||||||
@@ -129,7 +136,7 @@ elseif(HIP_FOUND)
|
|||||||
|
|
||||||
# ROCm 5.X and 6.X
|
# ROCm 5.X and 6.X
|
||||||
if (ROCM_VERSION_DEV_MAJOR GREATER_EQUAL 5 AND
|
if (ROCM_VERSION_DEV_MAJOR GREATER_EQUAL 5 AND
|
||||||
NOT Torch_VERSION VERSION_EQUAL ${TORCH_SUPPORTED_VERSION_ROCM})
|
Torch_VERSION VERSION_LESS ${TORCH_SUPPORTED_VERSION_ROCM})
|
||||||
message(WARNING "Pytorch version >= ${TORCH_SUPPORTED_VERSION_ROCM} "
|
message(WARNING "Pytorch version >= ${TORCH_SUPPORTED_VERSION_ROCM} "
|
||||||
"expected for ROCm build, saw ${Torch_VERSION} instead.")
|
"expected for ROCm build, saw ${Torch_VERSION} instead.")
|
||||||
endif()
|
endif()
|
||||||
@@ -237,11 +244,28 @@ set_gencode_flags_for_srcs(
|
|||||||
SRCS "${VLLM_CUMEM_EXT_SRC}"
|
SRCS "${VLLM_CUMEM_EXT_SRC}"
|
||||||
CUDA_ARCHS "${CUDA_ARCHS}")
|
CUDA_ARCHS "${CUDA_ARCHS}")
|
||||||
|
|
||||||
if(VLLM_GPU_LANG STREQUAL "CUDA")
|
if(VLLM_GPU_LANG STREQUAL "CUDA" OR VLLM_GPU_LANG STREQUAL "HIP")
|
||||||
message(STATUS "Enabling cumem allocator extension.")
|
message(STATUS "Enabling cumem allocator extension.")
|
||||||
|
if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||||
# link against cuda driver library
|
# link against cuda driver library
|
||||||
list(APPEND CUMEM_LIBS CUDA::cuda_driver)
|
list(APPEND CUMEM_LIBS CUDA::cuda_driver)
|
||||||
define_gpu_extension_target(
|
else()
|
||||||
|
# link against rocm driver library. Prefer an absolute path to
|
||||||
|
# libamdhip64.so inside ${ROCM_PATH}/lib if available, otherwise fall
|
||||||
|
# back to linking by name "amdhip64".
|
||||||
|
find_library(AMDHIP64_LIB
|
||||||
|
NAMES amdhip64 libamdhip64.so
|
||||||
|
PATHS ${ROCM_PATH}/lib
|
||||||
|
NO_DEFAULT_PATH)
|
||||||
|
if(AMDHIP64_LIB)
|
||||||
|
message(STATUS "Found libamdhip64 at ${AMDHIP64_LIB}")
|
||||||
|
list(APPEND CUMEM_LIBS ${AMDHIP64_LIB})
|
||||||
|
else()
|
||||||
|
message(WARNING "libamdhip64 not found in ${ROCM_PATH}/lib; falling back to linking 'amdhip64' by name")
|
||||||
|
list(APPEND CUMEM_LIBS amdhip64)
|
||||||
|
endif()
|
||||||
|
endif()
|
||||||
|
define_extension_target(
|
||||||
cumem_allocator
|
cumem_allocator
|
||||||
DESTINATION vllm
|
DESTINATION vllm
|
||||||
LANGUAGE CXX
|
LANGUAGE CXX
|
||||||
@@ -265,6 +289,7 @@ set(VLLM_EXT_SRC
|
|||||||
"csrc/pos_encoding_kernels.cu"
|
"csrc/pos_encoding_kernels.cu"
|
||||||
"csrc/activation_kernels.cu"
|
"csrc/activation_kernels.cu"
|
||||||
"csrc/layernorm_kernels.cu"
|
"csrc/layernorm_kernels.cu"
|
||||||
|
"csrc/fused_qknorm_rope_kernel.cu"
|
||||||
"csrc/layernorm_quant_kernels.cu"
|
"csrc/layernorm_quant_kernels.cu"
|
||||||
"csrc/sampler.cu"
|
"csrc/sampler.cu"
|
||||||
"csrc/cuda_view.cu"
|
"csrc/cuda_view.cu"
|
||||||
@@ -282,7 +307,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
SET(CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL "Enable only the header library")
|
SET(CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL "Enable only the header library")
|
||||||
|
|
||||||
# Set CUTLASS_REVISION. Used for FetchContent. Also fixes some bogus messages when building.
|
# Set CUTLASS_REVISION. Used for FetchContent. Also fixes some bogus messages when building.
|
||||||
set(CUTLASS_REVISION "v4.2.1" CACHE STRING "CUTLASS revision to use")
|
set(CUTLASS_REVISION "v4.2.1")
|
||||||
|
|
||||||
# Use the specified CUTLASS source directory for compilation if VLLM_CUTLASS_SRC_DIR is provided
|
# Use the specified CUTLASS source directory for compilation if VLLM_CUTLASS_SRC_DIR is provided
|
||||||
if (DEFINED ENV{VLLM_CUTLASS_SRC_DIR})
|
if (DEFINED ENV{VLLM_CUTLASS_SRC_DIR})
|
||||||
@@ -329,8 +354,17 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
# Only build Marlin kernels if we are building for at least some compatible archs.
|
# Only build Marlin kernels if we are building for at least some compatible archs.
|
||||||
# Keep building Marlin for 9.0 as there are some group sizes and shapes that
|
# Keep building Marlin for 9.0 as there are some group sizes and shapes that
|
||||||
# are not supported by Machete yet.
|
# are not supported by Machete yet.
|
||||||
# 9.0 for latest bf16 atomicAdd PTX
|
|
||||||
cuda_archs_loose_intersection(MARLIN_ARCHS "8.0;8.7;9.0+PTX" "${CUDA_ARCHS}")
|
# marlin arches for fp16 output
|
||||||
|
cuda_archs_loose_intersection(MARLIN_ARCHS "8.0+PTX" "${CUDA_ARCHS}")
|
||||||
|
# marlin arches for bf16 output (we need 9.0 for bf16 atomicAdd PTX)
|
||||||
|
cuda_archs_loose_intersection(MARLIN_BF16_ARCHS "8.0+PTX;9.0+PTX" "${CUDA_ARCHS}")
|
||||||
|
# marlin arches for fp8 input
|
||||||
|
# - sm80 doesn't support fp8 computation
|
||||||
|
# - sm90 and sm100 don't support QMMA.16832.F32.E4M3.E4M3 SAAS instruction
|
||||||
|
# so we only enable fp8 computation for SM89 (e.g. RTX 40x0) and 12.0 (e.g. RTX 50x0)
|
||||||
|
cuda_archs_loose_intersection(MARLIN_FP8_ARCHS "8.9;12.0" "${CUDA_ARCHS}")
|
||||||
|
|
||||||
if (MARLIN_ARCHS)
|
if (MARLIN_ARCHS)
|
||||||
|
|
||||||
#
|
#
|
||||||
@@ -340,16 +374,18 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
set(MARLIN_GEN_SCRIPT
|
set(MARLIN_GEN_SCRIPT
|
||||||
${CMAKE_CURRENT_SOURCE_DIR}/csrc/quantization/gptq_marlin/generate_kernels.py)
|
${CMAKE_CURRENT_SOURCE_DIR}/csrc/quantization/gptq_marlin/generate_kernels.py)
|
||||||
file(MD5 ${MARLIN_GEN_SCRIPT} MARLIN_GEN_SCRIPT_HASH)
|
file(MD5 ${MARLIN_GEN_SCRIPT} MARLIN_GEN_SCRIPT_HASH)
|
||||||
|
list(JOIN CUDA_ARCHS "," CUDA_ARCHS_STR)
|
||||||
|
set(MARLIN_GEN_SCRIPT_HASH_AND_ARCH "${MARLIN_GEN_SCRIPT_HASH}(ARCH:${CUDA_ARCHS_STR})")
|
||||||
|
|
||||||
message(STATUS "Marlin generation script hash: ${MARLIN_GEN_SCRIPT_HASH}")
|
message(STATUS "Marlin generation script hash: ${MARLIN_GEN_SCRIPT_HASH_AND_ARCH}")
|
||||||
message(STATUS "Last run Marlin generate script hash: $CACHE{MARLIN_GEN_SCRIPT_HASH}")
|
message(STATUS "Last run Marlin generate script hash: $CACHE{MARLIN_GEN_SCRIPT_HASH_AND_ARCH}")
|
||||||
|
|
||||||
if (NOT DEFINED CACHE{MARLIN_GEN_SCRIPT_HASH}
|
if (NOT DEFINED CACHE{MARLIN_GEN_SCRIPT_HASH_AND_ARCH}
|
||||||
OR NOT $CACHE{MARLIN_GEN_SCRIPT_HASH} STREQUAL ${MARLIN_GEN_SCRIPT_HASH})
|
OR NOT $CACHE{MARLIN_GEN_SCRIPT_HASH_AND_ARCH} STREQUAL ${MARLIN_GEN_SCRIPT_HASH_AND_ARCH})
|
||||||
execute_process(
|
execute_process(
|
||||||
COMMAND ${CMAKE_COMMAND} -E env
|
COMMAND ${CMAKE_COMMAND} -E env
|
||||||
PYTHONPATH=$PYTHONPATH
|
PYTHONPATH=$PYTHONPATH
|
||||||
${Python_EXECUTABLE} ${MARLIN_GEN_SCRIPT}
|
${Python_EXECUTABLE} ${MARLIN_GEN_SCRIPT} ${CUDA_ARCHS_STR}
|
||||||
RESULT_VARIABLE marlin_generation_result
|
RESULT_VARIABLE marlin_generation_result
|
||||||
OUTPUT_VARIABLE marlin_generation_result
|
OUTPUT_VARIABLE marlin_generation_result
|
||||||
OUTPUT_FILE ${CMAKE_CURRENT_BINARY_DIR}/marlin_generation.log
|
OUTPUT_FILE ${CMAKE_CURRENT_BINARY_DIR}/marlin_generation.log
|
||||||
@@ -362,15 +398,15 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
"\nCheck the log for details: "
|
"\nCheck the log for details: "
|
||||||
"${CMAKE_CURRENT_BINARY_DIR}/marlin_generation.log")
|
"${CMAKE_CURRENT_BINARY_DIR}/marlin_generation.log")
|
||||||
else()
|
else()
|
||||||
set(MARLIN_GEN_SCRIPT_HASH ${MARLIN_GEN_SCRIPT_HASH}
|
set(MARLIN_GEN_SCRIPT_HASH_AND_ARCH ${MARLIN_GEN_SCRIPT_HASH_AND_ARCH}
|
||||||
CACHE STRING "Last run Marlin generate script hash" FORCE)
|
CACHE STRING "Last run Marlin generate script hash and arch" FORCE)
|
||||||
message(STATUS "Marlin generation completed successfully.")
|
message(STATUS "Marlin generation completed successfully.")
|
||||||
endif()
|
endif()
|
||||||
else()
|
else()
|
||||||
message(STATUS "Marlin generation script has not changed, skipping generation.")
|
message(STATUS "Marlin generation script has not changed, skipping generation.")
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
file(GLOB MARLIN_TEMPLATE_KERNEL_SRC "csrc/quantization/gptq_marlin/kernel_*.cu")
|
file(GLOB MARLIN_TEMPLATE_KERNEL_SRC "csrc/quantization/gptq_marlin/sm80_kernel_*_float16.cu")
|
||||||
set_gencode_flags_for_srcs(
|
set_gencode_flags_for_srcs(
|
||||||
SRCS "${MARLIN_TEMPLATE_KERNEL_SRC}"
|
SRCS "${MARLIN_TEMPLATE_KERNEL_SRC}"
|
||||||
CUDA_ARCHS "${MARLIN_ARCHS}")
|
CUDA_ARCHS "${MARLIN_ARCHS}")
|
||||||
@@ -378,12 +414,34 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
set_source_files_properties(${MARLIN_TEMPLATE_KERNEL_SRC}
|
set_source_files_properties(${MARLIN_TEMPLATE_KERNEL_SRC}
|
||||||
PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false")
|
PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false")
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
list(APPEND VLLM_EXT_SRC ${MARLIN_TEMPLATE_KERNEL_SRC})
|
list(APPEND VLLM_EXT_SRC ${MARLIN_TEMPLATE_KERNEL_SRC})
|
||||||
|
|
||||||
|
file(GLOB MARLIN_TEMPLATE_BF16_KERNEL_SRC "csrc/quantization/gptq_marlin/sm80_kernel_*_bfloat16.cu")
|
||||||
|
set_gencode_flags_for_srcs(
|
||||||
|
SRCS "${MARLIN_TEMPLATE_BF16_KERNEL_SRC}"
|
||||||
|
CUDA_ARCHS "${MARLIN_BF16_ARCHS}")
|
||||||
|
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8)
|
||||||
|
set_source_files_properties(${MARLIN_TEMPLATE_BF16_KERNEL_SRC}
|
||||||
|
PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false")
|
||||||
|
endif()
|
||||||
|
list(APPEND VLLM_EXT_SRC ${MARLIN_TEMPLATE_BF16_KERNEL_SRC})
|
||||||
|
|
||||||
|
if (MARLIN_FP8_ARCHS)
|
||||||
|
file(GLOB MARLIN_TEMPLATE_FP8_KERNEL_SRC "csrc/quantization/gptq_marlin/sm89_kernel_*.cu")
|
||||||
|
set_gencode_flags_for_srcs(
|
||||||
|
SRCS "${MARLIN_TEMPLATE_FP8_KERNEL_SRC}"
|
||||||
|
CUDA_ARCHS "${MARLIN_FP8_ARCHS}")
|
||||||
|
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8)
|
||||||
|
set_source_files_properties(${MARLIN_TEMPLATE_FP8_KERNEL_SRC}
|
||||||
|
PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false")
|
||||||
|
endif()
|
||||||
|
list(APPEND VLLM_EXT_SRC ${MARLIN_TEMPLATE_FP8_KERNEL_SRC})
|
||||||
|
endif()
|
||||||
|
|
||||||
set(MARLIN_SRCS
|
set(MARLIN_SRCS
|
||||||
"csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu"
|
"csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu"
|
||||||
"csrc/quantization/gptq_marlin/gptq_marlin.cu"
|
"csrc/quantization/gptq_marlin/gptq_marlin.cu"
|
||||||
|
"csrc/quantization/gptq_marlin/marlin_int4_fp8_preprocess.cu"
|
||||||
"csrc/quantization/gptq_marlin/gptq_marlin_repack.cu"
|
"csrc/quantization/gptq_marlin/gptq_marlin_repack.cu"
|
||||||
"csrc/quantization/gptq_marlin/awq_marlin_repack.cu")
|
"csrc/quantization/gptq_marlin/awq_marlin_repack.cu")
|
||||||
set_gencode_flags_for_srcs(
|
set_gencode_flags_for_srcs(
|
||||||
@@ -487,9 +545,9 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
# The cutlass_scaled_mm kernels for Blackwell SM100 (c3x, i.e. CUTLASS 3.x)
|
# The cutlass_scaled_mm kernels for Blackwell SM100 (c3x, i.e. CUTLASS 3.x)
|
||||||
# require CUDA 12.8 or later
|
# require CUDA 12.8 or later
|
||||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
|
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
|
||||||
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f;11.0f;12.0f" "${CUDA_ARCHS}")
|
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f;11.0f" "${CUDA_ARCHS}")
|
||||||
else()
|
else()
|
||||||
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a;10.3a;12.0a;12.1a" "${CUDA_ARCHS}")
|
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a;10.3a" "${CUDA_ARCHS}")
|
||||||
endif()
|
endif()
|
||||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
|
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
|
||||||
set(SRCS
|
set(SRCS
|
||||||
@@ -579,12 +637,15 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
set(SRCS
|
set(SRCS
|
||||||
"csrc/quantization/fp4/nvfp4_quant_kernels.cu"
|
"csrc/quantization/fp4/nvfp4_quant_kernels.cu"
|
||||||
"csrc/quantization/fp4/activation_nvfp4_quant_fusion_kernels.cu"
|
"csrc/quantization/fp4/activation_nvfp4_quant_fusion_kernels.cu"
|
||||||
"csrc/quantization/fp4/nvfp4_scaled_mm_sm120_kernels.cu")
|
"csrc/quantization/fp4/nvfp4_experts_quant.cu"
|
||||||
|
"csrc/quantization/fp4/nvfp4_scaled_mm_sm120_kernels.cu"
|
||||||
|
"csrc/quantization/fp4/nvfp4_blockwise_moe_kernel.cu")
|
||||||
set_gencode_flags_for_srcs(
|
set_gencode_flags_for_srcs(
|
||||||
SRCS "${SRCS}"
|
SRCS "${SRCS}"
|
||||||
CUDA_ARCHS "${FP4_ARCHS}")
|
CUDA_ARCHS "${FP4_ARCHS}")
|
||||||
list(APPEND VLLM_EXT_SRC "${SRCS}")
|
list(APPEND VLLM_EXT_SRC "${SRCS}")
|
||||||
list(APPEND VLLM_GPU_FLAGS "-DENABLE_NVFP4_SM120=1")
|
list(APPEND VLLM_GPU_FLAGS "-DENABLE_NVFP4_SM120=1")
|
||||||
|
list(APPEND VLLM_GPU_FLAGS "-DENABLE_CUTLASS_MOE_SM120=1")
|
||||||
message(STATUS "Building NVFP4 for archs: ${FP4_ARCHS}")
|
message(STATUS "Building NVFP4 for archs: ${FP4_ARCHS}")
|
||||||
else()
|
else()
|
||||||
message(STATUS "Not building NVFP4 as no compatible archs were found.")
|
message(STATUS "Not building NVFP4 as no compatible archs were found.")
|
||||||
@@ -594,9 +655,9 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
|
|
||||||
# FP4 Archs and flags
|
# FP4 Archs and flags
|
||||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
|
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
|
||||||
cuda_archs_loose_intersection(FP4_ARCHS "10.0f;11.0f;12.0f" "${CUDA_ARCHS}")
|
cuda_archs_loose_intersection(FP4_ARCHS "10.0f;11.0f" "${CUDA_ARCHS}")
|
||||||
else()
|
else()
|
||||||
cuda_archs_loose_intersection(FP4_ARCHS "10.0a;10.1a;12.0a;12.1a" "${CUDA_ARCHS}")
|
cuda_archs_loose_intersection(FP4_ARCHS "10.0a;10.1a;10.3a" "${CUDA_ARCHS}")
|
||||||
endif()
|
endif()
|
||||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND FP4_ARCHS)
|
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND FP4_ARCHS)
|
||||||
set(SRCS
|
set(SRCS
|
||||||
@@ -670,7 +731,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
|
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
|
||||||
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f;11.0f" "${CUDA_ARCHS}")
|
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f;11.0f" "${CUDA_ARCHS}")
|
||||||
else()
|
else()
|
||||||
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a" "${CUDA_ARCHS}")
|
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a;10.3a" "${CUDA_ARCHS}")
|
||||||
endif()
|
endif()
|
||||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
|
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
|
||||||
set(SRCS "csrc/quantization/w8a8/cutlass/moe/grouped_mm_c3x_sm100.cu")
|
set(SRCS "csrc/quantization/w8a8/cutlass/moe/grouped_mm_c3x_sm100.cu")
|
||||||
@@ -716,9 +777,9 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
endif()
|
endif()
|
||||||
|
|
||||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
|
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
|
||||||
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f;11.0f;12.0f" "${CUDA_ARCHS}")
|
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f;11.0f" "${CUDA_ARCHS}")
|
||||||
else()
|
else()
|
||||||
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a;10.3a;12.0a;12.1a" "${CUDA_ARCHS}")
|
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a;10.3a" "${CUDA_ARCHS}")
|
||||||
endif()
|
endif()
|
||||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
|
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
|
||||||
set(SRCS "csrc/quantization/w8a8/cutlass/moe/blockwise_scaled_group_mm_sm100.cu")
|
set(SRCS "csrc/quantization/w8a8/cutlass/moe/blockwise_scaled_group_mm_sm100.cu")
|
||||||
@@ -813,7 +874,10 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
cuda_archs_loose_intersection(W4A8_ARCHS "9.0a" "${CUDA_ARCHS}")
|
cuda_archs_loose_intersection(W4A8_ARCHS "9.0a" "${CUDA_ARCHS}")
|
||||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.0 AND W4A8_ARCHS)
|
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.0 AND W4A8_ARCHS)
|
||||||
set(SRCS
|
set(SRCS
|
||||||
"csrc/quantization/cutlass_w4a8/w4a8_mm_entry.cu")
|
"csrc/quantization/cutlass_w4a8/w4a8_mm_entry.cu"
|
||||||
|
"csrc/quantization/cutlass_w4a8/w4a8_grouped_mm_entry.cu"
|
||||||
|
"csrc/quantization/cutlass_w4a8/w4a8_utils.cu"
|
||||||
|
)
|
||||||
|
|
||||||
set_gencode_flags_for_srcs(
|
set_gencode_flags_for_srcs(
|
||||||
SRCS "${SRCS}"
|
SRCS "${SRCS}"
|
||||||
@@ -836,7 +900,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
endif()
|
endif()
|
||||||
|
|
||||||
# Hadacore kernels
|
# Hadacore kernels
|
||||||
cuda_archs_loose_intersection(HADACORE_ARCHS "8.0;8.9;9.0" "${CUDA_ARCHS}")
|
cuda_archs_loose_intersection(HADACORE_ARCHS "8.0+PTX;9.0+PTX" "${CUDA_ARCHS}")
|
||||||
if(HADACORE_ARCHS)
|
if(HADACORE_ARCHS)
|
||||||
set(SRCS "csrc/quantization/hadamard/hadacore/hadamard_transform_cuda.cu")
|
set(SRCS "csrc/quantization/hadamard/hadacore/hadamard_transform_cuda.cu")
|
||||||
set_gencode_flags_for_srcs(
|
set_gencode_flags_for_srcs(
|
||||||
@@ -858,7 +922,7 @@ if (VLLM_GPU_LANG STREQUAL "HIP")
|
|||||||
endif()
|
endif()
|
||||||
|
|
||||||
message(STATUS "Enabling C extension.")
|
message(STATUS "Enabling C extension.")
|
||||||
define_gpu_extension_target(
|
define_extension_target(
|
||||||
_C
|
_C
|
||||||
DESTINATION vllm
|
DESTINATION vllm
|
||||||
LANGUAGE ${VLLM_GPU_LANG}
|
LANGUAGE ${VLLM_GPU_LANG}
|
||||||
@@ -883,7 +947,6 @@ target_compile_definitions(_C PRIVATE CUTLASS_ENABLE_DIRECT_CUDA_DRIVER_CALL=1)
|
|||||||
set(VLLM_MOE_EXT_SRC
|
set(VLLM_MOE_EXT_SRC
|
||||||
"csrc/moe/torch_bindings.cpp"
|
"csrc/moe/torch_bindings.cpp"
|
||||||
"csrc/moe/moe_align_sum_kernels.cu"
|
"csrc/moe/moe_align_sum_kernels.cu"
|
||||||
"csrc/moe/moe_lora_align_sum_kernels.cu"
|
|
||||||
"csrc/moe/topk_softmax_kernels.cu")
|
"csrc/moe/topk_softmax_kernels.cu")
|
||||||
|
|
||||||
if(VLLM_GPU_LANG STREQUAL "CUDA")
|
if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||||
@@ -913,8 +976,15 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
CUDA_ARCHS "${CUDA_ARCHS}")
|
CUDA_ARCHS "${CUDA_ARCHS}")
|
||||||
|
|
||||||
list(APPEND VLLM_MOE_EXT_SRC "${VLLM_MOE_WNA16_SRC}")
|
list(APPEND VLLM_MOE_EXT_SRC "${VLLM_MOE_WNA16_SRC}")
|
||||||
# 9.0 for latest bf16 atomicAdd PTX
|
# moe marlin arches
|
||||||
cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0;8.7;9.0+PTX" "${CUDA_ARCHS}")
|
# note that we always set `use_atomic_add=False` for moe marlin now,
|
||||||
|
# so we don't need 9.0 for bf16 atomicAdd PTX
|
||||||
|
cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0+PTX" "${CUDA_ARCHS}")
|
||||||
|
# moe marlin arches for fp8 input
|
||||||
|
# - sm80 doesn't support fp8 computation
|
||||||
|
# - sm90 and sm100 don't support QMMA.16832.F32.E4M3.E4M3 SAAS instruction
|
||||||
|
# so we only enable fp8 computation for SM89 (e.g. RTX 40x0) and 12.0 (e.g. RTX 50x0)
|
||||||
|
cuda_archs_loose_intersection(MARLIN_MOE_FP8_ARCHS "8.9;12.0" "${CUDA_ARCHS}")
|
||||||
if (MARLIN_MOE_ARCHS)
|
if (MARLIN_MOE_ARCHS)
|
||||||
|
|
||||||
#
|
#
|
||||||
@@ -924,16 +994,18 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
set(MOE_MARLIN_GEN_SCRIPT
|
set(MOE_MARLIN_GEN_SCRIPT
|
||||||
${CMAKE_CURRENT_SOURCE_DIR}/csrc/moe/marlin_moe_wna16/generate_kernels.py)
|
${CMAKE_CURRENT_SOURCE_DIR}/csrc/moe/marlin_moe_wna16/generate_kernels.py)
|
||||||
file(MD5 ${MOE_MARLIN_GEN_SCRIPT} MOE_MARLIN_GEN_SCRIPT_HASH)
|
file(MD5 ${MOE_MARLIN_GEN_SCRIPT} MOE_MARLIN_GEN_SCRIPT_HASH)
|
||||||
|
list(JOIN CUDA_ARCHS "," CUDA_ARCHS_STR)
|
||||||
|
set(MOE_MARLIN_GEN_SCRIPT_HASH_AND_ARCH "${MOE_MARLIN_GEN_SCRIPT_HASH}(ARCH:${CUDA_ARCHS_STR})")
|
||||||
|
|
||||||
message(STATUS "Marlin MOE generation script hash: ${MOE_MARLIN_GEN_SCRIPT_HASH}")
|
message(STATUS "Marlin MOE generation script hash with arch: ${MOE_MARLIN_GEN_SCRIPT_HASH_AND_ARCH}")
|
||||||
message(STATUS "Last run Marlin MOE generate script hash: $CACHE{MOE_MARLIN_GEN_SCRIPT_HASH}")
|
message(STATUS "Last run Marlin MOE generate script hash with arch: $CACHE{MOE_MARLIN_GEN_SCRIPT_HASH_AND_ARCH}")
|
||||||
|
|
||||||
if (NOT DEFINED CACHE{MOE_MARLIN_GEN_SCRIPT_HASH}
|
if (NOT DEFINED CACHE{MOE_MARLIN_GEN_SCRIPT_HASH_AND_ARCH}
|
||||||
OR NOT $CACHE{MOE_MARLIN_GEN_SCRIPT_HASH} STREQUAL ${MOE_MARLIN_GEN_SCRIPT_HASH})
|
OR NOT $CACHE{MOE_MARLIN_GEN_SCRIPT_HASH_AND_ARCH} STREQUAL ${MOE_MARLIN_GEN_SCRIPT_HASH_AND_ARCH})
|
||||||
execute_process(
|
execute_process(
|
||||||
COMMAND ${CMAKE_COMMAND} -E env
|
COMMAND ${CMAKE_COMMAND} -E env
|
||||||
PYTHONPATH=$PYTHONPATH
|
PYTHONPATH=$PYTHONPATH
|
||||||
${Python_EXECUTABLE} ${MOE_MARLIN_GEN_SCRIPT}
|
${Python_EXECUTABLE} ${MOE_MARLIN_GEN_SCRIPT} ${CUDA_ARCHS_STR}
|
||||||
RESULT_VARIABLE moe_marlin_generation_result
|
RESULT_VARIABLE moe_marlin_generation_result
|
||||||
OUTPUT_VARIABLE moe_marlin_generation_output
|
OUTPUT_VARIABLE moe_marlin_generation_output
|
||||||
OUTPUT_FILE ${CMAKE_CURRENT_BINARY_DIR}/moe_marlin_generation.log
|
OUTPUT_FILE ${CMAKE_CURRENT_BINARY_DIR}/moe_marlin_generation.log
|
||||||
@@ -946,7 +1018,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
"\nCheck the log for details: "
|
"\nCheck the log for details: "
|
||||||
"${CMAKE_CURRENT_BINARY_DIR}/moe_marlin_generation.log")
|
"${CMAKE_CURRENT_BINARY_DIR}/moe_marlin_generation.log")
|
||||||
else()
|
else()
|
||||||
set(MOE_MARLIN_GEN_SCRIPT_HASH ${MOE_MARLIN_GEN_SCRIPT_HASH}
|
set(MOE_MARLIN_GEN_SCRIPT_HASH_AND_ARCH ${MOE_MARLIN_GEN_SCRIPT_HASH_AND_ARCH}
|
||||||
CACHE STRING "Last run Marlin MOE generate script hash" FORCE)
|
CACHE STRING "Last run Marlin MOE generate script hash" FORCE)
|
||||||
message(STATUS "Marlin MOE generation completed successfully.")
|
message(STATUS "Marlin MOE generation completed successfully.")
|
||||||
endif()
|
endif()
|
||||||
@@ -954,16 +1026,28 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
message(STATUS "Marlin MOE generation script has not changed, skipping generation.")
|
message(STATUS "Marlin MOE generation script has not changed, skipping generation.")
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
file(GLOB MOE_WNAA16_MARLIN_SRC "csrc/moe/marlin_moe_wna16/*.cu")
|
file(GLOB MARLIN_MOE_SRC "csrc/moe/marlin_moe_wna16/sm80_kernel_*.cu")
|
||||||
|
list(APPEND MARLIN_MOE_SRC "csrc/moe/marlin_moe_wna16/ops.cu")
|
||||||
set_gencode_flags_for_srcs(
|
set_gencode_flags_for_srcs(
|
||||||
SRCS "${MOE_WNAA16_MARLIN_SRC}"
|
SRCS "${MARLIN_MOE_SRC}"
|
||||||
CUDA_ARCHS "${MARLIN_MOE_ARCHS}")
|
CUDA_ARCHS "${MARLIN_MOE_ARCHS}")
|
||||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8)
|
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8)
|
||||||
set_source_files_properties(${MOE_WNAA16_MARLIN_SRC}
|
set_source_files_properties(${MARLIN_MOE_SRC}
|
||||||
PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false")
|
PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false")
|
||||||
endif()
|
endif()
|
||||||
|
list(APPEND VLLM_MOE_EXT_SRC ${MARLIN_MOE_SRC})
|
||||||
|
|
||||||
list(APPEND VLLM_MOE_EXT_SRC ${MOE_WNAA16_MARLIN_SRC})
|
if (MARLIN_MOE_FP8_ARCHS)
|
||||||
|
file(GLOB MARLIN_MOE_FP8_SRC "csrc/moe/marlin_moe_wna16/sm89_kernel_*.cu")
|
||||||
|
set_gencode_flags_for_srcs(
|
||||||
|
SRCS "${MARLIN_MOE_FP8_SRC}"
|
||||||
|
CUDA_ARCHS "${MARLIN_MOE_FP8_ARCHS}")
|
||||||
|
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8)
|
||||||
|
set_source_files_properties(${MARLIN_MOE_FP8_SRC}
|
||||||
|
PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false")
|
||||||
|
endif()
|
||||||
|
list(APPEND VLLM_MOE_EXT_SRC ${MARLIN_MOE_FP8_SRC})
|
||||||
|
endif()
|
||||||
|
|
||||||
message(STATUS "Building Marlin MOE kernels for archs: ${MARLIN_MOE_ARCHS}")
|
message(STATUS "Building Marlin MOE kernels for archs: ${MARLIN_MOE_ARCHS}")
|
||||||
else()
|
else()
|
||||||
@@ -973,7 +1057,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
endif()
|
endif()
|
||||||
|
|
||||||
message(STATUS "Enabling moe extension.")
|
message(STATUS "Enabling moe extension.")
|
||||||
define_gpu_extension_target(
|
define_extension_target(
|
||||||
_moe_C
|
_moe_C
|
||||||
DESTINATION vllm
|
DESTINATION vllm
|
||||||
LANGUAGE ${VLLM_GPU_LANG}
|
LANGUAGE ${VLLM_GPU_LANG}
|
||||||
@@ -994,7 +1078,7 @@ if(VLLM_GPU_LANG STREQUAL "HIP")
|
|||||||
"csrc/rocm/skinny_gemms.cu"
|
"csrc/rocm/skinny_gemms.cu"
|
||||||
"csrc/rocm/attention.cu")
|
"csrc/rocm/attention.cu")
|
||||||
|
|
||||||
define_gpu_extension_target(
|
define_extension_target(
|
||||||
_rocm_C
|
_rocm_C
|
||||||
DESTINATION vllm
|
DESTINATION vllm
|
||||||
LANGUAGE ${VLLM_GPU_LANG}
|
LANGUAGE ${VLLM_GPU_LANG}
|
||||||
@@ -1005,6 +1089,11 @@ if(VLLM_GPU_LANG STREQUAL "HIP")
|
|||||||
WITH_SOABI)
|
WITH_SOABI)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
# For CUDA and HIP builds also build the triton_kernels external package.
|
||||||
|
if(VLLM_GPU_LANG STREQUAL "CUDA" OR VLLM_GPU_LANG STREQUAL "HIP")
|
||||||
|
include(cmake/external_projects/triton_kernels.cmake)
|
||||||
|
endif()
|
||||||
|
|
||||||
# For CUDA we also build and ship some external projects.
|
# For CUDA we also build and ship some external projects.
|
||||||
if (VLLM_GPU_LANG STREQUAL "CUDA")
|
if (VLLM_GPU_LANG STREQUAL "CUDA")
|
||||||
include(cmake/external_projects/flashmla.cmake)
|
include(cmake/external_projects/flashmla.cmake)
|
||||||
|
|||||||
@@ -21,6 +21,9 @@ Join us at the [PyTorch Conference, October 22-23](https://events.linuxfoundatio
|
|||||||
|
|
||||||
*Latest News* 🔥
|
*Latest News* 🔥
|
||||||
|
|
||||||
|
- [2025/11] We hosted [vLLM Bangkok Meetup](https://luma.com/v0f647nv). We explored vLLM and LMCache inference and low-resource language adaptation with speakers from Embedded LLM, AMD, and Red Hat. Please find the meetup slides [here](https://drive.google.com/drive/folders/1H0DS57F8HQ5q3kSOSoRmucPJWL3E0A_X?usp=sharing).
|
||||||
|
- [2025/11] We hosted [the first vLLM Europe Meetup in Zurich](https://luma.com/0gls27kb) focused on quantization, distributed inference, and reinforcement learning at scale with speakers from Mistral, IBM, and Red Hat. Please find the meetup slides [here](https://docs.google.com/presentation/d/1UC9PTLCHYXQpOmJDSFg6Sljra3iVXzc09DeEI7dnxMc/edit?usp=sharing) and recording [here](https://www.youtube.com/watch?v=6m6ZE6yVEDI)
|
||||||
|
- [2025/11] We hosted [vLLM Beijing Meetup](https://mp.weixin.qq.com/s/xSrYXjNgr1HbCP4ExYNG1w) focusing on distributed inference and diverse accelerator support with vLLM! Please find the meetup slides [here](https://drive.google.com/drive/folders/1nQJ8ZkLSjKxvu36sSHaceVXtttbLvvu-?usp=drive_link).
|
||||||
- [2025/10] We hosted [vLLM Shanghai Meetup](https://mp.weixin.qq.com/s/__xb4OyOsImz-9eAVrdlcg) focused on hands-on vLLM inference optimization! Please find the meetup slides [here](https://drive.google.com/drive/folders/1KqwjsFJLfEsC8wlDugnrR61zsWHt94Q6).
|
- [2025/10] We hosted [vLLM Shanghai Meetup](https://mp.weixin.qq.com/s/__xb4OyOsImz-9eAVrdlcg) focused on hands-on vLLM inference optimization! Please find the meetup slides [here](https://drive.google.com/drive/folders/1KqwjsFJLfEsC8wlDugnrR61zsWHt94Q6).
|
||||||
- [2025/09] We hosted [vLLM Toronto Meetup](https://luma.com/e80e0ymm) focused on tackling inference at scale and speculative decoding with speakers from NVIDIA and Red Hat! Please find the meetup slides [here](https://docs.google.com/presentation/d/1IYJYmJcu9fLpID5N5RbW_vO0XLo0CGOR14IXOjB61V8/edit?usp=sharing).
|
- [2025/09] We hosted [vLLM Toronto Meetup](https://luma.com/e80e0ymm) focused on tackling inference at scale and speculative decoding with speakers from NVIDIA and Red Hat! Please find the meetup slides [here](https://docs.google.com/presentation/d/1IYJYmJcu9fLpID5N5RbW_vO0XLo0CGOR14IXOjB61V8/edit?usp=sharing).
|
||||||
- [2025/08] We hosted [vLLM Shenzhen Meetup](https://mp.weixin.qq.com/s/k8ZBO1u2_2odgiKWH_GVTQ) focusing on the ecosystem around vLLM! Please find the meetup slides [here](https://drive.google.com/drive/folders/1Ua2SVKVSu-wp5vou_6ElraDt2bnKhiEA).
|
- [2025/08] We hosted [vLLM Shenzhen Meetup](https://mp.weixin.qq.com/s/k8ZBO1u2_2odgiKWH_GVTQ) focusing on the ecosystem around vLLM! Please find the meetup slides [here](https://drive.google.com/drive/folders/1Ua2SVKVSu-wp5vou_6ElraDt2bnKhiEA).
|
||||||
@@ -83,7 +86,7 @@ vLLM is flexible and easy to use with:
|
|||||||
- Tensor, pipeline, data and expert parallelism support for distributed inference
|
- Tensor, pipeline, data and expert parallelism support for distributed inference
|
||||||
- Streaming outputs
|
- Streaming outputs
|
||||||
- OpenAI-compatible API server
|
- OpenAI-compatible API server
|
||||||
- Support for NVIDIA GPUs, AMD CPUs and GPUs, Intel CPUs and GPUs, PowerPC CPUs, and TPU. Additionally, support for diverse hardware plugins such as Intel Gaudi, IBM Spyre and Huawei Ascend.
|
- Support for NVIDIA GPUs, AMD CPUs and GPUs, Intel CPUs and GPUs, PowerPC CPUs, Arm CPUs, and TPU. Additionally, support for diverse hardware plugins such as Intel Gaudi, IBM Spyre and Huawei Ascend.
|
||||||
- Prefix caching support
|
- Prefix caching support
|
||||||
- Multi-LoRA support
|
- Multi-LoRA support
|
||||||
|
|
||||||
@@ -134,6 +137,7 @@ Compute Resources:
|
|||||||
- Alibaba Cloud
|
- Alibaba Cloud
|
||||||
- AMD
|
- AMD
|
||||||
- Anyscale
|
- Anyscale
|
||||||
|
- Arm
|
||||||
- AWS
|
- AWS
|
||||||
- Crusoe Cloud
|
- Crusoe Cloud
|
||||||
- Databricks
|
- Databricks
|
||||||
|
|||||||
@@ -83,7 +83,7 @@ MIN_CACHE_HIT_PCT=0
|
|||||||
MAX_LATENCY_ALLOWED_MS=100000000000 # A very large number
|
MAX_LATENCY_ALLOWED_MS=100000000000 # A very large number
|
||||||
```
|
```
|
||||||
|
|
||||||
#### 2. Maximize Throughput with a Latency Requirement
|
### 2. Maximize Throughput with a Latency Requirement
|
||||||
|
|
||||||
- **Goal**: Find the best server parameters when P99 end-to-end latency must be below 500ms.
|
- **Goal**: Find the best server parameters when P99 end-to-end latency must be below 500ms.
|
||||||
- **Configuration**:
|
- **Configuration**:
|
||||||
@@ -96,7 +96,7 @@ MIN_CACHE_HIT_PCT=0
|
|||||||
MAX_LATENCY_ALLOWED_MS=500
|
MAX_LATENCY_ALLOWED_MS=500
|
||||||
```
|
```
|
||||||
|
|
||||||
#### 3. Maximize Throughput with Prefix Caching and Latency Requirements
|
### 3. Maximize Throughput with Prefix Caching and Latency Requirements
|
||||||
|
|
||||||
- **Goal**: Find the best server parameters assuming a 60% prefix cache hit rate and a latency requirement of 500ms.
|
- **Goal**: Find the best server parameters assuming a 60% prefix cache hit rate and a latency requirement of 500ms.
|
||||||
- **Configuration**:
|
- **Configuration**:
|
||||||
|
|||||||
@@ -96,8 +96,9 @@ start_server() {
|
|||||||
# This correctly passes each element as a separate argument.
|
# This correctly passes each element as a separate argument.
|
||||||
if [[ -n "$profile_dir" ]]; then
|
if [[ -n "$profile_dir" ]]; then
|
||||||
# Start server with profiling enabled
|
# Start server with profiling enabled
|
||||||
VLLM_SERVER_DEV_MODE=1 VLLM_TORCH_PROFILER_DIR=$profile_dir \
|
local profile_config_json="{\"profiler\": \"torch\", \"torch_profiler_dir\": \"$profile_dir\"}"
|
||||||
vllm serve "${common_args_array[@]}" > "$vllm_log" 2>&1 &
|
VLLM_SERVER_DEV_MODE=1 \
|
||||||
|
vllm serve --profiler-config "$profile_config_json" "${common_args_array[@]}" > "$vllm_log" 2>&1 &
|
||||||
else
|
else
|
||||||
# Start server without profiling
|
# Start server without profiling
|
||||||
VLLM_SERVER_DEV_MODE=1 \
|
VLLM_SERVER_DEV_MODE=1 \
|
||||||
|
|||||||
@@ -620,7 +620,7 @@ def get_tokenizer(
|
|||||||
kwargs["use_fast"] = False
|
kwargs["use_fast"] = False
|
||||||
if tokenizer_mode == "mistral":
|
if tokenizer_mode == "mistral":
|
||||||
try:
|
try:
|
||||||
from vllm.transformers_utils.tokenizer import MistralTokenizer
|
from vllm.tokenizers import MistralTokenizer
|
||||||
except ImportError as e:
|
except ImportError as e:
|
||||||
raise ImportError(
|
raise ImportError(
|
||||||
"MistralTokenizer requires vllm package.\n"
|
"MistralTokenizer requires vllm package.\n"
|
||||||
|
|||||||
380
benchmarks/benchmark_batch_invariance.py
Executable file
380
benchmarks/benchmark_batch_invariance.py
Executable file
@@ -0,0 +1,380 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
"""
|
||||||
|
Benchmark to measure the performance overhead of VLLM_BATCH_INVARIANT mode.
|
||||||
|
|
||||||
|
This benchmark runs the same workload twice:
|
||||||
|
1. With VLLM_BATCH_INVARIANT=0 (baseline)
|
||||||
|
2. With VLLM_BATCH_INVARIANT=1 (batch invariant mode)
|
||||||
|
|
||||||
|
And reports the timing and throughput metrics for comparison.
|
||||||
|
|
||||||
|
Environment variables:
|
||||||
|
VLLM_BENCH_MODEL: Model to benchmark (default: "Qwen/Qwen3-1.7B")
|
||||||
|
VLLM_BENCH_TP_SIZE: Tensor parallel size (default: 1, use 8 for deepseek)
|
||||||
|
VLLM_BENCH_BATCH_SIZE: Max batch size (default: 128)
|
||||||
|
VLLM_BENCH_NUM_TRIALS: Number of trials to run (default: 5)
|
||||||
|
VLLM_BENCH_MIN_PROMPT: Min prompt length in words (default: 1024)
|
||||||
|
VLLM_BENCH_MAX_PROMPT: Max prompt length in words (default: 2048)
|
||||||
|
VLLM_BENCH_MAX_TOKENS: Max tokens to generate (default: 128)
|
||||||
|
VLLM_BENCH_TEMPERATURE: Temperature for sampling (default: 0.0)
|
||||||
|
VLLM_BENCH_GPU_MEMORY_UTILIZATION: GPU memory utilization (default: 0.4)
|
||||||
|
VLLM_BENCH_MAX_MODEL_LEN: Max model length (default: 5120)
|
||||||
|
VLLM_BENCH_BACKEND: Attention backend (default: FLASH_ATTN)
|
||||||
|
|
||||||
|
Example usage:
|
||||||
|
# Benchmark qwen3 (default)
|
||||||
|
python benchmarks/benchmark_batch_invariance.py
|
||||||
|
|
||||||
|
# Benchmark deepseek with 8 GPUs
|
||||||
|
VLLM_BENCH_MODEL="deepseek-ai/DeepSeek-V3" VLLM_BENCH_TP_SIZE=8 \\
|
||||||
|
python benchmarks/benchmark_batch_invariance.py
|
||||||
|
|
||||||
|
# Quick test with fewer trials
|
||||||
|
VLLM_BENCH_NUM_TRIALS=2 VLLM_BENCH_BATCH_SIZE=32 \\
|
||||||
|
python benchmarks/benchmark_batch_invariance.py
|
||||||
|
"""
|
||||||
|
|
||||||
|
import contextlib
|
||||||
|
import os
|
||||||
|
import random
|
||||||
|
import time
|
||||||
|
|
||||||
|
from vllm import LLM, SamplingParams
|
||||||
|
from vllm.platforms import current_platform
|
||||||
|
|
||||||
|
|
||||||
|
def _random_prompt(min_words: int = 1024, max_words: int = 1024 * 2) -> str:
|
||||||
|
"""Generate a random prompt for benchmarking."""
|
||||||
|
prompt_templates = [
|
||||||
|
"Question: What is the capital of France?\nAnswer: The capital of France is",
|
||||||
|
"Q: How does photosynthesis work?\nA: Photosynthesis is the process by which",
|
||||||
|
"User: Can you explain quantum mechanics?\nAssistant: Quantum mechanics is",
|
||||||
|
"Once upon a time in a distant galaxy, there lived",
|
||||||
|
"The old man walked slowly down the street, remembering",
|
||||||
|
"In the year 2157, humanity finally discovered",
|
||||||
|
"To implement a binary search tree in Python, first we need to",
|
||||||
|
"The algorithm works by iterating through the array and",
|
||||||
|
"Here's how to optimize database queries using indexing:",
|
||||||
|
"The Renaissance was a period in European history that",
|
||||||
|
"Climate change is caused by several factors including",
|
||||||
|
"The human brain contains approximately 86 billion neurons which",
|
||||||
|
"I've been thinking about getting a new laptop because",
|
||||||
|
"Yesterday I went to the store and bought",
|
||||||
|
"My favorite thing about summer is definitely",
|
||||||
|
]
|
||||||
|
|
||||||
|
base_prompt = random.choice(prompt_templates)
|
||||||
|
|
||||||
|
if max_words < min_words:
|
||||||
|
max_words = min_words
|
||||||
|
target_words = random.randint(min_words, max_words)
|
||||||
|
|
||||||
|
if target_words > 50:
|
||||||
|
padding_text = (
|
||||||
|
" This is an interesting topic that deserves more explanation. "
|
||||||
|
* (target_words // 50)
|
||||||
|
)
|
||||||
|
base_prompt = base_prompt + padding_text
|
||||||
|
|
||||||
|
return base_prompt
|
||||||
|
|
||||||
|
|
||||||
|
def run_benchmark_with_batch_invariant(
|
||||||
|
model: str,
|
||||||
|
tp_size: int,
|
||||||
|
max_batch_size: int,
|
||||||
|
num_trials: int,
|
||||||
|
min_prompt: int,
|
||||||
|
max_prompt: int,
|
||||||
|
max_tokens: int,
|
||||||
|
temperature: float,
|
||||||
|
gpu_mem_util: float,
|
||||||
|
max_model_len: int,
|
||||||
|
backend: str,
|
||||||
|
batch_invariant: bool,
|
||||||
|
seed: int = 12345,
|
||||||
|
) -> dict:
|
||||||
|
"""
|
||||||
|
Run the benchmark with the specified configuration.
|
||||||
|
|
||||||
|
Returns a dict with timing and throughput metrics.
|
||||||
|
"""
|
||||||
|
random.seed(seed)
|
||||||
|
|
||||||
|
# Set environment variables
|
||||||
|
os.environ["VLLM_ATTENTION_BACKEND"] = backend
|
||||||
|
if batch_invariant:
|
||||||
|
os.environ["VLLM_BATCH_INVARIANT"] = "1"
|
||||||
|
else:
|
||||||
|
os.environ["VLLM_BATCH_INVARIANT"] = "0"
|
||||||
|
|
||||||
|
print(f"\n{'=' * 80}")
|
||||||
|
print(f"BENCHMARK: VLLM_BATCH_INVARIANT={int(batch_invariant)}")
|
||||||
|
print(f" Model: {model}")
|
||||||
|
print(f" TP Size: {tp_size}")
|
||||||
|
print(f" Backend: {backend}")
|
||||||
|
print(f" Max Batch Size: {max_batch_size}")
|
||||||
|
print(f" Trials: {num_trials}")
|
||||||
|
print(f" Max Tokens: {max_tokens}")
|
||||||
|
print(f"{'=' * 80}\n")
|
||||||
|
|
||||||
|
sampling = SamplingParams(
|
||||||
|
temperature=temperature,
|
||||||
|
top_p=0.95,
|
||||||
|
max_tokens=max_tokens,
|
||||||
|
seed=20240919,
|
||||||
|
)
|
||||||
|
|
||||||
|
needle_prompt = "There once was a "
|
||||||
|
|
||||||
|
llm = None
|
||||||
|
try:
|
||||||
|
# Create LLM engine
|
||||||
|
start_init = time.perf_counter()
|
||||||
|
llm = LLM(
|
||||||
|
model=model,
|
||||||
|
max_num_seqs=max_batch_size,
|
||||||
|
gpu_memory_utilization=gpu_mem_util,
|
||||||
|
max_model_len=max_model_len,
|
||||||
|
dtype="bfloat16",
|
||||||
|
tensor_parallel_size=tp_size,
|
||||||
|
enable_prefix_caching=False,
|
||||||
|
)
|
||||||
|
init_time = time.perf_counter() - start_init
|
||||||
|
print(f"Engine initialization time: {init_time:.2f}s\n")
|
||||||
|
|
||||||
|
# Generate baseline
|
||||||
|
print("Generating baseline (warmup)...")
|
||||||
|
baseline_out = llm.generate([needle_prompt], sampling)
|
||||||
|
assert len(baseline_out) == 1
|
||||||
|
baseline_text = baseline_out[0].outputs[0].text
|
||||||
|
print(f"Baseline output: '{baseline_text[:50]}...'\n")
|
||||||
|
|
||||||
|
# Run trials and measure timing
|
||||||
|
trial_times: list[float] = []
|
||||||
|
total_tokens = 0
|
||||||
|
total_prompts = 0
|
||||||
|
|
||||||
|
for trial in range(num_trials):
|
||||||
|
# Create a batch
|
||||||
|
prompts: list[str] = []
|
||||||
|
batch_size = random.randint(max_batch_size // 2, max_batch_size)
|
||||||
|
needle_pos = random.randint(0, batch_size - 1)
|
||||||
|
for i in range(batch_size):
|
||||||
|
if i == needle_pos:
|
||||||
|
prompts.append(needle_prompt)
|
||||||
|
else:
|
||||||
|
prompts.append(_random_prompt(min_prompt, max_prompt))
|
||||||
|
|
||||||
|
# Measure time for this trial
|
||||||
|
start_time = time.perf_counter()
|
||||||
|
outputs = llm.generate(prompts, sampling)
|
||||||
|
trial_time = time.perf_counter() - start_time
|
||||||
|
|
||||||
|
trial_times.append(trial_time)
|
||||||
|
total_prompts += len(prompts)
|
||||||
|
|
||||||
|
# Count tokens
|
||||||
|
for output in outputs:
|
||||||
|
if output.outputs:
|
||||||
|
total_tokens += len(output.outputs[0].token_ids)
|
||||||
|
|
||||||
|
print(
|
||||||
|
f"Trial {trial + 1}/{num_trials}: "
|
||||||
|
f"batch_size={batch_size}, "
|
||||||
|
f"time={trial_time:.2f}s"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Verify needle output still matches
|
||||||
|
needle_output = outputs[needle_pos]
|
||||||
|
assert needle_output.prompt == needle_prompt
|
||||||
|
|
||||||
|
# Compute statistics
|
||||||
|
avg_time = sum(trial_times) / len(trial_times)
|
||||||
|
min_time = min(trial_times)
|
||||||
|
max_time = max(trial_times)
|
||||||
|
throughput = total_tokens / sum(trial_times)
|
||||||
|
prompts_per_sec = total_prompts / sum(trial_times)
|
||||||
|
|
||||||
|
print(f"\n{'=' * 80}")
|
||||||
|
print("RESULTS:")
|
||||||
|
print(f" Average time per trial: {avg_time:.2f}s")
|
||||||
|
print(f" Min time: {min_time:.2f}s")
|
||||||
|
print(f" Max time: {max_time:.2f}s")
|
||||||
|
print(f" Total tokens generated: {total_tokens}")
|
||||||
|
print(f" Total prompts processed: {total_prompts}")
|
||||||
|
print(f" Throughput: {throughput:.2f} tokens/s")
|
||||||
|
print(f" Prompts/s: {prompts_per_sec:.2f}")
|
||||||
|
print(f"{'=' * 80}\n")
|
||||||
|
|
||||||
|
return {
|
||||||
|
"init_time": init_time,
|
||||||
|
"avg_time": avg_time,
|
||||||
|
"min_time": min_time,
|
||||||
|
"max_time": max_time,
|
||||||
|
"total_tokens": total_tokens,
|
||||||
|
"total_prompts": total_prompts,
|
||||||
|
"throughput": throughput,
|
||||||
|
"prompts_per_sec": prompts_per_sec,
|
||||||
|
"trial_times": trial_times,
|
||||||
|
}
|
||||||
|
|
||||||
|
finally:
|
||||||
|
# Cleanup
|
||||||
|
if llm is not None:
|
||||||
|
with contextlib.suppress(Exception):
|
||||||
|
llm.shutdown()
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
# Check platform support
|
||||||
|
if not (current_platform.is_cuda() and current_platform.has_device_capability(90)):
|
||||||
|
print("ERROR: Requires CUDA and >= Hopper (SM90)")
|
||||||
|
print(f"Current platform: {current_platform.device_type}")
|
||||||
|
if current_platform.is_cuda():
|
||||||
|
print(f"Device capability: {current_platform.get_device_capability()}")
|
||||||
|
return 1
|
||||||
|
|
||||||
|
# Read configuration from environment
|
||||||
|
model = os.getenv("VLLM_BENCH_MODEL", "Qwen/Qwen3-1.7B")
|
||||||
|
tp_size = int(os.getenv("VLLM_BENCH_TP_SIZE", "1"))
|
||||||
|
max_batch_size = int(os.getenv("VLLM_BENCH_BATCH_SIZE", "128"))
|
||||||
|
num_trials = int(os.getenv("VLLM_BENCH_NUM_TRIALS", "5"))
|
||||||
|
min_prompt = int(os.getenv("VLLM_BENCH_MIN_PROMPT", "1024"))
|
||||||
|
max_prompt = int(os.getenv("VLLM_BENCH_MAX_PROMPT", "2048"))
|
||||||
|
max_tokens = int(os.getenv("VLLM_BENCH_MAX_TOKENS", "128"))
|
||||||
|
temperature = float(os.getenv("VLLM_BENCH_TEMPERATURE", "0.0"))
|
||||||
|
gpu_mem_util = float(os.getenv("VLLM_BENCH_GPU_MEMORY_UTILIZATION", "0.4"))
|
||||||
|
max_model_len = int(os.getenv("VLLM_BENCH_MAX_MODEL_LEN", "5120"))
|
||||||
|
backend = os.getenv("VLLM_BENCH_BACKEND", "FLASH_ATTN")
|
||||||
|
|
||||||
|
print("\n" + "=" * 80)
|
||||||
|
print("VLLM BATCH INVARIANCE BENCHMARK")
|
||||||
|
print("=" * 80)
|
||||||
|
print("\nConfiguration:")
|
||||||
|
print(f" Model: {model}")
|
||||||
|
print(f" Tensor Parallel Size: {tp_size}")
|
||||||
|
print(f" Attention Backend: {backend}")
|
||||||
|
print(f" Max Batch Size: {max_batch_size}")
|
||||||
|
print(f" Number of Trials: {num_trials}")
|
||||||
|
print(f" Prompt Length Range: {min_prompt}-{max_prompt} words")
|
||||||
|
print(f" Max Tokens to Generate: {max_tokens}")
|
||||||
|
print(f" Temperature: {temperature}")
|
||||||
|
print(f" GPU Memory Utilization: {gpu_mem_util}")
|
||||||
|
print(f" Max Model Length: {max_model_len}")
|
||||||
|
print("=" * 80)
|
||||||
|
|
||||||
|
# Run benchmark WITHOUT batch invariance (baseline)
|
||||||
|
print("\n" + "=" * 80)
|
||||||
|
print("PHASE 1: Running WITHOUT batch invariance (baseline)")
|
||||||
|
print("=" * 80)
|
||||||
|
baseline_results = run_benchmark_with_batch_invariant(
|
||||||
|
model=model,
|
||||||
|
tp_size=tp_size,
|
||||||
|
max_batch_size=max_batch_size,
|
||||||
|
num_trials=num_trials,
|
||||||
|
min_prompt=min_prompt,
|
||||||
|
max_prompt=max_prompt,
|
||||||
|
max_tokens=max_tokens,
|
||||||
|
temperature=temperature,
|
||||||
|
gpu_mem_util=gpu_mem_util,
|
||||||
|
max_model_len=max_model_len,
|
||||||
|
backend=backend,
|
||||||
|
batch_invariant=False,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Run benchmark WITH batch invariance
|
||||||
|
print("\n" + "=" * 80)
|
||||||
|
print("PHASE 2: Running WITH batch invariance")
|
||||||
|
print("=" * 80)
|
||||||
|
batch_inv_results = run_benchmark_with_batch_invariant(
|
||||||
|
model=model,
|
||||||
|
tp_size=tp_size,
|
||||||
|
max_batch_size=max_batch_size,
|
||||||
|
num_trials=num_trials,
|
||||||
|
min_prompt=min_prompt,
|
||||||
|
max_prompt=max_prompt,
|
||||||
|
max_tokens=max_tokens,
|
||||||
|
temperature=temperature,
|
||||||
|
gpu_mem_util=gpu_mem_util,
|
||||||
|
max_model_len=max_model_len,
|
||||||
|
backend=backend,
|
||||||
|
batch_invariant=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Compare results
|
||||||
|
print("\n" + "=" * 80)
|
||||||
|
print("COMPARISON: Batch Invariance vs Baseline")
|
||||||
|
print("=" * 80)
|
||||||
|
|
||||||
|
init_overhead_pct = (
|
||||||
|
(batch_inv_results["init_time"] - baseline_results["init_time"])
|
||||||
|
/ baseline_results["init_time"]
|
||||||
|
* 100
|
||||||
|
)
|
||||||
|
time_overhead_pct = (
|
||||||
|
(batch_inv_results["avg_time"] - baseline_results["avg_time"])
|
||||||
|
/ baseline_results["avg_time"]
|
||||||
|
* 100
|
||||||
|
)
|
||||||
|
throughput_change_pct = (
|
||||||
|
(batch_inv_results["throughput"] - baseline_results["throughput"])
|
||||||
|
/ baseline_results["throughput"]
|
||||||
|
* 100
|
||||||
|
)
|
||||||
|
|
||||||
|
print("\nInitialization Time:")
|
||||||
|
print(f" Baseline: {baseline_results['init_time']:.2f}s")
|
||||||
|
print(f" Batch Invariant: {batch_inv_results['init_time']:.2f}s")
|
||||||
|
print(f" Overhead: {init_overhead_pct:+.2f}%")
|
||||||
|
|
||||||
|
print("\nAverage Trial Time:")
|
||||||
|
print(f" Baseline: {baseline_results['avg_time']:.2f}s")
|
||||||
|
print(f" Batch Invariant: {batch_inv_results['avg_time']:.2f}s")
|
||||||
|
print(f" Overhead: {time_overhead_pct:+.2f}%")
|
||||||
|
|
||||||
|
print("\nThroughput (tokens/s):")
|
||||||
|
print(f" Baseline: {baseline_results['throughput']:.2f}")
|
||||||
|
print(f" Batch Invariant: {batch_inv_results['throughput']:.2f}")
|
||||||
|
print(f" Change: {throughput_change_pct:+.2f}%")
|
||||||
|
|
||||||
|
print("\nPrompts/s:")
|
||||||
|
print(f" Baseline: {baseline_results['prompts_per_sec']:.2f}")
|
||||||
|
print(f" Batch Invariant: {batch_inv_results['prompts_per_sec']:.2f}")
|
||||||
|
|
||||||
|
print("\n" + "=" * 80)
|
||||||
|
print("SUMMARY")
|
||||||
|
print("=" * 80)
|
||||||
|
if time_overhead_pct > 0:
|
||||||
|
print(
|
||||||
|
f"Batch invariance mode adds approximately {time_overhead_pct:.1f}% "
|
||||||
|
"overhead"
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
print(
|
||||||
|
f"Batch invariance mode is approximately {-time_overhead_pct:.1f}% "
|
||||||
|
"faster (unexpected!)"
|
||||||
|
)
|
||||||
|
|
||||||
|
if abs(throughput_change_pct) < 1.0:
|
||||||
|
print("Throughput difference is negligible (< 1%)")
|
||||||
|
elif throughput_change_pct < 0:
|
||||||
|
print(
|
||||||
|
f"Throughput decreased by {-throughput_change_pct:.1f}% "
|
||||||
|
"with batch invariance"
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
print(
|
||||||
|
f"Throughput increased by {throughput_change_pct:.1f}% "
|
||||||
|
"with batch invariance (unexpected!)"
|
||||||
|
)
|
||||||
|
|
||||||
|
print("=" * 80 + "\n")
|
||||||
|
|
||||||
|
return 0
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
exit(main())
|
||||||
120
benchmarks/benchmark_hash.py
Normal file
120
benchmarks/benchmark_hash.py
Normal file
@@ -0,0 +1,120 @@
|
|||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
"""
|
||||||
|
Micro benchmark comparing built-in hash(), SHA-256, and xxHash.
|
||||||
|
|
||||||
|
This focuses on a single test payload shaped like the prefix-cache hash input:
|
||||||
|
(32-byte bytes object, 32-int tuple)
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
python benchmarks/hash_micro_benchmark.py --iterations 20000
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import random
|
||||||
|
import statistics
|
||||||
|
import time
|
||||||
|
from collections.abc import Callable, Iterable
|
||||||
|
|
||||||
|
from vllm.utils.hashing import sha256, xxhash
|
||||||
|
|
||||||
|
|
||||||
|
def _generate_test_data(seed: int) -> tuple[bytes, tuple[int, ...]]:
|
||||||
|
"""Generate a deterministic test payload."""
|
||||||
|
random.seed(seed)
|
||||||
|
bytes_data = bytes(random.getrandbits(8) for _ in range(32))
|
||||||
|
int_tuple = tuple(random.randint(1, 1_000_000) for _ in range(32))
|
||||||
|
return (bytes_data, int_tuple)
|
||||||
|
|
||||||
|
|
||||||
|
def _benchmark_func(func: Callable[[tuple], object], data: tuple, iterations: int):
|
||||||
|
"""Return (avg_seconds, std_seconds) for hashing `data` `iterations` times."""
|
||||||
|
times: list[float] = []
|
||||||
|
|
||||||
|
# Warm-up to avoid first-run noise.
|
||||||
|
for _ in range(200):
|
||||||
|
func(data)
|
||||||
|
|
||||||
|
for _ in range(iterations):
|
||||||
|
start = time.perf_counter()
|
||||||
|
func(data)
|
||||||
|
end = time.perf_counter()
|
||||||
|
times.append(end - start)
|
||||||
|
|
||||||
|
avg = statistics.mean(times)
|
||||||
|
std = statistics.stdev(times) if len(times) > 1 else 0.0
|
||||||
|
return avg, std
|
||||||
|
|
||||||
|
|
||||||
|
def _run_benchmarks(
|
||||||
|
benchmarks: Iterable[tuple[str, Callable[[tuple], object]]],
|
||||||
|
data: tuple,
|
||||||
|
iterations: int,
|
||||||
|
):
|
||||||
|
"""Yield (name, avg, std) for each benchmark, skipping unavailable ones."""
|
||||||
|
for name, func in benchmarks:
|
||||||
|
try:
|
||||||
|
avg, std = _benchmark_func(func, data, iterations)
|
||||||
|
except ModuleNotFoundError as exc:
|
||||||
|
print(f"Skipping {name}: {exc}")
|
||||||
|
continue
|
||||||
|
yield name, avg, std
|
||||||
|
|
||||||
|
|
||||||
|
def builtin_hash(data: tuple) -> int:
|
||||||
|
"""Wrapper for Python's built-in hash()."""
|
||||||
|
return hash(data)
|
||||||
|
|
||||||
|
|
||||||
|
def main() -> None:
|
||||||
|
parser = argparse.ArgumentParser(description=__doc__)
|
||||||
|
parser.add_argument(
|
||||||
|
"--iterations",
|
||||||
|
type=int,
|
||||||
|
default=10_000,
|
||||||
|
help="Number of measured iterations per hash function.",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--seed", type=int, default=42, help="Random seed for test payload."
|
||||||
|
)
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
data = _generate_test_data(args.seed)
|
||||||
|
benchmarks = (
|
||||||
|
("SHA256 (pickle)", sha256),
|
||||||
|
("xxHash (pickle)", xxhash),
|
||||||
|
("built-in hash()", builtin_hash),
|
||||||
|
)
|
||||||
|
|
||||||
|
print("=" * 60)
|
||||||
|
print("HASH FUNCTION MICRO BENCHMARK")
|
||||||
|
print("=" * 60)
|
||||||
|
print("Test data: (32-byte bytes object, 32-int tuple)")
|
||||||
|
print(f"Iterations: {args.iterations:,}")
|
||||||
|
print("=" * 60)
|
||||||
|
|
||||||
|
results = list(_run_benchmarks(benchmarks, data, args.iterations))
|
||||||
|
builtin_entry = next((r for r in results if r[0] == "built-in hash()"), None)
|
||||||
|
|
||||||
|
print("\nResults:")
|
||||||
|
for name, avg, std in results:
|
||||||
|
print(f" {name:16s}: {avg * 1e6:8.2f} ± {std * 1e6:6.2f} μs")
|
||||||
|
|
||||||
|
if builtin_entry:
|
||||||
|
_, builtin_avg, _ = builtin_entry
|
||||||
|
print("\n" + "=" * 60)
|
||||||
|
print("SUMMARY (relative to built-in hash())")
|
||||||
|
print("=" * 60)
|
||||||
|
for name, avg, _ in results:
|
||||||
|
if name == "built-in hash()":
|
||||||
|
continue
|
||||||
|
speed_ratio = avg / builtin_avg
|
||||||
|
print(f"• {name} is {speed_ratio:.1f}x slower than built-in hash()")
|
||||||
|
else:
|
||||||
|
print("\nBuilt-in hash() result missing; cannot compute speed ratios.")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
@@ -108,7 +108,10 @@ def benchmark_batched_propose(args):
|
|||||||
device_config=DeviceConfig(device=current_platform.device_type),
|
device_config=DeviceConfig(device=current_platform.device_type),
|
||||||
parallel_config=ParallelConfig(),
|
parallel_config=ParallelConfig(),
|
||||||
load_config=LoadConfig(),
|
load_config=LoadConfig(),
|
||||||
scheduler_config=SchedulerConfig(),
|
scheduler_config=SchedulerConfig(
|
||||||
|
max_model_len=model_config.max_model_len,
|
||||||
|
is_encoder_decoder=model_config.is_encoder_decoder,
|
||||||
|
),
|
||||||
)
|
)
|
||||||
|
|
||||||
# monkey patch vllm.v1.worker.gpu_model_runner.get_pp_group
|
# monkey patch vllm.v1.worker.gpu_model_runner.get_pp_group
|
||||||
|
|||||||
110
benchmarks/benchmark_prefix_block_hash.py
Normal file
110
benchmarks/benchmark_prefix_block_hash.py
Normal file
@@ -0,0 +1,110 @@
|
|||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
|
||||||
|
"""
|
||||||
|
Simple benchmark to compare prefix-cache block hashing algorithms.
|
||||||
|
|
||||||
|
Example:
|
||||||
|
python benchmark_prefix_block_hash.py --num-blocks 20000 --block-size 32
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import random
|
||||||
|
import statistics
|
||||||
|
import sys
|
||||||
|
import time
|
||||||
|
from collections.abc import Callable, Iterable, Sequence
|
||||||
|
|
||||||
|
from vllm.utils.hashing import get_hash_fn_by_name
|
||||||
|
from vllm.v1.core.kv_cache_utils import BlockHash, hash_block_tokens, init_none_hash
|
||||||
|
|
||||||
|
SUPPORTED_ALGOS = ("sha256", "sha256_cbor", "xxhash", "xxhash_cbor")
|
||||||
|
|
||||||
|
|
||||||
|
def _generate_blocks(
|
||||||
|
num_blocks: int, block_size: int, vocab_size: int, seed: int
|
||||||
|
) -> list[list[int]]:
|
||||||
|
rng = random.Random(seed)
|
||||||
|
return [
|
||||||
|
[rng.randrange(vocab_size) for _ in range(block_size)]
|
||||||
|
for _ in range(num_blocks)
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def _hash_all_blocks(
|
||||||
|
hash_fn: Callable[[object], bytes],
|
||||||
|
blocks: Iterable[Sequence[int]],
|
||||||
|
) -> float:
|
||||||
|
parent_hash: BlockHash | None = None
|
||||||
|
start = time.perf_counter()
|
||||||
|
for block in blocks:
|
||||||
|
parent_hash = hash_block_tokens(hash_fn, parent_hash, block, extra_keys=None)
|
||||||
|
end = time.perf_counter()
|
||||||
|
return end - start
|
||||||
|
|
||||||
|
|
||||||
|
def _benchmark(
|
||||||
|
hash_algo: str,
|
||||||
|
blocks: list[list[int]],
|
||||||
|
trials: int,
|
||||||
|
) -> tuple[float, float, float] | None:
|
||||||
|
try:
|
||||||
|
hash_fn = get_hash_fn_by_name(hash_algo)
|
||||||
|
init_none_hash(hash_fn)
|
||||||
|
timings = [_hash_all_blocks(hash_fn, blocks) for _ in range(trials)]
|
||||||
|
except ModuleNotFoundError as exc:
|
||||||
|
print(f"Skipping {hash_algo}: {exc}", file=sys.stderr)
|
||||||
|
return None
|
||||||
|
|
||||||
|
avg = statistics.mean(timings)
|
||||||
|
best = min(timings)
|
||||||
|
# throughput: tokens / second
|
||||||
|
tokens_hashed = len(blocks) * len(blocks[0])
|
||||||
|
throughput = tokens_hashed / best
|
||||||
|
return avg, best, throughput
|
||||||
|
|
||||||
|
|
||||||
|
def main() -> None:
|
||||||
|
parser = argparse.ArgumentParser(description=__doc__)
|
||||||
|
parser.add_argument("--num-blocks", type=int, default=10000, help="Block count.")
|
||||||
|
parser.add_argument("--block-size", type=int, default=32, help="Tokens per block.")
|
||||||
|
parser.add_argument(
|
||||||
|
"--vocab-size", type=int, default=32000, help="Token id range [0, vocab_size)."
|
||||||
|
)
|
||||||
|
parser.add_argument("--seed", type=int, default=0, help="Random seed.")
|
||||||
|
parser.add_argument(
|
||||||
|
"--trials", type=int, default=5, help="Number of timed trials per algorithm."
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--algorithms",
|
||||||
|
nargs="+",
|
||||||
|
default=SUPPORTED_ALGOS,
|
||||||
|
choices=SUPPORTED_ALGOS,
|
||||||
|
help="Hash algorithms to benchmark.",
|
||||||
|
)
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
blocks = _generate_blocks(
|
||||||
|
args.num_blocks, args.block_size, args.vocab_size, args.seed
|
||||||
|
)
|
||||||
|
print(
|
||||||
|
f"Benchmarking {len(args.algorithms)} algorithms on "
|
||||||
|
f"{args.num_blocks} blocks (block size={args.block_size})."
|
||||||
|
)
|
||||||
|
|
||||||
|
for algo in args.algorithms:
|
||||||
|
result = _benchmark(algo, blocks, args.trials)
|
||||||
|
if result is None:
|
||||||
|
continue
|
||||||
|
|
||||||
|
avg, best, throughput = result
|
||||||
|
print(
|
||||||
|
f"{algo:14s} avg: {avg:.6f}s best: {best:.6f}s "
|
||||||
|
f"throughput: {throughput / 1e6:.2f}M tokens/s"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
@@ -40,7 +40,7 @@ from vllm.engine.arg_utils import EngineArgs
|
|||||||
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
||||||
|
|
||||||
try:
|
try:
|
||||||
from vllm.transformers_utils.tokenizer import get_tokenizer
|
from vllm.tokenizers import get_tokenizer
|
||||||
except ImportError:
|
except ImportError:
|
||||||
from backend_request_func import get_tokenizer
|
from backend_request_func import get_tokenizer
|
||||||
|
|
||||||
@@ -69,7 +69,7 @@ def sample_tokens(tokenizer: PreTrainedTokenizerBase, length: int) -> list[int]:
|
|||||||
|
|
||||||
# Remove the special tokens.
|
# Remove the special tokens.
|
||||||
return random.choices(
|
return random.choices(
|
||||||
[v for k, v in vocab.items() if k not in all_special_ids],
|
[v for v in vocab.values() if v not in all_special_ids],
|
||||||
k=length,
|
k=length,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
@@ -46,7 +46,7 @@ from tqdm.asyncio import tqdm
|
|||||||
from transformers import PreTrainedTokenizerBase
|
from transformers import PreTrainedTokenizerBase
|
||||||
|
|
||||||
try:
|
try:
|
||||||
from vllm.transformers_utils.tokenizer import get_tokenizer
|
from vllm.tokenizers import get_tokenizer
|
||||||
except ImportError:
|
except ImportError:
|
||||||
from backend_request_func import get_tokenizer
|
from backend_request_func import get_tokenizer
|
||||||
|
|
||||||
@@ -963,8 +963,7 @@ def create_argument_parser():
|
|||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--profile",
|
"--profile",
|
||||||
action="store_true",
|
action="store_true",
|
||||||
help="Use Torch Profiler. The endpoint must be launched with "
|
help="Use vLLM Profiling. --profiler-config must be provided on the server.",
|
||||||
"VLLM_TORCH_PROFILER_DIR to enable profiler.",
|
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--result-dir",
|
"--result-dir",
|
||||||
|
|||||||
@@ -5,11 +5,12 @@ import argparse
|
|||||||
import asyncio
|
import asyncio
|
||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
|
import time
|
||||||
|
import uuid
|
||||||
|
from urllib.parse import urlparse
|
||||||
|
|
||||||
import aiohttp
|
import aiohttp
|
||||||
from quart import Quart, Response, make_response, request
|
from quart import Quart, Response, make_response, request
|
||||||
from rate_limiter import RateLimiter
|
|
||||||
from request_queue import RequestQueue
|
|
||||||
|
|
||||||
# Configure logging
|
# Configure logging
|
||||||
logging.basicConfig(level=logging.INFO)
|
logging.basicConfig(level=logging.INFO)
|
||||||
@@ -24,26 +25,8 @@ def parse_args():
|
|||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--timeout",
|
"--timeout",
|
||||||
type=float,
|
type=float,
|
||||||
default=300,
|
default=6 * 60 * 60,
|
||||||
help="Timeout for backend service requests in seconds (default: 300)",
|
help="Timeout for backend service requests in seconds (default: 21600)",
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--max-concurrent",
|
|
||||||
type=int,
|
|
||||||
default=100,
|
|
||||||
help="Maximum concurrent requests to backend services (default: 100)",
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--queue-size",
|
|
||||||
type=int,
|
|
||||||
default=500,
|
|
||||||
help="Maximum number of requests in the queue (default: 500)",
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--rate-limit",
|
|
||||||
type=int,
|
|
||||||
default=40,
|
|
||||||
help="Maximum requests per second (default: 40)",
|
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--port",
|
"--port",
|
||||||
@@ -54,14 +37,32 @@ def parse_args():
|
|||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--prefill-url",
|
"--prefill-url",
|
||||||
type=str,
|
type=str,
|
||||||
default="http://localhost:8100/v1/completions",
|
default="http://localhost:8100",
|
||||||
help="Prefill service endpoint URL",
|
help="Prefill service base URL (protocol + host[:port])",
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--decode-url",
|
"--decode-url",
|
||||||
type=str,
|
type=str,
|
||||||
default="http://localhost:8200/v1/completions",
|
default="http://localhost:8200",
|
||||||
help="Decode service endpoint URL",
|
help="Decode service base URL (protocol + host[:port])",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--kv-host",
|
||||||
|
type=str,
|
||||||
|
default="localhost",
|
||||||
|
help="Hostname or IP used by KV transfer (default: localhost)",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--prefill-kv-port",
|
||||||
|
type=int,
|
||||||
|
default=14579,
|
||||||
|
help="Prefill KV port (default: 14579)",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--decode-kv-port",
|
||||||
|
type=int,
|
||||||
|
default=14580,
|
||||||
|
help="Decode KV port (default: 14580)",
|
||||||
)
|
)
|
||||||
|
|
||||||
return parser.parse_args()
|
return parser.parse_args()
|
||||||
@@ -73,70 +74,129 @@ def main():
|
|||||||
|
|
||||||
# Initialize configuration using command line parameters
|
# Initialize configuration using command line parameters
|
||||||
AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=args.timeout)
|
AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=args.timeout)
|
||||||
MAX_CONCURRENT_REQUESTS = args.max_concurrent
|
|
||||||
REQUEST_QUEUE_SIZE = args.queue_size
|
|
||||||
RATE_LIMIT = args.rate_limit
|
|
||||||
PREFILL_SERVICE_URL = args.prefill_url
|
PREFILL_SERVICE_URL = args.prefill_url
|
||||||
DECODE_SERVICE_URL = args.decode_url
|
DECODE_SERVICE_URL = args.decode_url
|
||||||
PORT = args.port
|
PORT = args.port
|
||||||
|
|
||||||
|
PREFILL_KV_ADDR = f"{args.kv_host}:{args.prefill_kv_port}"
|
||||||
|
DECODE_KV_ADDR = f"{args.kv_host}:{args.decode_kv_port}"
|
||||||
|
|
||||||
|
logger.info(
|
||||||
|
"Proxy resolved KV addresses -> prefill: %s, decode: %s",
|
||||||
|
PREFILL_KV_ADDR,
|
||||||
|
DECODE_KV_ADDR,
|
||||||
|
)
|
||||||
|
|
||||||
app = Quart(__name__)
|
app = Quart(__name__)
|
||||||
|
|
||||||
# Initialize the rate limiter and request queue
|
# Attach the configuration object to the application instance so helper
|
||||||
rate_limiter = RateLimiter(RATE_LIMIT)
|
# coroutines can read the resolved backend URLs and timeouts without using
|
||||||
request_queue = RequestQueue(MAX_CONCURRENT_REQUESTS, REQUEST_QUEUE_SIZE)
|
# globals.
|
||||||
|
|
||||||
# Attach the configuration object to the application instance
|
|
||||||
app.config.update(
|
app.config.update(
|
||||||
{
|
{
|
||||||
"AIOHTTP_TIMEOUT": AIOHTTP_TIMEOUT,
|
"AIOHTTP_TIMEOUT": AIOHTTP_TIMEOUT,
|
||||||
"rate_limiter": rate_limiter,
|
|
||||||
"request_queue": request_queue,
|
|
||||||
"PREFILL_SERVICE_URL": PREFILL_SERVICE_URL,
|
"PREFILL_SERVICE_URL": PREFILL_SERVICE_URL,
|
||||||
"DECODE_SERVICE_URL": DECODE_SERVICE_URL,
|
"DECODE_SERVICE_URL": DECODE_SERVICE_URL,
|
||||||
|
"PREFILL_KV_ADDR": PREFILL_KV_ADDR,
|
||||||
|
"DECODE_KV_ADDR": DECODE_KV_ADDR,
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
|
||||||
# Start queue processing on app startup
|
def _normalize_base_url(url: str) -> str:
|
||||||
@app.before_serving
|
"""Remove any trailing slash so path joins behave predictably."""
|
||||||
async def startup():
|
return url.rstrip("/")
|
||||||
"""Start request processing task when app starts serving"""
|
|
||||||
asyncio.create_task(request_queue.process())
|
|
||||||
|
|
||||||
async def forward_request(url, data):
|
def _get_host_port(url: str) -> str:
|
||||||
"""Forward request to backend service with rate limiting and error handling"""
|
"""Return the hostname:port portion for logging and KV headers."""
|
||||||
headers = {"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"}
|
parsed = urlparse(url)
|
||||||
|
host = parsed.hostname or "localhost"
|
||||||
|
port = parsed.port
|
||||||
|
if port is None:
|
||||||
|
port = 80 if parsed.scheme == "http" else 443
|
||||||
|
return f"{host}:{port}"
|
||||||
|
|
||||||
# Use rate limiter as context manager
|
PREFILL_BASE = _normalize_base_url(PREFILL_SERVICE_URL)
|
||||||
async with (
|
DECODE_BASE = _normalize_base_url(DECODE_SERVICE_URL)
|
||||||
rate_limiter,
|
KV_TARGET = _get_host_port(DECODE_SERVICE_URL)
|
||||||
aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session,
|
|
||||||
|
def _build_headers(request_id: str) -> dict[str, str]:
|
||||||
|
"""Construct the headers expected by vLLM's P2P disagg connector."""
|
||||||
|
headers: dict[str, str] = {"X-Request-Id": request_id, "X-KV-Target": KV_TARGET}
|
||||||
|
api_key = os.environ.get("OPENAI_API_KEY")
|
||||||
|
if api_key:
|
||||||
|
headers["Authorization"] = f"Bearer {api_key}"
|
||||||
|
return headers
|
||||||
|
|
||||||
|
async def _run_prefill(
|
||||||
|
request_path: str,
|
||||||
|
payload: dict,
|
||||||
|
headers: dict[str, str],
|
||||||
|
request_id: str,
|
||||||
):
|
):
|
||||||
|
url = f"{PREFILL_BASE}{request_path}"
|
||||||
|
start_ts = time.perf_counter()
|
||||||
|
logger.info("[prefill] start request_id=%s url=%s", request_id, url)
|
||||||
try:
|
try:
|
||||||
async with session.post(
|
async with (
|
||||||
url=url, json=data, headers=headers
|
aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session,
|
||||||
) as response:
|
session.post(url=url, json=payload, headers=headers) as resp,
|
||||||
if response.status == 200:
|
):
|
||||||
# Stream response chunks
|
if resp.status != 200:
|
||||||
async for chunk_bytes in response.content.iter_chunked(1024):
|
error_text = await resp.text()
|
||||||
yield chunk_bytes
|
raise RuntimeError(
|
||||||
else:
|
f"Prefill backend error {resp.status}: {error_text}"
|
||||||
# Handle backend service errors
|
|
||||||
error_text = await response.text()
|
|
||||||
logger.error(
|
|
||||||
"Backend service error: %s - %s",
|
|
||||||
response.status,
|
|
||||||
error_text,
|
|
||||||
)
|
)
|
||||||
yield b'{"error": "Backend service error"}'
|
await resp.read()
|
||||||
except aiohttp.ClientError as e:
|
logger.info(
|
||||||
# Handle connection errors
|
"[prefill] done request_id=%s status=%s elapsed=%.2fs",
|
||||||
logger.error("Connection error to %s: %s", url, str(e))
|
request_id,
|
||||||
yield b'{"error": "Service unavailable"}'
|
resp.status,
|
||||||
|
time.perf_counter() - start_ts,
|
||||||
|
)
|
||||||
|
except asyncio.TimeoutError as exc:
|
||||||
|
raise RuntimeError(f"Prefill service timeout at {url}") from exc
|
||||||
|
except aiohttp.ClientError as exc:
|
||||||
|
raise RuntimeError(f"Prefill service unavailable at {url}") from exc
|
||||||
|
|
||||||
|
async def _stream_decode(
|
||||||
|
request_path: str,
|
||||||
|
payload: dict,
|
||||||
|
headers: dict[str, str],
|
||||||
|
request_id: str,
|
||||||
|
):
|
||||||
|
url = f"{DECODE_BASE}{request_path}"
|
||||||
|
# Stream tokens from the decode service once the prefill stage has
|
||||||
|
# materialized KV caches on the target workers.
|
||||||
|
logger.info("[decode] start request_id=%s url=%s", request_id, url)
|
||||||
|
try:
|
||||||
|
async with (
|
||||||
|
aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session,
|
||||||
|
session.post(url=url, json=payload, headers=headers) as resp,
|
||||||
|
):
|
||||||
|
if resp.status != 200:
|
||||||
|
error_text = await resp.text()
|
||||||
|
logger.error(
|
||||||
|
"Decode backend error %s - %s", resp.status, error_text
|
||||||
|
)
|
||||||
|
err_msg = (
|
||||||
|
'{"error": "Decode backend error ' + str(resp.status) + '"}'
|
||||||
|
)
|
||||||
|
yield err_msg.encode()
|
||||||
|
return
|
||||||
|
logger.info(
|
||||||
|
"[decode] streaming response request_id=%s status=%s",
|
||||||
|
request_id,
|
||||||
|
resp.status,
|
||||||
|
)
|
||||||
|
async for chunk_bytes in resp.content.iter_chunked(1024):
|
||||||
|
yield chunk_bytes
|
||||||
|
logger.info("[decode] finished streaming request_id=%s", request_id)
|
||||||
except asyncio.TimeoutError:
|
except asyncio.TimeoutError:
|
||||||
# Handle timeout errors
|
logger.error("Decode service timeout at %s", url)
|
||||||
logger.error("Timeout connecting to %s", url)
|
yield b'{"error": "Decode service timeout"}'
|
||||||
yield b'{"error": "Service timeout"}'
|
except aiohttp.ClientError as exc:
|
||||||
|
logger.error("Decode service error at %s: %s", url, exc)
|
||||||
|
yield b'{"error": "Decode service unavailable"}'
|
||||||
|
|
||||||
async def process_request():
|
async def process_request():
|
||||||
"""Process a single request through prefill and decode stages"""
|
"""Process a single request through prefill and decode stages"""
|
||||||
@@ -146,13 +206,27 @@ def main():
|
|||||||
# Create prefill request (max_tokens=1)
|
# Create prefill request (max_tokens=1)
|
||||||
prefill_request = original_request_data.copy()
|
prefill_request = original_request_data.copy()
|
||||||
prefill_request["max_tokens"] = 1
|
prefill_request["max_tokens"] = 1
|
||||||
|
if "max_completion_tokens" in prefill_request:
|
||||||
|
prefill_request["max_completion_tokens"] = 1
|
||||||
|
|
||||||
# Execute prefill stage
|
# Execute prefill stage
|
||||||
async for _ in forward_request(PREFILL_SERVICE_URL, prefill_request):
|
# The request id encodes both KV socket addresses so the backend can
|
||||||
continue
|
# shuttle tensors directly via NCCL once the prefill response
|
||||||
|
# completes.
|
||||||
|
request_id = (
|
||||||
|
f"___prefill_addr_{PREFILL_KV_ADDR}___decode_addr_"
|
||||||
|
f"{DECODE_KV_ADDR}_{uuid.uuid4().hex}"
|
||||||
|
)
|
||||||
|
|
||||||
|
headers = _build_headers(request_id)
|
||||||
|
await _run_prefill(request.path, prefill_request, headers, request_id)
|
||||||
|
|
||||||
# Execute decode stage and stream response
|
# Execute decode stage and stream response
|
||||||
generator = forward_request(DECODE_SERVICE_URL, original_request_data)
|
# Pass the unmodified user request so the decode phase can continue
|
||||||
|
# sampling with the already-populated KV cache.
|
||||||
|
generator = _stream_decode(
|
||||||
|
request.path, original_request_data, headers, request_id
|
||||||
|
)
|
||||||
response = await make_response(generator)
|
response = await make_response(generator)
|
||||||
response.timeout = None # Disable timeout for streaming response
|
response.timeout = None # Disable timeout for streaming response
|
||||||
return response
|
return response
|
||||||
@@ -168,23 +242,10 @@ def main():
|
|||||||
@app.route("/v1/completions", methods=["POST"])
|
@app.route("/v1/completions", methods=["POST"])
|
||||||
async def handle_request():
|
async def handle_request():
|
||||||
"""Handle incoming API requests with concurrency and rate limiting"""
|
"""Handle incoming API requests with concurrency and rate limiting"""
|
||||||
# Create task for request processing
|
|
||||||
task = asyncio.create_task(process_request())
|
|
||||||
|
|
||||||
# Enqueue request or reject if queue is full
|
|
||||||
if not await request_queue.enqueue(task):
|
|
||||||
return Response(
|
|
||||||
response=b'{"error": "Server busy, try again later"}',
|
|
||||||
status=503,
|
|
||||||
content_type="application/json",
|
|
||||||
)
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# Return the response from the processing task
|
return await process_request()
|
||||||
return await task
|
|
||||||
except asyncio.CancelledError:
|
except asyncio.CancelledError:
|
||||||
# Handle task cancellation (timeout or queue full)
|
logger.warning("Request cancelled")
|
||||||
logger.warning("Request cancelled due to timeout or queue full")
|
|
||||||
return Response(
|
return Response(
|
||||||
response=b'{"error": "Request cancelled"}',
|
response=b'{"error": "Request cancelled"}',
|
||||||
status=503,
|
status=503,
|
||||||
|
|||||||
@@ -14,6 +14,9 @@ from tqdm import tqdm
|
|||||||
|
|
||||||
import vllm._custom_ops as ops
|
import vllm._custom_ops as ops
|
||||||
from vllm.model_executor.layers.layernorm import RMSNorm
|
from vllm.model_executor.layers.layernorm import RMSNorm
|
||||||
|
from vllm.model_executor.layers.quantization.utils.fp8_utils import (
|
||||||
|
per_token_group_quant_fp8,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
@@ -22,6 +25,7 @@ class bench_params_t:
|
|||||||
hidden_size: int
|
hidden_size: int
|
||||||
add_residual: bool
|
add_residual: bool
|
||||||
dtype: torch.dtype
|
dtype: torch.dtype
|
||||||
|
group_size: list[int]
|
||||||
|
|
||||||
def description(self):
|
def description(self):
|
||||||
return (
|
return (
|
||||||
@@ -29,6 +33,7 @@ class bench_params_t:
|
|||||||
f"x D {self.hidden_size} "
|
f"x D {self.hidden_size} "
|
||||||
f"x R {self.add_residual} "
|
f"x R {self.add_residual} "
|
||||||
f"x DT {self.dtype}"
|
f"x DT {self.dtype}"
|
||||||
|
f"x GS {self.group_size}"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@@ -38,10 +43,11 @@ def get_bench_params() -> list[bench_params_t]:
|
|||||||
HIDDEN_SIZES = list(range(1024, 8129, 1024))
|
HIDDEN_SIZES = list(range(1024, 8129, 1024))
|
||||||
ADD_RESIDUAL = [True, False]
|
ADD_RESIDUAL = [True, False]
|
||||||
DTYPES = [torch.bfloat16, torch.float]
|
DTYPES = [torch.bfloat16, torch.float]
|
||||||
|
GROUP_SIZES = [[1, 64], [1, 128]]
|
||||||
|
|
||||||
combinations = product(NUM_TOKENS, HIDDEN_SIZES, ADD_RESIDUAL, DTYPES)
|
combinations = product(NUM_TOKENS, HIDDEN_SIZES, ADD_RESIDUAL, DTYPES, GROUP_SIZES)
|
||||||
bench_params = list(
|
bench_params = list(
|
||||||
map(lambda x: bench_params_t(x[0], x[1], x[2], x[3]), combinations)
|
map(lambda x: bench_params_t(x[0], x[1], x[2], x[3], x[4]), combinations)
|
||||||
)
|
)
|
||||||
return bench_params
|
return bench_params
|
||||||
|
|
||||||
@@ -52,6 +58,7 @@ def unfused_int8_impl(
|
|||||||
x: torch.Tensor,
|
x: torch.Tensor,
|
||||||
residual: torch.Tensor | None,
|
residual: torch.Tensor | None,
|
||||||
quant_dtype: torch.dtype,
|
quant_dtype: torch.dtype,
|
||||||
|
group_size: list[int],
|
||||||
):
|
):
|
||||||
# Norm
|
# Norm
|
||||||
torch_out = None
|
torch_out = None
|
||||||
@@ -69,6 +76,7 @@ def unfused_fp8_impl(
|
|||||||
x: torch.Tensor,
|
x: torch.Tensor,
|
||||||
residual: torch.Tensor | None,
|
residual: torch.Tensor | None,
|
||||||
quant_dtype: torch.dtype,
|
quant_dtype: torch.dtype,
|
||||||
|
group_size: list[int],
|
||||||
):
|
):
|
||||||
# Norm
|
# Norm
|
||||||
torch_out = None
|
torch_out = None
|
||||||
@@ -81,23 +89,63 @@ def unfused_fp8_impl(
|
|||||||
torch_out, _ = ops.scaled_fp8_quant(torch_out)
|
torch_out, _ = ops.scaled_fp8_quant(torch_out)
|
||||||
|
|
||||||
|
|
||||||
|
def unfused_groupwise_fp8_impl(
|
||||||
|
rms_norm_layer: RMSNorm,
|
||||||
|
x: torch.Tensor,
|
||||||
|
residual: torch.Tensor | None,
|
||||||
|
quant_dtype: torch.dtype,
|
||||||
|
group_size: list[int],
|
||||||
|
):
|
||||||
|
# Norm
|
||||||
|
torch_out = None
|
||||||
|
if residual is None:
|
||||||
|
torch_out = rms_norm_layer.forward_cuda(x, residual)
|
||||||
|
else:
|
||||||
|
torch_out, _ = rms_norm_layer.forward_cuda(x, residual)
|
||||||
|
|
||||||
|
# Quant
|
||||||
|
torch_out, _ = per_token_group_quant_fp8(
|
||||||
|
torch_out, group_size=group_size[1], use_ue8m0=False
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def fused_impl(
|
def fused_impl(
|
||||||
rms_norm_layer: RMSNorm, # this stores the weights
|
rms_norm_layer: RMSNorm, # this stores the weights
|
||||||
x: torch.Tensor,
|
x: torch.Tensor,
|
||||||
residual: torch.Tensor | None,
|
residual: torch.Tensor | None,
|
||||||
quant_dtype: torch.dtype,
|
quant_dtype: torch.dtype,
|
||||||
|
group_size: list[int],
|
||||||
):
|
):
|
||||||
out, _ = ops.rms_norm_dynamic_per_token_quant(
|
out, _ = ops.rms_norm_dynamic_per_token_quant(
|
||||||
x, rms_norm_layer.weight, 1e-6, quant_dtype, residual=residual
|
x, rms_norm_layer.weight, 1e-6, quant_dtype, residual=residual
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def fused_groupwise_impl(
|
||||||
|
rms_norm_layer: RMSNorm, # this stores the weights
|
||||||
|
x: torch.Tensor,
|
||||||
|
residual: torch.Tensor | None,
|
||||||
|
quant_dtype: torch.dtype,
|
||||||
|
group_size: list[int],
|
||||||
|
):
|
||||||
|
out, _ = ops.rms_norm_per_block_quant(
|
||||||
|
x,
|
||||||
|
rms_norm_layer.weight,
|
||||||
|
1e-6,
|
||||||
|
quant_dtype,
|
||||||
|
group_size,
|
||||||
|
residual=residual,
|
||||||
|
is_scale_transposed=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
# Bench functions
|
# Bench functions
|
||||||
def bench_fn(
|
def bench_fn(
|
||||||
rms_norm_layer: RMSNorm,
|
rms_norm_layer: RMSNorm,
|
||||||
x: torch.Tensor,
|
x: torch.Tensor,
|
||||||
residual: torch.Tensor,
|
residual: torch.Tensor,
|
||||||
quant_dtype: torch.dtype,
|
quant_dtype: torch.dtype,
|
||||||
|
group_size: list[int],
|
||||||
label: str,
|
label: str,
|
||||||
sub_label: str,
|
sub_label: str,
|
||||||
fn: Callable,
|
fn: Callable,
|
||||||
@@ -110,10 +158,11 @@ def bench_fn(
|
|||||||
"x": x,
|
"x": x,
|
||||||
"residual": residual,
|
"residual": residual,
|
||||||
"quant_dtype": quant_dtype,
|
"quant_dtype": quant_dtype,
|
||||||
|
"group_size": group_size,
|
||||||
"fn": fn,
|
"fn": fn,
|
||||||
}
|
}
|
||||||
return TBenchmark.Timer(
|
return TBenchmark.Timer(
|
||||||
stmt="fn(rms_norm_layer, x, residual, quant_dtype)",
|
stmt="fn(rms_norm_layer, x, residual, quant_dtype, group_size)",
|
||||||
globals=globals,
|
globals=globals,
|
||||||
label=label,
|
label=label,
|
||||||
sub_label=sub_label,
|
sub_label=sub_label,
|
||||||
@@ -147,6 +196,7 @@ def bench(params: bench_params_t, label: str, sub_label: str) -> Iterable[TMeasu
|
|||||||
x,
|
x,
|
||||||
residual,
|
residual,
|
||||||
torch.int8,
|
torch.int8,
|
||||||
|
params.group_size,
|
||||||
label,
|
label,
|
||||||
sub_label,
|
sub_label,
|
||||||
unfused_int8_impl,
|
unfused_int8_impl,
|
||||||
@@ -161,6 +211,7 @@ def bench(params: bench_params_t, label: str, sub_label: str) -> Iterable[TMeasu
|
|||||||
x,
|
x,
|
||||||
residual,
|
residual,
|
||||||
torch.float8_e4m3fn,
|
torch.float8_e4m3fn,
|
||||||
|
params.group_size,
|
||||||
label,
|
label,
|
||||||
sub_label,
|
sub_label,
|
||||||
unfused_fp8_impl,
|
unfused_fp8_impl,
|
||||||
@@ -175,6 +226,7 @@ def bench(params: bench_params_t, label: str, sub_label: str) -> Iterable[TMeasu
|
|||||||
x,
|
x,
|
||||||
residual,
|
residual,
|
||||||
torch.int8,
|
torch.int8,
|
||||||
|
params.group_size,
|
||||||
label,
|
label,
|
||||||
sub_label,
|
sub_label,
|
||||||
fused_impl,
|
fused_impl,
|
||||||
@@ -189,6 +241,7 @@ def bench(params: bench_params_t, label: str, sub_label: str) -> Iterable[TMeasu
|
|||||||
x,
|
x,
|
||||||
residual,
|
residual,
|
||||||
torch.float8_e4m3fn,
|
torch.float8_e4m3fn,
|
||||||
|
params.group_size,
|
||||||
label,
|
label,
|
||||||
sub_label,
|
sub_label,
|
||||||
fused_impl,
|
fused_impl,
|
||||||
@@ -196,6 +249,36 @@ def bench(params: bench_params_t, label: str, sub_label: str) -> Iterable[TMeasu
|
|||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# unfused groupwise fp8 impl.
|
||||||
|
timers.append(
|
||||||
|
bench_fn(
|
||||||
|
layer,
|
||||||
|
x,
|
||||||
|
residual,
|
||||||
|
torch.float8_e4m3fn,
|
||||||
|
params.group_size,
|
||||||
|
label,
|
||||||
|
sub_label,
|
||||||
|
unfused_groupwise_fp8_impl,
|
||||||
|
"unfused_groupwise_fp8_impl",
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
# fused groupwise fp8 impl.
|
||||||
|
timers.append(
|
||||||
|
bench_fn(
|
||||||
|
layer,
|
||||||
|
x,
|
||||||
|
residual,
|
||||||
|
torch.float8_e4m3fn,
|
||||||
|
params.group_size,
|
||||||
|
label,
|
||||||
|
sub_label,
|
||||||
|
fused_groupwise_impl,
|
||||||
|
"fused_groupwise_fp8_impl",
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
print_timers(timers)
|
print_timers(timers)
|
||||||
|
|
||||||
return timers
|
return timers
|
||||||
|
|||||||
@@ -1,10 +1,18 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
|
||||||
|
import os
|
||||||
|
|
||||||
|
# Disable DeepGEMM for this benchmark to use CUTLASS
|
||||||
|
os.environ["VLLM_USE_DEEP_GEMM"] = "0"
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
from vllm.model_executor.layers.quantization.utils.fp8_utils import (
|
from vllm.model_executor.layers.quantization.utils.fp8_utils import (
|
||||||
apply_w8a8_block_fp8_linear,
|
W8A8BlockFp8LinearOp,
|
||||||
|
)
|
||||||
|
from vllm.model_executor.layers.quantization.utils.quant_utils import (
|
||||||
|
GroupShape,
|
||||||
)
|
)
|
||||||
from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
|
from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
|
||||||
CUTLASS_BLOCK_FP8_SUPPORTED,
|
CUTLASS_BLOCK_FP8_SUPPORTED,
|
||||||
@@ -39,13 +47,14 @@ def build_w8a8_block_fp8_runner(M, N, K, block_size, device, use_cutlass):
|
|||||||
fp8_info = torch.finfo(torch.float8_e4m3fn)
|
fp8_info = torch.finfo(torch.float8_e4m3fn)
|
||||||
fp8_max, fp8_min = fp8_info.max, fp8_info.min
|
fp8_max, fp8_min = fp8_info.max, fp8_info.min
|
||||||
|
|
||||||
# Create random FP8 tensors
|
# Create random input tensor (bfloat16, will be quantized by W8A8BlockFp8LinearOp)
|
||||||
A_ref = (torch.rand(M, K, dtype=torch.bfloat16, device=device) - 0.5) * 2 * fp8_max
|
A_ref = (torch.rand(M, K, dtype=torch.bfloat16, device=device) - 0.5) * 2 * fp8_max
|
||||||
|
|
||||||
|
# Create quantized weight tensor
|
||||||
B_ref = (torch.rand(N, K, dtype=torch.bfloat16, device=device) - 0.5) * 2 * fp8_max
|
B_ref = (torch.rand(N, K, dtype=torch.bfloat16, device=device) - 0.5) * 2 * fp8_max
|
||||||
B = B_ref.clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn)
|
B = B_ref.clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn)
|
||||||
|
|
||||||
# Create scales
|
# Create weight scales
|
||||||
block_n, block_k = block_size[0], block_size[1]
|
block_n, block_k = block_size[0], block_size[1]
|
||||||
n_tiles = (N + block_n - 1) // block_n
|
n_tiles = (N + block_n - 1) // block_n
|
||||||
k_tiles = (K + block_k - 1) // block_k
|
k_tiles = (K + block_k - 1) // block_k
|
||||||
@@ -55,18 +64,24 @@ def build_w8a8_block_fp8_runner(M, N, K, block_size, device, use_cutlass):
|
|||||||
* factor_for_scale
|
* factor_for_scale
|
||||||
)
|
)
|
||||||
|
|
||||||
# SM90 CUTLASS requires row-major format for scales
|
# Create W8A8BlockFp8LinearOp instance
|
||||||
if use_cutlass and current_platform.is_device_capability(90):
|
weight_group_shape = GroupShape(block_n, block_k)
|
||||||
Bs = Bs.T.contiguous()
|
act_quant_group_shape = GroupShape(1, block_k) # Per-token, per-group quantization
|
||||||
|
|
||||||
|
linear_op = W8A8BlockFp8LinearOp(
|
||||||
|
weight_group_shape=weight_group_shape,
|
||||||
|
act_quant_group_shape=act_quant_group_shape,
|
||||||
|
cutlass_block_fp8_supported=use_cutlass,
|
||||||
|
use_aiter_and_is_supported=False,
|
||||||
|
)
|
||||||
|
|
||||||
def run():
|
def run():
|
||||||
if use_cutlass:
|
return linear_op.apply(
|
||||||
return apply_w8a8_block_fp8_linear(
|
input=A_ref,
|
||||||
A_ref, B, block_size, Bs, cutlass_block_fp8_supported=True
|
weight=B,
|
||||||
)
|
weight_scale=Bs,
|
||||||
else:
|
input_scale=None,
|
||||||
return apply_w8a8_block_fp8_linear(
|
bias=None,
|
||||||
A_ref, B, block_size, Bs, cutlass_block_fp8_supported=False
|
|
||||||
)
|
)
|
||||||
|
|
||||||
return run
|
return run
|
||||||
|
|||||||
244
benchmarks/kernels/benchmark_2d_silu_mul_fp8_quant.py
Normal file
244
benchmarks/kernels/benchmark_2d_silu_mul_fp8_quant.py
Normal file
@@ -0,0 +1,244 @@
|
|||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from enum import Enum
|
||||||
|
from itertools import product
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
import torch
|
||||||
|
import torch.utils.benchmark as TBenchmark
|
||||||
|
from torch.utils.benchmark import Measurement as TMeasurement
|
||||||
|
|
||||||
|
from vllm.model_executor.layers.quantization.utils.fp8_utils import (
|
||||||
|
_per_token_group_quant_fp8_colmajor,
|
||||||
|
silu_mul_per_token_group_quant_fp8_colmajor,
|
||||||
|
)
|
||||||
|
from vllm.triton_utils import triton
|
||||||
|
from vllm.utils.deep_gemm import is_deep_gemm_e8m0_used
|
||||||
|
|
||||||
|
from .utils import ArgPool, Bench, CudaGraphBenchParams
|
||||||
|
|
||||||
|
GROUP_SIZE = 128
|
||||||
|
FLOAT8_T = torch.float8_e4m3fn
|
||||||
|
|
||||||
|
|
||||||
|
def print_timers(timers: list[TMeasurement], cuda_graph_nops: int):
|
||||||
|
print(
|
||||||
|
f"Note : The timings reported above is for {cuda_graph_nops} "
|
||||||
|
"consecutive invocations of the benchmarking functions. "
|
||||||
|
f"Please divide by {cuda_graph_nops} for single invocation "
|
||||||
|
"timings."
|
||||||
|
)
|
||||||
|
compare = TBenchmark.Compare(timers)
|
||||||
|
compare.print()
|
||||||
|
|
||||||
|
|
||||||
|
class ImplType(Enum):
|
||||||
|
SILU_MUL_PER_TOKEN_GROUP_QUANT_FP8_COLMAJOR = 1
|
||||||
|
REFERENCE = 2
|
||||||
|
|
||||||
|
def get_impl(self):
|
||||||
|
if self == ImplType.SILU_MUL_PER_TOKEN_GROUP_QUANT_FP8_COLMAJOR:
|
||||||
|
return silu_mul_per_token_group_quant_fp8_colmajor
|
||||||
|
elif self == ImplType.REFERENCE:
|
||||||
|
return reference
|
||||||
|
raise ValueError(f"Unrecognized ImplType {self}")
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class BenchmarkTensors:
|
||||||
|
input: torch.Tensor
|
||||||
|
output: torch.Tensor
|
||||||
|
|
||||||
|
# Reference act output tensor
|
||||||
|
ref_act_out: torch.Tensor
|
||||||
|
ref_quant_out: torch.Tensor
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def make(T: int, N: int) -> "BenchmarkTensors":
|
||||||
|
assert T % GROUP_SIZE == 0
|
||||||
|
assert N % (GROUP_SIZE * 2) == 0
|
||||||
|
|
||||||
|
input = torch.rand((T, N), dtype=torch.bfloat16, device="cuda")
|
||||||
|
|
||||||
|
# silu_mul_per_token_group_quant_fp8_colmajor output.
|
||||||
|
output = torch.rand((T, N // 2), dtype=torch.bfloat16, device="cuda").to(
|
||||||
|
FLOAT8_T
|
||||||
|
)
|
||||||
|
|
||||||
|
# reference output.
|
||||||
|
ref_act_out = torch.empty((T, N // 2), dtype=torch.bfloat16, device="cuda")
|
||||||
|
ref_quant_out = torch.empty(
|
||||||
|
(T, N // 2), dtype=torch.bfloat16, device="cuda"
|
||||||
|
).to(FLOAT8_T)
|
||||||
|
|
||||||
|
return BenchmarkTensors(
|
||||||
|
input=input,
|
||||||
|
output=output,
|
||||||
|
ref_act_out=ref_act_out,
|
||||||
|
ref_quant_out=ref_quant_out,
|
||||||
|
)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def T(self):
|
||||||
|
return self.input.size(0)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def N(self):
|
||||||
|
return self.input.size(1)
|
||||||
|
|
||||||
|
def make_impl_kwargs(self, impl_type: ImplType) -> dict[str, Any]:
|
||||||
|
if impl_type == ImplType.SILU_MUL_PER_TOKEN_GROUP_QUANT_FP8_COLMAJOR:
|
||||||
|
return {
|
||||||
|
"input": self.input,
|
||||||
|
"output": self.output,
|
||||||
|
"use_ue8m0": is_deep_gemm_e8m0_used(),
|
||||||
|
}
|
||||||
|
elif impl_type == ImplType.REFERENCE:
|
||||||
|
return {
|
||||||
|
"input": self.input,
|
||||||
|
"act_out": self.ref_act_out,
|
||||||
|
"quant_out": self.ref_quant_out,
|
||||||
|
"use_ue8m0": is_deep_gemm_e8m0_used(),
|
||||||
|
}
|
||||||
|
raise ValueError(f"Unrecognized impl_type {impl_type}")
|
||||||
|
|
||||||
|
|
||||||
|
def reference_quant(x: torch.Tensor, quant_out: torch.Tensor, use_ue8m0: bool):
|
||||||
|
"""
|
||||||
|
Reference triton quant kernel from,
|
||||||
|
vllm.model_executor.layers.quantization.utils.fp8_utils
|
||||||
|
"""
|
||||||
|
assert quant_out.size() == x.size()
|
||||||
|
# Allocate the scale tensor column-major format.
|
||||||
|
shape = (x.shape[-1] // GROUP_SIZE,) + x.shape[:-1]
|
||||||
|
x_q = quant_out
|
||||||
|
x_s = torch.empty(shape, device=x.device, dtype=torch.float32).permute(-1, -2)
|
||||||
|
|
||||||
|
M = x.numel() // GROUP_SIZE
|
||||||
|
N = GROUP_SIZE
|
||||||
|
BLOCK = triton.next_power_of_2(N)
|
||||||
|
# heuristics for number of warps
|
||||||
|
num_warps = min(max(BLOCK // 256, 1), 8)
|
||||||
|
num_stages = 1
|
||||||
|
|
||||||
|
finfo = torch.finfo(FLOAT8_T)
|
||||||
|
fp8_min = finfo.min
|
||||||
|
fp8_max = finfo.max
|
||||||
|
|
||||||
|
_per_token_group_quant_fp8_colmajor[(M,)](
|
||||||
|
x,
|
||||||
|
x_q,
|
||||||
|
x_s,
|
||||||
|
GROUP_SIZE,
|
||||||
|
x.shape[1],
|
||||||
|
x.stride(0),
|
||||||
|
x_s.stride(1),
|
||||||
|
eps=1e-10,
|
||||||
|
fp8_min=fp8_min,
|
||||||
|
fp8_max=fp8_max,
|
||||||
|
use_ue8m0=use_ue8m0,
|
||||||
|
BLOCK=BLOCK,
|
||||||
|
num_warps=num_warps,
|
||||||
|
num_stages=num_stages,
|
||||||
|
)
|
||||||
|
return x_q, x_s
|
||||||
|
|
||||||
|
|
||||||
|
def reference(
|
||||||
|
input: torch.Tensor,
|
||||||
|
act_out: torch.Tensor,
|
||||||
|
quant_out: torch.Tensor,
|
||||||
|
use_ue8m0: bool,
|
||||||
|
) -> tuple[torch.Tensor, torch.Tensor]:
|
||||||
|
torch.ops._C.silu_and_mul(act_out, input)
|
||||||
|
return reference_quant(act_out, quant_out, use_ue8m0)
|
||||||
|
|
||||||
|
|
||||||
|
def bench_impl(
|
||||||
|
bench_tensors: list[BenchmarkTensors], impl_type: ImplType
|
||||||
|
) -> TMeasurement:
|
||||||
|
T = bench_tensors[0].T
|
||||||
|
N = bench_tensors[0].N
|
||||||
|
|
||||||
|
arg_pool_size = len(bench_tensors)
|
||||||
|
kwargs_list = [bt.make_impl_kwargs(impl_type) for bt in bench_tensors]
|
||||||
|
|
||||||
|
# warmup
|
||||||
|
for kwargs in kwargs_list:
|
||||||
|
impl_type.get_impl()(**kwargs)
|
||||||
|
torch.cuda.synchronize()
|
||||||
|
|
||||||
|
# Merge into a single kwargs and qualify arguments as ArgPool
|
||||||
|
kwargs = {k: ArgPool([]) for k in kwargs_list[0]}
|
||||||
|
for _kwargs in kwargs_list:
|
||||||
|
for k, v in _kwargs.items():
|
||||||
|
kwargs[k].values.append(v)
|
||||||
|
|
||||||
|
cuda_graph_params = None
|
||||||
|
cuda_graph_params = CudaGraphBenchParams(arg_pool_size)
|
||||||
|
timer = None
|
||||||
|
with Bench(
|
||||||
|
cuda_graph_params,
|
||||||
|
"silu-mul-quant",
|
||||||
|
f"num_tokens={T}, N={N}",
|
||||||
|
impl_type.name,
|
||||||
|
impl_type.get_impl(),
|
||||||
|
**kwargs,
|
||||||
|
) as bench:
|
||||||
|
timer = bench.run()
|
||||||
|
return timer
|
||||||
|
|
||||||
|
|
||||||
|
def test_correctness(T: int, N: int):
|
||||||
|
print(f"Testing num_tokens={T}, N={N} ...")
|
||||||
|
|
||||||
|
bench_tensor = BenchmarkTensors.make(T, N)
|
||||||
|
|
||||||
|
def output_from_impl(impl: ImplType) -> tuple[torch.Tensor, torch.Tensor]:
|
||||||
|
return impl.get_impl()(**bench_tensor.make_impl_kwargs(impl))
|
||||||
|
|
||||||
|
# reference output
|
||||||
|
ref_out_q, ref_out_s = output_from_impl(ImplType.REFERENCE)
|
||||||
|
|
||||||
|
# test ouptut
|
||||||
|
out_q, out_s = output_from_impl(
|
||||||
|
ImplType.SILU_MUL_PER_TOKEN_GROUP_QUANT_FP8_COLMAJOR
|
||||||
|
)
|
||||||
|
|
||||||
|
torch.testing.assert_close(ref_out_q.to(torch.float32), out_q.to(torch.float32))
|
||||||
|
torch.testing.assert_close(ref_out_s, out_s)
|
||||||
|
|
||||||
|
|
||||||
|
def run(Ts: list[int], Ns: list[int], arg_pool_size: int) -> list[TMeasurement]:
|
||||||
|
timers = []
|
||||||
|
for N, T in product(Ns, Ts):
|
||||||
|
test_correctness(T, N)
|
||||||
|
|
||||||
|
bench_tensors: list[BenchmarkTensors] = [
|
||||||
|
BenchmarkTensors.make(T, N) for _ in range(arg_pool_size)
|
||||||
|
]
|
||||||
|
|
||||||
|
silu_mul_quant_timer = bench_impl(
|
||||||
|
bench_tensors, ImplType.SILU_MUL_PER_TOKEN_GROUP_QUANT_FP8_COLMAJOR
|
||||||
|
)
|
||||||
|
timers.append(silu_mul_quant_timer)
|
||||||
|
reference_timer = bench_impl(bench_tensors, ImplType.REFERENCE)
|
||||||
|
timers.append(reference_timer)
|
||||||
|
|
||||||
|
print_timers(
|
||||||
|
[silu_mul_quant_timer, reference_timer], cuda_graph_nops=arg_pool_size
|
||||||
|
)
|
||||||
|
|
||||||
|
print_timers(timers, cuda_graph_nops=arg_pool_size)
|
||||||
|
|
||||||
|
return timers
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
T = [128 * i for i in range(1, 16)] + [2048 * i for i in range(1, 65)]
|
||||||
|
N = [2048, 4096, 8192]
|
||||||
|
|
||||||
|
print(f"T = {T}, N = {N}")
|
||||||
|
run(T, N, arg_pool_size=8)
|
||||||
@@ -255,8 +255,8 @@ def bench_run(
|
|||||||
torch.cuda.synchronize()
|
torch.cuda.synchronize()
|
||||||
|
|
||||||
# Timing
|
# Timing
|
||||||
start_event = torch.cuda.Event(enable_timing=True)
|
start_event = torch.Event(enable_timing=True)
|
||||||
end_event = torch.cuda.Event(enable_timing=True)
|
end_event = torch.Event(enable_timing=True)
|
||||||
|
|
||||||
latencies = []
|
latencies = []
|
||||||
for _ in range(num_iters):
|
for _ in range(num_iters):
|
||||||
|
|||||||
1129
benchmarks/kernels/benchmark_fused_collective.py
Normal file
1129
benchmarks/kernels/benchmark_fused_collective.py
Normal file
File diff suppressed because it is too large
Load Diff
@@ -16,8 +16,8 @@ from vllm.model_executor.layers.fused_moe.fused_moe import (
|
|||||||
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
||||||
|
|
||||||
DEFAULT_MODELS = [
|
DEFAULT_MODELS = [
|
||||||
"nm-testing/Mixtral-8x7B-Instruct-v0.1",
|
"mistralai/Mixtral-8x7B-Instruct-v0.1",
|
||||||
"nm-testing/deepseekv2-lite",
|
"deepseek-ai/DeepSeek-V2-Lite",
|
||||||
"ibm-granite/granite-3.0-1b-a400m",
|
"ibm-granite/granite-3.0-1b-a400m",
|
||||||
"ibm-granite/granite-3.0-3b-a800m",
|
"ibm-granite/granite-3.0-3b-a800m",
|
||||||
]
|
]
|
||||||
|
|||||||
@@ -19,13 +19,24 @@ from torch.utils.benchmark import Measurement as TMeasurement
|
|||||||
from utils import ArgPool, Bench, CudaGraphBenchParams
|
from utils import ArgPool, Bench, CudaGraphBenchParams
|
||||||
from weight_shapes import WEIGHT_SHAPES
|
from weight_shapes import WEIGHT_SHAPES
|
||||||
|
|
||||||
from vllm.triton_utils import HAS_TRITON
|
from vllm.lora.ops.triton_ops.utils import get_lora_op_configs
|
||||||
|
from vllm.triton_utils import HAS_TRITON, triton
|
||||||
|
|
||||||
if HAS_TRITON:
|
if HAS_TRITON:
|
||||||
from vllm.lora.ops.triton_ops import LoRAKernelMeta, lora_expand, lora_shrink
|
from vllm.lora.ops.triton_ops import ( ## added fused_moe_lora
|
||||||
|
LoRAKernelMeta,
|
||||||
|
fused_moe_lora_expand,
|
||||||
|
fused_moe_lora_shrink,
|
||||||
|
lora_expand,
|
||||||
|
lora_shrink,
|
||||||
|
)
|
||||||
|
from vllm.lora.ops.triton_ops.fused_moe_lora_op import (
|
||||||
|
_LORA_PTR_DICT, ## added _LORA_PTR_DICT for fused_moe_lora
|
||||||
|
)
|
||||||
from vllm.lora.ops.triton_ops.utils import _LORA_A_PTR_DICT, _LORA_B_PTR_DICT
|
from vllm.lora.ops.triton_ops.utils import _LORA_A_PTR_DICT, _LORA_B_PTR_DICT
|
||||||
|
from vllm import _custom_ops as ops
|
||||||
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
||||||
|
from vllm.utils.math_utils import round_up
|
||||||
|
|
||||||
DEFAULT_MODELS = list(WEIGHT_SHAPES.keys())
|
DEFAULT_MODELS = list(WEIGHT_SHAPES.keys())
|
||||||
DEFAULT_TP_SIZES = [1]
|
DEFAULT_TP_SIZES = [1]
|
||||||
@@ -59,6 +70,8 @@ DEFAULT_NUM_LORAS = [1, 2, 3, 4]
|
|||||||
DEFAULT_SORT_BY_LORA_IDS = [False, True]
|
DEFAULT_SORT_BY_LORA_IDS = [False, True]
|
||||||
DEFAULT_SEQ_LENGTHS = [1]
|
DEFAULT_SEQ_LENGTHS = [1]
|
||||||
DEFAULT_EXPAND_FN_ADD_INPUTS = [True, False]
|
DEFAULT_EXPAND_FN_ADD_INPUTS = [True, False]
|
||||||
|
DEFAULT_TOP_K_NUMS = [1] # Added for MoE LoRA top_k
|
||||||
|
DEFAULT_NUM_EXPERTS = [8] # Added for MoE LoRA num_experts
|
||||||
|
|
||||||
|
|
||||||
# Utilities
|
# Utilities
|
||||||
@@ -191,6 +204,11 @@ class OpType(Enum):
|
|||||||
|
|
||||||
LORA_SHRINK = auto()
|
LORA_SHRINK = auto()
|
||||||
LORA_EXPAND = auto()
|
LORA_EXPAND = auto()
|
||||||
|
## Adding support for fused moe lora
|
||||||
|
FUSED_MOE_LORA_GATE_UP_SHRINK = auto() ## Gate/Up projection variant with shrink
|
||||||
|
FUSED_MOE_LORA_GATE_UP_EXPAND = auto() ## Gate/Up projection variant with expand
|
||||||
|
FUSED_MOE_LORA_DOWN_SHRINK = auto() ## Down projection variant with shrink
|
||||||
|
FUSED_MOE_LORA_DOWN_EXPAND = auto() ## Down projection variant with expand
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def from_str(s: str) -> "OpType":
|
def from_str(s: str) -> "OpType":
|
||||||
@@ -198,6 +216,15 @@ class OpType(Enum):
|
|||||||
return OpType.LORA_SHRINK
|
return OpType.LORA_SHRINK
|
||||||
if s.lower() == "lora_expand":
|
if s.lower() == "lora_expand":
|
||||||
return OpType.LORA_EXPAND
|
return OpType.LORA_EXPAND
|
||||||
|
# Adding support for fused moe lora, both in gate_up and down
|
||||||
|
if s.lower() == "fused_moe_lora_gate_up_shrink": ## Gate/Up variant with shrink
|
||||||
|
return OpType.FUSED_MOE_LORA_GATE_UP_SHRINK
|
||||||
|
if s.lower() == "fused_moe_lora_gate_up_expand": ## Gate/Up variant with expand
|
||||||
|
return OpType.FUSED_MOE_LORA_GATE_UP_EXPAND
|
||||||
|
if s.lower() == "fused_moe_lora_down_shrink": ## Down variant with shrink
|
||||||
|
return OpType.FUSED_MOE_LORA_DOWN_SHRINK
|
||||||
|
if s.lower() == "fused_moe_lora_down_expand": ## Down variant with expand
|
||||||
|
return OpType.FUSED_MOE_LORA_DOWN_EXPAND
|
||||||
raise ValueError(f"Unrecognized str {s} to convert to OpType")
|
raise ValueError(f"Unrecognized str {s} to convert to OpType")
|
||||||
|
|
||||||
def is_shrink_fn(self) -> bool:
|
def is_shrink_fn(self) -> bool:
|
||||||
@@ -206,19 +233,56 @@ class OpType(Enum):
|
|||||||
def is_expand_fn(self) -> bool:
|
def is_expand_fn(self) -> bool:
|
||||||
return self in [OpType.LORA_EXPAND]
|
return self in [OpType.LORA_EXPAND]
|
||||||
|
|
||||||
|
def is_fused_moe_lora_fn(self) -> bool: ## adding for fused MoE LoRA
|
||||||
|
return self in [
|
||||||
|
OpType.FUSED_MOE_LORA_GATE_UP_SHRINK,
|
||||||
|
OpType.FUSED_MOE_LORA_DOWN_SHRINK,
|
||||||
|
OpType.FUSED_MOE_LORA_GATE_UP_EXPAND,
|
||||||
|
OpType.FUSED_MOE_LORA_DOWN_EXPAND,
|
||||||
|
]
|
||||||
|
|
||||||
|
def is_fused_moe_lora_gate_up_fn(
|
||||||
|
self,
|
||||||
|
) -> bool: ## adding for fused MoE LoRA Gate/Up
|
||||||
|
return self in [
|
||||||
|
OpType.FUSED_MOE_LORA_GATE_UP_SHRINK,
|
||||||
|
OpType.FUSED_MOE_LORA_GATE_UP_EXPAND,
|
||||||
|
]
|
||||||
|
|
||||||
|
def is_fused_moe_lora_down_fn(self) -> bool: ## adding for fused MoE LoRA Down
|
||||||
|
return self in [
|
||||||
|
OpType.FUSED_MOE_LORA_DOWN_SHRINK,
|
||||||
|
OpType.FUSED_MOE_LORA_DOWN_EXPAND,
|
||||||
|
]
|
||||||
|
|
||||||
|
def is_fused_moe_lora_shrink_fn(self) -> bool:
|
||||||
|
return self in [
|
||||||
|
OpType.FUSED_MOE_LORA_GATE_UP_SHRINK,
|
||||||
|
OpType.FUSED_MOE_LORA_DOWN_SHRINK,
|
||||||
|
]
|
||||||
|
|
||||||
|
def is_fused_moe_lora_expand_fn(self) -> bool:
|
||||||
|
return self in [
|
||||||
|
OpType.FUSED_MOE_LORA_GATE_UP_EXPAND,
|
||||||
|
OpType.FUSED_MOE_LORA_DOWN_EXPAND,
|
||||||
|
]
|
||||||
|
|
||||||
def num_slices(self) -> list[int]:
|
def num_slices(self) -> list[int]:
|
||||||
|
if self.is_fused_moe_lora_gate_up_fn():
|
||||||
|
return [2]
|
||||||
|
elif self.is_fused_moe_lora_down_fn():
|
||||||
|
return [1]
|
||||||
return [1, 2, 3]
|
return [1, 2, 3]
|
||||||
|
|
||||||
def mkn(
|
def mkn(
|
||||||
self, batch_size: int, seq_length: int, hidden_size: int, lora_rank: int
|
self, batch_size: int, seq_length: int, hidden_size: int, lora_rank: int
|
||||||
) -> tuple[int, int, int]:
|
) -> tuple[int, int, int]:
|
||||||
num_tokens = batch_size * seq_length
|
num_tokens = batch_size * seq_length
|
||||||
if self.is_shrink_fn():
|
if self.is_shrink_fn() or self.is_fused_moe_lora_fn():
|
||||||
m = num_tokens
|
m = num_tokens
|
||||||
k = hidden_size
|
k = hidden_size
|
||||||
n = lora_rank
|
n = lora_rank
|
||||||
else:
|
elif self.is_expand_fn():
|
||||||
assert self.is_expand_fn()
|
|
||||||
m = num_tokens
|
m = num_tokens
|
||||||
k = lora_rank
|
k = lora_rank
|
||||||
n = hidden_size
|
n = hidden_size
|
||||||
@@ -232,9 +296,36 @@ class OpType(Enum):
|
|||||||
"""
|
"""
|
||||||
if self.is_shrink_fn():
|
if self.is_shrink_fn():
|
||||||
return op_dtype, op_dtype, torch.float32
|
return op_dtype, op_dtype, torch.float32
|
||||||
else:
|
elif self.is_expand_fn():
|
||||||
assert self.is_expand_fn()
|
|
||||||
return torch.float32, op_dtype, op_dtype
|
return torch.float32, op_dtype, op_dtype
|
||||||
|
else:
|
||||||
|
assert self.is_fused_moe_lora_fn()
|
||||||
|
return op_dtype, op_dtype, op_dtype
|
||||||
|
|
||||||
|
def matmul_shapes_fused_moe_lora(
|
||||||
|
self,
|
||||||
|
m: int,
|
||||||
|
n: int,
|
||||||
|
k: int,
|
||||||
|
num_loras: int,
|
||||||
|
num_slices: int,
|
||||||
|
top_k_num: int,
|
||||||
|
num_experts: int,
|
||||||
|
) -> tuple[tuple[int], tuple[int], tuple[int], tuple[int]]:
|
||||||
|
if self.is_fused_moe_lora_shrink_fn():
|
||||||
|
input_shape = (
|
||||||
|
(m * top_k_num, n)
|
||||||
|
if self in [OpType.FUSED_MOE_LORA_DOWN_SHRINK]
|
||||||
|
else (m, n)
|
||||||
|
)
|
||||||
|
output_shape = (num_slices, m, top_k_num, k)
|
||||||
|
weight_shape = (num_loras, num_experts, k, n)
|
||||||
|
else:
|
||||||
|
assert self.is_fused_moe_lora_expand_fn()
|
||||||
|
input_shape = (num_slices, m, top_k_num, k)
|
||||||
|
output_shape = (m, top_k_num, n * num_slices)
|
||||||
|
weight_shape = (num_loras, num_experts, n, k)
|
||||||
|
return (input_shape, weight_shape, output_shape)
|
||||||
|
|
||||||
def matmul_shapes(
|
def matmul_shapes(
|
||||||
self,
|
self,
|
||||||
@@ -244,6 +335,8 @@ class OpType(Enum):
|
|||||||
lora_rank: int,
|
lora_rank: int,
|
||||||
num_loras: int,
|
num_loras: int,
|
||||||
num_slices: int,
|
num_slices: int,
|
||||||
|
top_k_num: int | None = None,
|
||||||
|
num_experts: int | None = None,
|
||||||
) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...]]:
|
) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...]]:
|
||||||
"""
|
"""
|
||||||
Given num_slices, return the shapes of the A, B, and C matrices
|
Given num_slices, return the shapes of the A, B, and C matrices
|
||||||
@@ -258,6 +351,16 @@ class OpType(Enum):
|
|||||||
if self in [OpType.LORA_EXPAND]:
|
if self in [OpType.LORA_EXPAND]:
|
||||||
# LoRA expand kernels support num_slices inherently in the kernel
|
# LoRA expand kernels support num_slices inherently in the kernel
|
||||||
return ((num_slices, m, k), b_shape, (m, n * num_slices))
|
return ((num_slices, m, k), b_shape, (m, n * num_slices))
|
||||||
|
if self.is_fused_moe_lora_fn():
|
||||||
|
return self.matmul_shapes_fused_moe_lora(
|
||||||
|
m,
|
||||||
|
k,
|
||||||
|
n,
|
||||||
|
num_loras,
|
||||||
|
num_slices,
|
||||||
|
top_k_num,
|
||||||
|
num_experts,
|
||||||
|
)
|
||||||
raise ValueError(f"Unrecognized op_type {self}")
|
raise ValueError(f"Unrecognized op_type {self}")
|
||||||
|
|
||||||
def bench_fn(self) -> Callable:
|
def bench_fn(self) -> Callable:
|
||||||
@@ -265,6 +368,16 @@ class OpType(Enum):
|
|||||||
return lora_shrink
|
return lora_shrink
|
||||||
if self == OpType.LORA_EXPAND:
|
if self == OpType.LORA_EXPAND:
|
||||||
return lora_expand
|
return lora_expand
|
||||||
|
if self in [
|
||||||
|
OpType.FUSED_MOE_LORA_GATE_UP_SHRINK,
|
||||||
|
OpType.FUSED_MOE_LORA_DOWN_SHRINK,
|
||||||
|
]:
|
||||||
|
return fused_moe_lora_shrink
|
||||||
|
if self in [
|
||||||
|
OpType.FUSED_MOE_LORA_GATE_UP_EXPAND,
|
||||||
|
OpType.FUSED_MOE_LORA_DOWN_EXPAND,
|
||||||
|
]:
|
||||||
|
return fused_moe_lora_expand
|
||||||
|
|
||||||
raise ValueError(f"Unrecognized optype {self}")
|
raise ValueError(f"Unrecognized optype {self}")
|
||||||
|
|
||||||
@@ -318,6 +431,8 @@ class BenchmarkContext:
|
|||||||
sort_by_lora_id: bool
|
sort_by_lora_id: bool
|
||||||
dtype: torch.dtype
|
dtype: torch.dtype
|
||||||
seq_length: int | None = None
|
seq_length: int | None = None
|
||||||
|
num_experts: int | None = None # num_experts for MoE based ops
|
||||||
|
top_k_num: int | None = None # top_k for MoE based ops
|
||||||
num_slices: int | None = None # num_slices for slice based ops
|
num_slices: int | None = None # num_slices for slice based ops
|
||||||
|
|
||||||
def with_seq_length(self, seq_length: int) -> "BenchmarkContext":
|
def with_seq_length(self, seq_length: int) -> "BenchmarkContext":
|
||||||
@@ -373,6 +488,11 @@ class BenchmarkTensors:
|
|||||||
f"{dtype_to_str(self.output.dtype)}"
|
f"{dtype_to_str(self.output.dtype)}"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
def get_num_tokens(self, size: int, top_k_num: int, op_type: OpType):
|
||||||
|
return (
|
||||||
|
size * top_k_num if op_type in [OpType.FUSED_MOE_LORA_DOWN_SHRINK] else size
|
||||||
|
)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def make(
|
def make(
|
||||||
ctx: BenchmarkContext, op_type: OpType, device: str = "cuda"
|
ctx: BenchmarkContext, op_type: OpType, device: str = "cuda"
|
||||||
@@ -385,6 +505,8 @@ class BenchmarkTensors:
|
|||||||
ctx.lora_rank,
|
ctx.lora_rank,
|
||||||
ctx.num_loras,
|
ctx.num_loras,
|
||||||
ctx.num_slices,
|
ctx.num_slices,
|
||||||
|
ctx.top_k_num,
|
||||||
|
ctx.num_experts,
|
||||||
)
|
)
|
||||||
a_type, b_type, c_type = op_type.matmul_dtypes(ctx.dtype)
|
a_type, b_type, c_type = op_type.matmul_dtypes(ctx.dtype)
|
||||||
input_tensor, lora_weights, output_tensor = make_rand_tensors(
|
input_tensor, lora_weights, output_tensor = make_rand_tensors(
|
||||||
@@ -432,17 +554,27 @@ class BenchmarkTensors:
|
|||||||
prompt_lora_indices_tensor,
|
prompt_lora_indices_tensor,
|
||||||
)
|
)
|
||||||
|
|
||||||
def sanity_check(self) -> None:
|
def sanity_check(self, ctx: BenchmarkContext, op_type: OpType) -> None:
|
||||||
"""
|
"""
|
||||||
Fails asserts when non-conformality is detected.
|
Fails asserts when non-conformality is detected.
|
||||||
"""
|
"""
|
||||||
num_tokens = self.input.shape[-2]
|
num_tokens = (
|
||||||
|
self.input.shape[1]
|
||||||
|
if op_type.is_fused_moe_lora_expand_fn()
|
||||||
|
else self.input.shape[-2]
|
||||||
|
)
|
||||||
# check metadata tensors
|
# check metadata tensors
|
||||||
assert torch.sum(self.seq_lens) == num_tokens
|
## In down shrink case, each token is repeated top_k_num times
|
||||||
|
assert num_tokens == self.get_num_tokens(
|
||||||
|
torch.sum(self.seq_lens), ctx.top_k_num, op_type
|
||||||
|
), f"Expected {num_tokens} tokens, but got {torch.sum(self.seq_lens)}"
|
||||||
num_seqs = self.seq_lens.shape[0]
|
num_seqs = self.seq_lens.shape[0]
|
||||||
# assert self.seq_start_loc.shape[0] == num_seqs
|
# assert self.seq_start_loc.shape[0] == num_seqs
|
||||||
|
## In down shrink case, each prompt corresponds to top_k_num sequences
|
||||||
assert self.prompt_lora_mapping.shape[0] == num_seqs
|
assert self.prompt_lora_mapping.shape[0] == num_seqs
|
||||||
assert self.lora_kernel_meta.token_lora_mapping.shape[0] == num_tokens
|
assert self.get_num_tokens(
|
||||||
|
self.lora_kernel_meta.token_lora_mapping.shape[0], ctx.top_k_num, op_type
|
||||||
|
)
|
||||||
|
|
||||||
def to_device(self, device: str):
|
def to_device(self, device: str):
|
||||||
"""
|
"""
|
||||||
@@ -471,21 +603,111 @@ class BenchmarkTensors:
|
|||||||
to_device(field) if field_name != "no_lora_flag_cpu" else field,
|
to_device(field) if field_name != "no_lora_flag_cpu" else field,
|
||||||
)
|
)
|
||||||
|
|
||||||
def metadata(self) -> tuple[int, int, int]:
|
def metadata(self, ctx: BenchmarkContext, op_type: OpType) -> tuple[int, int, int]:
|
||||||
"""
|
"""
|
||||||
Return num_seqs, num_tokens and max_seq_len
|
Return num_seqs, num_tokens and max_seq_len
|
||||||
"""
|
"""
|
||||||
num_seqs = self.seq_lens.shape[0]
|
num_seqs = self.seq_lens.shape[0]
|
||||||
num_tokens = self.lora_kernel_meta.token_lora_mapping.shape[0]
|
num_tokens = self.get_num_tokens(
|
||||||
|
self.lora_kernel_meta.token_lora_mapping.shape[0], ctx.top_k_num, op_type
|
||||||
|
)
|
||||||
max_seq_len = torch.max(self.seq_lens).item()
|
max_seq_len = torch.max(self.seq_lens).item()
|
||||||
num_slices = len(self.lora_weights_lst)
|
num_slices = len(self.lora_weights_lst)
|
||||||
return num_seqs, num_tokens, max_seq_len, num_slices
|
return num_seqs, num_tokens, max_seq_len, num_slices
|
||||||
|
|
||||||
def as_lora_shrink_kwargs(self) -> dict[str, Any]:
|
def fused_moe_lora_data_prepare(
|
||||||
self.sanity_check()
|
self,
|
||||||
|
block_size: int,
|
||||||
|
token_lora_mapping: torch.Tensor,
|
||||||
|
ctx: BenchmarkContext,
|
||||||
|
):
|
||||||
|
def moe_lora_align_block_size(
|
||||||
|
topk_ids: torch.Tensor,
|
||||||
|
token_lora_mapping: torch.Tensor,
|
||||||
|
block_size: int,
|
||||||
|
num_experts: int,
|
||||||
|
max_loras: int,
|
||||||
|
expert_map: torch.Tensor | None = None,
|
||||||
|
pad_sorted_ids: bool = False,
|
||||||
|
) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
|
||||||
|
"""
|
||||||
|
Aligns tokens and experts into block-sized chunks for LoRA-based
|
||||||
|
mixture-of-experts (MoE) execution.
|
||||||
|
"""
|
||||||
|
max_num_tokens_padded = topk_ids.numel() + num_experts * (block_size - 1)
|
||||||
|
if pad_sorted_ids:
|
||||||
|
max_num_tokens_padded = round_up(max_num_tokens_padded, block_size)
|
||||||
|
sorted_ids = torch.empty(
|
||||||
|
(max_loras * max_num_tokens_padded,),
|
||||||
|
dtype=torch.int32,
|
||||||
|
device=topk_ids.device,
|
||||||
|
)
|
||||||
|
max_num_m_blocks = triton.cdiv(max_num_tokens_padded, block_size)
|
||||||
|
# Expert ids must be set default to -1 to prevent a blank block
|
||||||
|
expert_ids = torch.empty(
|
||||||
|
(max_loras * max_num_m_blocks,),
|
||||||
|
dtype=torch.int32,
|
||||||
|
device=topk_ids.device,
|
||||||
|
)
|
||||||
|
num_tokens_post_pad = torch.empty(
|
||||||
|
(max_loras), dtype=torch.int32, device=topk_ids.device
|
||||||
|
)
|
||||||
|
|
||||||
|
ops.moe_lora_align_block_size(
|
||||||
|
topk_ids,
|
||||||
|
token_lora_mapping,
|
||||||
|
num_experts,
|
||||||
|
block_size,
|
||||||
|
max_loras,
|
||||||
|
max_num_tokens_padded,
|
||||||
|
max_num_m_blocks,
|
||||||
|
sorted_ids,
|
||||||
|
expert_ids,
|
||||||
|
num_tokens_post_pad,
|
||||||
|
)
|
||||||
|
if expert_map is not None:
|
||||||
|
expert_ids = expert_map[expert_ids]
|
||||||
|
|
||||||
|
return sorted_ids, expert_ids, num_tokens_post_pad
|
||||||
|
|
||||||
|
num_tokens = ctx.batch_size
|
||||||
|
curr_topk_ids = torch.randint(
|
||||||
|
0,
|
||||||
|
ctx.num_experts,
|
||||||
|
(num_tokens, ctx.top_k_num),
|
||||||
|
device="cuda",
|
||||||
|
dtype=torch.int32,
|
||||||
|
)
|
||||||
|
topk_weights = torch.randint(
|
||||||
|
0,
|
||||||
|
ctx.num_experts,
|
||||||
|
(num_tokens, ctx.top_k_num),
|
||||||
|
device="cuda",
|
||||||
|
dtype=torch.int32,
|
||||||
|
)
|
||||||
|
|
||||||
|
(sorted_token_ids_lora, expert_ids_lora, num_tokens_post_padded_lora) = (
|
||||||
|
moe_lora_align_block_size(
|
||||||
|
topk_ids=curr_topk_ids,
|
||||||
|
token_lora_mapping=token_lora_mapping,
|
||||||
|
block_size=block_size,
|
||||||
|
num_experts=ctx.num_experts,
|
||||||
|
max_loras=ctx.num_loras,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
sorted_token_ids = sorted_token_ids_lora.view(ctx.num_loras, -1)
|
||||||
|
expert_ids = expert_ids_lora.view(ctx.num_loras, -1)
|
||||||
|
num_tokens_post_padded = num_tokens_post_padded_lora
|
||||||
|
return (topk_weights, sorted_token_ids, expert_ids, num_tokens_post_padded)
|
||||||
|
|
||||||
|
def as_lora_shrink_kwargs(
|
||||||
|
self, ctx: BenchmarkContext, op_type: OpType
|
||||||
|
) -> dict[str, Any]:
|
||||||
|
self.sanity_check(ctx, op_type)
|
||||||
self.to_device(self.input.device)
|
self.to_device(self.input.device)
|
||||||
|
|
||||||
_, num_tokens, _, num_slices = self.metadata()
|
_, num_tokens, _, num_slices = self.metadata(ctx, op_type)
|
||||||
|
|
||||||
# Sanity check matrix shapes.
|
# Sanity check matrix shapes.
|
||||||
i_shape, lw_shape, o_shape = (
|
i_shape, lw_shape, o_shape = (
|
||||||
@@ -520,11 +742,13 @@ class BenchmarkTensors:
|
|||||||
"no_lora_flag_cpu": self.lora_kernel_meta.no_lora_flag_cpu,
|
"no_lora_flag_cpu": self.lora_kernel_meta.no_lora_flag_cpu,
|
||||||
}
|
}
|
||||||
|
|
||||||
def as_lora_expand_kwargs(self, add_inputs: bool) -> dict[str, Any]:
|
def as_lora_expand_kwargs(
|
||||||
self.sanity_check()
|
self, ctx: BenchmarkContext, op_type: OpType, add_inputs: bool
|
||||||
|
) -> dict[str, Any]:
|
||||||
|
self.sanity_check(ctx, op_type)
|
||||||
self.to_device(self.input.device)
|
self.to_device(self.input.device)
|
||||||
|
|
||||||
_, num_tokens, _, num_slices = self.metadata()
|
_, num_tokens, _, num_slices = self.metadata(ctx, op_type)
|
||||||
|
|
||||||
# Sanity check matrix shapes.
|
# Sanity check matrix shapes.
|
||||||
i_shape, lw_shape, o_shape = (
|
i_shape, lw_shape, o_shape = (
|
||||||
@@ -561,18 +785,173 @@ class BenchmarkTensors:
|
|||||||
"no_lora_flag_cpu": self.lora_kernel_meta.no_lora_flag_cpu,
|
"no_lora_flag_cpu": self.lora_kernel_meta.no_lora_flag_cpu,
|
||||||
}
|
}
|
||||||
|
|
||||||
def bench_fn_kwargs(
|
def as_fused_moe_lora_shrink_kwargs(
|
||||||
self, op_type: OpType, add_inputs: bool | None = None
|
self, ctx: BenchmarkContext, op_type: OpType
|
||||||
) -> dict[str, Any]:
|
) -> dict[str, Any]:
|
||||||
if op_type.is_shrink_fn():
|
self.sanity_check(ctx, op_type)
|
||||||
|
self.to_device(self.input.device)
|
||||||
|
|
||||||
|
_, num_tokens, _, num_slices = self.metadata(ctx, op_type)
|
||||||
|
|
||||||
|
# Sanity check matrix shapes.
|
||||||
|
i_shape, lw_shape, o_shape = (
|
||||||
|
self.input.shape,
|
||||||
|
self.lora_weights_lst[0].shape,
|
||||||
|
self.output.shape,
|
||||||
|
)
|
||||||
|
# Expected input shape : [num_tokens, hidden_size] for gate_up
|
||||||
|
# Expected input shape : [top_k_num * num_tokens, hidden_size] for down
|
||||||
|
assert len(i_shape) == 2
|
||||||
|
assert i_shape[0] == num_tokens
|
||||||
|
hidden_size = i_shape[1]
|
||||||
|
# Expected lora weight shape [max_lora, num_experts, lora_rank, hidden_size]
|
||||||
|
assert len(lw_shape) == 4
|
||||||
|
assert lw_shape[-1] == hidden_size
|
||||||
|
lora_rank = lw_shape[-2]
|
||||||
|
# Expected output shape : [num_slices, num_tokens, top_k_num, lora_rank]
|
||||||
|
assert len(o_shape) == 4
|
||||||
|
assert (
|
||||||
|
o_shape
|
||||||
|
== (num_slices, num_tokens // ctx.top_k_num, ctx.top_k_num, lora_rank)
|
||||||
|
if op_type in [OpType.FUSED_MOE_LORA_DOWN_SHRINK]
|
||||||
|
else o_shape == (num_slices, num_tokens, ctx.top_k_num, lora_rank)
|
||||||
|
)
|
||||||
|
kernel_config = get_lora_op_configs(
|
||||||
|
op_type.name.lower(),
|
||||||
|
max_loras=lw_shape[0],
|
||||||
|
batch=num_tokens,
|
||||||
|
hidden_size=hidden_size,
|
||||||
|
rank=lora_rank,
|
||||||
|
num_slices=num_slices,
|
||||||
|
add_inputs=False,
|
||||||
|
)
|
||||||
|
|
||||||
|
(topk_weights, sorted_token_ids, expert_ids, num_tokens_post_padded) = (
|
||||||
|
self.fused_moe_lora_data_prepare(
|
||||||
|
block_size=kernel_config["BLOCK_SIZE_M"],
|
||||||
|
token_lora_mapping=self.lora_kernel_meta.token_lora_mapping,
|
||||||
|
ctx=ctx,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
return {
|
||||||
|
"qcurr_hidden_states": self.input,
|
||||||
|
"lora_a_stacked": self.lora_weights_lst,
|
||||||
|
"a_intermediate_cache1": self.output,
|
||||||
|
"topk_weights": topk_weights,
|
||||||
|
"sorted_token_ids": sorted_token_ids,
|
||||||
|
"expert_ids": expert_ids,
|
||||||
|
"num_tokens_post_padded": num_tokens_post_padded,
|
||||||
|
"top_k_num": ctx.top_k_num,
|
||||||
|
"device": self.input.device,
|
||||||
|
"N": lora_rank,
|
||||||
|
"M": topk_weights.shape[0],
|
||||||
|
"EM": sorted_token_ids.shape[1],
|
||||||
|
"K": self.input.shape[1],
|
||||||
|
"num_tokens": num_tokens,
|
||||||
|
"num_experts": ctx.num_experts,
|
||||||
|
"num_slices": num_slices,
|
||||||
|
"shrink_block_size_m": kernel_config["BLOCK_SIZE_M"],
|
||||||
|
"shrink_block_size_n": kernel_config["BLOCK_SIZE_N"],
|
||||||
|
"shrink_block_size_k": kernel_config["BLOCK_SIZE_K"],
|
||||||
|
"shrink_group_size_m": kernel_config["GROUP_SIZE_M"],
|
||||||
|
"shrink_num_warps": kernel_config["NUM_WARPS"],
|
||||||
|
"shrink_num_stages": kernel_config["NUM_STAGES"],
|
||||||
|
"shrink_split_k": kernel_config.get("SPLIT_K", 1),
|
||||||
|
"mul_routed_weight": op_type.is_fused_moe_lora_down_fn(),
|
||||||
|
}
|
||||||
|
|
||||||
|
def as_fused_moe_lora_expand_kwargs(
|
||||||
|
self, ctx: BenchmarkContext, op_type: OpType
|
||||||
|
) -> dict[str, Any]:
|
||||||
|
self.sanity_check(ctx, op_type)
|
||||||
|
self.to_device(self.input.device)
|
||||||
|
|
||||||
|
_, num_tokens, _, num_slices = self.metadata(ctx, op_type)
|
||||||
|
|
||||||
|
# Sanity check matrix shapes.
|
||||||
|
i_shape, lw_shape, o_shape = (
|
||||||
|
self.input.shape,
|
||||||
|
self.lora_weights_lst[0].shape,
|
||||||
|
self.output.shape,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Expected input shape : [num_slices, num_tokens, top_k_num, lora_rank]
|
||||||
|
assert len(i_shape) == 4
|
||||||
|
assert i_shape[0] == num_slices
|
||||||
|
assert i_shape[1] == num_tokens
|
||||||
|
lora_rank = i_shape[-1]
|
||||||
|
# Expected lora weight shape : [num_loras, num_experts, hidden_size, lora_rank]
|
||||||
|
assert len(lw_shape) == 4
|
||||||
|
assert lw_shape[-1] == lora_rank
|
||||||
|
hidden_size = lw_shape[-2]
|
||||||
|
# Expected output shape : [num_tokens, top_k_num, hidden_size * num_slices]
|
||||||
|
assert len(o_shape) == 3
|
||||||
|
assert o_shape == (num_tokens, ctx.top_k_num, hidden_size * num_slices)
|
||||||
|
|
||||||
|
kernel_config = get_lora_op_configs(
|
||||||
|
op_type.name.lower(),
|
||||||
|
max_loras=lw_shape[0],
|
||||||
|
batch=num_tokens,
|
||||||
|
hidden_size=hidden_size,
|
||||||
|
rank=lora_rank,
|
||||||
|
num_slices=num_slices,
|
||||||
|
add_inputs=False,
|
||||||
|
)
|
||||||
|
|
||||||
|
(topk_weights, sorted_token_ids, expert_ids, num_tokens_post_padded) = (
|
||||||
|
self.fused_moe_lora_data_prepare(
|
||||||
|
block_size=kernel_config["BLOCK_SIZE_M"],
|
||||||
|
token_lora_mapping=self.lora_kernel_meta.token_lora_mapping,
|
||||||
|
ctx=ctx,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
return {
|
||||||
|
"a_intermediate_cache1": self.input,
|
||||||
|
"lora_b_stacked": self.lora_weights_lst,
|
||||||
|
"output": self.output,
|
||||||
|
"topk_weights": topk_weights,
|
||||||
|
"sorted_token_ids": sorted_token_ids,
|
||||||
|
"expert_ids": expert_ids,
|
||||||
|
"num_tokens_post_padded": num_tokens_post_padded,
|
||||||
|
"top_k_num": ctx.top_k_num,
|
||||||
|
"device": self.input.device,
|
||||||
|
"N": lora_rank,
|
||||||
|
"M": topk_weights.shape[0],
|
||||||
|
"EM": sorted_token_ids.shape[1],
|
||||||
|
"K": self.input.shape[1],
|
||||||
|
"num_tokens": num_tokens,
|
||||||
|
"num_experts": ctx.num_experts,
|
||||||
|
"num_slices": num_slices,
|
||||||
|
"max_lora_rank": lora_rank,
|
||||||
|
"w1_output_dim_size": lw_shape[2],
|
||||||
|
"expand_block_size_m": kernel_config["BLOCK_SIZE_M"],
|
||||||
|
"expand_block_size_n": kernel_config["BLOCK_SIZE_N"],
|
||||||
|
"expand_block_size_k": kernel_config["BLOCK_SIZE_K"],
|
||||||
|
"expand_group_size_m": kernel_config["GROUP_SIZE_M"],
|
||||||
|
"expand_num_warps": kernel_config["NUM_WARPS"],
|
||||||
|
"expand_num_stages": kernel_config["NUM_STAGES"],
|
||||||
|
"expand_split_k": kernel_config.get("SPLIT_K", 1),
|
||||||
|
"mul_routed_weight": op_type.is_fused_moe_lora_down_fn(),
|
||||||
|
}
|
||||||
|
|
||||||
|
def bench_fn_kwargs(
|
||||||
|
self, ctx: BenchmarkContext, op_type: OpType, add_inputs: bool | None = None
|
||||||
|
) -> dict[str, Any]:
|
||||||
|
if op_type.is_shrink_fn() or op_type.is_fused_moe_lora_fn():
|
||||||
assert add_inputs is None
|
assert add_inputs is None
|
||||||
else:
|
else:
|
||||||
assert add_inputs is not None
|
assert add_inputs is not None
|
||||||
|
|
||||||
if op_type == OpType.LORA_SHRINK:
|
if op_type == OpType.LORA_SHRINK:
|
||||||
return self.as_lora_shrink_kwargs()
|
return self.as_lora_shrink_kwargs(ctx, op_type)
|
||||||
if op_type == OpType.LORA_EXPAND:
|
if op_type == OpType.LORA_EXPAND:
|
||||||
return self.as_lora_expand_kwargs(add_inputs)
|
return self.as_lora_expand_kwargs(ctx, op_type, add_inputs)
|
||||||
|
if op_type.is_fused_moe_lora_shrink_fn():
|
||||||
|
return self.as_fused_moe_lora_shrink_kwargs(ctx, op_type)
|
||||||
|
if op_type.is_fused_moe_lora_expand_fn():
|
||||||
|
return self.as_fused_moe_lora_expand_kwargs(ctx, op_type)
|
||||||
raise ValueError(f"Unrecognized optype {self}")
|
raise ValueError(f"Unrecognized optype {self}")
|
||||||
|
|
||||||
def test_correctness(
|
def test_correctness(
|
||||||
@@ -617,7 +996,7 @@ def bench_optype(
|
|||||||
test_correctness: bool = False,
|
test_correctness: bool = False,
|
||||||
) -> TMeasurement:
|
) -> TMeasurement:
|
||||||
assert arg_pool_size >= 1
|
assert arg_pool_size >= 1
|
||||||
if op_type.is_shrink_fn():
|
if op_type.is_shrink_fn() or op_type.is_fused_moe_lora_fn():
|
||||||
assert expand_fn_add_inputs is None
|
assert expand_fn_add_inputs is None
|
||||||
else:
|
else:
|
||||||
assert expand_fn_add_inputs is not None
|
assert expand_fn_add_inputs is not None
|
||||||
@@ -627,23 +1006,30 @@ def bench_optype(
|
|||||||
BenchmarkTensors.make(ctx, op_type) for _ in range(arg_pool_size)
|
BenchmarkTensors.make(ctx, op_type) for _ in range(arg_pool_size)
|
||||||
]
|
]
|
||||||
for bt in bench_tensors:
|
for bt in bench_tensors:
|
||||||
bt.sanity_check()
|
bt.sanity_check(ctx, op_type)
|
||||||
|
|
||||||
# Test correctness of our implementation.
|
# Test correctness of our implementation.
|
||||||
if test_correctness:
|
if test_correctness:
|
||||||
|
assert op_type in [OpType.LORA_SHRINK, OpType.LORA_EXPAND], (
|
||||||
|
f"Correctness testing is not supported for {op_type.name}."
|
||||||
|
)
|
||||||
assert all(
|
assert all(
|
||||||
[bt.test_correctness(op_type, expand_fn_add_inputs) for bt in bench_tensors]
|
[
|
||||||
|
bt.test_correctness(ctx, op_type, expand_fn_add_inputs)
|
||||||
|
for bt in bench_tensors
|
||||||
|
]
|
||||||
)
|
)
|
||||||
|
|
||||||
# BenchmarkTensors -> dict (kwargs)
|
# BenchmarkTensors -> dict (kwargs)
|
||||||
kwargs_list = [
|
kwargs_list = [
|
||||||
bt.bench_fn_kwargs(op_type, add_inputs=expand_fn_add_inputs)
|
bt.bench_fn_kwargs(ctx, op_type, add_inputs=expand_fn_add_inputs)
|
||||||
for bt in bench_tensors
|
for bt in bench_tensors
|
||||||
]
|
]
|
||||||
|
|
||||||
# Clear LoRA optimization hash-maps.
|
# Clear LoRA optimization hash-maps.
|
||||||
_LORA_A_PTR_DICT.clear()
|
_LORA_A_PTR_DICT.clear()
|
||||||
_LORA_B_PTR_DICT.clear()
|
_LORA_B_PTR_DICT.clear()
|
||||||
|
_LORA_PTR_DICT.clear()
|
||||||
# Run bench function so that _LORA_A_PTR_DICT and _LORA_B_PTR_DICT are set up
|
# Run bench function so that _LORA_A_PTR_DICT and _LORA_B_PTR_DICT are set up
|
||||||
for kwargs in kwargs_list:
|
for kwargs in kwargs_list:
|
||||||
op_type.bench_fn()(**kwargs)
|
op_type.bench_fn()(**kwargs)
|
||||||
@@ -793,7 +1179,9 @@ def run(args: argparse.Namespace, bench_ctxs: list[BenchmarkContext]):
|
|||||||
|
|
||||||
# Benchmark bench_op
|
# Benchmark bench_op
|
||||||
expand_fn_add_inputs = (
|
expand_fn_add_inputs = (
|
||||||
[None] if bench_op.is_shrink_fn() else args.expand_fn_add_inputs
|
[None]
|
||||||
|
if bench_op.is_shrink_fn() or bench_op.is_fused_moe_lora_fn()
|
||||||
|
else args.expand_fn_add_inputs
|
||||||
)
|
)
|
||||||
for add_input_arg in expand_fn_add_inputs:
|
for add_input_arg in expand_fn_add_inputs:
|
||||||
seq_len_timers.append(
|
seq_len_timers.append(
|
||||||
@@ -831,12 +1219,22 @@ def as_benchmark_contexts(
|
|||||||
hidden_sizes: list[int], lora_ranks: list[int], args: argparse.Namespace
|
hidden_sizes: list[int], lora_ranks: list[int], args: argparse.Namespace
|
||||||
) -> list[BenchmarkContext]:
|
) -> list[BenchmarkContext]:
|
||||||
ctxs: list[BenchmarkContext] = []
|
ctxs: list[BenchmarkContext] = []
|
||||||
for batch_size, hidden_size, lora_rank, num_loras, sort_by_lora_id in product( # noqa
|
for (
|
||||||
|
batch_size,
|
||||||
|
hidden_size,
|
||||||
|
lora_rank,
|
||||||
|
num_loras,
|
||||||
|
sort_by_lora_id,
|
||||||
|
top_k_num,
|
||||||
|
num_experts,
|
||||||
|
) in product( # noqa
|
||||||
args.batch_sizes,
|
args.batch_sizes,
|
||||||
list(hidden_sizes),
|
list(hidden_sizes),
|
||||||
lora_ranks,
|
lora_ranks,
|
||||||
args.num_loras,
|
args.num_loras,
|
||||||
args.sort_by_lora_id,
|
args.sort_by_lora_id,
|
||||||
|
args.top_k_nums,
|
||||||
|
args.num_experts,
|
||||||
):
|
):
|
||||||
ctxs.append(
|
ctxs.append(
|
||||||
BenchmarkContext(
|
BenchmarkContext(
|
||||||
@@ -851,6 +1249,8 @@ def as_benchmark_contexts(
|
|||||||
seq_length=None,
|
seq_length=None,
|
||||||
sort_by_lora_id=sort_by_lora_id,
|
sort_by_lora_id=sort_by_lora_id,
|
||||||
dtype=args.dtype,
|
dtype=args.dtype,
|
||||||
|
top_k_num=top_k_num,
|
||||||
|
num_experts=num_experts,
|
||||||
# To be filled based on the OpType to benchmark
|
# To be filled based on the OpType to benchmark
|
||||||
num_slices=None,
|
num_slices=None,
|
||||||
)
|
)
|
||||||
@@ -1012,6 +1412,22 @@ if __name__ == "__main__":
|
|||||||
),
|
),
|
||||||
)
|
)
|
||||||
|
|
||||||
|
p.add_argument(
|
||||||
|
"--top-k-nums",
|
||||||
|
nargs="+",
|
||||||
|
type=int,
|
||||||
|
default=DEFAULT_TOP_K_NUMS,
|
||||||
|
help="Top-K values for MoE LoRA operations",
|
||||||
|
)
|
||||||
|
|
||||||
|
p.add_argument(
|
||||||
|
"--num-experts",
|
||||||
|
nargs="+",
|
||||||
|
type=int,
|
||||||
|
default=DEFAULT_NUM_EXPERTS,
|
||||||
|
help="Number of experts for MoE LoRA operations",
|
||||||
|
)
|
||||||
|
|
||||||
parser = FlexibleArgumentParser(
|
parser = FlexibleArgumentParser(
|
||||||
description=f"""
|
description=f"""
|
||||||
Benchmark LoRA kernels:
|
Benchmark LoRA kernels:
|
||||||
|
|||||||
@@ -237,6 +237,7 @@ def marlin_create_bench_fn(bt: BenchmarkTensors) -> Callable:
|
|||||||
b_q_weight=w_q,
|
b_q_weight=w_q,
|
||||||
b_bias=None,
|
b_bias=None,
|
||||||
b_scales=w_s,
|
b_scales=w_s,
|
||||||
|
a_scales=None,
|
||||||
global_scale=None,
|
global_scale=None,
|
||||||
b_zeros=w_zp,
|
b_zeros=w_zp,
|
||||||
g_idx=g_idx,
|
g_idx=g_idx,
|
||||||
|
|||||||
@@ -263,7 +263,7 @@ def bench_run(
|
|||||||
|
|
||||||
results.append(
|
results.append(
|
||||||
benchmark.Timer(
|
benchmark.Timer(
|
||||||
stmt="output = gptq_marlin_gemm(a, None, marlin_q_w, marlin_s, marlin_s2, marlin_zp, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, quant_type, size_m, size_n, size_k, is_k_full, False, False, False)", # noqa: E501
|
stmt="output = gptq_marlin_gemm(a, None, marlin_q_w, marlin_s, None, marlin_s2, marlin_zp, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, quant_type, size_m, size_n, size_k, is_k_full, False, False, False)", # noqa: E501
|
||||||
globals=globals,
|
globals=globals,
|
||||||
label=label,
|
label=label,
|
||||||
sub_label=sub_label,
|
sub_label=sub_label,
|
||||||
@@ -273,7 +273,7 @@ def bench_run(
|
|||||||
|
|
||||||
results.append(
|
results.append(
|
||||||
benchmark.Timer(
|
benchmark.Timer(
|
||||||
stmt="output = gptq_marlin_gemm(a, None, marlin_q_w, marlin_s, marlin_s2, marlin_zp, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, quant_type, size_m, size_n, size_k, is_k_full, False, True, False)", # noqa: E501
|
stmt="output = gptq_marlin_gemm(a, None, marlin_q_w, marlin_s, None, marlin_s2, marlin_zp, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, quant_type, size_m, size_n, size_k, is_k_full, False, True, False)", # noqa: E501
|
||||||
globals=globals,
|
globals=globals,
|
||||||
label=label,
|
label=label,
|
||||||
sub_label=sub_label,
|
sub_label=sub_label,
|
||||||
|
|||||||
@@ -185,8 +185,8 @@ def benchmark_config(
|
|||||||
graph.replay()
|
graph.replay()
|
||||||
torch.cuda.synchronize()
|
torch.cuda.synchronize()
|
||||||
|
|
||||||
start_event = torch.cuda.Event(enable_timing=True)
|
start_event = torch.Event(enable_timing=True)
|
||||||
end_event = torch.cuda.Event(enable_timing=True)
|
end_event = torch.Event(enable_timing=True)
|
||||||
|
|
||||||
latencies: list[float] = []
|
latencies: list[float] = []
|
||||||
for i in range(num_iters):
|
for i in range(num_iters):
|
||||||
@@ -211,7 +211,7 @@ def get_rocm_tuning_space(use_fp16):
|
|||||||
num_warps_range = [1, 2, 4, 8]
|
num_warps_range = [1, 2, 4, 8]
|
||||||
group_m_range = [1, 4, 8, 16, 32]
|
group_m_range = [1, 4, 8, 16, 32]
|
||||||
num_stage_range = [2]
|
num_stage_range = [2]
|
||||||
waves_per_eu_range = [0]
|
waves_per_eu_range = [0, 1, 2, 4]
|
||||||
matrix_instr_nonkdim_range = [16, 32] if use_fp16 else []
|
matrix_instr_nonkdim_range = [16, 32] if use_fp16 else []
|
||||||
kpack_range = [1, 2] if use_fp16 else []
|
kpack_range = [1, 2] if use_fp16 else []
|
||||||
|
|
||||||
@@ -590,6 +590,7 @@ def main(args: argparse.Namespace):
|
|||||||
"DeepseekV3ForCausalLM",
|
"DeepseekV3ForCausalLM",
|
||||||
"DeepseekV32ForCausalLM",
|
"DeepseekV32ForCausalLM",
|
||||||
"Glm4MoeForCausalLM",
|
"Glm4MoeForCausalLM",
|
||||||
|
"NemotronHForCausalLM",
|
||||||
):
|
):
|
||||||
E = config.n_routed_experts
|
E = config.n_routed_experts
|
||||||
topk = config.num_experts_per_tok
|
topk = config.num_experts_per_tok
|
||||||
@@ -615,6 +616,11 @@ def main(args: argparse.Namespace):
|
|||||||
topk = config.moe_topk[0]
|
topk = config.moe_topk[0]
|
||||||
intermediate_size = config.moe_intermediate_size[0]
|
intermediate_size = config.moe_intermediate_size[0]
|
||||||
hidden_size = config.hidden_size
|
hidden_size = config.hidden_size
|
||||||
|
elif config.architectures[0] in ["Qwen3OmniMoeForConditionalGeneration"]:
|
||||||
|
E = config.thinker_config.text_config.num_experts
|
||||||
|
topk = config.thinker_config.text_config.num_experts_per_tok
|
||||||
|
intermediate_size = config.thinker_config.text_config.moe_intermediate_size
|
||||||
|
hidden_size = config.thinker_config.text_config.hidden_size
|
||||||
else:
|
else:
|
||||||
# Support for llama4
|
# Support for llama4
|
||||||
config = config.get_text_config()
|
config = config.get_text_config()
|
||||||
|
|||||||
@@ -24,12 +24,15 @@ def get_topk_ids(num_tokens: int, num_experts: int, topk: int) -> torch.Tensor:
|
|||||||
num_tokens_range = [1, 16, 256, 4096]
|
num_tokens_range = [1, 16, 256, 4096]
|
||||||
num_experts_range = [16, 64, 224, 256, 280, 512]
|
num_experts_range = [16, 64, 224, 256, 280, 512]
|
||||||
topk_range = [1, 2, 8]
|
topk_range = [1, 2, 8]
|
||||||
configs = list(itertools.product(num_tokens_range, num_experts_range, topk_range))
|
ep_size_range = [1, 8]
|
||||||
|
configs = list(
|
||||||
|
itertools.product(num_tokens_range, num_experts_range, topk_range, ep_size_range)
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
@triton.testing.perf_report(
|
@triton.testing.perf_report(
|
||||||
triton.testing.Benchmark(
|
triton.testing.Benchmark(
|
||||||
x_names=["num_tokens", "num_experts", "topk"],
|
x_names=["num_tokens", "num_experts", "topk", "ep_size"],
|
||||||
x_vals=configs,
|
x_vals=configs,
|
||||||
line_arg="provider",
|
line_arg="provider",
|
||||||
line_vals=["vllm"],
|
line_vals=["vllm"],
|
||||||
@@ -38,16 +41,26 @@ configs = list(itertools.product(num_tokens_range, num_experts_range, topk_range
|
|||||||
args={},
|
args={},
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
def benchmark(num_tokens, num_experts, topk, provider):
|
def benchmark(num_tokens, num_experts, topk, ep_size, provider):
|
||||||
"""Benchmark function for Triton."""
|
"""Benchmark function for Triton."""
|
||||||
block_size = 256
|
block_size = 256
|
||||||
|
torch.cuda.manual_seed_all(0)
|
||||||
topk_ids = get_topk_ids(num_tokens, num_experts, topk)
|
topk_ids = get_topk_ids(num_tokens, num_experts, topk)
|
||||||
|
|
||||||
|
e_map = None
|
||||||
|
if ep_size != 1:
|
||||||
|
local_e = num_experts // ep_size
|
||||||
|
e_ids = torch.randperm(num_experts, device="cuda", dtype=torch.int32)[:local_e]
|
||||||
|
e_map = torch.full((num_experts,), -1, device="cuda", dtype=torch.int32)
|
||||||
|
e_map[e_ids] = torch.arange(local_e, device="cuda", dtype=torch.int32)
|
||||||
|
|
||||||
quantiles = [0.5, 0.2, 0.8]
|
quantiles = [0.5, 0.2, 0.8]
|
||||||
|
|
||||||
if provider == "vllm":
|
if provider == "vllm":
|
||||||
ms, min_ms, max_ms = triton.testing.do_bench(
|
ms, min_ms, max_ms = triton.testing.do_bench(
|
||||||
lambda: moe_align_block_size(topk_ids, block_size, num_experts),
|
lambda: moe_align_block_size(
|
||||||
|
topk_ids, block_size, num_experts, e_map, ignore_invalid_experts=True
|
||||||
|
),
|
||||||
quantiles=quantiles,
|
quantiles=quantiles,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
@@ -105,8 +105,8 @@ def benchmark_permute(
|
|||||||
graph.replay()
|
graph.replay()
|
||||||
torch.cuda.synchronize()
|
torch.cuda.synchronize()
|
||||||
|
|
||||||
start_event = torch.cuda.Event(enable_timing=True)
|
start_event = torch.Event(enable_timing=True)
|
||||||
end_event = torch.cuda.Event(enable_timing=True)
|
end_event = torch.Event(enable_timing=True)
|
||||||
|
|
||||||
latencies: list[float] = []
|
latencies: list[float] = []
|
||||||
for i in range(num_iters):
|
for i in range(num_iters):
|
||||||
@@ -241,8 +241,8 @@ def benchmark_unpermute(
|
|||||||
graph.replay()
|
graph.replay()
|
||||||
torch.cuda.synchronize()
|
torch.cuda.synchronize()
|
||||||
|
|
||||||
start_event = torch.cuda.Event(enable_timing=True)
|
start_event = torch.Event(enable_timing=True)
|
||||||
end_event = torch.cuda.Event(enable_timing=True)
|
end_event = torch.Event(enable_timing=True)
|
||||||
|
|
||||||
latencies: list[float] = []
|
latencies: list[float] = []
|
||||||
for i in range(num_iters):
|
for i in range(num_iters):
|
||||||
|
|||||||
@@ -6,7 +6,7 @@
|
|||||||
#
|
#
|
||||||
# The CSV file (named with current date/time) contains these columns:
|
# The CSV file (named with current date/time) contains these columns:
|
||||||
# model_name, tp_size, num_tokens, num_heads, num_kv_heads, head_dim, max_position,
|
# model_name, tp_size, num_tokens, num_heads, num_kv_heads, head_dim, max_position,
|
||||||
# rope_theta, is_neox_style, rope_scaling, dtype, torch_mean, torch_median, torch_p99,
|
# is_neox_style, rope_parameters, dtype, torch_mean, torch_median, torch_p99,
|
||||||
# torch_min, torch_max, triton_mean, triton_median, triton_p99, triton_min, triton_max,
|
# torch_min, torch_max, triton_mean, triton_median, triton_p99, triton_min, triton_max,
|
||||||
# speedup
|
# speedup
|
||||||
#
|
#
|
||||||
@@ -86,9 +86,8 @@ def benchmark_mrope(
|
|||||||
num_heads: int,
|
num_heads: int,
|
||||||
num_kv_heads: int,
|
num_kv_heads: int,
|
||||||
max_position: int = 8192,
|
max_position: int = 8192,
|
||||||
rope_theta: float = 10000,
|
|
||||||
is_neox_style: bool = True,
|
is_neox_style: bool = True,
|
||||||
rope_scaling: dict[str, Any] = None,
|
rope_parameters: dict[str, Any] | None = None,
|
||||||
dtype: torch.dtype = torch.bfloat16,
|
dtype: torch.dtype = torch.bfloat16,
|
||||||
seed: int = 0,
|
seed: int = 0,
|
||||||
warmup_iter: int = 10,
|
warmup_iter: int = 10,
|
||||||
@@ -102,9 +101,8 @@ def benchmark_mrope(
|
|||||||
head_size=head_dim,
|
head_size=head_dim,
|
||||||
rotary_dim=head_dim,
|
rotary_dim=head_dim,
|
||||||
max_position=max_position,
|
max_position=max_position,
|
||||||
base=rope_theta,
|
|
||||||
is_neox_style=is_neox_style,
|
is_neox_style=is_neox_style,
|
||||||
rope_scaling=rope_scaling,
|
rope_parameters=rope_parameters,
|
||||||
dtype=dtype,
|
dtype=dtype,
|
||||||
).to(device=device)
|
).to(device=device)
|
||||||
|
|
||||||
@@ -203,9 +201,8 @@ def benchmark_mrope(
|
|||||||
num_kv_heads,
|
num_kv_heads,
|
||||||
head_dim,
|
head_dim,
|
||||||
max_position,
|
max_position,
|
||||||
rope_theta,
|
|
||||||
is_neox_style,
|
is_neox_style,
|
||||||
str(rope_scaling),
|
str(rope_parameters),
|
||||||
str(dtype).split(".")[-1],
|
str(dtype).split(".")[-1],
|
||||||
torch_stats["mean"],
|
torch_stats["mean"],
|
||||||
torch_stats["median"],
|
torch_stats["median"],
|
||||||
@@ -255,9 +252,8 @@ if __name__ == "__main__":
|
|||||||
"num_kv_heads",
|
"num_kv_heads",
|
||||||
"head_dim",
|
"head_dim",
|
||||||
"max_position",
|
"max_position",
|
||||||
"rope_theta",
|
|
||||||
"is_neox_style",
|
"is_neox_style",
|
||||||
"rope_scaling",
|
"rope_parameters",
|
||||||
"dtype",
|
"dtype",
|
||||||
"torch_mean",
|
"torch_mean",
|
||||||
"torch_median",
|
"torch_median",
|
||||||
@@ -303,7 +299,7 @@ if __name__ == "__main__":
|
|||||||
q_size = num_heads * head_dim
|
q_size = num_heads * head_dim
|
||||||
kv_size = num_kv_heads * head_dim
|
kv_size = num_kv_heads * head_dim
|
||||||
is_neox_style = True
|
is_neox_style = True
|
||||||
rope_theta = config.rope_theta
|
rope_parameters = config.rope_parameters
|
||||||
max_position = config.max_position_embeddings
|
max_position = config.max_position_embeddings
|
||||||
|
|
||||||
for num_tokens in num_tokens_list:
|
for num_tokens in num_tokens_list:
|
||||||
@@ -315,9 +311,8 @@ if __name__ == "__main__":
|
|||||||
num_heads=num_heads,
|
num_heads=num_heads,
|
||||||
num_kv_heads=num_kv_heads,
|
num_kv_heads=num_kv_heads,
|
||||||
max_position=max_position,
|
max_position=max_position,
|
||||||
rope_theta=rope_theta,
|
|
||||||
is_neox_style=is_neox_style,
|
is_neox_style=is_neox_style,
|
||||||
rope_scaling=config.rope_scaling,
|
rope_parameters=rope_parameters,
|
||||||
dtype=getattr(torch, args.dtype),
|
dtype=getattr(torch, args.dtype),
|
||||||
seed=args.seed,
|
seed=args.seed,
|
||||||
warmup_iter=args.warmup_iter,
|
warmup_iter=args.warmup_iter,
|
||||||
|
|||||||
@@ -30,8 +30,8 @@ def _time_cuda(
|
|||||||
fn()
|
fn()
|
||||||
torch.cuda.synchronize()
|
torch.cuda.synchronize()
|
||||||
|
|
||||||
start = torch.cuda.Event(enable_timing=True)
|
start = torch.Event(enable_timing=True)
|
||||||
end = torch.cuda.Event(enable_timing=True)
|
end = torch.Event(enable_timing=True)
|
||||||
|
|
||||||
start.record()
|
start.record()
|
||||||
for _ in range(bench_iters):
|
for _ in range(bench_iters):
|
||||||
|
|||||||
@@ -1,97 +1,76 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
|
||||||
from itertools import accumulate
|
import itertools
|
||||||
|
|
||||||
import nvtx
|
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
from vllm.model_executor.layers.rotary_embedding import RotaryEmbedding, get_rope
|
from vllm.model_executor.layers.rotary_embedding import get_rope
|
||||||
from vllm.platforms import current_platform
|
from vllm.triton_utils import triton
|
||||||
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
||||||
|
|
||||||
|
batch_size_range = [2**i for i in range(0, 8, 2)]
|
||||||
|
seq_len_range = [2**i for i in range(6, 10, 1)]
|
||||||
|
num_heads_range = [32, 48]
|
||||||
|
configs = list(itertools.product(batch_size_range, seq_len_range, num_heads_range))
|
||||||
|
|
||||||
def benchmark_rope_kernels_multi_lora(
|
|
||||||
is_neox_style: bool,
|
|
||||||
batch_size: int,
|
|
||||||
seq_len: int,
|
|
||||||
num_heads: int,
|
|
||||||
head_size: int,
|
|
||||||
rotary_dim: int | None,
|
|
||||||
dtype: torch.dtype,
|
|
||||||
seed: int,
|
|
||||||
device: str,
|
|
||||||
max_position: int = 8192,
|
|
||||||
base: float = 10000,
|
|
||||||
) -> None:
|
|
||||||
current_platform.seed_everything(seed)
|
|
||||||
torch.set_default_device(device)
|
|
||||||
if rotary_dim is None:
|
|
||||||
rotary_dim = head_size
|
|
||||||
# silulating serving 4 LoRAs
|
|
||||||
scaling_factors = [1, 2, 4, 8]
|
|
||||||
# batched RoPE can take multiple scaling factors
|
|
||||||
batched_rope = get_rope(
|
|
||||||
head_size,
|
|
||||||
rotary_dim,
|
|
||||||
max_position,
|
|
||||||
base,
|
|
||||||
is_neox_style,
|
|
||||||
{"rope_type": "linear", "factor": tuple(scaling_factors)},
|
|
||||||
)
|
|
||||||
# non-batched RoPE takes only one scaling factor, we create multiple
|
|
||||||
# instances to simulate the same behavior
|
|
||||||
non_batched_ropes: list[RotaryEmbedding] = []
|
|
||||||
for scaling_factor in scaling_factors:
|
|
||||||
non_batched_ropes.append(
|
|
||||||
get_rope(
|
|
||||||
head_size,
|
|
||||||
rotary_dim,
|
|
||||||
max_position,
|
|
||||||
base,
|
|
||||||
is_neox_style,
|
|
||||||
{"rope_type": "linear", "factor": (scaling_factor,)},
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
positions = torch.randint(0, max_position, (batch_size, seq_len))
|
def get_benchmark(head_size, rotary_dim, is_neox_style, device):
|
||||||
query = torch.randn(batch_size, seq_len, num_heads * head_size, dtype=dtype)
|
@triton.testing.perf_report(
|
||||||
|
triton.testing.Benchmark(
|
||||||
|
x_names=["batch_size", "seq_len", "num_heads"],
|
||||||
|
x_vals=[list(_) for _ in configs],
|
||||||
|
line_arg="provider",
|
||||||
|
line_vals=["torch", "flashinfer", "vllm"],
|
||||||
|
line_names=["PyTorch", "FlashInfer", "vLLM"],
|
||||||
|
styles=[("blue", "-"), ("green", "-"), ("red", "-")],
|
||||||
|
ylabel="us",
|
||||||
|
plot_name=f"rope-perf{'-neox-style' if is_neox_style else ''}",
|
||||||
|
args={},
|
||||||
|
)
|
||||||
|
)
|
||||||
|
def benchmark(batch_size, seq_len, num_heads, provider):
|
||||||
|
dtype = torch.bfloat16
|
||||||
|
max_position = 8192
|
||||||
|
base = 10000
|
||||||
|
rope = get_rope(head_size, rotary_dim, max_position, base, is_neox_style)
|
||||||
|
rope = rope.to(dtype=dtype, device=device)
|
||||||
|
cos_sin_cache = rope.cos_sin_cache.to(dtype=torch.float, device=device)
|
||||||
|
|
||||||
|
positions = torch.randint(0, max_position, (batch_size, seq_len), device=device)
|
||||||
|
query = torch.randn(
|
||||||
|
(batch_size, seq_len, num_heads * head_size), dtype=dtype, device=device
|
||||||
|
)
|
||||||
key = torch.randn_like(query)
|
key = torch.randn_like(query)
|
||||||
|
|
||||||
# create query offsets for batched RoPE, we concat multiple kv cache
|
quantiles = [0.5, 0.2, 0.8]
|
||||||
# together and each query needs to find the right kv cache of its type
|
|
||||||
offset_map = torch.tensor(
|
|
||||||
list(
|
|
||||||
accumulate(
|
|
||||||
[0]
|
|
||||||
+ [
|
|
||||||
max_position * scaling_factor * 2
|
|
||||||
for scaling_factor in scaling_factors[:-1]
|
|
||||||
]
|
|
||||||
)
|
|
||||||
)
|
|
||||||
)
|
|
||||||
query_types = torch.randint(
|
|
||||||
0, len(scaling_factors), (batch_size, seq_len), device=device
|
|
||||||
)
|
|
||||||
# map query types to offsets
|
|
||||||
query_offsets = offset_map[query_types]
|
|
||||||
# the kernel takes flattened offsets
|
|
||||||
flatten_offsets = query_offsets.flatten()
|
|
||||||
|
|
||||||
# batched queries of the same type together for non-batched RoPE
|
if provider == "torch":
|
||||||
queries = [query[query_types == i] for i in range(len(scaling_factors))]
|
ms, min_ms, max_ms = triton.testing.do_bench(
|
||||||
keys = [key[query_types == i] for i in range(len(scaling_factors))]
|
lambda: rope.forward_native(positions, query.clone(), key.clone()),
|
||||||
packed_qkr = zip(queries, keys, non_batched_ropes)
|
quantiles=quantiles,
|
||||||
# synchronize before start timing
|
)
|
||||||
torch.cuda.synchronize()
|
elif provider == "flashinfer":
|
||||||
with nvtx.annotate("non-batched", color="yellow"):
|
ms, min_ms, max_ms = triton.testing.do_bench(
|
||||||
for q, k, r in packed_qkr:
|
lambda: torch.ops.vllm.flashinfer_rotary_embedding(
|
||||||
r.forward(positions, q, k)
|
positions,
|
||||||
torch.cuda.synchronize()
|
query.clone(),
|
||||||
with nvtx.annotate("batched", color="green"):
|
key.clone(),
|
||||||
batched_rope.forward(positions, query, key, flatten_offsets)
|
head_size,
|
||||||
torch.cuda.synchronize()
|
cos_sin_cache,
|
||||||
|
is_neox_style,
|
||||||
|
),
|
||||||
|
quantiles=quantiles,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
ms, min_ms, max_ms = triton.testing.do_bench(
|
||||||
|
lambda: rope.forward_cuda(positions, query.clone(), key.clone()),
|
||||||
|
quantiles=quantiles,
|
||||||
|
)
|
||||||
|
|
||||||
|
return 1000 * ms, 1000 * max_ms, 1000 * min_ms
|
||||||
|
|
||||||
|
return benchmark
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
@@ -116,17 +95,12 @@ if __name__ == "__main__":
|
|||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--device", type=str, choices=["cuda:0", "cuda:1"], default="cuda:0"
|
"--device", type=str, choices=["cuda:0", "cuda:1"], default="cuda:0"
|
||||||
)
|
)
|
||||||
|
parser.add_argument("--save-path", type=str, default="./configs/rope/")
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
print(args)
|
|
||||||
|
|
||||||
benchmark_rope_kernels_multi_lora(
|
# Get the benchmark function
|
||||||
is_neox_style=args.is_neox_style,
|
benchmark = get_benchmark(
|
||||||
batch_size=args.batch_size,
|
args.head_size, args.rotary_dim, args.is_neox_style, args.device
|
||||||
seq_len=args.seq_len,
|
|
||||||
num_heads=args.num_heads,
|
|
||||||
head_size=args.head_size,
|
|
||||||
rotary_dim=args.rotary_dim,
|
|
||||||
dtype=getattr(torch, args.dtype),
|
|
||||||
seed=args.seed,
|
|
||||||
device=args.device,
|
|
||||||
)
|
)
|
||||||
|
# Run performance benchmark
|
||||||
|
benchmark.run(print_data=True, save_path=args.save_path)
|
||||||
|
|||||||
@@ -78,11 +78,11 @@ WEIGHT_SHAPES = {
|
|||||||
}
|
}
|
||||||
|
|
||||||
WEIGHT_SHAPES_MOE = {
|
WEIGHT_SHAPES_MOE = {
|
||||||
"nm-testing/Mixtral-8x7B-Instruct-v0.1": [
|
"mistralai/Mixtral-8x7B-Instruct-v0.1": [
|
||||||
[8, 2, 4096, 28672],
|
[8, 2, 4096, 28672],
|
||||||
[8, 2, 14336, 4096],
|
[8, 2, 14336, 4096],
|
||||||
],
|
],
|
||||||
"nm-testing/deepseekv2-lite": [
|
"deepseek-ai/DeepSeek-V2-Lite": [
|
||||||
[64, 6, 2048, 1408],
|
[64, 6, 2048, 1408],
|
||||||
],
|
],
|
||||||
"ibm-granite/granite-3.0-1b-a400m": [
|
"ibm-granite/granite-3.0-1b-a400m": [
|
||||||
|
|||||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user