Compare commits
1162 Commits
v0.15.1rc0
...
v0.17.0rc0
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
097eb544e9 | ||
|
|
7cdba98edf | ||
|
|
3c85cd9d74 | ||
|
|
edba15045a | ||
|
|
e379396167 | ||
|
|
6e9f21e8a2 | ||
|
|
c1d963403c | ||
|
|
77e6dcbbfa | ||
|
|
70c73df69e | ||
|
|
9a9d442464 | ||
|
|
f7da9cdffc | ||
|
|
f22ff2958c | ||
|
|
d15c3b90fc | ||
|
|
97286a20ed | ||
|
|
12b38c0f45 | ||
|
|
467886a0c4 | ||
|
|
a9b8b13e5c | ||
|
|
e7213003cb | ||
|
|
3a8eef5869 | ||
|
|
97995f6376 | ||
|
|
881a6b011b | ||
|
|
8e1fd5baf0 | ||
|
|
ae88468bcc | ||
|
|
e05cb3b93e | ||
|
|
28ef9ba399 | ||
|
|
fb7fdc49c4 | ||
|
|
ea463978bb | ||
|
|
440f0e7dc6 | ||
|
|
fd4a90f337 | ||
|
|
ad9d09e2b8 | ||
|
|
4beebfd146 | ||
|
|
b8401cde0e | ||
|
|
5dfc5abe94 | ||
|
|
8fa68a8ce4 | ||
|
|
35a6f0bfe2 | ||
|
|
3a6cbf16e2 | ||
|
|
f44d1ddc8c | ||
|
|
48a54c1e0d | ||
|
|
8b9e8b7454 | ||
|
|
c21d0039ec | ||
|
|
7d8bbe6f42 | ||
|
|
25e02647c2 | ||
|
|
a0a5178ab4 | ||
|
|
8ea8ba275e | ||
|
|
4f85bae9d6 | ||
|
|
0a7165fd71 | ||
|
|
6521ccf286 | ||
|
|
8ebd872f50 | ||
|
|
168ee03e1c | ||
|
|
9dd656f0ea | ||
|
|
c8b678e53e | ||
|
|
18c29c746b | ||
|
|
96fc09503a | ||
|
|
1b82b433fc | ||
|
|
9319044ee9 | ||
|
|
c42dc402c1 | ||
|
|
fa6a6be519 | ||
|
|
cad21918e3 | ||
|
|
53700bf49b | ||
|
|
a13d8c03c9 | ||
|
|
9433acb8df | ||
|
|
d1a6e96d9e | ||
|
|
2a9e3347e9 | ||
|
|
cc0d565f40 | ||
|
|
358e4d5ba7 | ||
|
|
792a74b973 | ||
|
|
4034c3d32e | ||
|
|
7560d674c9 | ||
|
|
d9c7730877 | ||
|
|
ada4f4fadd | ||
|
|
7e9149d9a9 | ||
|
|
87c98b0236 | ||
|
|
de7dd634b9 | ||
|
|
9a87b0578f | ||
|
|
510bc9e1df | ||
|
|
cbd361fd46 | ||
|
|
c212202d93 | ||
|
|
ec27b36b4b | ||
|
|
3fd1d4ec2c | ||
|
|
cb21972a97 | ||
|
|
c34963f138 | ||
|
|
f26650d649 | ||
|
|
92f5d0f070 | ||
|
|
a60985b07e | ||
|
|
8b5014d3dd | ||
|
|
57a96e26c9 | ||
|
|
e82fbeec7b | ||
|
|
6290470843 | ||
|
|
72f4d16262 | ||
|
|
5a435507d8 | ||
|
|
59d7af9c6c | ||
|
|
bbf81f9a92 | ||
|
|
da543d1abe | ||
|
|
87d319c52f | ||
|
|
a9ec392c86 | ||
|
|
afd089f231 | ||
|
|
3ecd0bf9fc | ||
|
|
e3eb146f7a | ||
|
|
95a395dbec | ||
|
|
e94b263bd6 | ||
|
|
e113a30113 | ||
|
|
1dafb29f91 | ||
|
|
49b9ae32e9 | ||
|
|
63d7972f13 | ||
|
|
c68e69f144 | ||
|
|
7e08c22b8c | ||
|
|
8e75d88554 | ||
|
|
0892d1ab1f | ||
|
|
7600642eae | ||
|
|
1e69c04887 | ||
|
|
4292e3b807 | ||
|
|
24d6ea8afd | ||
|
|
57c86c0741 | ||
|
|
06254d4cbb | ||
|
|
f5d1281c9d | ||
|
|
94029ffaf0 | ||
|
|
88e8525f2e | ||
|
|
b2d8b422b2 | ||
|
|
1d5ab5d603 | ||
|
|
7b346ba8ed | ||
|
|
dea268336f | ||
|
|
90805ff464 | ||
|
|
2562e0271e | ||
|
|
fd68cd132b | ||
|
|
0edf101d2b | ||
|
|
d5b6f3ba36 | ||
|
|
1a014a0a93 | ||
|
|
86ac7bcf84 | ||
|
|
405f28d38d | ||
|
|
5323672bc2 | ||
|
|
a201ad72d8 | ||
|
|
e3691988d0 | ||
|
|
9fa6c68fa6 | ||
|
|
2ce6f3cf67 | ||
|
|
1f3dbd95fd | ||
|
|
1d532f9d8f | ||
|
|
234a65b781 | ||
|
|
2decec9856 | ||
|
|
29b35477b0 | ||
|
|
b1d9f5372d | ||
|
|
fd6de37fca | ||
|
|
c8aca0c9e1 | ||
|
|
b602e4f299 | ||
|
|
157722da75 | ||
|
|
1d897ff04f | ||
|
|
905d76b51d | ||
|
|
9098ce690c | ||
|
|
876312f0b5 | ||
|
|
5de98abc12 | ||
|
|
9251ed5c4f | ||
|
|
e8249378e4 | ||
|
|
6d4f9d3ad5 | ||
|
|
fbe3f0120a | ||
|
|
66c1751d13 | ||
|
|
6467b635b6 | ||
|
|
9c3fe9936b | ||
|
|
b66a74649e | ||
|
|
07bdabef03 | ||
|
|
a572baff5e | ||
|
|
516cf26698 | ||
|
|
487e5c51f7 | ||
|
|
1a8c71674e | ||
|
|
062b789632 | ||
|
|
a532c83849 | ||
|
|
1e5ad9b74f | ||
|
|
cabdaa7619 | ||
|
|
06be53563b | ||
|
|
c29ee9c326 | ||
|
|
d43048ce05 | ||
|
|
4fec53cfcb | ||
|
|
38c498b8e3 | ||
|
|
56a6371706 | ||
|
|
6283021142 | ||
|
|
01923eec70 | ||
|
|
31fb6f43da | ||
|
|
eb19955c37 | ||
|
|
0f2f24c8b2 | ||
|
|
d0105b84f0 | ||
|
|
832a780f3a | ||
|
|
98217b09f9 | ||
|
|
967572dd5f | ||
|
|
3d66502e1b | ||
|
|
c66aa48e99 | ||
|
|
b6d5a17298 | ||
|
|
5e58bdc711 | ||
|
|
a1f53addb1 | ||
|
|
05970c772c | ||
|
|
d940607629 | ||
|
|
99c7892c5b | ||
|
|
ec8f943db1 | ||
|
|
f2ad952f40 | ||
|
|
9e2cabdf9c | ||
|
|
ec8ab9d254 | ||
|
|
05972ea7e5 | ||
|
|
111d869069 | ||
|
|
7fea7250a4 | ||
|
|
845ee348ef | ||
|
|
ec13e549d3 | ||
|
|
c6ca51598a | ||
|
|
c0615a296d | ||
|
|
01914445b0 | ||
|
|
5281713e11 | ||
|
|
32693db8ce | ||
|
|
e03ddcfbd4 | ||
|
|
02acd16861 | ||
|
|
ab87f85231 | ||
|
|
3827c8c55a | ||
|
|
ade81f17fe | ||
|
|
6042e66cd5 | ||
|
|
9f9a675b23 | ||
|
|
a07c4c5939 | ||
|
|
d3a51da92a | ||
|
|
186ea22efe | ||
|
|
4a9c07a0a2 | ||
|
|
9d37941017 | ||
|
|
4171ff6dd9 | ||
|
|
13025e71e8 | ||
|
|
71dfce6aa6 | ||
|
|
2aa4140402 | ||
|
|
86c3b5a808 | ||
|
|
160424a937 | ||
|
|
9511a3f8ee | ||
|
|
de527e1cec | ||
|
|
1976356ee6 | ||
|
|
cbf8f7028c | ||
|
|
6831650c40 | ||
|
|
ed42507f6d | ||
|
|
9571e99945 | ||
|
|
c97234c08b | ||
|
|
b188bab441 | ||
|
|
15d76f74e2 | ||
|
|
8fd6975479 | ||
|
|
5d18bf8b32 | ||
|
|
0788ff0a15 | ||
|
|
d72b0be33c | ||
|
|
42489e43c2 | ||
|
|
af5e6afa0a | ||
|
|
ee59a7c615 | ||
|
|
709eadbb0b | ||
|
|
90fc7f9109 | ||
|
|
675ec59aa9 | ||
|
|
80e60a6133 | ||
|
|
26e722f906 | ||
|
|
2c619e5e3f | ||
|
|
8a685be8d9 | ||
|
|
2465071510 | ||
|
|
cd43673668 | ||
|
|
35d44b4557 | ||
|
|
8ad54a991b | ||
|
|
92510edc32 | ||
|
|
a6c137521c | ||
|
|
4572a06afe | ||
|
|
5cc29cfb8b | ||
|
|
8fae54faff | ||
|
|
f7967577f5 | ||
|
|
af770b8e7b | ||
|
|
2ff3e436ad | ||
|
|
c2c4c4611a | ||
|
|
f38f8c9742 | ||
|
|
ec1d30c0f6 | ||
|
|
e3b2324ec4 | ||
|
|
dbf0da817a | ||
|
|
3bbb2046ff | ||
|
|
576fe50333 | ||
|
|
a0e50a4260 | ||
|
|
9fa5b25a23 | ||
|
|
ea97750414 | ||
|
|
067c5d9ad1 | ||
|
|
f5972a872f | ||
|
|
a9e15e040d | ||
|
|
542ca66357 | ||
|
|
fc8456c336 | ||
|
|
9ce8fad2a9 | ||
|
|
c38b8d5a31 | ||
|
|
60da0e1544 | ||
|
|
9609b1f18d | ||
|
|
a0c7081695 | ||
|
|
34ce0ffd1f | ||
|
|
0de5333989 | ||
|
|
a87cc50859 | ||
|
|
761e63e541 | ||
|
|
d12d201409 | ||
|
|
b3ad37c5db | ||
|
|
14561fabfd | ||
|
|
c77f3e1207 | ||
|
|
012dee9233 | ||
|
|
f1c664545b | ||
|
|
c870eb9e0f | ||
|
|
6af03f2394 | ||
|
|
1a6cf39dec | ||
|
|
f91808ae0d | ||
|
|
33a0d43c71 | ||
|
|
80d93fd6da | ||
|
|
ec85340531 | ||
|
|
2ff4e51152 | ||
|
|
95642441d0 | ||
|
|
a7c9f7b7ec | ||
|
|
a4bd661fb3 | ||
|
|
3ef9fd0f98 | ||
|
|
22a97e6613 | ||
|
|
596ed1f02e | ||
|
|
b8d8b7e934 | ||
|
|
28c5e69ba0 | ||
|
|
864167d376 | ||
|
|
a2ba6a5244 | ||
|
|
c4f38696f7 | ||
|
|
a7f341c323 | ||
|
|
d13ece38d7 | ||
|
|
5cc7c4452e | ||
|
|
b95bb6927f | ||
|
|
392645454b | ||
|
|
1e8438a89a | ||
|
|
8435b2e049 | ||
|
|
b1b5e045df | ||
|
|
5f68464f92 | ||
|
|
aa08a30fc9 | ||
|
|
7f40e9e516 | ||
|
|
103e614b14 | ||
|
|
54e2f83d0a | ||
|
|
e631f8e78e | ||
|
|
e97c46a92d | ||
|
|
7291d1b288 | ||
|
|
987506bca6 | ||
|
|
c645e9a214 | ||
|
|
944ffb5968 | ||
|
|
2bcf71b9c0 | ||
|
|
b7892a3bef | ||
|
|
682566b18e | ||
|
|
b9c2a565cc | ||
|
|
dd8c3a7fb2 | ||
|
|
a8a47c17b6 | ||
|
|
40f88d8318 | ||
|
|
2cbf9656ce | ||
|
|
30132cd144 | ||
|
|
cbd95a2dd1 | ||
|
|
970861ac0c | ||
|
|
d24bdd7c4b | ||
|
|
d403c1da1c | ||
|
|
b71fbd06e2 | ||
|
|
74d90b1ce4 | ||
|
|
a4047d4ea9 | ||
|
|
965fe45935 | ||
|
|
98b0205c3c | ||
|
|
272b535ab3 | ||
|
|
f74f1572ca | ||
|
|
bebfe55b1c | ||
|
|
820d7815eb | ||
|
|
ab6f3487a6 | ||
|
|
8dc8a99b56 | ||
|
|
2aab2bb543 | ||
|
|
54254f7a61 | ||
|
|
cf93c1a128 | ||
|
|
89358f0d35 | ||
|
|
a0fe7ea2f0 | ||
|
|
991d6bff38 | ||
|
|
5719a4e4e6 | ||
|
|
11be2c74dc | ||
|
|
7a5adad480 | ||
|
|
59c6233297 | ||
|
|
d38cd3dde5 | ||
|
|
ded333fb9b | ||
|
|
9d7577b2bd | ||
|
|
e739c29ea4 | ||
|
|
a55caf6ae9 | ||
|
|
0e22cd618b | ||
|
|
ea5f903f80 | ||
|
|
0632ed8778 | ||
|
|
aaefc58ee0 | ||
|
|
f24b2de3d3 | ||
|
|
fac1507f03 | ||
|
|
f863994084 | ||
|
|
e4a5d8c653 | ||
|
|
a6d0299c75 | ||
|
|
6ce80f7071 | ||
|
|
1fe462168c | ||
|
|
ed31a020ee | ||
|
|
f9ac19204f | ||
|
|
59965affbd | ||
|
|
b1c4f0b265 | ||
|
|
8de7c636cc | ||
|
|
059779231f | ||
|
|
ea37530b47 | ||
|
|
f5432e35a3 | ||
|
|
07cab212f0 | ||
|
|
0c1dc42748 | ||
|
|
676f82ae81 | ||
|
|
81bfc21a6a | ||
|
|
4e2c7caf2d | ||
|
|
d9e62c03eb | ||
|
|
a1a2d79442 | ||
|
|
ac900c89bb | ||
|
|
76df6072ff | ||
|
|
16f24e8797 | ||
|
|
40b2f1c3d9 | ||
|
|
648951a9c3 | ||
|
|
f72061a19a | ||
|
|
662205d34e | ||
|
|
4fb8beefaa | ||
|
|
304319c4ed | ||
|
|
c683d11c94 | ||
|
|
3eff45d793 | ||
|
|
4685a630a2 | ||
|
|
ee1d25f199 | ||
|
|
6fff24f30f | ||
|
|
23210a911e | ||
|
|
1391378861 | ||
|
|
f6220f9877 | ||
|
|
2df2bb27b0 | ||
|
|
f75b61a9e9 | ||
|
|
7f51e93864 | ||
|
|
4611af1663 | ||
|
|
ad5aa6bd9f | ||
|
|
9681068cf9 | ||
|
|
b6101d384d | ||
|
|
5fcb0cdd68 | ||
|
|
c878b43b64 | ||
|
|
2b84ac669c | ||
|
|
11d3976b88 | ||
|
|
40da9625a1 | ||
|
|
8d9babd4de | ||
|
|
e99ba957ec | ||
|
|
64ac1395e8 | ||
|
|
61cf087680 | ||
|
|
847a57cd12 | ||
|
|
fcd6ac97ed | ||
|
|
95be2a7f22 | ||
|
|
0e60c925cf | ||
|
|
d7ff22204a | ||
|
|
c0bd8b13da | ||
|
|
caeb887bf6 | ||
|
|
6b3166a7c7 | ||
|
|
25e2e136ef | ||
|
|
6874638bc4 | ||
|
|
e24663c5a9 | ||
|
|
c50e105a88 | ||
|
|
a766b30349 | ||
|
|
1faa8cb73c | ||
|
|
e89a91d927 | ||
|
|
909b147197 | ||
|
|
a88b3be7c4 | ||
|
|
a49ea5a58f | ||
|
|
30ebe0dc3c | ||
|
|
cef65f0715 | ||
|
|
6f3b2047ab | ||
|
|
02e8f26cea | ||
|
|
4a00a511bb | ||
|
|
a0d8d944e2 | ||
|
|
df3f537a66 | ||
|
|
7743152957 | ||
|
|
ab33d2a629 | ||
|
|
be3af2d29e | ||
|
|
c656ba3b4d | ||
|
|
dc5fa77a4e | ||
|
|
1e4a084c8e | ||
|
|
7967e854da | ||
|
|
6bd6d0c3c1 | ||
|
|
8e962fef5f | ||
|
|
574fe75245 | ||
|
|
c61a98f529 | ||
|
|
28bffe9466 | ||
|
|
ad65177a19 | ||
|
|
d44a5b6c47 | ||
|
|
1d65283e95 | ||
|
|
c464b57374 | ||
|
|
c5c38e152a | ||
|
|
d00df624f3 | ||
|
|
9752da9d9c | ||
|
|
04925b2202 | ||
|
|
d74278fb67 | ||
|
|
b68fd899d1 | ||
|
|
0b5f9b7204 | ||
|
|
9a8853f781 | ||
|
|
387a1898d9 | ||
|
|
3b30e61507 | ||
|
|
824f9e8f3c | ||
|
|
6cc403e67d | ||
|
|
72d5951d02 | ||
|
|
a3205beffb | ||
|
|
6930becd45 | ||
|
|
03a8770a6d | ||
|
|
bc56a1d56e | ||
|
|
ec7d9e6745 | ||
|
|
3bb4e4311c | ||
|
|
08f8c198ae | ||
|
|
a21cedf4ff | ||
|
|
3ef74cde5d | ||
|
|
cd81cdb399 | ||
|
|
1e828573b4 | ||
|
|
a5ccc85c8c | ||
|
|
b5475d0534 | ||
|
|
9521002f0a | ||
|
|
ec17bdd894 | ||
|
|
bb59c90248 | ||
|
|
5bff999d12 | ||
|
|
bb85929aa6 | ||
|
|
5653021094 | ||
|
|
974d829b05 | ||
|
|
91ac5d9bfd | ||
|
|
23d825aba1 | ||
|
|
f07a128413 | ||
|
|
71cd89264f | ||
|
|
19fab44152 | ||
|
|
79c7e09235 | ||
|
|
79f3fab05a | ||
|
|
604b9eaec5 | ||
|
|
50dbd6c9e6 | ||
|
|
98bcc6ca59 | ||
|
|
f13e86d8dd | ||
|
|
9ca768c740 | ||
|
|
d5fe3f702c | ||
|
|
73391a1baa | ||
|
|
b3c14229b0 | ||
|
|
2f186635cb | ||
|
|
342a7cda2d | ||
|
|
d1ea65d0a1 | ||
|
|
de42abb366 | ||
|
|
60ca7981bc | ||
|
|
0ef5b9147b | ||
|
|
ed242652d7 | ||
|
|
b37b679770 | ||
|
|
a0638d052d | ||
|
|
c027541eaf | ||
|
|
fd267bc7b7 | ||
|
|
bfaa559305 | ||
|
|
87789c8364 | ||
|
|
bcd65c1f6a | ||
|
|
59d53066d8 | ||
|
|
4a9952ec1b | ||
|
|
1dae7b7843 | ||
|
|
5885e330ef | ||
|
|
071d863e20 | ||
|
|
0916e7960b | ||
|
|
3d2a026fd0 | ||
|
|
dddbff4624 | ||
|
|
47e9b63e1a | ||
|
|
934acddef9 | ||
|
|
742d214d6e | ||
|
|
4137c5dfa7 | ||
|
|
7a8a46ddcb | ||
|
|
bcf0731aa0 | ||
|
|
ec090c2429 | ||
|
|
eea3024f43 | ||
|
|
2f308214c0 | ||
|
|
1b4e8e53f8 | ||
|
|
dcf6ee8592 | ||
|
|
372b2e762a | ||
|
|
6afa587d31 | ||
|
|
94ed6cf6ea | ||
|
|
bf37812ca7 | ||
|
|
b86bf4417e | ||
|
|
de13dd781f | ||
|
|
62788f99a4 | ||
|
|
ea5ff3a1f6 | ||
|
|
04ea31baab | ||
|
|
6f019e6e0a | ||
|
|
d707678dfb | ||
|
|
fc22cae4ac | ||
|
|
96161fe978 | ||
|
|
4453ba8d9e | ||
|
|
aa181c923b | ||
|
|
be7370daf3 | ||
|
|
9ea1f598ce | ||
|
|
f120bd42d3 | ||
|
|
fac4e96940 | ||
|
|
6d4e27ce29 | ||
|
|
4c078fa546 | ||
|
|
6c0baee610 | ||
|
|
1100a97621 | ||
|
|
766e167821 | ||
|
|
becbe24808 | ||
|
|
679ca5d8d3 | ||
|
|
f2c47886fd | ||
|
|
334c715e0f | ||
|
|
7b5a8b4a9d | ||
|
|
dea63512bb | ||
|
|
8a798be929 | ||
|
|
fb455ed547 | ||
|
|
f5897613fb | ||
|
|
55a1a9563a | ||
|
|
386bfe5d08 | ||
|
|
e9cd691132 | ||
|
|
80f2ba6ea6 | ||
|
|
136b0bfa59 | ||
|
|
b96f7314b4 | ||
|
|
ced2a92f40 | ||
|
|
e1d97c38f8 | ||
|
|
ec12d39d44 | ||
|
|
ff1f83b056 | ||
|
|
83b47f67b1 | ||
|
|
fb7b30c716 | ||
|
|
31d992d215 | ||
|
|
5aff2699bd | ||
|
|
527ca32197 | ||
|
|
5458eb835d | ||
|
|
144d9b7cc8 | ||
|
|
83e26c834e | ||
|
|
5001211369 | ||
|
|
11c7ace340 | ||
|
|
be7f3d5d20 | ||
|
|
0ab06100f4 | ||
|
|
ffb3d553cc | ||
|
|
fa7e0bfacf | ||
|
|
48134a2c22 | ||
|
|
64f570ab56 | ||
|
|
fd618871b4 | ||
|
|
67a42b5a44 | ||
|
|
c7914d30f9 | ||
|
|
1b8756562e | ||
|
|
275e0d2a99 | ||
|
|
0f5e55e7a8 | ||
|
|
1e9204bff3 | ||
|
|
05339a7b20 | ||
|
|
40b8f55358 | ||
|
|
5045d5c983 | ||
|
|
e09546cf05 | ||
|
|
786806dd44 | ||
|
|
79504027ef | ||
|
|
addac0e653 | ||
|
|
675a22ed66 | ||
|
|
cb9574eb85 | ||
|
|
21dfb842d7 | ||
|
|
d1b837f0ae | ||
|
|
0b20469c62 | ||
|
|
d7982daff5 | ||
|
|
9b17c57460 | ||
|
|
1b3540e6c6 | ||
|
|
7a048ee65f | ||
|
|
c9a1923bb4 | ||
|
|
b482f71e9f | ||
|
|
1485396abb | ||
|
|
5ee5c86eeb | ||
|
|
b5dcb372e4 | ||
|
|
066c6da6a0 | ||
|
|
e30cedd44b | ||
|
|
3bcd494ef4 | ||
|
|
0e725a7d22 | ||
|
|
ba0511fd80 | ||
|
|
4a1550d22d | ||
|
|
d1481ba783 | ||
|
|
dc6de33c3d | ||
|
|
c4b9e6778f | ||
|
|
341eed3d30 | ||
|
|
6f2f59f2b3 | ||
|
|
bb2fc8b5e7 | ||
|
|
67132945bb | ||
|
|
f0ca0671c7 | ||
|
|
578977bb5e | ||
|
|
9615575afc | ||
|
|
4293c00b84 | ||
|
|
506ad7d7c1 | ||
|
|
fdd6f2ad58 | ||
|
|
33bcd3dc3b | ||
|
|
1f5febb4b8 | ||
|
|
ae871ca923 | ||
|
|
a2443de5fa | ||
|
|
f84a2a8f31 | ||
|
|
000214c4bb | ||
|
|
c5a66d1697 | ||
|
|
afdce12c89 | ||
|
|
82e11973cc | ||
|
|
b129136c7a | ||
|
|
599e4335a4 | ||
|
|
a1946570d8 | ||
|
|
d0bc520569 | ||
|
|
748625cdaf | ||
|
|
61413973e8 | ||
|
|
94de871546 | ||
|
|
e042d7e685 | ||
|
|
ae4e280602 | ||
|
|
cbea11c9f0 | ||
|
|
2c32558a3c | ||
|
|
5f970120f0 | ||
|
|
998e2d91f8 | ||
|
|
e1060a71a1 | ||
|
|
97fa8f6590 | ||
|
|
dab1de9f38 | ||
|
|
8d48d0a9d9 | ||
|
|
9608844f96 | ||
|
|
f69b903b4c | ||
|
|
81e217fe6b | ||
|
|
ab97bcf662 | ||
|
|
25e48a3aae | ||
|
|
8a5e0e2b2b | ||
|
|
4cde2e0159 | ||
|
|
047a457fa4 | ||
|
|
e94ec59733 | ||
|
|
13397841ab | ||
|
|
c60f8e3b49 | ||
|
|
5e75a14a66 | ||
|
|
e7e52781ff | ||
|
|
bb9f97308d | ||
|
|
4d39650961 | ||
|
|
8fd31f6245 | ||
|
|
eadb4e868b | ||
|
|
285bab4752 | ||
|
|
995bbf38f1 | ||
|
|
d4f123cc48 | ||
|
|
cb62e86f83 | ||
|
|
781ddf7868 | ||
|
|
64a9c2528b | ||
|
|
d0d97e2974 | ||
|
|
9562912cea | ||
|
|
9bdb06b436 | ||
|
|
caad9f1e01 | ||
|
|
1d5922fade | ||
|
|
3025b3cebb | ||
|
|
978a37c823 | ||
|
|
5a5c43511a | ||
|
|
d9bede0314 | ||
|
|
22b64948f6 | ||
|
|
7c233dbb36 | ||
|
|
a75a5b54c7 | ||
|
|
f97ca67176 | ||
|
|
084aa19f02 | ||
|
|
1ecfabe525 | ||
|
|
4df841fe75 | ||
|
|
a263aa6140 | ||
|
|
179ae7da8f | ||
|
|
c4df59ad43 | ||
|
|
785cf28fff | ||
|
|
a96197f564 | ||
|
|
ab10d79855 | ||
|
|
7fcb705b80 | ||
|
|
b956cdf818 | ||
|
|
ed17f54c8b | ||
|
|
860981d8d8 | ||
|
|
52181baaea | ||
|
|
de3869bb4d | ||
|
|
ce9b3cd3e9 | ||
|
|
db4ede9743 | ||
|
|
2cb2340f7a | ||
|
|
4df44c16ba | ||
|
|
81fe69cae5 | ||
|
|
dd6a6e1190 | ||
|
|
edb359cce4 | ||
|
|
6ed5eda300 | ||
|
|
11a4c9d30d | ||
|
|
15a0b9e570 | ||
|
|
c490d8cc73 | ||
|
|
48312e579a | ||
|
|
bc32444b23 | ||
|
|
18e8545297 | ||
|
|
6f7adc533a | ||
|
|
40218a82ba | ||
|
|
1c3b22058f | ||
|
|
3920cafdd6 | ||
|
|
ec28784fdc | ||
|
|
55aeec04f5 | ||
|
|
906077181b | ||
|
|
89a385d79f | ||
|
|
4a2d00eafd | ||
|
|
207c3a0c20 | ||
|
|
ae2e93f89b | ||
|
|
9e9acce577 | ||
|
|
fe5438200b | ||
|
|
77c09e1130 | ||
|
|
16786da735 | ||
|
|
aaa2efbe98 | ||
|
|
aca5967416 | ||
|
|
67a746e87f | ||
|
|
7bec435130 | ||
|
|
5c52644b10 | ||
|
|
2ce9fe4ad0 | ||
|
|
cd8b405bd0 | ||
|
|
4707f7ebb4 | ||
|
|
c39ee9ee2b | ||
|
|
350ca72c04 | ||
|
|
1fb0495a72 | ||
|
|
85ee1d962b | ||
|
|
51a7bda625 | ||
|
|
6e7b1c4b59 | ||
|
|
2991dd3d22 | ||
|
|
ac32e66cf9 | ||
|
|
f79d9dce16 | ||
|
|
ba5cbbf107 | ||
|
|
233b26ab35 | ||
|
|
791a94bed0 | ||
|
|
e969a169ef | ||
|
|
6d8d34be6d | ||
|
|
1363e3d6d5 | ||
|
|
965525667b | ||
|
|
6550815c3a | ||
|
|
7439e4f41b | ||
|
|
ac04dd374f | ||
|
|
035a6cb09a | ||
|
|
a32cb49b60 | ||
|
|
20d7454c9b | ||
|
|
5819ca8944 | ||
|
|
79028d4388 | ||
|
|
325ab6b0a8 | ||
|
|
91a07ff618 | ||
|
|
d5c4800112 | ||
|
|
42d5d705f9 | ||
|
|
116880a5a0 | ||
|
|
4145e50d85 | ||
|
|
20f5d185a6 | ||
|
|
1887acca9e | ||
|
|
92e7562a99 | ||
|
|
87d0d17ab5 | ||
|
|
a57c8228ff | ||
|
|
1ee95841bd | ||
|
|
7d8c6804e2 | ||
|
|
af3162d3aa | ||
|
|
5b2a9422f0 | ||
|
|
c1858b7ec8 | ||
|
|
82914d2ae8 | ||
|
|
81a90e5277 | ||
|
|
1c3a221d3b | ||
|
|
7bd42e609d | ||
|
|
a2522839d8 | ||
|
|
59a5cb387a | ||
|
|
8322d4e47f | ||
|
|
3e472e81f9 | ||
|
|
038914b7c8 | ||
|
|
d2f4a71cd5 | ||
|
|
2abd97592f | ||
|
|
6abb0454ad | ||
|
|
db6f71d4c9 | ||
|
|
fd03538bf9 | ||
|
|
1f70313e59 | ||
|
|
07daee132b | ||
|
|
9595afda18 | ||
|
|
c1395f72cd | ||
|
|
007b183d74 | ||
|
|
add9f1fbd9 | ||
|
|
e3bf79ffa0 | ||
|
|
fb1270f1f8 | ||
|
|
72bb24e2db | ||
|
|
a7be77beef | ||
|
|
bbe0574d8e | ||
|
|
4d9513537d | ||
|
|
439afa4eea | ||
|
|
fa4e0fb028 | ||
|
|
ce498a6d61 | ||
|
|
9f14c9224d | ||
|
|
535de06cb1 | ||
|
|
4292c90a2a | ||
|
|
6e98f6d8b6 | ||
|
|
2f6d17cb2f | ||
|
|
192ad4648b | ||
|
|
0e92298622 | ||
|
|
87d9a26166 | ||
|
|
80f921ba4b | ||
|
|
711edaf0d0 | ||
|
|
1d367a738e | ||
|
|
32a02c7ca2 | ||
|
|
f67ee8b859 | ||
|
|
e57ef99b40 | ||
|
|
f8516a1ab9 | ||
|
|
824058076c | ||
|
|
8e32690869 | ||
|
|
a208439537 | ||
|
|
bcd2f74c0d | ||
|
|
f79f777803 | ||
|
|
4c8d1bf361 | ||
|
|
061da6bcf7 | ||
|
|
4403e3ed4c | ||
|
|
08e094997e | ||
|
|
d88a1df699 | ||
|
|
90d74ebaa4 | ||
|
|
45f8fd6f97 | ||
|
|
5e1e0a0fbd | ||
|
|
eb5ed20743 | ||
|
|
2647163674 | ||
|
|
9fb27dd3b3 | ||
|
|
4dffc5e044 | ||
|
|
e1bf04b6c2 | ||
|
|
02080179a3 | ||
|
|
1b8fe6f7c4 | ||
|
|
52ee21021a | ||
|
|
655efb3e69 | ||
|
|
bd8da29a66 | ||
|
|
2a99c5a6c8 | ||
|
|
3f7662d650 | ||
|
|
a372f3f40a | ||
|
|
61e632aea1 | ||
|
|
b1bb18de8d | ||
|
|
2267cb1cfd | ||
|
|
0d6ccf68fa | ||
|
|
18e7cbbb15 | ||
|
|
f0d5251715 | ||
|
|
5c4f2dd6ef | ||
|
|
f3d8a34671 | ||
|
|
4bc913aeec | ||
|
|
fbb3cf6981 | ||
|
|
2df2b3499d | ||
|
|
2a8d84e66d | ||
|
|
a3acfa1071 | ||
|
|
be8168ff88 | ||
|
|
f6af34626d | ||
|
|
ceab70c89d | ||
|
|
52683ccbe1 | ||
|
|
e346e2d056 | ||
|
|
83449a5ff0 | ||
|
|
dad2d6a590 | ||
|
|
32e84fa1ff | ||
|
|
fd9c83d0e0 | ||
|
|
b95cc5014d | ||
|
|
61397891ce | ||
|
|
ef248ff740 | ||
|
|
e10604480b | ||
|
|
bf001da4bf | ||
|
|
a0a984ac2e | ||
|
|
f1cb9b5544 | ||
|
|
4c4b6f7a97 | ||
|
|
10546f925a | ||
|
|
e69c990c21 | ||
|
|
5eac9a1b34 | ||
|
|
1b60b45d0d | ||
|
|
4b3803d180 | ||
|
|
5019c59dd2 | ||
|
|
089cd4f002 | ||
|
|
0130223bd9 | ||
|
|
5d1aef3004 | ||
|
|
ffe1fc7a28 | ||
|
|
8b7346d5f1 | ||
|
|
6141ebe0dd | ||
|
|
199e3cb476 | ||
|
|
9f8cb81b44 | ||
|
|
d7e17aaacd | ||
|
|
528e9b1490 | ||
|
|
d95b4be47a | ||
|
|
4061dcf4c5 | ||
|
|
0aca8b8c62 | ||
|
|
9eb58f8cf1 | ||
|
|
b10d05b8a8 | ||
|
|
b398e5c819 | ||
|
|
78061ef584 | ||
|
|
528b3076af | ||
|
|
a502831d36 | ||
|
|
ba871fb788 | ||
|
|
ab374786c7 | ||
|
|
808dd87b30 | ||
|
|
beb8899482 | ||
|
|
ce88756b96 | ||
|
|
a3154a6092 | ||
|
|
7c036432fc | ||
|
|
318b120766 | ||
|
|
c3b40dc3e7 | ||
|
|
a01ef3fa51 | ||
|
|
7320ca3942 | ||
|
|
cf0a99f84d | ||
|
|
e535d90deb | ||
|
|
0b225fb7b2 | ||
|
|
46b4a02794 | ||
|
|
8869cd8ec1 | ||
|
|
cd86fff38f | ||
|
|
b5f8c3092d | ||
|
|
21997f45b1 | ||
|
|
672023877b | ||
|
|
754a8ca942 | ||
|
|
302ecf64ff | ||
|
|
b6bb2842cf | ||
|
|
79b6ec6aab | ||
|
|
d6416fdde9 | ||
|
|
0fb3157267 | ||
|
|
a358e4dffe | ||
|
|
079781177a | ||
|
|
63c0889416 | ||
|
|
1e86c802d4 | ||
|
|
fedf64332e | ||
|
|
2238a12c13 | ||
|
|
ce0afe2451 | ||
|
|
88c3e114d8 | ||
|
|
92924b2ddd | ||
|
|
27cb2f678f | ||
|
|
22d9a056d5 | ||
|
|
13b842f271 | ||
|
|
15f40b20aa | ||
|
|
793af538a3 | ||
|
|
6f5e7cda57 | ||
|
|
68feb76a6f | ||
|
|
4cb59dea6a | ||
|
|
608b556507 | ||
|
|
f0a1c8453a | ||
|
|
8980001c93 | ||
|
|
527bcd14d4 | ||
|
|
f68e3ea4e1 | ||
|
|
d5c41db35b | ||
|
|
1618e25492 | ||
|
|
f3888aca83 | ||
|
|
f0bca83ee4 | ||
|
|
73419abfae | ||
|
|
e77f162cf5 | ||
|
|
8ecd213c0b | ||
|
|
5b55c0bea7 | ||
|
|
15e0bb9c42 | ||
|
|
6c64c41b4a | ||
|
|
a2ef06e1b3 | ||
|
|
0a3c71e7e5 | ||
|
|
29fba76781 | ||
|
|
9df152bbf6 | ||
|
|
876a16f4fb | ||
|
|
aaa901ad55 | ||
|
|
010ec0c30e | ||
|
|
64a40a7ab4 | ||
|
|
31aedfe7d6 | ||
|
|
67ebaff528 | ||
|
|
2b465570e6 | ||
|
|
9ca66ecc10 | ||
|
|
c3a9752b0c | ||
|
|
f451b4558b | ||
|
|
3f96fcf646 | ||
|
|
6c1f9e4c18 | ||
|
|
67239c4c42 | ||
|
|
8ece60768f | ||
|
|
fd0e377244 | ||
|
|
f857a03f6b | ||
|
|
74898a7015 | ||
|
|
8f5d51203b | ||
|
|
ae5b7aff2b | ||
|
|
a11bc12d53 | ||
|
|
58cb55e4de | ||
|
|
cf896ae0e3 | ||
|
|
c5113f60f2 | ||
|
|
174f16700b | ||
|
|
8e2ad97ad0 | ||
|
|
10152d2194 | ||
|
|
1a7894dbdf | ||
|
|
c87eac18f7 | ||
|
|
f45870b53f | ||
|
|
ba45bedfd1 | ||
|
|
9432ed8c7e | ||
|
|
726d89720c | ||
|
|
d334dd26c4 | ||
|
|
070c811d6f | ||
|
|
8bfc8d5600 | ||
|
|
ec51831a22 | ||
|
|
80b918f2bd | ||
|
|
c46b0cd0af | ||
|
|
133765760b | ||
|
|
bfb9bdaf3f | ||
|
|
2284461d02 | ||
|
|
8e2a469b3b | ||
|
|
23591e631e | ||
|
|
0493d897c4 | ||
|
|
8c8ebeb941 | ||
|
|
831453fcef | ||
|
|
5a66c9cc76 | ||
|
|
5e73e4900c | ||
|
|
c6e7404cc5 | ||
|
|
17b17c0684 | ||
|
|
8bb6271c77 | ||
|
|
8b3f0a99dd | ||
|
|
8311f083bd | ||
|
|
40c35038d2 | ||
|
|
a5aa4d5c0f | ||
|
|
615e8033e5 | ||
|
|
d09135fbd0 | ||
|
|
8688c3d460 | ||
|
|
5400014d55 | ||
|
|
3a92c6f3b5 | ||
|
|
e01ff5c070 | ||
|
|
fb946a7f89 | ||
|
|
a650ad1588 | ||
|
|
d697581a7c | ||
|
|
5eeba80c74 | ||
|
|
08b1195e62 | ||
|
|
3bba2edb0f | ||
|
|
53fc166402 | ||
|
|
31b25f6516 | ||
|
|
abb34ac43a | ||
|
|
2515bbd027 | ||
|
|
c487a8eef4 | ||
|
|
9e138cb01d | ||
|
|
f9d03599ef | ||
|
|
39037d258e | ||
|
|
51550179fc | ||
|
|
07ea184f00 | ||
|
|
a663b218ae | ||
|
|
1bd47d6e5a | ||
|
|
141cd43967 | ||
|
|
6bf3b46d78 | ||
|
|
77c4f45c6c | ||
|
|
ca1969186d | ||
|
|
ab597c869a | ||
|
|
4197168ea5 | ||
|
|
59bcc5b6f2 | ||
|
|
3e440786af | ||
|
|
8bdd3979d8 | ||
|
|
c4e744dbd4 | ||
|
|
8ebf372e9d | ||
|
|
f210f0b7b1 | ||
|
|
392c5af4fe | ||
|
|
af9b69f977 | ||
|
|
8e5e40daf4 | ||
|
|
2e8de86777 | ||
|
|
247d1a32ea | ||
|
|
ecb4f82209 | ||
|
|
5914090765 | ||
|
|
f1acbd68c5 | ||
|
|
9581185d51 | ||
|
|
2dd359f953 | ||
|
|
22ad649501 | ||
|
|
36d450e3b8 | ||
|
|
a2b877df6c | ||
|
|
35fb0b8613 | ||
|
|
2eb673a088 | ||
|
|
a97b5e206d | ||
|
|
911b51b69f | ||
|
|
604e3b87e8 | ||
|
|
706f123b23 | ||
|
|
fb7abfc1d0 | ||
|
|
5d3d6e44e8 | ||
|
|
46ec6d71c7 | ||
|
|
e82fa448c4 | ||
|
|
d9aa39a3bb | ||
|
|
3a6d5cbefd | ||
|
|
f5d7049cc1 | ||
|
|
3c3c547ce0 | ||
|
|
1cbccb6dba | ||
|
|
bd92089d33 | ||
|
|
a6760f1525 | ||
|
|
66e601ef79 | ||
|
|
0cd259b2d8 | ||
|
|
83fb2d09e8 | ||
|
|
f3a5ee705f | ||
|
|
7cbbca9aaa | ||
|
|
5ec44056f7 | ||
|
|
492a7983dd | ||
|
|
a608b4c6c2 | ||
|
|
1f3a2c2944 | ||
|
|
7227d06156 | ||
|
|
14385c80fc | ||
|
|
76139d0801 | ||
|
|
da8d0c441a | ||
|
|
58996f3589 | ||
|
|
b539f988e1 | ||
|
|
6c00645712 | ||
|
|
b781eeaa15 | ||
|
|
e0b005d9cf | ||
|
|
3b8f0fe59e | ||
|
|
c831911be2 | ||
|
|
157caf511b | ||
|
|
0b53bec60b | ||
|
|
c568581ff3 | ||
|
|
2d7053438a | ||
|
|
5a93b9162b | ||
|
|
6d86fde09c | ||
|
|
510ed1e8d3 | ||
|
|
8caffd92df | ||
|
|
58a05b0ca1 | ||
|
|
6ee7f18f33 | ||
|
|
8f987883cb | ||
|
|
ebe0ba91db | ||
|
|
43a013c3a2 | ||
|
|
c25dbee40d | ||
|
|
19ab0f7ce5 | ||
|
|
67fe677c53 | ||
|
|
d56afd45fd | ||
|
|
a2393ed496 | ||
|
|
be6931ee27 | ||
|
|
9ef3b718d9 | ||
|
|
bb17e8f11c | ||
|
|
dcd80206b7 | ||
|
|
f4a0921c9c | ||
|
|
208c56256f | ||
|
|
9ac818a551 | ||
|
|
6ca2c91b96 | ||
|
|
e33192b269 | ||
|
|
61274bdef5 | ||
|
|
b40db4dfec |
@@ -1,7 +1,8 @@
|
|||||||
name: vllm_ci
|
name: vllm_ci
|
||||||
job_dirs:
|
job_dirs:
|
||||||
- ".buildkite/test_areas"
|
|
||||||
- ".buildkite/image_build"
|
- ".buildkite/image_build"
|
||||||
|
- ".buildkite/test_areas"
|
||||||
|
- ".buildkite/hardware_tests"
|
||||||
run_all_patterns:
|
run_all_patterns:
|
||||||
- "docker/Dockerfile"
|
- "docker/Dockerfile"
|
||||||
- "CMakeLists.txt"
|
- "CMakeLists.txt"
|
||||||
|
|||||||
30
.buildkite/hardware_tests/amd.yaml
Normal file
30
.buildkite/hardware_tests/amd.yaml
Normal file
@@ -0,0 +1,30 @@
|
|||||||
|
group: Hardware - AMD Build
|
||||||
|
steps:
|
||||||
|
- label: "AMD: :docker: build image"
|
||||||
|
key: image-build-amd
|
||||||
|
depends_on: []
|
||||||
|
device: amd_cpu
|
||||||
|
no_plugin: true
|
||||||
|
commands:
|
||||||
|
- >
|
||||||
|
docker build
|
||||||
|
--build-arg max_jobs=16
|
||||||
|
--build-arg REMOTE_VLLM=1
|
||||||
|
--build-arg ARG_PYTORCH_ROCM_ARCH='gfx942;gfx950'
|
||||||
|
--build-arg VLLM_BRANCH=$BUILDKITE_COMMIT
|
||||||
|
--tag "rocm/vllm-ci:${BUILDKITE_COMMIT}"
|
||||||
|
-f docker/Dockerfile.rocm
|
||||||
|
--target test
|
||||||
|
--no-cache
|
||||||
|
--progress plain .
|
||||||
|
- docker push "rocm/vllm-ci:${BUILDKITE_COMMIT}"
|
||||||
|
env:
|
||||||
|
DOCKER_BUILDKIT: "1"
|
||||||
|
retry:
|
||||||
|
automatic:
|
||||||
|
- exit_status: -1 # Agent was lost
|
||||||
|
limit: 1
|
||||||
|
- exit_status: -10 # Agent was lost
|
||||||
|
limit: 1
|
||||||
|
- exit_status: 1 # Machine occasionally fail
|
||||||
|
limit: 1
|
||||||
10
.buildkite/hardware_tests/ascend_npu.yaml
Normal file
10
.buildkite/hardware_tests/ascend_npu.yaml
Normal file
@@ -0,0 +1,10 @@
|
|||||||
|
group: Hardware
|
||||||
|
depends_on: ~
|
||||||
|
steps:
|
||||||
|
- label: "Ascend NPU Test"
|
||||||
|
soft_fail: true
|
||||||
|
timeout_in_minutes: 20
|
||||||
|
no_plugin: true
|
||||||
|
device: ascend_npu
|
||||||
|
commands:
|
||||||
|
- bash .buildkite/scripts/hardware_ci/run-npu-test.sh
|
||||||
100
.buildkite/hardware_tests/cpu.yaml
Normal file
100
.buildkite/hardware_tests/cpu.yaml
Normal file
@@ -0,0 +1,100 @@
|
|||||||
|
group: CPU
|
||||||
|
depends_on: []
|
||||||
|
steps:
|
||||||
|
- label: CPU-Kernel Tests
|
||||||
|
depends_on: []
|
||||||
|
soft_fail: true
|
||||||
|
device: intel_cpu
|
||||||
|
no_plugin: true
|
||||||
|
source_file_dependencies:
|
||||||
|
- csrc/cpu/
|
||||||
|
- cmake/cpu_extension.cmake
|
||||||
|
- CMakeLists.txt
|
||||||
|
- vllm/_custom_ops.py
|
||||||
|
- tests/kernels/attention/test_cpu_attn.py
|
||||||
|
- tests/kernels/moe/test_cpu_fused_moe.py
|
||||||
|
- tests/kernels/test_onednn.py
|
||||||
|
commands:
|
||||||
|
- |
|
||||||
|
bash .buildkite/scripts/hardware_ci/run-cpu-test.sh 20m "
|
||||||
|
pytest -x -v -s tests/kernels/attention/test_cpu_attn.py
|
||||||
|
pytest -x -v -s tests/kernels/moe/test_cpu_fused_moe.py
|
||||||
|
pytest -x -v -s tests/kernels/test_onednn.py"
|
||||||
|
|
||||||
|
- label: CPU-Language Generation and Pooling Model Tests
|
||||||
|
depends_on: []
|
||||||
|
soft_fail: true
|
||||||
|
device: intel_cpu
|
||||||
|
no_plugin: true
|
||||||
|
source_file_dependencies:
|
||||||
|
- csrc/cpu/
|
||||||
|
- vllm/
|
||||||
|
- tests/models/language/generation/
|
||||||
|
- tests/models/language/pooling/
|
||||||
|
commands:
|
||||||
|
- |
|
||||||
|
bash .buildkite/scripts/hardware_ci/run-cpu-test.sh 30m "
|
||||||
|
pytest -x -v -s tests/models/language/generation -m cpu_model
|
||||||
|
pytest -x -v -s tests/models/language/pooling -m cpu_model"
|
||||||
|
|
||||||
|
- label: CPU-Quantization Model Tests
|
||||||
|
depends_on: []
|
||||||
|
soft_fail: true
|
||||||
|
device: intel_cpu
|
||||||
|
no_plugin: true
|
||||||
|
source_file_dependencies:
|
||||||
|
- csrc/cpu/
|
||||||
|
- vllm/model_executor/layers/quantization/cpu_wna16.py
|
||||||
|
- vllm/model_executor/layers/quantization/gptq_marlin.py
|
||||||
|
- vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py
|
||||||
|
- vllm/model_executor/layers/quantization/kernels/scaled_mm/cpu.py
|
||||||
|
- vllm/model_executor/layers/quantization/kernels/mixed_precision/cpu.py
|
||||||
|
- tests/quantization/test_compressed_tensors.py
|
||||||
|
- tests/quantization/test_cpu_wna16.py
|
||||||
|
commands:
|
||||||
|
- |
|
||||||
|
bash .buildkite/scripts/hardware_ci/run-cpu-test.sh 20m "
|
||||||
|
pytest -x -v -s tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_logprobs
|
||||||
|
pytest -x -v -s tests/quantization/test_cpu_wna16.py"
|
||||||
|
|
||||||
|
- label: CPU-Distributed Tests
|
||||||
|
depends_on: []
|
||||||
|
soft_fail: true
|
||||||
|
device: intel_cpu
|
||||||
|
no_plugin: true
|
||||||
|
source_file_dependencies:
|
||||||
|
- csrc/cpu/shm.cpp
|
||||||
|
- vllm/v1/worker/cpu_worker.py
|
||||||
|
- vllm/v1/worker/gpu_worker.py
|
||||||
|
- vllm/v1/worker/cpu_model_runner.py
|
||||||
|
- vllm/v1/worker/gpu_model_runner.py
|
||||||
|
- vllm/platforms/cpu.py
|
||||||
|
- vllm/distributed/parallel_state.py
|
||||||
|
- vllm/distributed/device_communicators/cpu_communicator.py
|
||||||
|
commands:
|
||||||
|
- |
|
||||||
|
bash .buildkite/scripts/hardware_ci/run-cpu-test.sh 10m "
|
||||||
|
bash .buildkite/scripts/hardware_ci/run-cpu-distributed-smoke-test.sh"
|
||||||
|
|
||||||
|
- label: CPU-Multi-Modal Model Tests %N
|
||||||
|
depends_on: []
|
||||||
|
soft_fail: true
|
||||||
|
device: intel_cpu
|
||||||
|
no_plugin: true
|
||||||
|
source_file_dependencies:
|
||||||
|
# - vllm/
|
||||||
|
- vllm/model_executor/layers/rotary_embedding
|
||||||
|
- tests/models/multimodal/generation/
|
||||||
|
commands:
|
||||||
|
- |
|
||||||
|
bash .buildkite/scripts/hardware_ci/run-cpu-test.sh 45m "
|
||||||
|
pytest -x -v -s tests/models/multimodal/generation --ignore=tests/models/multimodal/generation/test_pixtral.py -m cpu_model --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --shard-id=$$BUILDKITE_PARALLEL_JOB"
|
||||||
|
parallelism: 2
|
||||||
|
|
||||||
|
- label: "Arm CPU Test"
|
||||||
|
depends_on: []
|
||||||
|
soft_fail: true
|
||||||
|
device: arm_cpu
|
||||||
|
no_plugin: true
|
||||||
|
commands:
|
||||||
|
- bash .buildkite/scripts/hardware_ci/run-cpu-test-arm.sh
|
||||||
10
.buildkite/hardware_tests/gh200.yaml
Normal file
10
.buildkite/hardware_tests/gh200.yaml
Normal file
@@ -0,0 +1,10 @@
|
|||||||
|
group: Hardware
|
||||||
|
steps:
|
||||||
|
- label: "GH200 Test"
|
||||||
|
soft_fail: true
|
||||||
|
device: gh200
|
||||||
|
no_plugin: true
|
||||||
|
optional: true
|
||||||
|
commands:
|
||||||
|
- nvidia-smi
|
||||||
|
- bash .buildkite/scripts/hardware_ci/run-gh200-test.sh
|
||||||
17
.buildkite/hardware_tests/intel.yaml
Normal file
17
.buildkite/hardware_tests/intel.yaml
Normal file
@@ -0,0 +1,17 @@
|
|||||||
|
group: Hardware
|
||||||
|
depends_on: ~
|
||||||
|
steps:
|
||||||
|
- label: "Intel HPU Test"
|
||||||
|
soft_fail: true
|
||||||
|
device: intel_hpu
|
||||||
|
no_plugin: true
|
||||||
|
commands:
|
||||||
|
- bash .buildkite/scripts/hardware_ci/run-hpu-test.sh
|
||||||
|
|
||||||
|
- label: "Intel GPU Test"
|
||||||
|
depends_on: []
|
||||||
|
soft_fail: true
|
||||||
|
device: intel_gpu
|
||||||
|
no_plugin: true
|
||||||
|
commands:
|
||||||
|
- bash .buildkite/scripts/hardware_ci/run-xpu-test.sh
|
||||||
@@ -1,56 +1,255 @@
|
|||||||
#!/bin/bash
|
#!/bin/bash
|
||||||
set -e
|
set -euo pipefail
|
||||||
|
|
||||||
if [[ $# -lt 8 ]]; then
|
# replace invalid characters in Docker image tags and truncate to 128 chars
|
||||||
echo "Usage: $0 <registry> <repo> <commit> <branch> <vllm_use_precompiled> <vllm_merge_base_commit> <cache_from> <cache_to>"
|
clean_docker_tag() {
|
||||||
exit 1
|
local input="$1"
|
||||||
|
echo "$input" | sed 's/[^a-zA-Z0-9._-]/_/g' | cut -c1-128
|
||||||
|
}
|
||||||
|
|
||||||
|
print_usage_and_exit() {
|
||||||
|
echo "Usage: $0 <registry> <repo> <commit> <branch> <image_tag> [<image_tag_latest>]"
|
||||||
|
exit 1
|
||||||
|
}
|
||||||
|
|
||||||
|
print_instance_info() {
|
||||||
|
echo ""
|
||||||
|
echo "=== Debug: Instance Information ==="
|
||||||
|
# Get IMDSv2 token
|
||||||
|
if TOKEN=$(curl -s -X PUT "http://169.254.169.254/latest/api/token" \
|
||||||
|
-H "X-aws-ec2-metadata-token-ttl-seconds: 21600" 2>/dev/null); then
|
||||||
|
AMI_ID=$(curl -s -H "X-aws-ec2-metadata-token: $TOKEN" \
|
||||||
|
http://169.254.169.254/latest/meta-data/ami-id 2>/dev/null || echo "unknown")
|
||||||
|
INSTANCE_TYPE=$(curl -s -H "X-aws-ec2-metadata-token: $TOKEN" \
|
||||||
|
http://169.254.169.254/latest/meta-data/instance-type 2>/dev/null || echo "unknown")
|
||||||
|
INSTANCE_ID=$(curl -s -H "X-aws-ec2-metadata-token: $TOKEN" \
|
||||||
|
http://169.254.169.254/latest/meta-data/instance-id 2>/dev/null || echo "unknown")
|
||||||
|
AZ=$(curl -s -H "X-aws-ec2-metadata-token: $TOKEN" \
|
||||||
|
http://169.254.169.254/latest/meta-data/placement/availability-zone 2>/dev/null || echo "unknown")
|
||||||
|
echo "AMI ID: ${AMI_ID}"
|
||||||
|
echo "Instance Type: ${INSTANCE_TYPE}"
|
||||||
|
echo "Instance ID: ${INSTANCE_ID}"
|
||||||
|
echo "AZ: ${AZ}"
|
||||||
|
else
|
||||||
|
echo "Not running on EC2 or IMDS not available"
|
||||||
|
fi
|
||||||
|
# Check for warm cache AMI (marker file baked into custom AMI)
|
||||||
|
if [[ -f /etc/vllm-ami-info ]]; then
|
||||||
|
echo "Cache: warm (custom vLLM AMI)"
|
||||||
|
cat /etc/vllm-ami-info
|
||||||
|
else
|
||||||
|
echo "Cache: cold (standard AMI)"
|
||||||
|
fi
|
||||||
|
echo "==================================="
|
||||||
|
echo ""
|
||||||
|
}
|
||||||
|
|
||||||
|
setup_buildx_builder() {
|
||||||
|
echo "--- :buildkite: Setting up buildx builder"
|
||||||
|
if [[ -S "${BUILDKIT_SOCKET}" ]]; then
|
||||||
|
# Custom AMI with standalone buildkitd - use remote driver for warm cache
|
||||||
|
echo "✅ Found local buildkitd socket at ${BUILDKIT_SOCKET}"
|
||||||
|
echo "Using remote driver to connect to buildkitd (warm cache available)"
|
||||||
|
if docker buildx inspect baked-vllm-builder >/dev/null 2>&1; then
|
||||||
|
echo "Using existing baked-vllm-builder"
|
||||||
|
docker buildx use baked-vllm-builder
|
||||||
|
else
|
||||||
|
echo "Creating baked-vllm-builder with remote driver"
|
||||||
|
docker buildx create \
|
||||||
|
--name baked-vllm-builder \
|
||||||
|
--driver remote \
|
||||||
|
--use \
|
||||||
|
"unix://${BUILDKIT_SOCKET}"
|
||||||
|
fi
|
||||||
|
docker buildx inspect --bootstrap
|
||||||
|
elif docker buildx inspect "${BUILDER_NAME}" >/dev/null 2>&1; then
|
||||||
|
# Existing builder available
|
||||||
|
echo "Using existing builder: ${BUILDER_NAME}"
|
||||||
|
docker buildx use "${BUILDER_NAME}"
|
||||||
|
docker buildx inspect --bootstrap
|
||||||
|
else
|
||||||
|
# No local buildkitd, no existing builder - create new docker-container builder
|
||||||
|
echo "No local buildkitd found, using docker-container driver"
|
||||||
|
docker buildx create --name "${BUILDER_NAME}" --driver docker-container --use
|
||||||
|
docker buildx inspect --bootstrap
|
||||||
|
fi
|
||||||
|
|
||||||
|
# builder info
|
||||||
|
echo "Active builder:"
|
||||||
|
docker buildx ls | grep -E '^\*|^NAME' || docker buildx ls
|
||||||
|
}
|
||||||
|
|
||||||
|
check_and_skip_if_image_exists() {
|
||||||
|
if [[ -n "${IMAGE_TAG:-}" ]]; then
|
||||||
|
echo "--- :mag: Checking if image exists"
|
||||||
|
if docker manifest inspect "${IMAGE_TAG}" >/dev/null 2>&1; then
|
||||||
|
echo "Image already exists: ${IMAGE_TAG}"
|
||||||
|
echo "Skipping build"
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
echo "Image not found, proceeding with build"
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
ecr_login() {
|
||||||
|
aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin "$REGISTRY"
|
||||||
|
aws ecr get-login-password --region us-east-1 | docker login --username AWS --password-stdin 936637512419.dkr.ecr.us-east-1.amazonaws.com
|
||||||
|
}
|
||||||
|
|
||||||
|
prepare_cache_tags() {
|
||||||
|
# resolve and set: CACHE_TO, CACHE_FROM, CACHE_FROM_BASE_BRANCH, CACHE_FROM_MAIN
|
||||||
|
TEST_CACHE_ECR="936637512419.dkr.ecr.us-east-1.amazonaws.com/vllm-ci-test-cache"
|
||||||
|
MAIN_CACHE_ECR="936637512419.dkr.ecr.us-east-1.amazonaws.com/vllm-ci-postmerge-cache"
|
||||||
|
|
||||||
|
if [[ "$BUILDKITE_PULL_REQUEST" == "false" ]]; then
|
||||||
|
if [[ "$BUILDKITE_BRANCH" == "main" ]]; then
|
||||||
|
cache="${MAIN_CACHE_ECR}:latest"
|
||||||
|
else
|
||||||
|
clean_branch=$(clean_docker_tag "$BUILDKITE_BRANCH")
|
||||||
|
cache="${TEST_CACHE_ECR}:${clean_branch}"
|
||||||
|
fi
|
||||||
|
CACHE_TO="$cache"
|
||||||
|
CACHE_FROM="$cache"
|
||||||
|
CACHE_FROM_BASE_BRANCH="$cache"
|
||||||
|
else
|
||||||
|
CACHE_TO="${TEST_CACHE_ECR}:pr-${BUILDKITE_PULL_REQUEST}"
|
||||||
|
CACHE_FROM="${TEST_CACHE_ECR}:pr-${BUILDKITE_PULL_REQUEST}"
|
||||||
|
if [[ "$BUILDKITE_PULL_REQUEST_BASE_BRANCH" == "main" ]]; then
|
||||||
|
CACHE_FROM_BASE_BRANCH="${MAIN_CACHE_ECR}:latest"
|
||||||
|
else
|
||||||
|
clean_base=$(clean_docker_tag "$BUILDKITE_PULL_REQUEST_BASE_BRANCH")
|
||||||
|
CACHE_FROM_BASE_BRANCH="${TEST_CACHE_ECR}:${clean_base}"
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
CACHE_FROM_MAIN="${MAIN_CACHE_ECR}:latest"
|
||||||
|
export CACHE_TO CACHE_FROM CACHE_FROM_BASE_BRANCH CACHE_FROM_MAIN
|
||||||
|
}
|
||||||
|
|
||||||
|
resolve_parent_commit() {
|
||||||
|
if [[ -z "${PARENT_COMMIT:-}" ]]; then
|
||||||
|
PARENT_COMMIT=$(git rev-parse HEAD~1 2>/dev/null || echo "")
|
||||||
|
if [[ -n "${PARENT_COMMIT}" ]]; then
|
||||||
|
echo "Computed parent commit for cache fallback: ${PARENT_COMMIT}"
|
||||||
|
export PARENT_COMMIT
|
||||||
|
else
|
||||||
|
echo "Could not determine parent commit (may be first commit in repo)"
|
||||||
|
fi
|
||||||
|
else
|
||||||
|
echo "Using provided PARENT_COMMIT: ${PARENT_COMMIT}"
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
print_bake_config() {
|
||||||
|
echo "--- :page_facing_up: Resolved bake configuration"
|
||||||
|
# Write to a temp directory to avoid polluting the repo root (which is the
|
||||||
|
# Docker build context). Files left in the repo root get COPY'd into the
|
||||||
|
# image and can cause duplicate artifact uploads from downstream steps.
|
||||||
|
local bake_tmp
|
||||||
|
bake_tmp="$(mktemp -d)"
|
||||||
|
BAKE_CONFIG_FILE="${bake_tmp}/bake-config-build-${BUILDKITE_BUILD_NUMBER:-local}.json"
|
||||||
|
docker buildx bake -f "${VLLM_BAKE_FILE_PATH}" -f "${CI_HCL_PATH}" --print "${TARGET}" | tee "${BAKE_CONFIG_FILE}" || true
|
||||||
|
echo "Saved bake config to ${BAKE_CONFIG_FILE}"
|
||||||
|
echo "--- :arrow_down: Uploading bake config to Buildkite"
|
||||||
|
(cd "$(dirname "${BAKE_CONFIG_FILE}")" && buildkite-agent artifact upload "$(basename "${BAKE_CONFIG_FILE}")")
|
||||||
|
}
|
||||||
|
|
||||||
|
#################################
|
||||||
|
# Main Script #
|
||||||
|
#################################
|
||||||
|
print_instance_info
|
||||||
|
|
||||||
|
if [[ $# -lt 5 ]]; then
|
||||||
|
print_usage_and_exit
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
# input args
|
||||||
REGISTRY=$1
|
REGISTRY=$1
|
||||||
REPO=$2
|
REPO=$2
|
||||||
BUILDKITE_COMMIT=$3
|
BUILDKITE_COMMIT=$3
|
||||||
BRANCH=$4
|
BRANCH=$4
|
||||||
VLLM_USE_PRECOMPILED=$5
|
IMAGE_TAG=$5
|
||||||
VLLM_MERGE_BASE_COMMIT=$6
|
IMAGE_TAG_LATEST=${6:-} # only used for main branch, optional
|
||||||
CACHE_FROM=$7
|
|
||||||
CACHE_TO=$8
|
|
||||||
|
|
||||||
# authenticate with AWS ECR
|
# build config
|
||||||
aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin $REGISTRY
|
TARGET="test-ci"
|
||||||
aws ecr get-login-password --region us-east-1 | docker login --username AWS --password-stdin 936637512419.dkr.ecr.us-east-1.amazonaws.com
|
VLLM_BAKE_FILE_PATH="${VLLM_BAKE_FILE_PATH:-docker/docker-bake.hcl}"
|
||||||
|
BUILDER_NAME="${BUILDER_NAME:-vllm-builder}"
|
||||||
|
CI_HCL_URL="${CI_HCL_URL:-https://raw.githubusercontent.com/vllm-project/ci-infra/main/docker/ci.hcl}"
|
||||||
|
CI_HCL_PATH="/tmp/ci.hcl"
|
||||||
|
BUILDKIT_SOCKET="/run/buildkit/buildkitd.sock"
|
||||||
|
|
||||||
# docker buildx
|
prepare_cache_tags
|
||||||
docker buildx create --name vllm-builder --driver docker-container --use
|
ecr_login
|
||||||
docker buildx inspect --bootstrap
|
|
||||||
docker buildx ls
|
|
||||||
|
|
||||||
# skip build if image already exists
|
# Environment info (for docs and human readers)
|
||||||
if [[ -z $(docker manifest inspect $REGISTRY/$REPO:$BUILDKITE_COMMIT) ]]; then
|
# VLLM_CI_BRANCH - ci-infra branch to use (default: main)
|
||||||
echo "Image not found, proceeding with build..."
|
# VLLM_BAKE_FILE_PATH - Path to vLLM's bake file (default: docker/docker-bake.hcl)
|
||||||
else
|
# BUILDER_NAME - Name for buildx builder (default: vllm-builder)
|
||||||
echo "Image found"
|
#
|
||||||
exit 0
|
# Build configuration (exported as environment variables for bake):
|
||||||
|
export BUILDKITE_COMMIT
|
||||||
|
export PARENT_COMMIT
|
||||||
|
export IMAGE_TAG
|
||||||
|
export IMAGE_TAG_LATEST
|
||||||
|
export CACHE_FROM
|
||||||
|
export CACHE_FROM_BASE_BRANCH
|
||||||
|
export CACHE_FROM_MAIN
|
||||||
|
export CACHE_TO
|
||||||
|
|
||||||
|
# print args
|
||||||
|
echo "--- :mag: Arguments"
|
||||||
|
echo "REGISTRY: ${REGISTRY}"
|
||||||
|
echo "REPO: ${REPO}"
|
||||||
|
echo "BUILDKITE_COMMIT: ${BUILDKITE_COMMIT}"
|
||||||
|
echo "BRANCH: ${BRANCH}"
|
||||||
|
echo "IMAGE_TAG: ${IMAGE_TAG}"
|
||||||
|
echo "IMAGE_TAG_LATEST: ${IMAGE_TAG_LATEST}"
|
||||||
|
|
||||||
|
# print build configuration
|
||||||
|
echo "--- :mag: Build configuration"
|
||||||
|
echo "TARGET: ${TARGET}"
|
||||||
|
echo "vLLM bake file: ${VLLM_BAKE_FILE_PATH}"
|
||||||
|
echo "BUILDER_NAME: ${BUILDER_NAME}"
|
||||||
|
echo "CI_HCL_URL: ${CI_HCL_URL}"
|
||||||
|
echo "BUILDKIT_SOCKET: ${BUILDKIT_SOCKET}"
|
||||||
|
|
||||||
|
echo "--- :mag: Cache tags"
|
||||||
|
echo "CACHE_TO: ${CACHE_TO}"
|
||||||
|
echo "CACHE_FROM: ${CACHE_FROM}"
|
||||||
|
echo "CACHE_FROM_BASE_BRANCH: ${CACHE_FROM_BASE_BRANCH}"
|
||||||
|
echo "CACHE_FROM_MAIN: ${CACHE_FROM_MAIN}"
|
||||||
|
|
||||||
|
check_and_skip_if_image_exists
|
||||||
|
|
||||||
|
echo "--- :docker: Setting up Docker buildx bake"
|
||||||
|
echo "Target: ${TARGET}"
|
||||||
|
echo "vLLM bake file: ${VLLM_BAKE_FILE_PATH}"
|
||||||
|
echo "CI HCL path: ${CI_HCL_PATH}"
|
||||||
|
|
||||||
|
if [[ ! -f "${VLLM_BAKE_FILE_PATH}" ]]; then
|
||||||
|
echo "Error: vLLM bake file not found at ${VLLM_BAKE_FILE_PATH}"
|
||||||
|
echo "Make sure you're running from the vLLM repository root"
|
||||||
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
if [[ "${VLLM_USE_PRECOMPILED:-0}" == "1" ]]; then
|
echo "--- :arrow_down: Downloading ci.hcl"
|
||||||
merge_base_commit_build_args="--build-arg VLLM_MERGE_BASE_COMMIT=${VLLM_MERGE_BASE_COMMIT}"
|
curl -sSfL -o "${CI_HCL_PATH}" "${CI_HCL_URL}"
|
||||||
else
|
echo "Downloaded to ${CI_HCL_PATH}"
|
||||||
merge_base_commit_build_args=""
|
|
||||||
|
if [[ ! -f "${CI_HCL_PATH}" ]]; then
|
||||||
|
echo "Error: ci.hcl not found at ${CI_HCL_PATH}"
|
||||||
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# build
|
setup_buildx_builder
|
||||||
docker buildx build --file docker/Dockerfile \
|
|
||||||
--build-arg max_jobs=16 \
|
resolve_parent_commit
|
||||||
--build-arg buildkite_commit=$BUILDKITE_COMMIT \
|
export PARENT_COMMIT
|
||||||
--build-arg USE_SCCACHE=1 \
|
|
||||||
--build-arg TORCH_CUDA_ARCH_LIST="8.0 8.9 9.0 10.0" \
|
print_bake_config
|
||||||
--build-arg FI_TORCH_CUDA_ARCH_LIST="8.0 8.9 9.0a 10.0a" \
|
|
||||||
--build-arg VLLM_USE_PRECOMPILED="${VLLM_USE_PRECOMPILED:-0}" \
|
echo "--- :docker: Building ${TARGET}"
|
||||||
${merge_base_commit_build_args} \
|
docker --debug buildx bake -f "${VLLM_BAKE_FILE_PATH}" -f "${CI_HCL_PATH}" --progress plain "${TARGET}"
|
||||||
--cache-from type=registry,ref=${CACHE_FROM},mode=max \
|
|
||||||
--cache-to type=registry,ref=${CACHE_TO},mode=max \
|
echo "--- :white_check_mark: Build complete"
|
||||||
--tag ${REGISTRY}/${REPO}:${BUILDKITE_COMMIT} \
|
|
||||||
$( [[ "${BRANCH}" == "main" ]] && echo "--tag ${REGISTRY}/${REPO}:latest" ) \
|
|
||||||
--push \
|
|
||||||
--target test \
|
|
||||||
--progress plain .
|
|
||||||
|
|||||||
@@ -3,8 +3,9 @@ steps:
|
|||||||
- label: ":docker: Build image"
|
- label: ":docker: Build image"
|
||||||
key: image-build
|
key: image-build
|
||||||
depends_on: []
|
depends_on: []
|
||||||
|
timeout_in_minutes: 600
|
||||||
commands:
|
commands:
|
||||||
- .buildkite/image_build/image_build.sh $REGISTRY $REPO $BUILDKITE_COMMIT $BRANCH $VLLM_USE_PRECOMPILED $VLLM_MERGE_BASE_COMMIT $CACHE_FROM $CACHE_TO
|
- if [[ "$BUILDKITE_BRANCH" == "main" ]]; then .buildkite/image_build/image_build.sh $REGISTRY $REPO $BUILDKITE_COMMIT $BRANCH $IMAGE_TAG $IMAGE_TAG_LATEST; else .buildkite/image_build/image_build.sh $REGISTRY $REPO $BUILDKITE_COMMIT $BRANCH $IMAGE_TAG; fi
|
||||||
retry:
|
retry:
|
||||||
automatic:
|
automatic:
|
||||||
- exit_status: -1 # Agent was lost
|
- exit_status: -1 # Agent was lost
|
||||||
@@ -40,7 +41,7 @@ steps:
|
|||||||
limit: 2
|
limit: 2
|
||||||
- exit_status: -10 # Agent was lost
|
- exit_status: -10 # Agent was lost
|
||||||
limit: 2
|
limit: 2
|
||||||
|
|
||||||
- label: ":docker: Build CPU arm64 image"
|
- label: ":docker: Build CPU arm64 image"
|
||||||
key: cpu-arm64-image-build
|
key: cpu-arm64-image-build
|
||||||
depends_on: []
|
depends_on: []
|
||||||
|
|||||||
@@ -11,10 +11,10 @@ REPO=$2
|
|||||||
BUILDKITE_COMMIT=$3
|
BUILDKITE_COMMIT=$3
|
||||||
|
|
||||||
# authenticate with AWS ECR
|
# authenticate with AWS ECR
|
||||||
aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin $REGISTRY
|
aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin "$REGISTRY"
|
||||||
|
|
||||||
# skip build if image already exists
|
# skip build if image already exists
|
||||||
if [[ -z $(docker manifest inspect $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu) ]]; then
|
if [[ -z $(docker manifest inspect "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-cpu) ]]; then
|
||||||
echo "Image not found, proceeding with build..."
|
echo "Image not found, proceeding with build..."
|
||||||
else
|
else
|
||||||
echo "Image found"
|
echo "Image found"
|
||||||
@@ -24,13 +24,13 @@ fi
|
|||||||
# build
|
# build
|
||||||
docker build --file docker/Dockerfile.cpu \
|
docker build --file docker/Dockerfile.cpu \
|
||||||
--build-arg max_jobs=16 \
|
--build-arg max_jobs=16 \
|
||||||
--build-arg buildkite_commit=$BUILDKITE_COMMIT \
|
--build-arg buildkite_commit="$BUILDKITE_COMMIT" \
|
||||||
--build-arg VLLM_CPU_AVX512BF16=true \
|
--build-arg VLLM_CPU_AVX512BF16=true \
|
||||||
--build-arg VLLM_CPU_AVX512VNNI=true \
|
--build-arg VLLM_CPU_AVX512VNNI=true \
|
||||||
--build-arg VLLM_CPU_AMXBF16=true \
|
--build-arg VLLM_CPU_AMXBF16=true \
|
||||||
--tag $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu \
|
--tag "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-cpu \
|
||||||
--target vllm-test \
|
--target vllm-test \
|
||||||
--progress plain .
|
--progress plain .
|
||||||
|
|
||||||
# push
|
# push
|
||||||
docker push $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu
|
docker push "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-cpu
|
||||||
|
|||||||
@@ -11,10 +11,10 @@ REPO=$2
|
|||||||
BUILDKITE_COMMIT=$3
|
BUILDKITE_COMMIT=$3
|
||||||
|
|
||||||
# authenticate with AWS ECR
|
# authenticate with AWS ECR
|
||||||
aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin $REGISTRY
|
aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin "$REGISTRY"
|
||||||
|
|
||||||
# skip build if image already exists
|
# skip build if image already exists
|
||||||
if [[ -z $(docker manifest inspect $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu) ]]; then
|
if [[ -z $(docker manifest inspect "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-arm64-cpu) ]]; then
|
||||||
echo "Image not found, proceeding with build..."
|
echo "Image not found, proceeding with build..."
|
||||||
else
|
else
|
||||||
echo "Image found"
|
echo "Image found"
|
||||||
@@ -24,10 +24,10 @@ fi
|
|||||||
# build
|
# build
|
||||||
docker build --file docker/Dockerfile.cpu \
|
docker build --file docker/Dockerfile.cpu \
|
||||||
--build-arg max_jobs=16 \
|
--build-arg max_jobs=16 \
|
||||||
--build-arg buildkite_commit=$BUILDKITE_COMMIT \
|
--build-arg buildkite_commit="$BUILDKITE_COMMIT" \
|
||||||
--tag $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu \
|
--tag "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-arm64-cpu \
|
||||||
--target vllm-test \
|
--target vllm-test \
|
||||||
--progress plain .
|
--progress plain .
|
||||||
|
|
||||||
# push
|
# push
|
||||||
docker push $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu
|
docker push "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-arm64-cpu
|
||||||
|
|||||||
@@ -11,10 +11,10 @@ REPO=$2
|
|||||||
BUILDKITE_COMMIT=$3
|
BUILDKITE_COMMIT=$3
|
||||||
|
|
||||||
# authenticate with AWS ECR
|
# authenticate with AWS ECR
|
||||||
aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin $REGISTRY
|
aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin "$REGISTRY"
|
||||||
|
|
||||||
# skip build if image already exists
|
# skip build if image already exists
|
||||||
if [[ -z $(docker manifest inspect $REGISTRY/$REPO:$BUILDKITE_COMMIT-hpu) ]]; then
|
if [[ -z $(docker manifest inspect "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-hpu) ]]; then
|
||||||
echo "Image not found, proceeding with build..."
|
echo "Image not found, proceeding with build..."
|
||||||
else
|
else
|
||||||
echo "Image found"
|
echo "Image found"
|
||||||
@@ -25,10 +25,10 @@ fi
|
|||||||
docker build \
|
docker build \
|
||||||
--file tests/pytorch_ci_hud_benchmark/Dockerfile.hpu \
|
--file tests/pytorch_ci_hud_benchmark/Dockerfile.hpu \
|
||||||
--build-arg max_jobs=16 \
|
--build-arg max_jobs=16 \
|
||||||
--build-arg buildkite_commit=$BUILDKITE_COMMIT \
|
--build-arg buildkite_commit="$BUILDKITE_COMMIT" \
|
||||||
--tag $REGISTRY/$REPO:$BUILDKITE_COMMIT-hpu \
|
--tag "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-hpu \
|
||||||
--progress plain \
|
--progress plain \
|
||||||
https://github.com/vllm-project/vllm-gaudi.git
|
https://github.com/vllm-project/vllm-gaudi.git
|
||||||
|
|
||||||
# push
|
# push
|
||||||
docker push $REGISTRY/$REPO:$BUILDKITE_COMMIT-hpu
|
docker push "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-hpu
|
||||||
|
|||||||
@@ -0,0 +1,15 @@
|
|||||||
|
model_name: "nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16"
|
||||||
|
tasks:
|
||||||
|
- name: "gsm8k"
|
||||||
|
metrics:
|
||||||
|
- name: "exact_match,strict-match"
|
||||||
|
value: 0.695
|
||||||
|
- name: "exact_match,flexible-extract"
|
||||||
|
value: 0.447
|
||||||
|
limit: 1319
|
||||||
|
num_fewshot: 5
|
||||||
|
max_model_len: 262144
|
||||||
|
enforce_eager: false
|
||||||
|
apply_chat_template: true
|
||||||
|
fewshot_as_multiturn: true
|
||||||
|
trust_remote_code: true
|
||||||
@@ -0,0 +1,19 @@
|
|||||||
|
model_name: "nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-FP8"
|
||||||
|
tasks:
|
||||||
|
- name: "gsm8k"
|
||||||
|
metrics:
|
||||||
|
- name: "exact_match,strict-match"
|
||||||
|
value: 0.7142
|
||||||
|
- name: "exact_match,flexible-extract"
|
||||||
|
value: 0.4579
|
||||||
|
env_vars:
|
||||||
|
VLLM_USE_FLASHINFER_MOE_FP8: "1"
|
||||||
|
VLLM_FLASHINFER_MOE_BACKEND: "throughput"
|
||||||
|
limit: 1319
|
||||||
|
num_fewshot: 5
|
||||||
|
max_model_len: 262144
|
||||||
|
kv_cache_dtype: fp8
|
||||||
|
enforce_eager: false
|
||||||
|
apply_chat_template: true
|
||||||
|
fewshot_as_multiturn: true
|
||||||
|
trust_remote_code: true
|
||||||
@@ -1 +1,2 @@
|
|||||||
Qwen3-235B-A22B-Instruct-2507-FP8.yaml
|
Qwen3-235B-A22B-Instruct-2507-FP8.yaml
|
||||||
|
NVIDIA-Nemotron-3-Nano-30B-A3B-FP8.yaml
|
||||||
|
|||||||
@@ -3,3 +3,4 @@ Meta-Llama-3-70B-Instruct.yaml
|
|||||||
Mixtral-8x7B-Instruct-v0.1.yaml
|
Mixtral-8x7B-Instruct-v0.1.yaml
|
||||||
Qwen2-57B-A14-Instruct.yaml
|
Qwen2-57B-A14-Instruct.yaml
|
||||||
DeepSeek-V2-Lite-Chat.yaml
|
DeepSeek-V2-Lite-Chat.yaml
|
||||||
|
NVIDIA-Nemotron-3-Nano-30B-A3B-BF16.yaml
|
||||||
|
|||||||
@@ -2,7 +2,7 @@
|
|||||||
# We can use this script to compute baseline accuracy on chartqa for vllm.
|
# We can use this script to compute baseline accuracy on chartqa for vllm.
|
||||||
#
|
#
|
||||||
# Make sure you have lm-eval-harness installed:
|
# Make sure you have lm-eval-harness installed:
|
||||||
# pip install "lm-eval[api]>=0.4.9.2"
|
# pip install "lm-eval[api]>=0.4.11"
|
||||||
|
|
||||||
usage() {
|
usage() {
|
||||||
echo``
|
echo``
|
||||||
@@ -41,4 +41,4 @@ lm_eval --model vllm-vlm \
|
|||||||
--tasks chartqa \
|
--tasks chartqa \
|
||||||
--batch_size auto \
|
--batch_size auto \
|
||||||
--apply_chat_template \
|
--apply_chat_template \
|
||||||
--limit $LIMIT
|
--limit "$LIMIT"
|
||||||
|
|||||||
@@ -2,7 +2,7 @@
|
|||||||
# We can use this script to compute baseline accuracy on GSM for transformers.
|
# We can use this script to compute baseline accuracy on GSM for transformers.
|
||||||
#
|
#
|
||||||
# Make sure you have lm-eval-harness installed:
|
# Make sure you have lm-eval-harness installed:
|
||||||
# pip install "lm-eval[api]>=0.4.9.2"
|
# pip install "lm-eval[api]>=0.4.11"
|
||||||
|
|
||||||
usage() {
|
usage() {
|
||||||
echo``
|
echo``
|
||||||
|
|||||||
@@ -3,7 +3,7 @@
|
|||||||
# We use this for fp8, which HF does not support.
|
# We use this for fp8, which HF does not support.
|
||||||
#
|
#
|
||||||
# Make sure you have lm-eval-harness installed:
|
# Make sure you have lm-eval-harness installed:
|
||||||
# pip install "lm-eval[api]>=0.4.9.2"
|
# pip install "lm-eval[api]>=0.4.11"
|
||||||
|
|
||||||
usage() {
|
usage() {
|
||||||
echo``
|
echo``
|
||||||
|
|||||||
@@ -3,7 +3,7 @@
|
|||||||
# We use this for fp8, which HF does not support.
|
# We use this for fp8, which HF does not support.
|
||||||
#
|
#
|
||||||
# Make sure you have lm-eval-harness installed:
|
# Make sure you have lm-eval-harness installed:
|
||||||
# pip install "lm-eval[api]>=0.4.9.2"
|
# pip install "lm-eval[api]>=0.4.11"
|
||||||
|
|
||||||
usage() {
|
usage() {
|
||||||
echo``
|
echo``
|
||||||
@@ -20,14 +20,11 @@ usage() {
|
|||||||
echo
|
echo
|
||||||
}
|
}
|
||||||
|
|
||||||
while getopts "m:b:l:f:t:" OPT; do
|
while getopts "m:l:f:t:" OPT; do
|
||||||
case ${OPT} in
|
case ${OPT} in
|
||||||
m )
|
m )
|
||||||
MODEL="$OPTARG"
|
MODEL="$OPTARG"
|
||||||
;;
|
;;
|
||||||
b )
|
|
||||||
BATCH_SIZE="$OPTARG"
|
|
||||||
;;
|
|
||||||
l )
|
l )
|
||||||
LIMIT="$OPTARG"
|
LIMIT="$OPTARG"
|
||||||
;;
|
;;
|
||||||
|
|||||||
@@ -9,8 +9,10 @@ import json
|
|||||||
import os
|
import os
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from importlib import util
|
from importlib import util
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
import regex as re
|
||||||
|
|
||||||
pd.options.display.float_format = "{:.2f}".format
|
pd.options.display.float_format = "{:.2f}".format
|
||||||
plotly_found = util.find_spec("plotly.express") is not None
|
plotly_found = util.find_spec("plotly.express") is not None
|
||||||
@@ -275,6 +277,131 @@ def _apply_two_decimals(
|
|||||||
return styler.format({c: "{:.2f}" for c in num_cols}, na_rep="")
|
return styler.format({c: "{:.2f}" for c in num_cols}, na_rep="")
|
||||||
|
|
||||||
|
|
||||||
|
# -----------------------------
|
||||||
|
# Export helpers (Excel + CSV)
|
||||||
|
# -----------------------------
|
||||||
|
def _sanitize_sheet_name(name: str) -> str:
|
||||||
|
"""
|
||||||
|
Excel sheet constraints:
|
||||||
|
- max 31 chars
|
||||||
|
- cannot contain: : \ / ? * [ ]
|
||||||
|
- cannot be empty
|
||||||
|
"""
|
||||||
|
name = "sheet" if name is None else str(name)
|
||||||
|
name = re.sub(r"[:\\/?*\[\]]", "_", name)
|
||||||
|
name = name.strip().strip("'")
|
||||||
|
name = re.sub(r"\s+", " ", name)
|
||||||
|
if not name:
|
||||||
|
name = "sheet"
|
||||||
|
return name[:31]
|
||||||
|
|
||||||
|
|
||||||
|
def _group_to_sheet_base(group_cols: list[str], gkey_tuple) -> str:
|
||||||
|
d = dict(zip(group_cols, gkey_tuple))
|
||||||
|
model = d.get("Model", "model")
|
||||||
|
model_short = str(model).split("/")[-1]
|
||||||
|
ilen = d.get("Input Len", "")
|
||||||
|
olen = d.get("Output Len", "")
|
||||||
|
lens = f"_{ilen}x{olen}" if ilen != "" and olen != "" else ""
|
||||||
|
return _sanitize_sheet_name(f"{model_short}{lens}")
|
||||||
|
|
||||||
|
|
||||||
|
def _write_tables_to_excel_sheet(
|
||||||
|
writer: pd.ExcelWriter, sheet: str, blocks: list[tuple[str, pd.DataFrame]]
|
||||||
|
):
|
||||||
|
startrow = 0
|
||||||
|
for title, df in blocks:
|
||||||
|
pd.DataFrame([[title]]).to_excel(
|
||||||
|
writer, sheet_name=sheet, index=False, header=False, startrow=startrow
|
||||||
|
)
|
||||||
|
startrow += 1
|
||||||
|
df.to_excel(writer, sheet_name=sheet, index=False, startrow=startrow)
|
||||||
|
startrow += len(df) + 3
|
||||||
|
|
||||||
|
|
||||||
|
def _safe_filename(s: str) -> str:
|
||||||
|
s = re.sub(r"[^\w\-.]+", "_", str(s).strip())
|
||||||
|
return s[:180] if len(s) > 180 else s
|
||||||
|
|
||||||
|
|
||||||
|
# -----------------------------
|
||||||
|
# vLLM environment export helper
|
||||||
|
# -----------------------------
|
||||||
|
def _parse_vllm_env_txt(env_path: Path) -> pd.DataFrame:
|
||||||
|
"""Parse vllm_env.txt into a flat table (Section, Key, Value).
|
||||||
|
|
||||||
|
Supports:
|
||||||
|
- section headers as standalone lines (no ':' or '=')
|
||||||
|
- key-value lines like 'OS: Ubuntu ...'
|
||||||
|
- env var lines like 'HF_HOME=/data/hf'
|
||||||
|
"""
|
||||||
|
lines = env_path.read_text(encoding="utf-8", errors="replace").splitlines()
|
||||||
|
section = "General"
|
||||||
|
rows: list[dict] = []
|
||||||
|
|
||||||
|
def set_section(s: str):
|
||||||
|
nonlocal section
|
||||||
|
s = (s or "").strip()
|
||||||
|
if s:
|
||||||
|
section = s
|
||||||
|
|
||||||
|
for raw in lines:
|
||||||
|
stripped = raw.strip()
|
||||||
|
if not stripped:
|
||||||
|
continue
|
||||||
|
# divider lines like =====
|
||||||
|
if set(stripped) <= {"="}:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# section header heuristic: short standalone line
|
||||||
|
if ":" not in stripped and "=" not in stripped and len(stripped) <= 64:
|
||||||
|
if stripped.lower().startswith("collecting environment information"):
|
||||||
|
continue
|
||||||
|
set_section(stripped)
|
||||||
|
continue
|
||||||
|
|
||||||
|
# env var style: KEY=VALUE (and not a URL with :)
|
||||||
|
if "=" in stripped and ":" not in stripped:
|
||||||
|
k, v = stripped.split("=", 1)
|
||||||
|
k = k.strip()
|
||||||
|
v = v.strip()
|
||||||
|
if k:
|
||||||
|
rows.append({"Section": section, "Key": k, "Value": v})
|
||||||
|
continue
|
||||||
|
|
||||||
|
# key: value
|
||||||
|
if ":" in stripped:
|
||||||
|
k, v = stripped.split(":", 1)
|
||||||
|
k = k.strip()
|
||||||
|
v = v.strip()
|
||||||
|
if k:
|
||||||
|
rows.append({"Section": section, "Key": k, "Value": v})
|
||||||
|
continue
|
||||||
|
|
||||||
|
return pd.DataFrame(rows, columns=["Section", "Key", "Value"])
|
||||||
|
|
||||||
|
|
||||||
|
def _load_env_df_for_inputs(args, files: list[str]) -> pd.DataFrame | None:
|
||||||
|
"""Load vllm_env.txt next to the *original* input JSON file.
|
||||||
|
|
||||||
|
Note: when only one -f is provided, the script may split JSON into ./splits/...,
|
||||||
|
but vllm_env.txt typically lives next to the original benchmark_results.json.
|
||||||
|
"""
|
||||||
|
base_dir: Path | None = None
|
||||||
|
if getattr(args, "file", None):
|
||||||
|
base_dir = Path(args.file[0]).resolve().parent
|
||||||
|
elif files:
|
||||||
|
base_dir = Path(files[0]).resolve().parent
|
||||||
|
if base_dir is None:
|
||||||
|
return None
|
||||||
|
|
||||||
|
env_path = base_dir / "vllm_env.txt"
|
||||||
|
if not env_path.exists():
|
||||||
|
return None
|
||||||
|
df = _parse_vllm_env_txt(env_path)
|
||||||
|
return df
|
||||||
|
|
||||||
|
|
||||||
# -----------------------------
|
# -----------------------------
|
||||||
# Valid max concurrency summary helpers
|
# Valid max concurrency summary helpers
|
||||||
# -----------------------------
|
# -----------------------------
|
||||||
@@ -428,7 +555,6 @@ def build_valid_max_concurrency_summary_html(
|
|||||||
|
|
||||||
summary_df = pd.DataFrame(rows)
|
summary_df = pd.DataFrame(rows)
|
||||||
|
|
||||||
# --- Coerce numeric columns so Styler doesn't miss them due to object dtype ---
|
|
||||||
for c in summary_df.columns:
|
for c in summary_df.columns:
|
||||||
if c == "Configuration":
|
if c == "Configuration":
|
||||||
continue
|
continue
|
||||||
@@ -436,12 +562,10 @@ def build_valid_max_concurrency_summary_html(
|
|||||||
|
|
||||||
both_col = f"Max {conc_col} (Both)"
|
both_col = f"Max {conc_col} (Both)"
|
||||||
|
|
||||||
# --- Strict 2-decimal formatting for ALL non-Configuration columns ---
|
|
||||||
formatters = {}
|
formatters = {}
|
||||||
for c in summary_df.columns:
|
for c in summary_df.columns:
|
||||||
if c == "Configuration":
|
if c == "Configuration":
|
||||||
continue
|
continue
|
||||||
# default argument binds per-column formatter correctly
|
|
||||||
formatters[c] = lambda v: "" if pd.isna(v) else f"{float(v):.2f}"
|
formatters[c] = lambda v: "" if pd.isna(v) else f"{float(v):.2f}"
|
||||||
|
|
||||||
styler = summary_df.style.format(formatters)
|
styler = summary_df.style.format(formatters)
|
||||||
@@ -460,6 +584,95 @@ def build_valid_max_concurrency_summary_html(
|
|||||||
return title + styler.to_html(table_attributes='border="1" class="dataframe"')
|
return title + styler.to_html(table_attributes='border="1" class="dataframe"')
|
||||||
|
|
||||||
|
|
||||||
|
def build_valid_max_concurrency_summary_df(
|
||||||
|
tput_group_df: pd.DataFrame | None,
|
||||||
|
ttft_group_df: pd.DataFrame | None,
|
||||||
|
tpot_group_df: pd.DataFrame | None,
|
||||||
|
conc_col: str,
|
||||||
|
args,
|
||||||
|
) -> pd.DataFrame | None:
|
||||||
|
if ttft_group_df is None and tpot_group_df is None:
|
||||||
|
return None
|
||||||
|
|
||||||
|
ttft_cols = (
|
||||||
|
_config_value_columns(ttft_group_df, conc_col)
|
||||||
|
if ttft_group_df is not None
|
||||||
|
else []
|
||||||
|
)
|
||||||
|
tpot_cols = (
|
||||||
|
_config_value_columns(tpot_group_df, conc_col)
|
||||||
|
if tpot_group_df is not None
|
||||||
|
else []
|
||||||
|
)
|
||||||
|
tput_cols = (
|
||||||
|
_config_value_columns(tput_group_df, conc_col)
|
||||||
|
if tput_group_df is not None
|
||||||
|
else []
|
||||||
|
)
|
||||||
|
|
||||||
|
if ttft_group_df is not None and tpot_group_df is not None:
|
||||||
|
cfg_cols = [c for c in ttft_cols if c in tpot_cols]
|
||||||
|
if tput_group_df is not None:
|
||||||
|
cfg_cols = [c for c in cfg_cols if c in tput_cols] or cfg_cols
|
||||||
|
else:
|
||||||
|
cfg_cols = ttft_cols or tpot_cols
|
||||||
|
|
||||||
|
if not cfg_cols:
|
||||||
|
cfg_cols = sorted(set(ttft_cols) | set(tpot_cols) | set(tput_cols), key=str)
|
||||||
|
|
||||||
|
rows = []
|
||||||
|
for cfg in cfg_cols:
|
||||||
|
ttft_max = (
|
||||||
|
_max_concurrency_ok(ttft_group_df, conc_col, cfg, args.ttft_max_ms)
|
||||||
|
if ttft_group_df is not None
|
||||||
|
else pd.NA
|
||||||
|
)
|
||||||
|
tpot_max = (
|
||||||
|
_max_concurrency_ok(tpot_group_df, conc_col, cfg, args.tpot_max_ms)
|
||||||
|
if tpot_group_df is not None
|
||||||
|
else pd.NA
|
||||||
|
)
|
||||||
|
both = (
|
||||||
|
pd.NA
|
||||||
|
if (pd.isna(ttft_max) or pd.isna(tpot_max))
|
||||||
|
else min(ttft_max, tpot_max)
|
||||||
|
)
|
||||||
|
|
||||||
|
tput_at_both = (
|
||||||
|
_value_at_concurrency(tput_group_df, conc_col, cfg, both)
|
||||||
|
if tput_group_df is not None
|
||||||
|
else pd.NA
|
||||||
|
)
|
||||||
|
ttft_at_both = (
|
||||||
|
_value_at_concurrency(ttft_group_df, conc_col, cfg, both)
|
||||||
|
if ttft_group_df is not None
|
||||||
|
else pd.NA
|
||||||
|
)
|
||||||
|
tpot_at_both = (
|
||||||
|
_value_at_concurrency(tpot_group_df, conc_col, cfg, both)
|
||||||
|
if tpot_group_df is not None
|
||||||
|
else pd.NA
|
||||||
|
)
|
||||||
|
|
||||||
|
rows.append(
|
||||||
|
{
|
||||||
|
"Configuration": cfg,
|
||||||
|
f"Max {conc_col} (TTFT ≤ {args.ttft_max_ms:g} ms)": ttft_max,
|
||||||
|
f"Max {conc_col} (TPOT ≤ {args.tpot_max_ms:g} ms)": tpot_max,
|
||||||
|
f"Max {conc_col} (Both)": both,
|
||||||
|
"Output Tput @ Both (tok/s)": tput_at_both,
|
||||||
|
"TTFT @ Both (ms)": ttft_at_both,
|
||||||
|
"TPOT @ Both (ms)": tpot_at_both,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
df = pd.DataFrame(rows)
|
||||||
|
for c in df.columns:
|
||||||
|
if c != "Configuration":
|
||||||
|
df[c] = pd.to_numeric(df[c], errors="coerce")
|
||||||
|
return df
|
||||||
|
|
||||||
|
|
||||||
# -----------------------------
|
# -----------------------------
|
||||||
# Plot helper
|
# Plot helper
|
||||||
# -----------------------------
|
# -----------------------------
|
||||||
@@ -537,6 +750,21 @@ def build_parser() -> argparse.ArgumentParser:
|
|||||||
default=100.0,
|
default=100.0,
|
||||||
help="Reference limit for TPOT plots (ms)",
|
help="Reference limit for TPOT plots (ms)",
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# ---- NEW: export options ----
|
||||||
|
parser.add_argument(
|
||||||
|
"--excel-out",
|
||||||
|
type=str,
|
||||||
|
default="perf_comparison.xlsx",
|
||||||
|
help="Write one sheet per (Model, Dataset, Input Len, Output Len).",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--csv-out-dir",
|
||||||
|
type=str,
|
||||||
|
default="",
|
||||||
|
help="If set, write per-group per-metric CSVs into this directory.",
|
||||||
|
)
|
||||||
|
|
||||||
return parser
|
return parser
|
||||||
|
|
||||||
|
|
||||||
@@ -657,7 +885,6 @@ def maybe_write_plot(
|
|||||||
markers=True,
|
markers=True,
|
||||||
)
|
)
|
||||||
|
|
||||||
# Ensure plot hover + y tick labels are also 2 decimals.
|
|
||||||
fig.update_traces(hovertemplate="%{y:.2f}<extra></extra>")
|
fig.update_traces(hovertemplate="%{y:.2f}<extra></extra>")
|
||||||
fig.update_yaxes(tickformat=".2f")
|
fig.update_yaxes(tickformat=".2f")
|
||||||
|
|
||||||
@@ -730,87 +957,151 @@ def write_report_group_first(
|
|||||||
for metric_label, (df, _) in metric_cache.items()
|
for metric_label, (df, _) in metric_cache.items()
|
||||||
}
|
}
|
||||||
|
|
||||||
with open("perf_comparison.html", "w", encoding="utf-8") as main_fh:
|
csv_dir = Path(args.csv_out_dir) if args.csv_out_dir else None
|
||||||
main_fh.write('<meta charset="utf-8">\n')
|
if csv_dir:
|
||||||
for gkey in group_keys:
|
csv_dir.mkdir(parents=True, exist_ok=True)
|
||||||
gkey_tuple = normalize_group_key(gkey)
|
|
||||||
suffix = build_group_suffix(group_cols_canonical, gkey_tuple)
|
|
||||||
sub_path = group_filename(gkey_tuple)
|
|
||||||
group_header = (
|
|
||||||
'<div style="font-size: 1.4em; font-weight: 700; '
|
|
||||||
'margin: 18px 0 10px 0;">'
|
|
||||||
f"{_html.escape(suffix)}"
|
|
||||||
"</div>\n"
|
|
||||||
)
|
|
||||||
|
|
||||||
main_fh.write(group_header)
|
excel_path = args.excel_out or "perf_comparison.xlsx"
|
||||||
with open(sub_path, "w", encoding="utf-8") as sub_fh:
|
with pd.ExcelWriter(excel_path, engine="openpyxl") as xw:
|
||||||
sub_fh.write('<meta charset="utf-8">\n')
|
# ---- Environment sheet (first) ----
|
||||||
sub_fh.write(group_header)
|
env_sheet = _sanitize_sheet_name("Environment")
|
||||||
tput_group_df = None
|
env_df = _load_env_df_for_inputs(args, files)
|
||||||
ttft_group_df = None
|
if env_df is None or env_df.empty:
|
||||||
tpot_group_df = None
|
pd.DataFrame(
|
||||||
conc_col = args.xaxis
|
[
|
||||||
|
{
|
||||||
|
"Section": "Environment",
|
||||||
|
"Key": "vllm_env.txt",
|
||||||
|
"Value": "NOT FOUND (or empty)",
|
||||||
|
}
|
||||||
|
]
|
||||||
|
).to_excel(xw, sheet_name=env_sheet, index=False)
|
||||||
|
else:
|
||||||
|
env_df.to_excel(xw, sheet_name=env_sheet, index=False)
|
||||||
|
with open("perf_comparison.html", "w", encoding="utf-8") as main_fh:
|
||||||
|
main_fh.write('<meta charset="utf-8">\n')
|
||||||
|
for gkey in group_keys:
|
||||||
|
gkey_tuple = normalize_group_key(gkey)
|
||||||
|
suffix = build_group_suffix(group_cols_canonical, gkey_tuple)
|
||||||
|
sub_path = group_filename(gkey_tuple)
|
||||||
|
group_header = (
|
||||||
|
'<div style="font-size: 1.4em; font-weight: 700; '
|
||||||
|
'margin: 18px 0 10px 0;">'
|
||||||
|
f"{_html.escape(suffix)}"
|
||||||
|
"</div>\n"
|
||||||
|
)
|
||||||
|
|
||||||
for metric_label in plan.data_cols:
|
main_fh.write(group_header)
|
||||||
gb = metric_groupbys[metric_label]
|
|
||||||
df_sorted, raw_data_cols = metric_cache[metric_label]
|
|
||||||
|
|
||||||
try:
|
sheet = _group_to_sheet_base(group_cols_canonical, gkey_tuple)
|
||||||
group_df = gb.get_group(gkey)
|
sheet_base = sheet
|
||||||
except KeyError:
|
dedup_i = 1
|
||||||
missing = (
|
while sheet in xw.sheets:
|
||||||
'<div style="font-size: 1.1em; font-weight: 600; '
|
dedup_i += 1
|
||||||
'margin: 10px 0;">'
|
sheet = _sanitize_sheet_name(f"{sheet_base}_{dedup_i}")
|
||||||
f"{_html.escape(metric_label)} — missing for this group"
|
|
||||||
"</div>\n"
|
excel_blocks: list[tuple[str, pd.DataFrame]] = []
|
||||||
|
|
||||||
|
with open(sub_path, "w", encoding="utf-8") as sub_fh:
|
||||||
|
sub_fh.write('<meta charset="utf-8">\n')
|
||||||
|
sub_fh.write(group_header)
|
||||||
|
tput_group_df = None
|
||||||
|
ttft_group_df = None
|
||||||
|
tpot_group_df = None
|
||||||
|
conc_col = args.xaxis
|
||||||
|
|
||||||
|
for metric_label in plan.data_cols:
|
||||||
|
gb = metric_groupbys[metric_label]
|
||||||
|
df_sorted, raw_data_cols = metric_cache[metric_label]
|
||||||
|
|
||||||
|
try:
|
||||||
|
group_df = gb.get_group(gkey)
|
||||||
|
except KeyError:
|
||||||
|
missing = (
|
||||||
|
'<div style="font-size: 1.1em; font-weight: 600; '
|
||||||
|
'margin: 10px 0;">'
|
||||||
|
f"{_html.escape(metric_label)} — missing for this group"
|
||||||
|
"</div>\n"
|
||||||
|
)
|
||||||
|
main_fh.write(missing)
|
||||||
|
sub_fh.write(missing)
|
||||||
|
continue
|
||||||
|
|
||||||
|
if conc_col not in group_df.columns:
|
||||||
|
conc_col = _find_concurrency_col(group_df)
|
||||||
|
|
||||||
|
mn = metric_label.lower().strip()
|
||||||
|
if "tok/s" in mn:
|
||||||
|
tput_group_df = group_df
|
||||||
|
elif "ttft" in mn:
|
||||||
|
ttft_group_df = group_df
|
||||||
|
elif mn in ("p99", "median") or "tpot" in mn:
|
||||||
|
tpot_group_df = group_df
|
||||||
|
|
||||||
|
display_group = group_df.drop(
|
||||||
|
columns=group_cols_canonical, errors="ignore"
|
||||||
)
|
)
|
||||||
|
|
||||||
main_fh.write(missing)
|
html = render_metric_table_html(
|
||||||
sub_fh.write(missing)
|
display_group, metric_label, suffix, args
|
||||||
continue
|
)
|
||||||
|
main_fh.write(html)
|
||||||
|
sub_fh.write(html)
|
||||||
|
|
||||||
if conc_col not in group_df.columns:
|
maybe_write_plot(
|
||||||
conc_col = _find_concurrency_col(group_df)
|
main_fh,
|
||||||
|
sub_fh,
|
||||||
|
group_df=group_df,
|
||||||
|
raw_data_cols=raw_data_cols,
|
||||||
|
metric_label=metric_label,
|
||||||
|
y_axis_col=y_axis_col,
|
||||||
|
args=args,
|
||||||
|
)
|
||||||
|
|
||||||
mn = metric_label.lower().strip()
|
excel_blocks.append(
|
||||||
if "tok/s" in mn:
|
(metric_label, display_group.reset_index(drop=True))
|
||||||
tput_group_df = group_df
|
)
|
||||||
elif "ttft" in mn:
|
if csv_dir:
|
||||||
ttft_group_df = group_df
|
fn = _safe_filename(
|
||||||
elif mn in ("p99", "median") or "tpot" in mn:
|
f"{sheet}__{metric_label}".replace(" ", "_").replace(
|
||||||
tpot_group_df = group_df
|
"/", "_"
|
||||||
|
)
|
||||||
|
)
|
||||||
|
display_group.to_csv(csv_dir / f"{fn}.csv", index=False)
|
||||||
|
|
||||||
display_group = group_df.drop(
|
summary_html = build_valid_max_concurrency_summary_html(
|
||||||
columns=group_cols_canonical, errors="ignore"
|
tput_group_df=tput_group_df,
|
||||||
)
|
ttft_group_df=ttft_group_df,
|
||||||
|
tpot_group_df=tpot_group_df,
|
||||||
html = render_metric_table_html(
|
conc_col=conc_col,
|
||||||
display_group, metric_label, suffix, args
|
|
||||||
)
|
|
||||||
main_fh.write(html)
|
|
||||||
sub_fh.write(html)
|
|
||||||
|
|
||||||
maybe_write_plot(
|
|
||||||
main_fh,
|
|
||||||
sub_fh,
|
|
||||||
group_df=group_df,
|
|
||||||
raw_data_cols=raw_data_cols,
|
|
||||||
metric_label=metric_label,
|
|
||||||
y_axis_col=y_axis_col,
|
|
||||||
args=args,
|
args=args,
|
||||||
)
|
)
|
||||||
|
if summary_html:
|
||||||
|
main_fh.write(summary_html)
|
||||||
|
sub_fh.write(summary_html)
|
||||||
|
|
||||||
summary_html = build_valid_max_concurrency_summary_html(
|
summary_df = build_valid_max_concurrency_summary_df(
|
||||||
tput_group_df=tput_group_df,
|
tput_group_df=tput_group_df,
|
||||||
ttft_group_df=ttft_group_df,
|
ttft_group_df=ttft_group_df,
|
||||||
tpot_group_df=tpot_group_df,
|
tpot_group_df=tpot_group_df,
|
||||||
conc_col=conc_col,
|
conc_col=conc_col,
|
||||||
args=args,
|
args=args,
|
||||||
)
|
)
|
||||||
if summary_html:
|
if summary_df is not None:
|
||||||
main_fh.write(summary_html)
|
excel_blocks.append(
|
||||||
sub_fh.write(summary_html)
|
("Valid Max Concurrency Summary", summary_df)
|
||||||
|
)
|
||||||
|
if csv_dir:
|
||||||
|
fn = _safe_filename(
|
||||||
|
f"{sheet}__Valid_Max_Concurrency_Summary"
|
||||||
|
)
|
||||||
|
summary_df.to_csv(csv_dir / f"{fn}.csv", index=False)
|
||||||
|
|
||||||
|
_write_tables_to_excel_sheet(xw, sheet, excel_blocks)
|
||||||
|
|
||||||
|
print(f"Wrote Excel: {excel_path}")
|
||||||
|
if csv_dir:
|
||||||
|
print(f"Wrote CSVs under: {csv_dir}")
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
|
|||||||
@@ -393,7 +393,7 @@ if __name__ == "__main__":
|
|||||||
with open(results_folder / md_file, "w") as f:
|
with open(results_folder / md_file, "w") as f:
|
||||||
results = read_markdown(
|
results = read_markdown(
|
||||||
"../.buildkite/performance-benchmarks/"
|
"../.buildkite/performance-benchmarks/"
|
||||||
+ "performance-benchmarks-descriptions.md"
|
"performance-benchmarks-descriptions.md"
|
||||||
)
|
)
|
||||||
results = results.format(
|
results = results.format(
|
||||||
latency_tests_markdown_table=latency_md_table,
|
latency_tests_markdown_table=latency_md_table,
|
||||||
|
|||||||
@@ -1,6 +1,4 @@
|
|||||||
#!/bin/bash
|
#!/bin/bash
|
||||||
|
|
||||||
# This script should be run inside the CI process
|
|
||||||
# This script assumes that we are already inside the vllm/ directory
|
# This script assumes that we are already inside the vllm/ directory
|
||||||
# Benchmarking results will be available inside vllm/benchmarks/results/
|
# Benchmarking results will be available inside vllm/benchmarks/results/
|
||||||
|
|
||||||
@@ -9,14 +7,19 @@
|
|||||||
set -x
|
set -x
|
||||||
set -o pipefail
|
set -o pipefail
|
||||||
|
|
||||||
|
# Environment-driven debug controls (like ON_CPU=1)
|
||||||
|
DRY_RUN="${DRY_RUN:-0}"
|
||||||
|
MODEL_FILTER="${MODEL_FILTER:-}"
|
||||||
|
DTYPE_FILTER="${DTYPE_FILTER:-}"
|
||||||
|
|
||||||
check_gpus() {
|
check_gpus() {
|
||||||
if command -v nvidia-smi; then
|
if command -v nvidia-smi; then
|
||||||
# check the number of GPUs and GPU type.
|
# check the number of GPUs and GPU type.
|
||||||
declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
|
declare -g gpu_count=$(nvidia-smi --list-gpus | grep -c . || true)
|
||||||
elif command -v amd-smi; then
|
elif command -v amd-smi; then
|
||||||
declare -g gpu_count=$(amd-smi list | grep 'GPU' | wc -l)
|
declare -g gpu_count=$(amd-smi list | grep -c 'GPU' || true)
|
||||||
elif command -v hl-smi; then
|
elif command -v hl-smi; then
|
||||||
declare -g gpu_count=$(hl-smi --list | grep -i "Module ID" | wc -l)
|
declare -g gpu_count=$(hl-smi --list | grep -ci "Module ID" || true)
|
||||||
fi
|
fi
|
||||||
|
|
||||||
if [[ $gpu_count -gt 0 ]]; then
|
if [[ $gpu_count -gt 0 ]]; then
|
||||||
@@ -25,9 +28,9 @@ check_gpus() {
|
|||||||
echo "Need at least 1 GPU to run benchmarking."
|
echo "Need at least 1 GPU to run benchmarking."
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
declare -g arch_suffix=''
|
declare -g arch_suffix=''
|
||||||
|
|
||||||
if command -v nvidia-smi; then
|
if command -v nvidia-smi; then
|
||||||
declare -g gpu_type=$(nvidia-smi --query-gpu=name --format=csv,noheader | awk '{print $2}')
|
declare -g gpu_type=$(nvidia-smi --query-gpu=name --format=csv,noheader | awk '{print $2}')
|
||||||
elif command -v amd-smi; then
|
elif command -v amd-smi; then
|
||||||
@@ -44,7 +47,7 @@ check_cpus() {
|
|||||||
declare -g numa_count=$(lscpu | grep "NUMA node(s):" | awk '{print $3}')
|
declare -g numa_count=$(lscpu | grep "NUMA node(s):" | awk '{print $3}')
|
||||||
if [[ $numa_count -gt 0 ]]; then
|
if [[ $numa_count -gt 0 ]]; then
|
||||||
echo "NUMA found."
|
echo "NUMA found."
|
||||||
echo $numa_count
|
echo "$numa_count"
|
||||||
else
|
else
|
||||||
echo "Need at least 1 NUMA to run benchmarking."
|
echo "Need at least 1 NUMA to run benchmarking."
|
||||||
exit 1
|
exit 1
|
||||||
@@ -112,13 +115,12 @@ json2envs() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
wait_for_server() {
|
wait_for_server() {
|
||||||
# wait for vllm server to start
|
|
||||||
# return 1 if vllm server crashes
|
|
||||||
local timeout_val="1200"
|
local timeout_val="1200"
|
||||||
timeout "$timeout_val" bash -c '
|
timeout "$timeout_val" bash -c '
|
||||||
until curl -X POST localhost:8000/v1/completions; do
|
until curl -sf http://localhost:8000/v1/models >/dev/null; do
|
||||||
sleep 1
|
sleep 1
|
||||||
done' && return 0 || return 1
|
done
|
||||||
|
'
|
||||||
}
|
}
|
||||||
|
|
||||||
kill_processes_launched_by_current_bash() {
|
kill_processes_launched_by_current_bash() {
|
||||||
@@ -181,19 +183,20 @@ upload_to_buildkite() {
|
|||||||
$BUILDKITE_AGENT_COMMAND artifact upload "$RESULTS_FOLDER/*"
|
$BUILDKITE_AGENT_COMMAND artifact upload "$RESULTS_FOLDER/*"
|
||||||
}
|
}
|
||||||
|
|
||||||
run_latency_tests() {
|
run_benchmark_tests() {
|
||||||
# run latency tests using `vllm bench latency` command
|
# run benchmark tests using `vllm bench <test_type>` command
|
||||||
# $1: a json file specifying latency test cases
|
# $1: test type (latency or throughput)
|
||||||
|
# $2: a json file specifying test cases
|
||||||
|
|
||||||
local latency_test_file
|
local test_type=$1
|
||||||
latency_test_file=$1
|
local test_file=$2
|
||||||
|
|
||||||
# Iterate over latency tests
|
# Iterate over tests
|
||||||
jq -c '.[]' "$latency_test_file" | while read -r params; do
|
jq -c '.[]' "$test_file" | while read -r params; do
|
||||||
# get the test name, and append the GPU type back to it.
|
# get the test name, and append the GPU type back to it.
|
||||||
test_name=$(echo "$params" | jq -r '.test_name')
|
test_name=$(echo "$params" | jq -r '.test_name')
|
||||||
if [[ ! "$test_name" =~ ^latency_ ]]; then
|
if [[ ! "$test_name" =~ ^${test_type}_ ]]; then
|
||||||
echo "In latency-test.json, test_name must start with \"latency_\"."
|
echo "In ${test_type}-test.json, test_name must start with \"${test_type}_\"."
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
@@ -204,15 +207,15 @@ run_latency_tests() {
|
|||||||
fi
|
fi
|
||||||
|
|
||||||
# get arguments
|
# get arguments
|
||||||
latency_params=$(echo "$params" | jq -r '.parameters')
|
bench_params=$(echo "$params" | jq -r '.parameters')
|
||||||
latency_args=$(json2args "$latency_params")
|
bench_args=$(json2args "$bench_params")
|
||||||
latency_environment_variables=$(echo "$params" | jq -r '.environment_variables')
|
bench_environment_variables=$(echo "$params" | jq -r '.environment_variables')
|
||||||
latency_envs=$(json2envs "$latency_environment_variables")
|
bench_envs=$(json2envs "$bench_environment_variables")
|
||||||
|
|
||||||
# check if there is enough GPU to run the test
|
# check if there is enough GPU to run the test
|
||||||
tp=$(echo "$latency_params" | jq -r '.tensor_parallel_size')
|
tp=$(echo "$bench_params" | jq -r '.tensor_parallel_size')
|
||||||
if [[ "$ON_CPU" == "1" ]]; then
|
if [[ "$ON_CPU" == "1" ]]; then
|
||||||
pp=$(echo "$latency_params" | jq -r '.pipeline_parallel_size // 1')
|
pp=$(echo "$bench_params" | jq -r '.pipeline_parallel_size // 1')
|
||||||
world_size=$(($tp*$pp))
|
world_size=$(($tp*$pp))
|
||||||
if [[ $numa_count -lt $world_size && -z "${REMOTE_HOST}" ]]; then
|
if [[ $numa_count -lt $world_size && -z "${REMOTE_HOST}" ]]; then
|
||||||
echo "Required world-size $world_size but only $numa_count NUMA nodes found. Skip testcase $test_name."
|
echo "Required world-size $world_size but only $numa_count NUMA nodes found. Skip testcase $test_name."
|
||||||
@@ -225,118 +228,42 @@ run_latency_tests() {
|
|||||||
fi
|
fi
|
||||||
fi
|
fi
|
||||||
|
|
||||||
latency_command=" $latency_envs vllm bench latency \
|
bench_command=" $bench_envs vllm bench $test_type \
|
||||||
--output-json $RESULTS_FOLDER/${test_name}.json \
|
--output-json $RESULTS_FOLDER/${test_name}.json \
|
||||||
$latency_args"
|
$bench_args"
|
||||||
|
|
||||||
echo "Running test case $test_name"
|
echo "Running test case $test_name"
|
||||||
echo "Latency command: $latency_command"
|
echo "${test_type^} command: $bench_command"
|
||||||
|
|
||||||
# recoding benchmarking command ang GPU command
|
# recording benchmarking command and GPU command
|
||||||
jq_output=$(jq -n \
|
jq_output=$(jq -n \
|
||||||
--arg latency "$latency_command" \
|
--arg command "$bench_command" \
|
||||||
--arg gpu "$gpu_type" \
|
--arg gpu "$gpu_type" \
|
||||||
|
--arg test_type "$test_type" \
|
||||||
'{
|
'{
|
||||||
latency_command: $latency,
|
($test_type + "_command"): $command,
|
||||||
gpu_type: $gpu
|
gpu_type: $gpu
|
||||||
}')
|
}')
|
||||||
echo "$jq_output" >"$RESULTS_FOLDER/$test_name.commands"
|
echo "$jq_output" >"$RESULTS_FOLDER/$test_name.commands"
|
||||||
|
|
||||||
# run the benchmark
|
# run the benchmark
|
||||||
eval "$latency_command"
|
eval "$bench_command"
|
||||||
|
|
||||||
kill_gpu_processes
|
kill_gpu_processes
|
||||||
|
|
||||||
done
|
done
|
||||||
}
|
}
|
||||||
|
|
||||||
run_throughput_tests() {
|
run_latency_tests() { run_benchmark_tests "latency" "$1"; }
|
||||||
# run throughput tests using `vllm bench throughput`
|
run_startup_tests() { run_benchmark_tests "startup" "$1"; }
|
||||||
# $1: a json file specifying throughput test cases
|
run_throughput_tests() { run_benchmark_tests "throughput" "$1"; }
|
||||||
|
|
||||||
local throughput_test_file
|
merge_serving_tests_stream() {
|
||||||
throughput_test_file=$1
|
# Emit merged serving test objects, optionally filtered by MODEL_FILTER/DTYPE_FILTER in DRY_RUN mode.
|
||||||
|
# This helper does NOT modify JSON; it only filters the stream in dry-run mode.
|
||||||
# Iterate over throughput tests
|
local serving_test_file="$1"
|
||||||
jq -c '.[]' "$throughput_test_file" | while read -r params; do
|
# shellcheck disable=SC2016
|
||||||
# get the test name, and append the GPU type back to it.
|
local merged='
|
||||||
test_name=$(echo "$params" | jq -r '.test_name')
|
|
||||||
if [[ ! "$test_name" =~ ^throughput_ ]]; then
|
|
||||||
echo "In throughput-test.json, test_name must start with \"throughput_\"."
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
# if TEST_SELECTOR is set, only run the test cases that match the selector
|
|
||||||
if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
|
|
||||||
echo "Skip test case $test_name."
|
|
||||||
continue
|
|
||||||
fi
|
|
||||||
|
|
||||||
# get arguments
|
|
||||||
throughput_params=$(echo "$params" | jq -r '.parameters')
|
|
||||||
throughput_args=$(json2args "$throughput_params")
|
|
||||||
throughput_environment_variables=$(echo "$params" | jq -r '.environment_variables')
|
|
||||||
throughput_envs=$(json2envs "$throughput_environment_variables")
|
|
||||||
|
|
||||||
# check if there is enough GPU to run the test
|
|
||||||
tp=$(echo "$throughput_params" | jq -r '.tensor_parallel_size')
|
|
||||||
if [[ "$ON_CPU" == "1" ]]; then
|
|
||||||
pp=$(echo "$throughput_params" | jq -r '.pipeline_parallel_size // 1')
|
|
||||||
world_size=$(($tp*$pp))
|
|
||||||
if [[ $numa_count -lt $world_size && -z "${REMOTE_HOST}" ]]; then
|
|
||||||
echo "Required world-size $world_size but only $numa_count NUMA nodes found. Skip testcase $test_name."
|
|
||||||
continue
|
|
||||||
fi
|
|
||||||
else
|
|
||||||
if [[ $gpu_count -lt $tp ]]; then
|
|
||||||
echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name."
|
|
||||||
continue
|
|
||||||
fi
|
|
||||||
fi
|
|
||||||
|
|
||||||
throughput_command=" $throughput_envs vllm bench throughput \
|
|
||||||
--output-json $RESULTS_FOLDER/${test_name}.json \
|
|
||||||
$throughput_args"
|
|
||||||
|
|
||||||
echo "Running test case $test_name"
|
|
||||||
echo "Throughput command: $throughput_command"
|
|
||||||
# recoding benchmarking command ang GPU command
|
|
||||||
jq_output=$(jq -n \
|
|
||||||
--arg command "$throughput_command" \
|
|
||||||
--arg gpu "$gpu_type" \
|
|
||||||
'{
|
|
||||||
throughput_command: $command,
|
|
||||||
gpu_type: $gpu
|
|
||||||
}')
|
|
||||||
echo "$jq_output" >"$RESULTS_FOLDER/$test_name.commands"
|
|
||||||
|
|
||||||
# run the benchmark
|
|
||||||
eval "$throughput_command"
|
|
||||||
|
|
||||||
kill_gpu_processes
|
|
||||||
|
|
||||||
done
|
|
||||||
}
|
|
||||||
|
|
||||||
run_serving_tests() {
|
|
||||||
# run serving tests using `vllm bench serve` command
|
|
||||||
# $1: a json file specifying serving test cases
|
|
||||||
#
|
|
||||||
# Supported JSON formats:
|
|
||||||
# 1) Plain format: top-level array
|
|
||||||
# [ { "test_name": "...", "server_parameters": {...}, ... }, ... ]
|
|
||||||
#
|
|
||||||
# 2) Default parameters field + plain format tests
|
|
||||||
# {
|
|
||||||
# "defaults": { ... },
|
|
||||||
# "tests": [ { "test_name": "...", "server_parameters": {...}, ... }, ... ]
|
|
||||||
# }
|
|
||||||
|
|
||||||
local serving_test_file
|
|
||||||
serving_test_file=$1
|
|
||||||
|
|
||||||
# Iterate over serving tests
|
|
||||||
jq -c '
|
|
||||||
if type == "array" then
|
if type == "array" then
|
||||||
# Plain format: test cases array
|
# Plain format: test cases array
|
||||||
.[]
|
.[]
|
||||||
@@ -358,7 +285,50 @@ run_serving_tests() {
|
|||||||
else
|
else
|
||||||
error("Unsupported serving test file format: must be array or object with .tests")
|
error("Unsupported serving test file format: must be array or object with .tests")
|
||||||
end
|
end
|
||||||
' "$serving_test_file" | while read -r params; do
|
'
|
||||||
|
|
||||||
|
jq -c "$merged" "$serving_test_file" | \
|
||||||
|
if [[ "${DRY_RUN:-0}" == "1" && ( "${MODEL_FILTER}${DTYPE_FILTER}" != "" ) ]]; then
|
||||||
|
jq -c --arg model "$MODEL_FILTER" --arg dtype "$DTYPE_FILTER" '
|
||||||
|
select((($model|length)==0)
|
||||||
|
or ((.server_parameters.model // "") == $model)
|
||||||
|
or ((.client_parameters.model // "") == $model))
|
||||||
|
| select((($dtype|length)==0) or ((.server_parameters.dtype // "") == $dtype))
|
||||||
|
'
|
||||||
|
else
|
||||||
|
cat
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
run_serving_tests() {
|
||||||
|
# run serving tests using `vllm bench serve` command
|
||||||
|
# $1: a json file specifying serving test cases
|
||||||
|
#
|
||||||
|
# Supported JSON formats:
|
||||||
|
# 1) Plain format: top-level array
|
||||||
|
# [ { "test_name": "...", "server_parameters": {...}, ... }, ... ]
|
||||||
|
#
|
||||||
|
# 2) Default parameters field + plain format tests
|
||||||
|
# {
|
||||||
|
# "defaults": { ... },
|
||||||
|
# "tests": [ { "test_name": "...", "server_parameters": {...}, ... }, ... ]
|
||||||
|
# }
|
||||||
|
|
||||||
|
local serving_test_file
|
||||||
|
serving_test_file=$1
|
||||||
|
|
||||||
|
# In dry-run mode, if filters are provided but no tests match, fail fast.
|
||||||
|
if [[ "${DRY_RUN:-0}" == "1" && ( "${MODEL_FILTER}${DTYPE_FILTER}" != "" ) ]]; then
|
||||||
|
local count
|
||||||
|
count=$(merge_serving_tests_stream "$serving_test_file" | wc -l | tr -d ' ')
|
||||||
|
if [[ "$count" -eq 0 ]]; then
|
||||||
|
echo "No matching serving tests found in $serving_test_file for model='$MODEL_FILTER' dtype='$DTYPE_FILTER'." >&2
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Iterate over serving tests (merged + optional filtered stream)
|
||||||
|
merge_serving_tests_stream "$serving_test_file" | while read -r params; do
|
||||||
# get the test name, and append the GPU type back to it.
|
# get the test name, and append the GPU type back to it.
|
||||||
test_name=$(echo "$params" | jq -r '.test_name')
|
test_name=$(echo "$params" | jq -r '.test_name')
|
||||||
if [[ ! "$test_name" =~ ^serving_ ]]; then
|
if [[ ! "$test_name" =~ ^serving_ ]]; then
|
||||||
@@ -427,7 +397,7 @@ run_serving_tests() {
|
|||||||
echo "Server command: $server_command"
|
echo "Server command: $server_command"
|
||||||
# support remote vllm server
|
# support remote vllm server
|
||||||
client_remote_args=""
|
client_remote_args=""
|
||||||
if [[ -z "${REMOTE_HOST}" ]]; then
|
if [[ -z "${REMOTE_HOST}" && "${DRY_RUN:-0}" != "1" ]]; then
|
||||||
bash -c "$server_command" &
|
bash -c "$server_command" &
|
||||||
server_pid=$!
|
server_pid=$!
|
||||||
# wait until the server is alive
|
# wait until the server is alive
|
||||||
@@ -438,6 +408,9 @@ run_serving_tests() {
|
|||||||
echo ""
|
echo ""
|
||||||
echo "vLLM failed to start within the timeout period."
|
echo "vLLM failed to start within the timeout period."
|
||||||
fi
|
fi
|
||||||
|
elif [[ "${DRY_RUN:-0}" == "1" ]]; then
|
||||||
|
# dry-run: don't start server
|
||||||
|
echo "Dry Run."
|
||||||
else
|
else
|
||||||
server_command="Using Remote Server $REMOTE_HOST $REMOTE_PORT"
|
server_command="Using Remote Server $REMOTE_HOST $REMOTE_PORT"
|
||||||
if [[ ${REMOTE_PORT} ]]; then
|
if [[ ${REMOTE_PORT} ]]; then
|
||||||
@@ -447,34 +420,39 @@ run_serving_tests() {
|
|||||||
fi
|
fi
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
# save the compilation mode and optimization level on the serving results
|
||||||
|
# whenever they are set
|
||||||
|
compilation_config_mode=$(echo "$server_params" | jq -r '."compilation_config.mode" // empty')
|
||||||
|
optimization_level=$(echo "$server_params" | jq -r '.optimization_level // empty')
|
||||||
|
|
||||||
# iterate over different QPS
|
# iterate over different QPS
|
||||||
for qps in $qps_list; do
|
for qps in $qps_list; do
|
||||||
# remove the surrounding single quote from qps
|
# remove the surrounding single quote from qps
|
||||||
if [[ "$qps" == *"inf"* ]]; then
|
if [[ "$qps" == *"inf"* ]]; then
|
||||||
echo "qps was $qps"
|
|
||||||
qps="inf"
|
qps="inf"
|
||||||
echo "now qps is $qps"
|
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# iterate over different max_concurrency
|
# iterate over different max_concurrency
|
||||||
for max_concurrency in $max_concurrency_list; do
|
for max_concurrency in $max_concurrency_list; do
|
||||||
new_test_name=$test_name"_qps_"$qps"_concurrency_"$max_concurrency
|
new_test_name="${test_name}_qps_${qps}_concurrency_${max_concurrency}"
|
||||||
echo " new test name $new_test_name"
|
echo " new test name $new_test_name"
|
||||||
# pass the tensor parallel size to the client so that it can be displayed
|
# pass the tensor parallel size, the compilation mode, and the optimization
|
||||||
# on the benchmark dashboard
|
# level to the client so that they can be used on the benchmark dashboard
|
||||||
client_command="vllm bench serve \
|
client_command="vllm bench serve \
|
||||||
--save-result \
|
--save-result \
|
||||||
--result-dir $RESULTS_FOLDER \
|
--result-dir $RESULTS_FOLDER \
|
||||||
--result-filename ${new_test_name}.json \
|
--result-filename ${new_test_name}.json \
|
||||||
--request-rate $qps \
|
--request-rate $qps \
|
||||||
--max-concurrency $max_concurrency \
|
--max-concurrency $max_concurrency \
|
||||||
--metadata "tensor_parallel_size=$tp" \
|
--metadata tensor_parallel_size=$tp compilation_config.mode=$compilation_config_mode optimization_level=$optimization_level \
|
||||||
$client_args $client_remote_args "
|
$client_args $client_remote_args "
|
||||||
|
|
||||||
echo "Running test case $test_name with qps $qps"
|
echo "Running test case $test_name with qps $qps"
|
||||||
echo "Client command: $client_command"
|
echo "Client command: $client_command"
|
||||||
|
|
||||||
bash -c "$client_command"
|
if [[ "${DRY_RUN:-0}" != "1" ]]; then
|
||||||
|
bash -c "$client_command"
|
||||||
|
fi
|
||||||
|
|
||||||
# record the benchmarking commands
|
# record the benchmarking commands
|
||||||
jq_output=$(jq -n \
|
jq_output=$(jq -n \
|
||||||
@@ -492,12 +470,15 @@ run_serving_tests() {
|
|||||||
done
|
done
|
||||||
|
|
||||||
# clean up
|
# clean up
|
||||||
kill -9 $server_pid
|
if [[ "${DRY_RUN:-0}" != "1" ]]; then
|
||||||
kill_gpu_processes
|
kill -9 "$server_pid"
|
||||||
|
kill_gpu_processes
|
||||||
|
fi
|
||||||
done
|
done
|
||||||
}
|
}
|
||||||
|
|
||||||
main() {
|
main() {
|
||||||
|
|
||||||
local ARCH
|
local ARCH
|
||||||
ARCH=''
|
ARCH=''
|
||||||
if [[ "$ON_CPU" == "1" ]]; then
|
if [[ "$ON_CPU" == "1" ]]; then
|
||||||
@@ -507,7 +488,13 @@ main() {
|
|||||||
check_gpus
|
check_gpus
|
||||||
ARCH="$arch_suffix"
|
ARCH="$arch_suffix"
|
||||||
fi
|
fi
|
||||||
check_hf_token
|
|
||||||
|
# DRY_RUN does not execute vLLM; do not require HF_TOKEN.
|
||||||
|
if [[ "${DRY_RUN:-0}" != "1" ]]; then
|
||||||
|
check_hf_token
|
||||||
|
else
|
||||||
|
echo "DRY_RUN=1 -> skip HF_TOKEN validation"
|
||||||
|
fi
|
||||||
|
|
||||||
# dependencies
|
# dependencies
|
||||||
(which wget && which curl) || (apt-get update && apt-get install -y wget curl)
|
(which wget && which curl) || (apt-get update && apt-get install -y wget curl)
|
||||||
@@ -528,12 +515,18 @@ main() {
|
|||||||
|
|
||||||
# dump vllm info via vllm collect-env
|
# dump vllm info via vllm collect-env
|
||||||
env_output=$(vllm collect-env)
|
env_output=$(vllm collect-env)
|
||||||
|
|
||||||
echo "$env_output" >"$RESULTS_FOLDER/vllm_env.txt"
|
echo "$env_output" >"$RESULTS_FOLDER/vllm_env.txt"
|
||||||
|
|
||||||
# benchmarking
|
# benchmarking
|
||||||
run_serving_tests $QUICK_BENCHMARK_ROOT/tests/"${SERVING_JSON:-serving-tests$ARCH.json}"
|
run_serving_tests $QUICK_BENCHMARK_ROOT/tests/"${SERVING_JSON:-serving-tests$ARCH.json}" || exit $?
|
||||||
|
|
||||||
|
if [[ "${DRY_RUN:-0}" == "1" ]]; then
|
||||||
|
echo "DRY_RUN=1 -> skip latency/startup/throughput suites"
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
|
||||||
run_latency_tests $QUICK_BENCHMARK_ROOT/tests/"${LATENCY_JSON:-latency-tests$ARCH.json}"
|
run_latency_tests $QUICK_BENCHMARK_ROOT/tests/"${LATENCY_JSON:-latency-tests$ARCH.json}"
|
||||||
|
run_startup_tests $QUICK_BENCHMARK_ROOT/tests/"${STARTUP_JSON:-startup-tests$ARCH.json}"
|
||||||
run_throughput_tests $QUICK_BENCHMARK_ROOT/tests/"${THROUGHPUT_JSON:-throughput-tests$ARCH.json}"
|
run_throughput_tests $QUICK_BENCHMARK_ROOT/tests/"${THROUGHPUT_JSON:-throughput-tests$ARCH.json}"
|
||||||
|
|
||||||
# postprocess benchmarking results
|
# postprocess benchmarking results
|
||||||
|
|||||||
@@ -51,5 +51,56 @@
|
|||||||
"max-model-len": 256,
|
"max-model-len": 256,
|
||||||
"async-scheduling": ""
|
"async-scheduling": ""
|
||||||
}
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"test_name": "latency_deepseek_r1",
|
||||||
|
"environment_variables": {
|
||||||
|
"PT_HPU_LAZY_MODE": 1,
|
||||||
|
"PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
|
||||||
|
"VLLM_CONTIGUOUS_PA": 1,
|
||||||
|
"VLLM_DEFRAG": 1
|
||||||
|
},
|
||||||
|
"parameters": {
|
||||||
|
"model": "deepseek-ai/DeepSeek-R1",
|
||||||
|
"tensor_parallel_size": 8,
|
||||||
|
"load_format": "dummy",
|
||||||
|
"max-model-len": 2048,
|
||||||
|
"dtype": "bfloat16"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"test_name": "latency_llama4_maverick_17b128e_instruct_fp8",
|
||||||
|
"environment_variables": {
|
||||||
|
"PT_HPU_LAZY_MODE": 1,
|
||||||
|
"PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
|
||||||
|
"VLLM_CONTIGUOUS_PA": 1,
|
||||||
|
"VLLM_DEFRAG": 1
|
||||||
|
},
|
||||||
|
"parameters": {
|
||||||
|
"model": "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
|
||||||
|
"tensor_parallel_size": 8,
|
||||||
|
"max-model-len": 512,
|
||||||
|
"max-num-seqs": 128,
|
||||||
|
"async-scheduling": "",
|
||||||
|
"gpu-memory-utilization": 0.95,
|
||||||
|
"enable_expert_parallel": ""
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"test_name": "latency_qwen3_8b",
|
||||||
|
"environment_variables": {
|
||||||
|
"PT_HPU_LAZY_MODE": 1,
|
||||||
|
"PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
|
||||||
|
"VLLM_CONTIGUOUS_PA": 1,
|
||||||
|
"VLLM_DEFRAG": 1
|
||||||
|
},
|
||||||
|
"parameters": {
|
||||||
|
"model": "Qwen/Qwen3-8B",
|
||||||
|
"tensor_parallel_size": 1,
|
||||||
|
"max-model-len": 2048,
|
||||||
|
"max-num-seqs": 128,
|
||||||
|
"dtype": "bfloat16",
|
||||||
|
"async-scheduling": ""
|
||||||
|
}
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
|
|||||||
@@ -0,0 +1,41 @@
|
|||||||
|
{
|
||||||
|
"defaults": {
|
||||||
|
"qps_list": [
|
||||||
|
"inf"
|
||||||
|
],
|
||||||
|
"max_concurrency_list": [
|
||||||
|
32,
|
||||||
|
64,
|
||||||
|
128
|
||||||
|
],
|
||||||
|
"server_environment_variables": {
|
||||||
|
"VLLM_RPC_TIMEOUT": 100000,
|
||||||
|
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||||
|
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||||
|
"VLLM_CPU_SGL_KERNEL": 1,
|
||||||
|
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||||
|
},
|
||||||
|
"server_parameters": {
|
||||||
|
"dtype": "bfloat16",
|
||||||
|
"model": "jinaai/jina-embeddings-v3",
|
||||||
|
"trust_remote_code": ""
|
||||||
|
},
|
||||||
|
"client_parameters": {
|
||||||
|
"model": "jinaai/jina-embeddings-v3",
|
||||||
|
"backend": "openai-embeddings",
|
||||||
|
"endpoint": "/v1/embeddings",
|
||||||
|
"dataset_name": "sharegpt",
|
||||||
|
"dataset_path": "ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||||
|
"num_prompts": 200
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"tests": [
|
||||||
|
{
|
||||||
|
"test_name": "serving_jina_embed_v3_tp1_sharegpt",
|
||||||
|
"server_parameters": {
|
||||||
|
"tensor_parallel_size": 1
|
||||||
|
},
|
||||||
|
"client_parameters": {}
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
@@ -0,0 +1,283 @@
|
|||||||
|
{
|
||||||
|
"defaults": {
|
||||||
|
"qps_list": [
|
||||||
|
"inf"
|
||||||
|
],
|
||||||
|
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
||||||
|
"server_environment_variables": {
|
||||||
|
"VLLM_RPC_TIMEOUT": 100000,
|
||||||
|
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||||
|
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||||
|
"VLLM_CPU_SGL_KERNEL": 1,
|
||||||
|
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||||
|
},
|
||||||
|
"server_parameters": {
|
||||||
|
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||||
|
"tensor_parallel_size": 1,
|
||||||
|
"dtype": "bfloat16",
|
||||||
|
"distributed_executor_backend": "mp",
|
||||||
|
"block_size": 128,
|
||||||
|
"trust_remote_code": "",
|
||||||
|
"disable_log_stats": "",
|
||||||
|
"max_num_batched_tokens": 2048,
|
||||||
|
"max_num_seqs": 256
|
||||||
|
},
|
||||||
|
"client_parameters": {
|
||||||
|
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||||
|
"backend": "vllm",
|
||||||
|
"ignore-eos": "",
|
||||||
|
"num_prompts": 200
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"tests": [
|
||||||
|
{
|
||||||
|
"test_name": "serving_llama8B_tp1_sharegpt",
|
||||||
|
"server_parameters": {
|
||||||
|
"tensor_parallel_size": 1
|
||||||
|
},
|
||||||
|
"client_parameters": {
|
||||||
|
"dataset_name": "sharegpt",
|
||||||
|
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"test_name": "serving_llama8B_tp2_sharegpt",
|
||||||
|
"server_parameters": {
|
||||||
|
"tensor_parallel_size": 2
|
||||||
|
},
|
||||||
|
"client_parameters": {
|
||||||
|
"dataset_name": "sharegpt",
|
||||||
|
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"test_name": "serving_llama8B_tp1_random_128_128",
|
||||||
|
"server_parameters": {
|
||||||
|
"tensor_parallel_size": 1
|
||||||
|
},
|
||||||
|
"client_parameters": {
|
||||||
|
"dataset_name": "random",
|
||||||
|
"random-input-len": 128,
|
||||||
|
"random-output-len": 128
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"test_name": "serving_llama8B_tp2_random_128_128",
|
||||||
|
"server_parameters": {
|
||||||
|
"tensor_parallel_size": 2
|
||||||
|
},
|
||||||
|
"client_parameters": {
|
||||||
|
"dataset_name": "random",
|
||||||
|
"random-input-len": 128,
|
||||||
|
"random-output-len": 128
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"test_name": "serving_llama8B_tp4_random_128_128",
|
||||||
|
"server_parameters": {
|
||||||
|
"tensor_parallel_size": 4
|
||||||
|
},
|
||||||
|
"client_parameters": {
|
||||||
|
"dataset_name": "random",
|
||||||
|
"random-input-len": 128,
|
||||||
|
"random-output-len": 128
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"test_name": "serving_llama8B_tp1_random_128_2048",
|
||||||
|
"server_parameters": {
|
||||||
|
"tensor_parallel_size": 1
|
||||||
|
},
|
||||||
|
"client_parameters": {
|
||||||
|
"dataset_name": "random",
|
||||||
|
"random-input-len": 128,
|
||||||
|
"random-output-len": 2048
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"test_name": "serving_llama8B_tp2_random_128_2048",
|
||||||
|
"server_parameters": {
|
||||||
|
"tensor_parallel_size": 2
|
||||||
|
},
|
||||||
|
"client_parameters": {
|
||||||
|
"dataset_name": "random",
|
||||||
|
"random-input-len": 128,
|
||||||
|
"random-output-len": 2048
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"test_name": "serving_llama8B_tp4_random_128_2048",
|
||||||
|
"server_parameters": {
|
||||||
|
"tensor_parallel_size": 4
|
||||||
|
},
|
||||||
|
"client_parameters": {
|
||||||
|
"dataset_name": "random",
|
||||||
|
"random-input-len": 128,
|
||||||
|
"random-output-len": 2048
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"test_name": "serving_llama8B_tp1_random_2048_128",
|
||||||
|
"server_parameters": {
|
||||||
|
"tensor_parallel_size": 1
|
||||||
|
},
|
||||||
|
"client_parameters": {
|
||||||
|
"dataset_name": "random",
|
||||||
|
"random-input-len": 2048,
|
||||||
|
"random-output-len": 128
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"test_name": "serving_llama8B_tp2_random_2048_128",
|
||||||
|
"server_parameters": {
|
||||||
|
"tensor_parallel_size": 2
|
||||||
|
},
|
||||||
|
"client_parameters": {
|
||||||
|
"dataset_name": "random",
|
||||||
|
"random-input-len": 2048,
|
||||||
|
"random-output-len": 128
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"test_name": "serving_llama8B_tp4_random_2048_128",
|
||||||
|
"server_parameters": {
|
||||||
|
"tensor_parallel_size": 4
|
||||||
|
},
|
||||||
|
"client_parameters": {
|
||||||
|
"dataset_name": "random",
|
||||||
|
"random-input-len": 2048,
|
||||||
|
"random-output-len": 128
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"test_name": "serving_llama8B_int4_tp1_random_128_128",
|
||||||
|
"server_parameters": {
|
||||||
|
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
||||||
|
"tensor_parallel_size": 1
|
||||||
|
},
|
||||||
|
"client_parameters": {
|
||||||
|
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
||||||
|
"dataset_name": "random",
|
||||||
|
"random-input-len": 128,
|
||||||
|
"random-output-len": 128
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"test_name": "serving_llama8B_int4_tp2_random_128_128",
|
||||||
|
"server_parameters": {
|
||||||
|
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
||||||
|
"tensor_parallel_size": 2
|
||||||
|
},
|
||||||
|
"client_parameters": {
|
||||||
|
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
||||||
|
"dataset_name": "random",
|
||||||
|
"random-input-len": 128,
|
||||||
|
"random-output-len": 128
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"test_name": "serving_llama8B_int4_tp4_random_128_128",
|
||||||
|
"server_parameters": {
|
||||||
|
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
||||||
|
"tensor_parallel_size": 4
|
||||||
|
},
|
||||||
|
"client_parameters": {
|
||||||
|
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
||||||
|
"dataset_name": "random",
|
||||||
|
"random-input-len": 128,
|
||||||
|
"random-output-len": 128
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"test_name": "serving_llama3B_tp1_random_128_128",
|
||||||
|
"server_parameters": {
|
||||||
|
"model": "meta-llama/Llama-3.2-3B-Instruct",
|
||||||
|
"tensor_parallel_size": 1
|
||||||
|
},
|
||||||
|
"client_parameters": {
|
||||||
|
"model": "meta-llama/Llama-3.2-3B-Instruct",
|
||||||
|
"dataset_name": "random",
|
||||||
|
"random-input-len": 128,
|
||||||
|
"random-output-len": 128
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"test_name": "serving_granite2B_tp1_random_128_128",
|
||||||
|
"server_parameters": {
|
||||||
|
"model": "ibm-granite/granite-3.2-2b-instruct",
|
||||||
|
"tensor_parallel_size": 1
|
||||||
|
},
|
||||||
|
"client_parameters": {
|
||||||
|
"model": "ibm-granite/granite-3.2-2b-instruct",
|
||||||
|
"dataset_name": "random",
|
||||||
|
"random-input-len": 128,
|
||||||
|
"random-output-len": 128
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"test_name": "serving_qwen1.7B_tp1_random_128_128",
|
||||||
|
"server_parameters": {
|
||||||
|
"model": "Qwen/Qwen3-1.7B",
|
||||||
|
"tensor_parallel_size": 1
|
||||||
|
},
|
||||||
|
"client_parameters": {
|
||||||
|
"model": "Qwen/Qwen3-1.7B",
|
||||||
|
"dataset_name": "random",
|
||||||
|
"random-input-len": 128,
|
||||||
|
"random-output-len": 128
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"test_name": "serving_qwen4B_tp1_random_128_128",
|
||||||
|
"server_parameters": {
|
||||||
|
"model": "Qwen/Qwen3-4B",
|
||||||
|
"tensor_parallel_size": 1
|
||||||
|
},
|
||||||
|
"client_parameters": {
|
||||||
|
"model": "Qwen/Qwen3-4B",
|
||||||
|
"dataset_name": "random",
|
||||||
|
"random-input-len": 128,
|
||||||
|
"random-output-len": 128
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"test_name": "serving_qwen8B_tp1_random_128_128",
|
||||||
|
"server_parameters": {
|
||||||
|
"model": "Qwen/Qwen3-8B",
|
||||||
|
"tensor_parallel_size": 1
|
||||||
|
},
|
||||||
|
"client_parameters": {
|
||||||
|
"model": "Qwen/Qwen3-8B",
|
||||||
|
"dataset_name": "random",
|
||||||
|
"random-input-len": 128,
|
||||||
|
"random-output-len": 128
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"test_name": "serving_glm9B_tp1_random_128_128",
|
||||||
|
"server_parameters": {
|
||||||
|
"model": "zai-org/glm-4-9b-hf",
|
||||||
|
"tensor_parallel_size": 1
|
||||||
|
},
|
||||||
|
"client_parameters": {
|
||||||
|
"model": "zai-org/glm-4-9b-hf",
|
||||||
|
"dataset_name": "random",
|
||||||
|
"random-input-len": 128,
|
||||||
|
"random-output-len": 128
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"test_name": "serving_gemma7B_tp1_random_128_128",
|
||||||
|
"server_parameters": {
|
||||||
|
"model": "google/gemma-7b",
|
||||||
|
"tensor_parallel_size": 1
|
||||||
|
},
|
||||||
|
"client_parameters": {
|
||||||
|
"model": "google/gemma-7b",
|
||||||
|
"dataset_name": "random",
|
||||||
|
"random-input-len": 128,
|
||||||
|
"random-output-len": 128
|
||||||
|
}
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
@@ -148,136 +148,6 @@
|
|||||||
"random-input-len": 2048,
|
"random-input-len": 2048,
|
||||||
"random-output-len": 128
|
"random-output-len": 128
|
||||||
}
|
}
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "serving_llama8B_int4_tp1_random_128_128",
|
|
||||||
"server_parameters": {
|
|
||||||
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
|
||||||
"tensor_parallel_size": 1
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
|
||||||
"dataset_name": "random",
|
|
||||||
"random-input-len": 128,
|
|
||||||
"random-output-len": 128
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "serving_llama8B_int4_tp2_random_128_128",
|
|
||||||
"server_parameters": {
|
|
||||||
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
|
||||||
"tensor_parallel_size": 2
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
|
||||||
"dataset_name": "random",
|
|
||||||
"random-input-len": 128,
|
|
||||||
"random-output-len": 128
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "serving_llama8B_int4_tp4_random_128_128",
|
|
||||||
"server_parameters": {
|
|
||||||
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
|
||||||
"tensor_parallel_size": 4
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
|
||||||
"dataset_name": "random",
|
|
||||||
"random-input-len": 128,
|
|
||||||
"random-output-len": 128
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "serving_llama3B_tp1_random_128_128",
|
|
||||||
"server_parameters": {
|
|
||||||
"model": "meta-llama/Llama-3.2-3B-Instruct",
|
|
||||||
"tensor_parallel_size": 1
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"model": "meta-llama/Llama-3.2-3B-Instruct",
|
|
||||||
"dataset_name": "random",
|
|
||||||
"random-input-len": 128,
|
|
||||||
"random-output-len": 128
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "serving_granite2B_tp1_random_128_128",
|
|
||||||
"server_parameters": {
|
|
||||||
"model": "ibm-granite/granite-3.2-2b-instruct",
|
|
||||||
"tensor_parallel_size": 1
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"model": "ibm-granite/granite-3.2-2b-instruct",
|
|
||||||
"dataset_name": "random",
|
|
||||||
"random-input-len": 128,
|
|
||||||
"random-output-len": 128
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "serving_qwen1.7B_tp1_random_128_128",
|
|
||||||
"server_parameters": {
|
|
||||||
"model": "Qwen/Qwen3-1.7B",
|
|
||||||
"tensor_parallel_size": 1
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"model": "Qwen/Qwen3-1.7B",
|
|
||||||
"dataset_name": "random",
|
|
||||||
"random-input-len": 128,
|
|
||||||
"random-output-len": 128
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "serving_qwen4B_tp1_random_128_128",
|
|
||||||
"server_parameters": {
|
|
||||||
"model": "Qwen/Qwen3-4B",
|
|
||||||
"tensor_parallel_size": 1
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"model": "Qwen/Qwen3-4B",
|
|
||||||
"dataset_name": "random",
|
|
||||||
"random-input-len": 128,
|
|
||||||
"random-output-len": 128
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "serving_qwen8B_tp1_random_128_128",
|
|
||||||
"server_parameters": {
|
|
||||||
"model": "Qwen/Qwen3-8B",
|
|
||||||
"tensor_parallel_size": 1
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"model": "Qwen/Qwen3-8B",
|
|
||||||
"dataset_name": "random",
|
|
||||||
"random-input-len": 128,
|
|
||||||
"random-output-len": 128
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "serving_glm9B_tp1_random_128_128",
|
|
||||||
"server_parameters": {
|
|
||||||
"model": "zai-org/glm-4-9b-hf",
|
|
||||||
"tensor_parallel_size": 1
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"model": "zai-org/glm-4-9b-hf",
|
|
||||||
"dataset_name": "random",
|
|
||||||
"random-input-len": 128,
|
|
||||||
"random-output-len": 128
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "serving_gemma7B_tp1_random_128_128",
|
|
||||||
"server_parameters": {
|
|
||||||
"model": "google/gemma-7b",
|
|
||||||
"tensor_parallel_size": 1
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"model": "google/gemma-7b",
|
|
||||||
"dataset_name": "random",
|
|
||||||
"random-input-len": 128,
|
|
||||||
"random-output-len": 128
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -78,5 +78,84 @@
|
|||||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||||
"num_prompts": 200
|
"num_prompts": 200
|
||||||
}
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"test_name": "serving_deepseek_r1",
|
||||||
|
"qps_list": [1, 4, 16, "inf"],
|
||||||
|
"server_environment_variables": {
|
||||||
|
"PT_HPU_LAZY_MODE": 1,
|
||||||
|
"PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
|
||||||
|
"VLLM_CONTIGUOUS_PA": 1,
|
||||||
|
"VLLM_DEFRAG": 1
|
||||||
|
},
|
||||||
|
"server_parameters": {
|
||||||
|
"model": "deepseek-ai/DeepSeek-R1",
|
||||||
|
"tensor_parallel_size": 8,
|
||||||
|
"swap_space": 16,
|
||||||
|
"disable_log_stats": "",
|
||||||
|
"load_format": "dummy",
|
||||||
|
"max-model-len": 2048,
|
||||||
|
"max-num-seqs": 200,
|
||||||
|
"async-scheduling": "",
|
||||||
|
"dtype": "bfloat16"
|
||||||
|
},
|
||||||
|
"client_parameters": {
|
||||||
|
"model": "deepseek-ai/DeepSeek-R1",
|
||||||
|
"backend": "vllm",
|
||||||
|
"dataset_name": "sharegpt",
|
||||||
|
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||||
|
"num_prompts": 200
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"test_name": "serving_llama4_maverick_17b128e_instruct_fp8",
|
||||||
|
"qps_list": [1, 4, 16, "inf"],
|
||||||
|
"server_environment_variables": {
|
||||||
|
"PT_HPU_LAZY_MODE": 1,
|
||||||
|
"PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
|
||||||
|
"VLLM_CONTIGUOUS_PA": 1,
|
||||||
|
"VLLM_DEFRAG": 1
|
||||||
|
},
|
||||||
|
"server_parameters": {
|
||||||
|
"model": "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
|
||||||
|
"tensor_parallel_size": 8,
|
||||||
|
"disable_log_stats": "",
|
||||||
|
"max-model-len": 2048,
|
||||||
|
"max-num-seqs": 128,
|
||||||
|
"async-scheduling": "",
|
||||||
|
"enable_expert_parallel": "",
|
||||||
|
"max-num-batched-tokens": 4096
|
||||||
|
},
|
||||||
|
"client_parameters": {
|
||||||
|
"model": "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
|
||||||
|
"backend": "vllm",
|
||||||
|
"dataset_name": "sharegpt",
|
||||||
|
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||||
|
"num_prompts": 200
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"test_name": "serving_qwen3_8b",
|
||||||
|
"qps_list": [1, 4, 10, "inf"],
|
||||||
|
"server_environment_variables": {
|
||||||
|
"PT_HPU_LAZY_MODE": 1,
|
||||||
|
"PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
|
||||||
|
"VLLM_CONTIGUOUS_PA": 1,
|
||||||
|
"VLLM_DEFRAG": 1
|
||||||
|
},
|
||||||
|
"server_parameters": {
|
||||||
|
"model": "Qwen/Qwen-3-8B",
|
||||||
|
"tensor_parallel_size": 1,
|
||||||
|
"dtype": "bfloat16",
|
||||||
|
"disable_log_stats": "",
|
||||||
|
"async-scheduling": ""
|
||||||
|
},
|
||||||
|
"client_parameters": {
|
||||||
|
"model": "Qwen/Qwen-3-8B",
|
||||||
|
"backend": "vllm",
|
||||||
|
"dataset_name": "sharegpt",
|
||||||
|
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||||
|
"num_prompts": 200
|
||||||
|
}
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
|
|||||||
@@ -57,5 +57,67 @@
|
|||||||
"max-num-seqs": 512,
|
"max-num-seqs": 512,
|
||||||
"async-scheduling": ""
|
"async-scheduling": ""
|
||||||
}
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"test_name": "throughput_deepseek_r1",
|
||||||
|
"environment_variables": {
|
||||||
|
"PT_HPU_LAZY_MODE": 1,
|
||||||
|
"PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
|
||||||
|
"VLLM_CONTIGUOUS_PA": 1,
|
||||||
|
"VLLM_DEFRAG": 1
|
||||||
|
},
|
||||||
|
"parameters": {
|
||||||
|
"model": "deepseek-ai/DeepSeek-R1",
|
||||||
|
"tensor_parallel_size": 8,
|
||||||
|
"load_format": "dummy",
|
||||||
|
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||||
|
"dataset_name": "sharegpt",
|
||||||
|
"num_prompts": 1000,
|
||||||
|
"backend": "vllm",
|
||||||
|
"max-model-len": 2048,
|
||||||
|
"max-num-seqs": 384,
|
||||||
|
"async-scheduling": ""
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"test_name": "throughput_llama4_maverick_17b128e_instruct_fp8",
|
||||||
|
"environment_variables": {
|
||||||
|
"PT_HPU_LAZY_MODE": 1,
|
||||||
|
"PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
|
||||||
|
"VLLM_CONTIGUOUS_PA": 1,
|
||||||
|
"VLLM_DEFRAG": 1
|
||||||
|
},
|
||||||
|
"parameters": {
|
||||||
|
"model": "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
|
||||||
|
"tensor_parallel_size": 8,
|
||||||
|
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||||
|
"dataset_name": "sharegpt",
|
||||||
|
"num_prompts": 1000,
|
||||||
|
"backend": "vllm",
|
||||||
|
"max-model-len": 2048,
|
||||||
|
"max-num-seqs": 512,
|
||||||
|
"async-scheduling": "",
|
||||||
|
"enable_expert_parallel": ""
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"test_name": "throughput_qwen3_8b",
|
||||||
|
"environment_variables": {
|
||||||
|
"PT_HPU_LAZY_MODE": 1,
|
||||||
|
"PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
|
||||||
|
"VLLM_CONTIGUOUS_PA": 1,
|
||||||
|
"VLLM_DEFRAG": 1
|
||||||
|
},
|
||||||
|
"parameters": {
|
||||||
|
"model": "Qwen/Qwen-3-8B",
|
||||||
|
"tensor_parallel_size": 1,
|
||||||
|
"load_format": "dummy",
|
||||||
|
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||||
|
"dataset_name": "sharegpt",
|
||||||
|
"num_prompts": 1000,
|
||||||
|
"max-num-seqs": 512,
|
||||||
|
"backend": "vllm",
|
||||||
|
"async-scheduling": ""
|
||||||
|
}
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
|
|||||||
@@ -176,23 +176,6 @@ steps:
|
|||||||
env:
|
env:
|
||||||
DOCKER_BUILDKIT: "1"
|
DOCKER_BUILDKIT: "1"
|
||||||
|
|
||||||
- block: "Build release image for x86_64 ROCm"
|
|
||||||
key: block-rocm-release-image-build
|
|
||||||
depends_on: ~
|
|
||||||
|
|
||||||
- label: "Build release image - x86_64 - ROCm"
|
|
||||||
depends_on: block-rocm-release-image-build
|
|
||||||
id: build-release-image-rocm
|
|
||||||
agents:
|
|
||||||
queue: cpu_queue_postmerge
|
|
||||||
commands:
|
|
||||||
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
|
||||||
# Build base image first
|
|
||||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --tag rocm/vllm-dev:base-$BUILDKITE_COMMIT --target final --progress plain -f docker/Dockerfile.rocm_base ."
|
|
||||||
# Build vLLM ROCm image using the base
|
|
||||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg BASE_IMAGE=rocm/vllm-dev:base-$BUILDKITE_COMMIT --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-rocm --target vllm-openai --progress plain -f docker/Dockerfile.rocm ."
|
|
||||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-rocm"
|
|
||||||
|
|
||||||
- group: "Publish release images"
|
- group: "Publish release images"
|
||||||
key: "publish-release-images"
|
key: "publish-release-images"
|
||||||
steps:
|
steps:
|
||||||
@@ -274,14 +257,14 @@ steps:
|
|||||||
- input-release-version
|
- input-release-version
|
||||||
- build-wheels
|
- build-wheels
|
||||||
|
|
||||||
- label: "Upload release wheels to PyPI and GitHub"
|
- label: "Upload release wheels to PyPI"
|
||||||
depends_on:
|
depends_on:
|
||||||
- block-upload-release-wheels
|
- block-upload-release-wheels
|
||||||
id: upload-release-wheels
|
id: upload-release-wheels
|
||||||
agents:
|
agents:
|
||||||
queue: small_cpu_queue_postmerge
|
queue: small_cpu_queue_postmerge
|
||||||
commands:
|
commands:
|
||||||
- "bash .buildkite/scripts/upload-release-wheels.sh"
|
- "bash .buildkite/scripts/upload-release-wheels-pypi.sh"
|
||||||
|
|
||||||
# =============================================================================
|
# =============================================================================
|
||||||
# ROCm Release Pipeline (x86_64 only)
|
# ROCm Release Pipeline (x86_64 only)
|
||||||
@@ -476,7 +459,7 @@ steps:
|
|||||||
S3_BUCKET: "vllm-wheels"
|
S3_BUCKET: "vllm-wheels"
|
||||||
|
|
||||||
# ROCm Job 2: Build vLLM ROCm Wheel
|
# ROCm Job 2: Build vLLM ROCm Wheel
|
||||||
- label: ":python: Build vLLM ROCm Wheel"
|
- label: ":python: Build vLLM ROCm Wheel - x86_64"
|
||||||
id: build-rocm-vllm-wheel
|
id: build-rocm-vllm-wheel
|
||||||
depends_on:
|
depends_on:
|
||||||
- step: build-rocm-base-wheels
|
- step: build-rocm-base-wheels
|
||||||
@@ -638,9 +621,93 @@ steps:
|
|||||||
depends_on:
|
depends_on:
|
||||||
- step: upload-rocm-wheels
|
- step: upload-rocm-wheels
|
||||||
allow_failure: true
|
allow_failure: true
|
||||||
|
- step: input-release-version
|
||||||
|
allow_failure: true
|
||||||
agents:
|
agents:
|
||||||
queue: cpu_queue_postmerge
|
queue: cpu_queue_postmerge
|
||||||
commands:
|
commands:
|
||||||
- "bash .buildkite/scripts/annotate-rocm-release.sh"
|
- "bash .buildkite/scripts/annotate-rocm-release.sh"
|
||||||
env:
|
env:
|
||||||
S3_BUCKET: "vllm-wheels"
|
S3_BUCKET: "vllm-wheels"
|
||||||
|
|
||||||
|
# ROCm Job 5: Generate Root Index for ROCm Wheels (for release only)
|
||||||
|
# This is the job to create https://wheels.vllm.ai/rocm/ index allowing
|
||||||
|
# users to install with `uv pip install vllm --extra-index-url https://wheels.vllm.ai/rocm/`
|
||||||
|
- block: "Generate Root Index for ROCm Wheels for Release"
|
||||||
|
key: block-generate-root-index-rocm-wheels
|
||||||
|
depends_on: upload-rocm-wheels
|
||||||
|
|
||||||
|
- label: ":package: Generate Root Index for ROCm Wheels for Release"
|
||||||
|
depends_on: block-generate-root-index-rocm-wheels
|
||||||
|
id: generate-root-index-rocm-wheels
|
||||||
|
agents:
|
||||||
|
queue: cpu_queue_postmerge
|
||||||
|
commands:
|
||||||
|
- "bash tools/vllm-rocm/generate-rocm-wheels-root-index.sh"
|
||||||
|
env:
|
||||||
|
S3_BUCKET: "vllm-wheels"
|
||||||
|
VARIANT: "rocm700"
|
||||||
|
|
||||||
|
# ROCm Job 5: Build ROCm Release Docker Image
|
||||||
|
- label: ":docker: Build release image - x86_64 - ROCm"
|
||||||
|
id: build-rocm-release-image
|
||||||
|
depends_on:
|
||||||
|
- step: build-rocm-base-wheels
|
||||||
|
allow_failure: false
|
||||||
|
agents:
|
||||||
|
queue: cpu_queue_postmerge
|
||||||
|
timeout_in_minutes: 60
|
||||||
|
commands:
|
||||||
|
- |
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
# Login to ECR
|
||||||
|
aws ecr-public get-login-password --region us-east-1 | \
|
||||||
|
docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7
|
||||||
|
|
||||||
|
# Download Docker image from S3 (set by build-rocm-base-wheels)
|
||||||
|
DOCKER_IMAGE_S3_PATH="$$(buildkite-agent meta-data get rocm-docker-image-s3-path 2>/dev/null || echo '')"
|
||||||
|
if [ -z "$${DOCKER_IMAGE_S3_PATH}" ]; then
|
||||||
|
echo "ERROR: rocm-docker-image-s3-path metadata not found"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "Downloading base image from $${DOCKER_IMAGE_S3_PATH}"
|
||||||
|
mkdir -p artifacts/rocm-docker-image
|
||||||
|
aws s3 cp "$${DOCKER_IMAGE_S3_PATH}" artifacts/rocm-docker-image/rocm-base-image.tar.gz
|
||||||
|
|
||||||
|
# Load base Docker image
|
||||||
|
echo "Loading base Docker image..."
|
||||||
|
LOAD_OUTPUT=$$(gunzip -c artifacts/rocm-docker-image/rocm-base-image.tar.gz | docker load)
|
||||||
|
BASE_IMAGE_TAG=$$(echo "$${LOAD_OUTPUT}" | grep "Loaded image:" | sed 's/Loaded image: //')
|
||||||
|
echo "Loaded base image: $${BASE_IMAGE_TAG}"
|
||||||
|
|
||||||
|
# Tag and push the base image to ECR
|
||||||
|
docker tag "$${BASE_IMAGE_TAG}" public.ecr.aws/q9t5s3a7/vllm-release-repo:$${BUILDKITE_COMMIT}-rocm-base
|
||||||
|
docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$${BUILDKITE_COMMIT}-rocm-base
|
||||||
|
echo "Pushed base image: public.ecr.aws/q9t5s3a7/vllm-release-repo:$${BUILDKITE_COMMIT}-rocm-base"
|
||||||
|
|
||||||
|
# Get GPU architectures from meta-data
|
||||||
|
PYTORCH_ROCM_ARCH="$$(buildkite-agent meta-data get rocm-pytorch-rocm-arch 2>/dev/null || echo '')"
|
||||||
|
PYTORCH_ROCM_ARCH="$${PYTORCH_ROCM_ARCH:-gfx90a;gfx942;gfx950;gfx1100;gfx1101;gfx1200;gfx1201;gfx1150;gfx1151}"
|
||||||
|
|
||||||
|
# Build vLLM ROCm release image using cached base
|
||||||
|
DOCKER_BUILDKIT=1 docker build \
|
||||||
|
--build-arg max_jobs=16 \
|
||||||
|
--build-arg BASE_IMAGE="$${BASE_IMAGE_TAG}" \
|
||||||
|
--build-arg ARG_PYTORCH_ROCM_ARCH="$${PYTORCH_ROCM_ARCH}" \
|
||||||
|
--build-arg USE_SCCACHE=1 \
|
||||||
|
--build-arg SCCACHE_BUCKET_NAME=vllm-build-sccache \
|
||||||
|
--build-arg SCCACHE_REGION_NAME=us-west-2 \
|
||||||
|
--build-arg SCCACHE_S3_NO_CREDENTIALS=0 \
|
||||||
|
--tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$${BUILDKITE_COMMIT}-rocm \
|
||||||
|
--target vllm-openai \
|
||||||
|
--progress plain \
|
||||||
|
-f docker/Dockerfile.rocm .
|
||||||
|
|
||||||
|
# Push to ECR
|
||||||
|
docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$${BUILDKITE_COMMIT}-rocm
|
||||||
|
echo "Pushed: public.ecr.aws/q9t5s3a7/vllm-release-repo:$${BUILDKITE_COMMIT}-rocm"
|
||||||
|
env:
|
||||||
|
DOCKER_BUILDKIT: "1"
|
||||||
|
S3_BUCKET: "vllm-wheels"
|
||||||
|
|||||||
@@ -11,28 +11,36 @@ fi
|
|||||||
buildkite-agent annotate --style 'info' --context 'release-workflow' << EOF
|
buildkite-agent annotate --style 'info' --context 'release-workflow' << EOF
|
||||||
To download the wheel (by commit):
|
To download the wheel (by commit):
|
||||||
\`\`\`
|
\`\`\`
|
||||||
aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux1_x86_64.whl .
|
aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux_2_31_x86_64.whl .
|
||||||
aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux2014_aarch64.whl .
|
aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux_2_31_aarch64.whl .
|
||||||
|
|
||||||
aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}+cu129-cp38-abi3-manylinux1_x86_64.whl .
|
(Optional) For CUDA 13.0:
|
||||||
aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}+cu129-cp38-abi3-manylinux1_x86_64.whl .
|
aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}+cu130-cp38-abi3-manylinux_2_35_x86_64.whl .
|
||||||
|
aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}+cu130-cp38-abi3-manylinux_2_35_aarch64.whl .
|
||||||
|
|
||||||
|
(Optional) For CPU:
|
||||||
|
aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}+cpu-cp38-abi3-manylinux_2_35_x86_64.whl .
|
||||||
|
aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}+cpu-cp38-abi3-manylinux_2_35_aarch64.whl .
|
||||||
\`\`\`
|
\`\`\`
|
||||||
|
|
||||||
To download the wheel (by version):
|
|
||||||
\`\`\`
|
|
||||||
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux1_x86_64.whl .
|
|
||||||
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux2014_aarch64.whl .
|
|
||||||
|
|
||||||
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu129/vllm-${RELEASE_VERSION}+cu129-cp38-abi3-manylinux1_x86_64.whl .
|
|
||||||
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu130/vllm-${RELEASE_VERSION}+cu130-cp38-abi3-manylinux1_x86_64.whl .
|
|
||||||
\`\`\`
|
|
||||||
|
|
||||||
To download and upload the image:
|
To download and upload the image:
|
||||||
|
|
||||||
\`\`\`
|
\`\`\`
|
||||||
|
# Download images:
|
||||||
|
|
||||||
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64
|
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64
|
||||||
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64
|
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64
|
||||||
|
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64-cu130
|
||||||
|
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64-cu130
|
||||||
|
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm-base
|
||||||
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm
|
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm
|
||||||
|
docker pull public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:v${RELEASE_VERSION}
|
||||||
|
docker pull public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:v${RELEASE_VERSION}
|
||||||
|
|
||||||
|
# Tag and push images:
|
||||||
|
|
||||||
|
## CUDA
|
||||||
|
|
||||||
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64 vllm/vllm-openai:x86_64
|
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64 vllm/vllm-openai:x86_64
|
||||||
docker tag vllm/vllm-openai:x86_64 vllm/vllm-openai:latest-x86_64
|
docker tag vllm/vllm-openai:x86_64 vllm/vllm-openai:latest-x86_64
|
||||||
@@ -40,22 +48,70 @@ docker tag vllm/vllm-openai:x86_64 vllm/vllm-openai:v${RELEASE_VERSION}-x86_64
|
|||||||
docker push vllm/vllm-openai:latest-x86_64
|
docker push vllm/vllm-openai:latest-x86_64
|
||||||
docker push vllm/vllm-openai:v${RELEASE_VERSION}-x86_64
|
docker push vllm/vllm-openai:v${RELEASE_VERSION}-x86_64
|
||||||
|
|
||||||
|
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64-cu130 vllm/vllm-openai:x86_64-cu130
|
||||||
|
docker tag vllm/vllm-openai:x86_64-cu130 vllm/vllm-openai:latest-x86_64-cu130
|
||||||
|
docker tag vllm/vllm-openai:x86_64-cu130 vllm/vllm-openai:v${RELEASE_VERSION}-x86_64-cu130
|
||||||
|
docker push vllm/vllm-openai:latest-x86_64-cu130
|
||||||
|
docker push vllm/vllm-openai:v${RELEASE_VERSION}-x86_64-cu130
|
||||||
|
|
||||||
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64 vllm/vllm-openai:aarch64
|
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64 vllm/vllm-openai:aarch64
|
||||||
docker tag vllm/vllm-openai:aarch64 vllm/vllm-openai:latest-aarch64
|
docker tag vllm/vllm-openai:aarch64 vllm/vllm-openai:latest-aarch64
|
||||||
docker tag vllm/vllm-openai:aarch64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
|
docker tag vllm/vllm-openai:aarch64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
|
||||||
docker push vllm/vllm-openai:latest-aarch64
|
docker push vllm/vllm-openai:latest-aarch64
|
||||||
docker push vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
|
docker push vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
|
||||||
|
|
||||||
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm vllm/vllm-openai:rocm
|
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64-cu130 vllm/vllm-openai:aarch64-cu130
|
||||||
docker tag vllm/vllm-openai:rocm vllm/vllm-openai:latest-rocm
|
docker tag vllm/vllm-openai:aarch64-cu130 vllm/vllm-openai:latest-aarch64-cu130
|
||||||
docker tag vllm/vllm-openai:rocm vllm/vllm-openai:v${RELEASE_VERSION}-rocm
|
docker tag vllm/vllm-openai:aarch64-cu130 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64-cu130
|
||||||
docker push vllm/vllm-openai:latest-rocm
|
docker push vllm/vllm-openai:latest-aarch64-cu130
|
||||||
docker push vllm/vllm-openai:v${RELEASE_VERSION}-rocm
|
docker push vllm/vllm-openai:v${RELEASE_VERSION}-aarch64-cu130
|
||||||
|
|
||||||
|
## ROCm
|
||||||
|
|
||||||
|
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}
|
||||||
|
docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT} vllm/vllm-openai-rocm:latest
|
||||||
|
docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT} vllm/vllm-openai-rocm:v${RELEASE_VERSION}
|
||||||
|
docker push vllm/vllm-openai-rocm:latest
|
||||||
|
docker push vllm/vllm-openai-rocm:v${RELEASE_VERSION}
|
||||||
|
|
||||||
|
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm-base vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-base
|
||||||
|
docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-base vllm/vllm-openai-rocm:latest-base
|
||||||
|
docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-base vllm/vllm-openai-rocm:v${RELEASE_VERSION}-base
|
||||||
|
docker push vllm/vllm-openai-rocm:latest-base
|
||||||
|
docker push vllm/vllm-openai-rocm:v${RELEASE_VERSION}-base
|
||||||
|
|
||||||
|
## CPU
|
||||||
|
|
||||||
|
docker tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:v${RELEASE_VERSION} vllm/vllm-openai-cpu:x86_64
|
||||||
|
docker tag vllm/vllm-openai-cpu:x86_64 vllm/vllm-openai-cpu:latest-x86_64
|
||||||
|
docker tag vllm/vllm-openai-cpu:x86_64 vllm/vllm-openai-cpu:v${RELEASE_VERSION}-x86_64
|
||||||
|
docker push vllm/vllm-openai-cpu:latest-x86_64
|
||||||
|
docker push vllm/vllm-openai-cpu:v${RELEASE_VERSION}-x86_64
|
||||||
|
|
||||||
|
docker tag public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:v${RELEASE_VERSION} vllm/vllm-openai-cpu:arm64
|
||||||
|
docker tag vllm/vllm-openai-cpu:arm64 vllm/vllm-openai-cpu:latest-arm64
|
||||||
|
docker tag vllm/vllm-openai-cpu:arm64 vllm/vllm-openai-cpu:v${RELEASE_VERSION}-arm64
|
||||||
|
docker push vllm/vllm-openai-cpu:latest-arm64
|
||||||
|
docker push vllm/vllm-openai-cpu:v${RELEASE_VERSION}-arm64
|
||||||
|
|
||||||
|
# Create multi-arch manifest:
|
||||||
|
|
||||||
docker manifest rm vllm/vllm-openai:latest
|
docker manifest rm vllm/vllm-openai:latest
|
||||||
docker manifest create vllm/vllm-openai:latest vllm/vllm-openai:latest-x86_64 vllm/vllm-openai:latest-aarch64
|
docker manifest create vllm/vllm-openai:latest vllm/vllm-openai:latest-x86_64 vllm/vllm-openai:latest-aarch64
|
||||||
docker manifest create vllm/vllm-openai:v${RELEASE_VERSION} vllm/vllm-openai:v${RELEASE_VERSION}-x86_64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
|
docker manifest create vllm/vllm-openai:v${RELEASE_VERSION} vllm/vllm-openai:v${RELEASE_VERSION}-x86_64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
|
||||||
docker manifest push vllm/vllm-openai:latest
|
docker manifest push vllm/vllm-openai:latest
|
||||||
docker manifest push vllm/vllm-openai:v${RELEASE_VERSION}
|
docker manifest push vllm/vllm-openai:v${RELEASE_VERSION}
|
||||||
|
|
||||||
|
docker manifest rm vllm/vllm-openai:latest-cu130
|
||||||
|
docker manifest create vllm/vllm-openai:latest-cu130 vllm/vllm-openai:latest-x86_64-cu130 vllm/vllm-openai:latest-aarch64-cu130
|
||||||
|
docker manifest create vllm/vllm-openai:v${RELEASE_VERSION}-cu130 vllm/vllm-openai:v${RELEASE_VERSION}-x86_64-cu130 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64-cu130
|
||||||
|
docker manifest push vllm/vllm-openai:latest-cu130
|
||||||
|
docker manifest push vllm/vllm-openai:v${RELEASE_VERSION}-cu130
|
||||||
|
|
||||||
|
docker manifest rm vllm/vllm-openai-cpu:latest || true
|
||||||
|
docker manifest create vllm/vllm-openai-cpu:latest vllm/vllm-openai-cpu:latest-x86_64 vllm/vllm-openai-cpu:latest-arm64
|
||||||
|
docker manifest create vllm/vllm-openai-cpu:v${RELEASE_VERSION} vllm/vllm-openai-cpu:v${RELEASE_VERSION}-x86_64 vllm/vllm-openai-cpu:v${RELEASE_VERSION}-arm64
|
||||||
|
docker manifest push vllm/vllm-openai-cpu:latest
|
||||||
|
docker manifest push vllm/vllm-openai-cpu:v${RELEASE_VERSION}
|
||||||
\`\`\`
|
\`\`\`
|
||||||
EOF
|
EOF
|
||||||
|
|||||||
@@ -3,25 +3,32 @@
|
|||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
#
|
#
|
||||||
# Generate Buildkite annotation for ROCm wheel release
|
# Generate Buildkite annotation for ROCm wheel release
|
||||||
|
|
||||||
set -ex
|
set -ex
|
||||||
|
|
||||||
# Get build configuration from meta-data
|
# Get build configuration from meta-data
|
||||||
# Extract ROCm version dynamically from Dockerfile.rocm_base
|
# Extract ROCm version dynamically from Dockerfile.rocm_base
|
||||||
# BASE_IMAGE format: rocm/dev-ubuntu-22.04:7.1-complete -> extracts "7.1"
|
# BASE_IMAGE format: rocm/dev-ubuntu-22.04:7.0-complete -> extracts "7.0"
|
||||||
ROCM_VERSION=$(grep -E '^ARG BASE_IMAGE=' docker/Dockerfile.rocm_base | sed -E 's/.*:([0-9]+\.[0-9]+).*/\1/' || echo "unknown")
|
ROCM_VERSION=$(grep -E '^ARG BASE_IMAGE=' docker/Dockerfile.rocm_base | sed -E 's/.*:([0-9]+\.[0-9]+).*/\1/' || echo "unknown")
|
||||||
PYTHON_VERSION=$(buildkite-agent meta-data get rocm-python-version 2>/dev/null || echo "3.12")
|
PYTHON_VERSION=$(buildkite-agent meta-data get rocm-python-version 2>/dev/null || echo "3.12")
|
||||||
PYTORCH_ROCM_ARCH=$(buildkite-agent meta-data get rocm-pytorch-rocm-arch 2>/dev/null || echo "gfx90a;gfx942;gfx950;gfx1100;gfx1101;gfx1200;gfx1201;gfx1150;gfx1151")
|
PYTORCH_ROCM_ARCH=$(buildkite-agent meta-data get rocm-pytorch-rocm-arch 2>/dev/null || echo "gfx90a;gfx942;gfx950;gfx1100;gfx1101;gfx1200;gfx1201;gfx1150;gfx1151")
|
||||||
|
|
||||||
|
# TODO: Enable the nightly build for ROCm
|
||||||
|
# Get release version, default to 1.0.0.dev for nightly/per-commit builds
|
||||||
|
RELEASE_VERSION=$(buildkite-agent meta-data get release-version 2>/dev/null || echo "")
|
||||||
|
if [ -z "${RELEASE_VERSION}" ]; then
|
||||||
|
RELEASE_VERSION="1.0.0.dev"
|
||||||
|
fi
|
||||||
|
|
||||||
# S3 URLs
|
# S3 URLs
|
||||||
S3_BUCKET="${S3_BUCKET:-vllm-wheels}"
|
S3_BUCKET="${S3_BUCKET:-vllm-wheels}"
|
||||||
S3_REGION="${AWS_DEFAULT_REGION:-us-west-2}"
|
S3_REGION="${AWS_DEFAULT_REGION:-us-west-2}"
|
||||||
S3_URL="https://${S3_BUCKET}.s3.${S3_REGION}.amazonaws.com"
|
S3_URL="http://${S3_BUCKET}.s3-website-${S3_REGION}.amazonaws.com"
|
||||||
ROCM_PATH="rocm/${BUILDKITE_COMMIT}"
|
|
||||||
|
|
||||||
|
# Format ROCm version for path (e.g., "7.1" -> "rocm710")
|
||||||
|
ROCM_VERSION_PATH="rocm$(echo "${ROCM_VERSION}" | tr -d '.')"
|
||||||
|
ROCM_PATH="rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}"
|
||||||
buildkite-agent annotate --style 'success' --context 'rocm-release-workflow' << EOF
|
buildkite-agent annotate --style 'success' --context 'rocm-release-workflow' << EOF
|
||||||
## :rocm: ROCm Wheel Release
|
## ROCm Wheel and Docker Image Releases
|
||||||
|
|
||||||
### Build Configuration
|
### Build Configuration
|
||||||
| Setting | Value |
|
| Setting | Value |
|
||||||
|---------|-------|
|
|---------|-------|
|
||||||
@@ -34,41 +41,72 @@ buildkite-agent annotate --style 'success' --context 'rocm-release-workflow' <<
|
|||||||
### :package: Installation
|
### :package: Installation
|
||||||
|
|
||||||
**Install from this build (by commit):**
|
**Install from this build (by commit):**
|
||||||
\`\`\`bash
|
|
||||||
uv pip install vllm --extra-index-url ${S3_URL}/${ROCM_PATH}/{rocm_variant}/
|
|
||||||
|
|
||||||
# Example:
|
\`\`\`bash
|
||||||
uv pip install vllm --extra-index-url ${S3_URL}/${ROCM_PATH}/rocm700/
|
pip install vllm --extra-index-url ${S3_URL}/${ROCM_PATH}/ --trusted-host ${S3_BUCKET}.s3-website-${S3_REGION}.amazonaws.com
|
||||||
|
|
||||||
|
# Example for ROCm ${ROCM_VERSION}:
|
||||||
|
pip install vllm --extra-index-url ${S3_URL}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/ --trusted-host ${S3_BUCKET}.s3-website-${S3_REGION}.amazonaws.com
|
||||||
\`\`\`
|
\`\`\`
|
||||||
|
|
||||||
**Install from nightly (if published):**
|
**Install from nightly (if published):**
|
||||||
|
|
||||||
\`\`\`bash
|
\`\`\`bash
|
||||||
uv pip install vllm --extra-index-url ${S3_URL}/rocm/nightly/
|
pip install vllm --extra-index-url ${S3_URL}/rocm/nightly/ --trusted-host ${S3_BUCKET}.s3-website-${S3_REGION}.amazonaws.com
|
||||||
\`\`\`
|
\`\`\`
|
||||||
|
|
||||||
### :floppy_disk: Download Wheels Directly
|
### :floppy_disk: Download Wheels Directly
|
||||||
|
|
||||||
\`\`\`bash
|
\`\`\`bash
|
||||||
# List all ROCm wheels
|
# List all ROCm wheels
|
||||||
aws s3 ls s3://${S3_BUCKET}/${ROCM_PATH}/
|
aws s3 ls s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/
|
||||||
|
|
||||||
# Download specific wheels
|
# Download specific wheels
|
||||||
aws s3 cp s3://${S3_BUCKET}/${ROCM_PATH}/vllm-*.whl .
|
aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/vllm-*.whl .
|
||||||
aws s3 cp s3://${S3_BUCKET}/${ROCM_PATH}/torch-*.whl .
|
aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/torch-*.whl .
|
||||||
aws s3 cp s3://${S3_BUCKET}/${ROCM_PATH}/triton_rocm-*.whl .
|
aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/triton-*.whl .
|
||||||
aws s3 cp s3://${S3_BUCKET}/${ROCM_PATH}/torchvision-*.whl .
|
aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/triton-kernels-*.whl .
|
||||||
aws s3 cp s3://${S3_BUCKET}/${ROCM_PATH}/amdsmi-*.whl .
|
aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/torchvision-*.whl .
|
||||||
|
aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/torchaudio-*.whl .
|
||||||
|
aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/amdsmi-*.whl .
|
||||||
|
aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/amd_aiter-*.whl .
|
||||||
|
aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/flash-attn-*.whl .
|
||||||
\`\`\`
|
\`\`\`
|
||||||
|
|
||||||
### :gear: Included Packages
|
### :gear: Included Packages
|
||||||
- **vllm**: vLLM with ROCm support
|
- **vllm**: vLLM with ROCm support
|
||||||
- **torch**: PyTorch built for ROCm ${ROCM_VERSION}
|
- **torch**: PyTorch built for ROCm ${ROCM_VERSION}
|
||||||
- **triton_rocm**: Triton built for ROCm
|
- **triton**: Triton
|
||||||
|
- **triton-kernels**: Triton kernels
|
||||||
- **torchvision**: TorchVision for ROCm PyTorch
|
- **torchvision**: TorchVision for ROCm PyTorch
|
||||||
|
- **torchaudio**: Torchaudio for ROCm PyTorch
|
||||||
- **amdsmi**: AMD SMI Python bindings
|
- **amdsmi**: AMD SMI Python bindings
|
||||||
|
- **amd_aiter**: Aiter for ROCm
|
||||||
|
- **flash-attn**: Flash Attention for ROCm
|
||||||
|
|
||||||
### :warning: Notes
|
### :warning: Notes
|
||||||
- These wheels are built for **ROCm ${ROCM_VERSION}** and will NOT work with CUDA GPUs
|
- These wheels are built for **ROCm ${ROCM_VERSION}** and will NOT work with CUDA GPUs
|
||||||
- Supported GPU architectures: ${PYTORCH_ROCM_ARCH}
|
- Supported GPU architectures: ${PYTORCH_ROCM_ARCH}
|
||||||
- Platform: Linux x86_64 only
|
- Platform: Linux x86_64 only
|
||||||
|
|
||||||
|
### :package: Docker Image Release
|
||||||
|
|
||||||
|
To download and upload the image:
|
||||||
|
|
||||||
|
\`\`\`
|
||||||
|
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm-base
|
||||||
|
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm
|
||||||
|
|
||||||
|
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm-base vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-base
|
||||||
|
docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-base vllm/vllm-openai-rocm:latest-base
|
||||||
|
docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-base vllm/vllm-openai-rocm:v${RELEASE_VERSION}-base
|
||||||
|
docker push vllm/vllm-openai-rocm:latest-base
|
||||||
|
docker push vllm/vllm-openai-rocm:v${RELEASE_VERSION}-base
|
||||||
|
|
||||||
|
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}
|
||||||
|
docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT} vllm/vllm-openai-rocm:latest
|
||||||
|
docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT} vllm/vllm-openai-rocm:v${RELEASE_VERSION}
|
||||||
|
docker push vllm/vllm-openai-rocm:latest
|
||||||
|
docker push vllm/vllm-openai-rocm:v${RELEASE_VERSION}
|
||||||
|
\`\`\`
|
||||||
|
|
||||||
EOF
|
EOF
|
||||||
|
|||||||
@@ -83,7 +83,7 @@ case "${1:-}" in
|
|||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
WHEEL_COUNT=$(ls artifacts/rocm-base-wheels/*.whl 2>/dev/null | wc -l)
|
WHEEL_COUNT=$(find artifacts/rocm-base-wheels -maxdepth 1 -name '*.whl' 2>/dev/null | wc -l)
|
||||||
if [[ "$WHEEL_COUNT" -eq 0 ]]; then
|
if [[ "$WHEEL_COUNT" -eq 0 ]]; then
|
||||||
echo "ERROR: No wheels found in artifacts/rocm-base-wheels/" >&2
|
echo "ERROR: No wheels found in artifacts/rocm-base-wheels/" >&2
|
||||||
exit 1
|
exit 1
|
||||||
@@ -110,9 +110,9 @@ case "${1:-}" in
|
|||||||
|
|
||||||
echo ""
|
echo ""
|
||||||
echo "Downloaded wheels:"
|
echo "Downloaded wheels:"
|
||||||
ls -lh artifacts/rocm-base-wheels/
|
find artifacts/rocm-base-wheels -maxdepth 1 -name '*.whl' -exec ls -lh {} \;
|
||||||
|
|
||||||
WHEEL_COUNT=$(ls artifacts/rocm-base-wheels/*.whl 2>/dev/null | wc -l)
|
WHEEL_COUNT=$(find artifacts/rocm-base-wheels -maxdepth 1 -name '*.whl' 2>/dev/null | wc -l)
|
||||||
echo ""
|
echo ""
|
||||||
echo "Total: $WHEEL_COUNT wheels"
|
echo "Total: $WHEEL_COUNT wheels"
|
||||||
echo "========================================"
|
echo "========================================"
|
||||||
|
|||||||
205
.buildkite/scripts/check-ray-compatibility.sh
Normal file
205
.buildkite/scripts/check-ray-compatibility.sh
Normal file
@@ -0,0 +1,205 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
#
|
||||||
|
# Check if Ray LLM can generate lock files that are compatible with this
|
||||||
|
# version of vllm. Downloads Ray's requirement files and runs a full
|
||||||
|
# dependency resolution with the installed vllm's constraints to see if
|
||||||
|
# a valid lock file can be produced.
|
||||||
|
#
|
||||||
|
# See: https://github.com/vllm-project/vllm/issues/33599
|
||||||
|
|
||||||
|
set -eo pipefail
|
||||||
|
|
||||||
|
RAY_BASE_URL="https://raw.githubusercontent.com/ray-project/ray/master/python"
|
||||||
|
|
||||||
|
WORK_DIR=$(mktemp -d)
|
||||||
|
trap 'rm -rf "$WORK_DIR"' EXIT
|
||||||
|
|
||||||
|
# Fetch all Ray requirement files used in the LLM depset pipeline
|
||||||
|
echo ">>> Fetching Ray requirement files"
|
||||||
|
RAY_FILES=(
|
||||||
|
"requirements.txt"
|
||||||
|
"requirements/cloud-requirements.txt"
|
||||||
|
"requirements/base-test-requirements.txt"
|
||||||
|
"requirements/llm/llm-requirements.txt"
|
||||||
|
"requirements/llm/llm-test-requirements.txt"
|
||||||
|
)
|
||||||
|
for FILE in "${RAY_FILES[@]}"; do
|
||||||
|
LOCAL_PATH="${WORK_DIR}/$(basename "$FILE")"
|
||||||
|
echo " ${FILE}"
|
||||||
|
curl -fsSL -o "$LOCAL_PATH" "${RAY_BASE_URL}/${FILE}"
|
||||||
|
done
|
||||||
|
|
||||||
|
# Extract installed vllm deps
|
||||||
|
echo ">>> Extracting installed vllm dependency constraints"
|
||||||
|
python3 - "${WORK_DIR}/vllm-constraints.txt" <<'PYEOF'
|
||||||
|
"""Write out the installed vllm's dependencies as pip constraint lines.
|
||||||
|
|
||||||
|
Ray uses vllm[audio], so audio-extra deps are included with their extra
|
||||||
|
markers stripped. The resolver cannot evaluate extra markers for a
|
||||||
|
package that is not itself being resolved from an index, so we activate
|
||||||
|
them manually here.
|
||||||
|
"""
|
||||||
|
import importlib.metadata
|
||||||
|
import re
|
||||||
|
import sys
|
||||||
|
|
||||||
|
out_path = sys.argv[1]
|
||||||
|
raw_reqs = importlib.metadata.requires("vllm") or []
|
||||||
|
|
||||||
|
# Ray uses vllm[audio] – activate that extra.
|
||||||
|
ACTIVE_EXTRAS = {"audio"}
|
||||||
|
EXTRA_RE = re.compile(r"""extra\s*==\s*['"]([^'"]+)['"]""")
|
||||||
|
|
||||||
|
lines = []
|
||||||
|
for r in raw_reqs:
|
||||||
|
if ";" not in r:
|
||||||
|
# Unconditional dep — always include.
|
||||||
|
lines.append(r.strip())
|
||||||
|
continue
|
||||||
|
|
||||||
|
req_part, _, marker_part = r.partition(";")
|
||||||
|
marker_part = marker_part.strip()
|
||||||
|
|
||||||
|
extra_matches = EXTRA_RE.findall(marker_part)
|
||||||
|
if not extra_matches:
|
||||||
|
# Non-extra marker (python_version, etc.) — keep as-is.
|
||||||
|
lines.append(r.strip())
|
||||||
|
continue
|
||||||
|
|
||||||
|
if not ACTIVE_EXTRAS.intersection(extra_matches):
|
||||||
|
continue # Skip inactive extras (tensorizer, bench, …).
|
||||||
|
|
||||||
|
# Strip the extra== conditions but keep any remaining markers
|
||||||
|
# (e.g. python_version).
|
||||||
|
cleaned = EXTRA_RE.sub("", marker_part)
|
||||||
|
cleaned = re.sub(r"\band\b\s*\band\b", "and", cleaned)
|
||||||
|
cleaned = re.sub(r"^\s*and\s+|\s+and\s*$", "", cleaned).strip()
|
||||||
|
|
||||||
|
if cleaned:
|
||||||
|
lines.append(f"{req_part.strip()} ; {cleaned}")
|
||||||
|
else:
|
||||||
|
lines.append(req_part.strip())
|
||||||
|
|
||||||
|
with open(out_path, "w") as f:
|
||||||
|
for line in lines:
|
||||||
|
f.write(line + "\n")
|
||||||
|
|
||||||
|
print(f"Wrote {len(lines)} constraints to {out_path}")
|
||||||
|
PYEOF
|
||||||
|
|
||||||
|
echo ">>> Installed vllm deps (first 20 lines):"
|
||||||
|
head -20 "${WORK_DIR}/vllm-constraints.txt"
|
||||||
|
|
||||||
|
# Remove Ray's vllm pin — the installed vllm's transitive deps
|
||||||
|
# (written above) replace it in the resolution. vllm itself cannot
|
||||||
|
# be resolved from PyPI for in-development versions, so we test
|
||||||
|
# whether Ray's requirements can coexist with vllm's dependency
|
||||||
|
# constraints instead.
|
||||||
|
sed -i '/^vllm/d' "${WORK_DIR}/llm-requirements.txt"
|
||||||
|
|
||||||
|
# Install uv if needed
|
||||||
|
if ! command -v uv &>/dev/null; then
|
||||||
|
echo ">>> Installing uv"
|
||||||
|
pip install uv -q
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Resolve: given vllm's constraints, can Ray compile a lock file?
|
||||||
|
#
|
||||||
|
# vllm's dependency constraints are the fixed side — Ray is flexible and
|
||||||
|
# can regenerate its lock files. We pass vllm's constraints via -c so
|
||||||
|
# the resolver treats them as non-negotiable bounds, then check whether
|
||||||
|
# Ray's own requirements can still be satisfied within those bounds.
|
||||||
|
echo ""
|
||||||
|
echo "============================================================"
|
||||||
|
echo ">>> Resolving: Can Ray generate compatible lock files?"
|
||||||
|
echo "============================================================"
|
||||||
|
|
||||||
|
set +e
|
||||||
|
uv pip compile \
|
||||||
|
"${WORK_DIR}/requirements.txt" \
|
||||||
|
"${WORK_DIR}/cloud-requirements.txt" \
|
||||||
|
"${WORK_DIR}/base-test-requirements.txt" \
|
||||||
|
"${WORK_DIR}/llm-requirements.txt" \
|
||||||
|
"${WORK_DIR}/llm-test-requirements.txt" \
|
||||||
|
-c "${WORK_DIR}/vllm-constraints.txt" \
|
||||||
|
--python-version 3.12 \
|
||||||
|
--python-platform x86_64-manylinux_2_31 \
|
||||||
|
--extra-index-url https://download.pytorch.org/whl/cu129 \
|
||||||
|
--index-strategy unsafe-best-match \
|
||||||
|
--unsafe-package setuptools \
|
||||||
|
--unsafe-package ray \
|
||||||
|
--no-header \
|
||||||
|
-o "${WORK_DIR}/resolved.txt" \
|
||||||
|
2>&1
|
||||||
|
EXIT_CODE=$?
|
||||||
|
set -e
|
||||||
|
|
||||||
|
echo ""
|
||||||
|
echo "=========================================="
|
||||||
|
if [ $EXIT_CODE -eq 0 ]; then
|
||||||
|
echo "SUCCESS: Ray can generate lock files compatible with this vllm."
|
||||||
|
echo ""
|
||||||
|
echo "Key resolved versions:"
|
||||||
|
grep -E '^(protobuf|torch|numpy|transformers)==' \
|
||||||
|
"${WORK_DIR}/resolved.txt" | sort || true
|
||||||
|
echo "=========================================="
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "FAILURE: Ray cannot generate lock files compatible with this vllm."
|
||||||
|
echo "This means a fundamental dependency conflict exists that Ray"
|
||||||
|
echo "cannot resolve by regenerating its lock files."
|
||||||
|
echo "See: https://github.com/vllm-project/vllm/issues/33599"
|
||||||
|
echo "=========================================="
|
||||||
|
|
||||||
|
# Buildkite annotation
|
||||||
|
if [ -f /usr/bin/buildkite-agent ]; then
|
||||||
|
buildkite-agent annotate --style 'warning' --context 'ray-compat' << EOF
|
||||||
|
### :warning: Ray Dependency Compatibility Warning
|
||||||
|
This PR introduces dependencies that **cannot** be resolved with Ray's requirements.
|
||||||
|
Ray would not be able to regenerate its lock files to accommodate this vllm version.
|
||||||
|
|
||||||
|
Please check the **Ray Dependency Compatibility Check** step logs for details.
|
||||||
|
See [issue #33599](https://github.com/vllm-project/vllm/issues/33599) for context.
|
||||||
|
EOF
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Notify Slack if webhook is configured.
|
||||||
|
if [ -n "$RAY_COMPAT_SLACK_WEBHOOK_URL" ]; then
|
||||||
|
echo ">>> Sending Slack notification"
|
||||||
|
# Single quotes are intentional: the f-string expressions are Python, not shell.
|
||||||
|
# shellcheck disable=SC2016
|
||||||
|
PAYLOAD=$(python3 -c '
|
||||||
|
import json, os, sys
|
||||||
|
pr = os.getenv("BUILDKITE_PULL_REQUEST", "N/A")
|
||||||
|
branch = os.getenv("BUILDKITE_BRANCH", "unknown")
|
||||||
|
url = os.getenv("BUILDKITE_BUILD_URL", "#")
|
||||||
|
data = {
|
||||||
|
"text": ":warning: Ray Dependency Compatibility Check Failed",
|
||||||
|
"blocks": [{
|
||||||
|
"type": "section",
|
||||||
|
"text": {
|
||||||
|
"type": "mrkdwn",
|
||||||
|
"text": (
|
||||||
|
"*:warning: Ray Dependency Compatibility Check Failed*\n"
|
||||||
|
f"PR #{pr} on branch `{branch}` introduces dependencies "
|
||||||
|
f"that cannot be resolved with Ray'\''s requirements.\n"
|
||||||
|
f"<{url}|View Build>"
|
||||||
|
),
|
||||||
|
},
|
||||||
|
}],
|
||||||
|
}
|
||||||
|
print(json.dumps(data))
|
||||||
|
')
|
||||||
|
|
||||||
|
HTTP_CODE=$(curl -s -o /dev/null -w "%{http_code}" -X POST "$RAY_COMPAT_SLACK_WEBHOOK_URL" \
|
||||||
|
-H 'Content-type: application/json' \
|
||||||
|
-d "$PAYLOAD")
|
||||||
|
echo " Slack webhook response: $HTTP_CODE"
|
||||||
|
else
|
||||||
|
echo ">>> Skipping Slack notification (RAY_COMPAT_SLACK_WEBHOOK_URL not set)"
|
||||||
|
fi
|
||||||
|
|
||||||
|
exit 1
|
||||||
@@ -134,7 +134,7 @@ log_info "Fetching merged PRs from milestone '${MILESTONE}'..."
|
|||||||
|
|
||||||
# Store PR data in a temp file
|
# Store PR data in a temp file
|
||||||
PR_DATA=$(mktemp)
|
PR_DATA=$(mktemp)
|
||||||
trap "rm -f $PR_DATA" EXIT
|
trap 'rm -f "$PR_DATA"' EXIT
|
||||||
|
|
||||||
if ! gh pr list --state merged --search "milestone:${MILESTONE}" \
|
if ! gh pr list --state merged --search "milestone:${MILESTONE}" \
|
||||||
--limit 1000 \
|
--limit 1000 \
|
||||||
|
|||||||
@@ -112,7 +112,7 @@ def parse_from_filename(file: str) -> WheelFileInfo:
|
|||||||
|
|
||||||
def generate_project_list(subdir_names: list[str], comment: str = "") -> str:
|
def generate_project_list(subdir_names: list[str], comment: str = "") -> str:
|
||||||
"""
|
"""
|
||||||
Generate project list HTML content linking to each project & variant sub-directory.
|
Generate project list HTML content linking to each project & variant subdirectory.
|
||||||
"""
|
"""
|
||||||
href_tags = []
|
href_tags = []
|
||||||
for name in sorted(subdir_names):
|
for name in sorted(subdir_names):
|
||||||
@@ -168,23 +168,23 @@ def generate_index_and_metadata(
|
|||||||
comment (str | None): Optional comment to include in the generated HTML files.
|
comment (str | None): Optional comment to include in the generated HTML files.
|
||||||
|
|
||||||
First, parse all wheel files to extract metadata.
|
First, parse all wheel files to extract metadata.
|
||||||
We need to collect all wheel files for each variant, and generate an index for it (in a sub-directory).
|
We need to collect all wheel files for each variant, and generate an index for it (in a subdirectory).
|
||||||
The index for the default variant (if any) is generated in the root index directory.
|
The index for the default variant (if any) is generated in the root index directory.
|
||||||
|
|
||||||
If `default_variant` is provided, all wheels must have variant suffixes, and the default variant index
|
If `default_variant` is provided, all wheels must have variant suffixes, and the default variant index
|
||||||
is purely a copy of the corresponding variant index, with only the links adjusted.
|
is purely a copy of the corresponding variant index, with only the links adjusted.
|
||||||
Otherwise, all wheels without variant suffixes are treated as the default variant.
|
Otherwise, all wheels without variant suffixes are treated as the default variant.
|
||||||
|
|
||||||
If `alias_to_default` is provided, an additional alias sub-directory is created, it has the same content
|
If `alias_to_default` is provided, an additional alias subdirectory is created, it has the same content
|
||||||
as the default variant index, but the links are adjusted accordingly.
|
as the default variant index, but the links are adjusted accordingly.
|
||||||
|
|
||||||
Index directory structure:
|
Index directory structure:
|
||||||
index_base_dir/ (hosted at wheels.vllm.ai/{nightly,$commit,$version}/)
|
index_base_dir/ (hosted at wheels.vllm.ai/{nightly,$commit,$version}/)
|
||||||
index.html # project list, linking to "vllm/" and other packages, and all variant sub-directories
|
index.html # project list, linking to "vllm/" and other packages, and all variant subdirectories
|
||||||
vllm/
|
vllm/
|
||||||
index.html # package index, pointing to actual files in wheel_base_dir (relative path)
|
index.html # package index, pointing to actual files in wheel_base_dir (relative path)
|
||||||
metadata.json # machine-readable metadata for all wheels in this package
|
metadata.json # machine-readable metadata for all wheels in this package
|
||||||
cpu/ # cpu variant sub-directory
|
cpu/ # cpu variant subdirectory
|
||||||
index.html
|
index.html
|
||||||
vllm/
|
vllm/
|
||||||
index.html
|
index.html
|
||||||
@@ -194,7 +194,7 @@ def generate_index_and_metadata(
|
|||||||
vllm/
|
vllm/
|
||||||
index.html
|
index.html
|
||||||
metadata.json
|
metadata.json
|
||||||
cu130/ # cu130 variant sub-directory
|
cu130/ # cu130 variant subdirectory
|
||||||
index.html
|
index.html
|
||||||
vllm/
|
vllm/
|
||||||
index.html
|
index.html
|
||||||
|
|||||||
@@ -1,25 +1,57 @@
|
|||||||
#!/bin/bash
|
#!/bin/bash
|
||||||
|
|
||||||
# This script runs test inside the corresponding ROCm docker container.
|
# This script runs tests inside the corresponding ROCm docker container.
|
||||||
|
# It handles both single-node and multi-node test configurations.
|
||||||
|
#
|
||||||
|
# Multi-node detection: Instead of matching on fragile group names, we detect
|
||||||
|
# multi-node jobs structurally by looking for the bracket command syntax
|
||||||
|
# "[node0_cmds] && [node1_cmds]" or via the NUM_NODES environment variable.
|
||||||
|
#
|
||||||
|
###############################################################################
|
||||||
|
# QUOTING / COMMAND PASSING
|
||||||
|
#
|
||||||
|
# Passing commands as positional arguments ($*) is fragile when the command
|
||||||
|
# string itself contains double quotes, e.g.:
|
||||||
|
#
|
||||||
|
# bash run-amd-test.sh "export FLAGS="value" && pytest -m "not slow""
|
||||||
|
#
|
||||||
|
# The outer shell resolves the nested quotes *before* this script runs, so
|
||||||
|
# the script receives mangled input it cannot fully recover.
|
||||||
|
#
|
||||||
|
# Preferred: pass commands via the VLLM_TEST_COMMANDS environment variable:
|
||||||
|
#
|
||||||
|
# export VLLM_TEST_COMMANDS='export FLAGS="value" && pytest -m "not slow"'
|
||||||
|
# bash run-amd-test.sh
|
||||||
|
#
|
||||||
|
# Single-quoted assignment preserves all inner double quotes verbatim.
|
||||||
|
# The $* path is kept for backward compatibility but callers should migrate.
|
||||||
|
###############################################################################
|
||||||
set -o pipefail
|
set -o pipefail
|
||||||
|
|
||||||
# Export Python path
|
# Export Python path
|
||||||
export PYTHONPATH=".."
|
export PYTHONPATH=".."
|
||||||
|
|
||||||
# Print ROCm version
|
###############################################################################
|
||||||
echo "--- Confirming Clean Initial State"
|
# Helper Functions
|
||||||
while true; do
|
###############################################################################
|
||||||
sleep 3
|
|
||||||
if grep -q clean /opt/amdgpu/etc/gpu_state; then
|
|
||||||
echo "GPUs state is \"clean\""
|
|
||||||
break
|
|
||||||
fi
|
|
||||||
done
|
|
||||||
|
|
||||||
echo "--- ROCm info"
|
wait_for_clean_gpus() {
|
||||||
rocminfo
|
local timeout=${1:-300}
|
||||||
|
local start=$SECONDS
|
||||||
|
echo "--- Waiting for clean GPU state (timeout: ${timeout}s)"
|
||||||
|
while true; do
|
||||||
|
if grep -q clean /opt/amdgpu/etc/gpu_state; then
|
||||||
|
echo "GPUs state is \"clean\""
|
||||||
|
return
|
||||||
|
fi
|
||||||
|
if (( SECONDS - start >= timeout )); then
|
||||||
|
echo "Error: GPUs did not reach clean state within ${timeout}s" >&2
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
sleep 3
|
||||||
|
done
|
||||||
|
}
|
||||||
|
|
||||||
# cleanup older docker images
|
|
||||||
cleanup_docker() {
|
cleanup_docker() {
|
||||||
# Get Docker's root directory
|
# Get Docker's root directory
|
||||||
docker_root=$(docker info -f '{{.DockerRootDir}}')
|
docker_root=$(docker info -f '{{.DockerRootDir}}')
|
||||||
@@ -28,15 +60,12 @@ cleanup_docker() {
|
|||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
echo "Docker root directory: $docker_root"
|
echo "Docker root directory: $docker_root"
|
||||||
# Check disk usage of the filesystem where Docker's root directory is located
|
|
||||||
disk_usage=$(df "$docker_root" | tail -1 | awk '{print $5}' | sed 's/%//')
|
disk_usage=$(df "$docker_root" | tail -1 | awk '{print $5}' | sed 's/%//')
|
||||||
# Define the threshold
|
|
||||||
threshold=70
|
threshold=70
|
||||||
if [ "$disk_usage" -gt "$threshold" ]; then
|
if [ "$disk_usage" -gt "$threshold" ]; then
|
||||||
echo "Disk usage is above $threshold%. Cleaning up Docker images and volumes..."
|
echo "Disk usage is above $threshold%. Cleaning up Docker images and volumes..."
|
||||||
# Remove dangling images (those that are not tagged and not used by any container)
|
|
||||||
docker image prune -f
|
docker image prune -f
|
||||||
# Remove unused volumes / force the system prune for old images as well.
|
|
||||||
docker volume prune -f && docker system prune --force --filter "until=72h" --all
|
docker volume prune -f && docker system prune --force --filter "until=72h" --all
|
||||||
echo "Docker images and volumes cleanup completed."
|
echo "Docker images and volumes cleanup completed."
|
||||||
else
|
else
|
||||||
@@ -44,201 +73,432 @@ cleanup_docker() {
|
|||||||
fi
|
fi
|
||||||
}
|
}
|
||||||
|
|
||||||
# Call the cleanup docker function
|
cleanup_network() {
|
||||||
|
local max_nodes=${NUM_NODES:-2}
|
||||||
|
for node in $(seq 0 $((max_nodes - 1))); do
|
||||||
|
if docker ps -a -q -f name="node${node}" | grep -q .; then
|
||||||
|
docker stop "node${node}" || true
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
if docker network ls | grep -q docker-net; then
|
||||||
|
docker network rm docker-net || true
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
is_multi_node() {
|
||||||
|
local cmds="$1"
|
||||||
|
# Primary signal: NUM_NODES environment variable set by the pipeline
|
||||||
|
if [[ "${NUM_NODES:-1}" -gt 1 ]]; then
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
# Fallback: detect the bracket syntax structurally
|
||||||
|
# Pattern: [...] && [...] (per-node command arrays)
|
||||||
|
if [[ "$cmds" =~ \[.*\].*\&\&.*\[.*\] ]]; then
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
return 1
|
||||||
|
}
|
||||||
|
|
||||||
|
handle_pytest_exit() {
|
||||||
|
local exit_code=$1
|
||||||
|
if [ "$exit_code" -eq 5 ]; then
|
||||||
|
echo "Pytest exit code 5 (no tests collected) - treating as success."
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
exit "$exit_code"
|
||||||
|
}
|
||||||
|
|
||||||
|
###############################################################################
|
||||||
|
# Pytest marker/keyword re-quoting
|
||||||
|
#
|
||||||
|
# When commands are passed through Buildkite -> shell -> $* -> bash -c,
|
||||||
|
# quotes around multi-word pytest -m/-k expressions get stripped:
|
||||||
|
# pytest -v -s -m 'not cpu_test' v1/core
|
||||||
|
# becomes:
|
||||||
|
# pytest -v -s -m not cpu_test v1/core
|
||||||
|
#
|
||||||
|
# pytest then interprets "cpu_test" as a file path, not part of the marker.
|
||||||
|
#
|
||||||
|
# This function detects unquoted expressions after -m/-k and re-quotes them
|
||||||
|
# by collecting tokens until a recognizable boundary is reached:
|
||||||
|
# - test path (contains '/')
|
||||||
|
# - test file (ends with '.py')
|
||||||
|
# - another pytest flag (--xxx or -x single-char flags)
|
||||||
|
# - command separator (&& || ; |)
|
||||||
|
# - environment variable assignment (FOO=bar)
|
||||||
|
#
|
||||||
|
# Single-word markers (e.g. -m cpu_test, -m hybrid_model) pass through
|
||||||
|
# unquoted since they have no spaces and work fine.
|
||||||
|
#
|
||||||
|
# Already-quoted expressions (containing literal single quotes) are passed
|
||||||
|
# through untouched to avoid double-quoting values injected by
|
||||||
|
# apply_rocm_test_overrides.
|
||||||
|
#
|
||||||
|
# NOTE: This ONLY fixes -m/-k flags. It cannot recover arbitrary inner
|
||||||
|
# double-quotes stripped by the calling shell (see header comment).
|
||||||
|
# Use VLLM_TEST_COMMANDS to avoid the problem entirely.
|
||||||
|
###############################################################################
|
||||||
|
re_quote_pytest_markers() {
|
||||||
|
local input="$1"
|
||||||
|
local output=""
|
||||||
|
local collecting=false
|
||||||
|
local marker_buf=""
|
||||||
|
|
||||||
|
# Strip backslash-newline continuations, then flatten remaining newlines
|
||||||
|
local flat="${input//$'\\\n'/ }"
|
||||||
|
flat="${flat//$'\n'/ }"
|
||||||
|
|
||||||
|
# Disable globbing to prevent *.py etc. from expanding during read -ra
|
||||||
|
local restore_glob
|
||||||
|
restore_glob="$(shopt -p -o noglob 2>/dev/null || true)"
|
||||||
|
set -o noglob
|
||||||
|
local -a words
|
||||||
|
read -ra words <<< "$flat"
|
||||||
|
eval "$restore_glob"
|
||||||
|
|
||||||
|
for word in "${words[@]}"; do
|
||||||
|
if $collecting; then
|
||||||
|
# If the token we're about to collect already contains a literal
|
||||||
|
# single quote, the expression was already quoted upstream.
|
||||||
|
# Flush and stop collecting.
|
||||||
|
if [[ "$word" == *"'"* ]]; then
|
||||||
|
if [[ -n "$marker_buf" ]]; then
|
||||||
|
# Should not normally happen (partial buf + quote), flush raw
|
||||||
|
output+="${marker_buf} "
|
||||||
|
marker_buf=""
|
||||||
|
fi
|
||||||
|
output+="${word} "
|
||||||
|
collecting=false
|
||||||
|
continue
|
||||||
|
fi
|
||||||
|
|
||||||
|
local is_boundary=false
|
||||||
|
case "$word" in
|
||||||
|
# Line-continuation artifact
|
||||||
|
"\\")
|
||||||
|
is_boundary=true ;;
|
||||||
|
# Command separators
|
||||||
|
"&&"|"||"|";"|"|")
|
||||||
|
is_boundary=true ;;
|
||||||
|
# Long flags (--ignore, --shard-id, etc.)
|
||||||
|
--*)
|
||||||
|
is_boundary=true ;;
|
||||||
|
# Short flags (-v, -s, -x, etc.) but NOT negative marker tokens
|
||||||
|
# like "not" which don't start with "-". Also skip -k/-m which
|
||||||
|
# would start a new marker (handled below).
|
||||||
|
-[a-zA-Z])
|
||||||
|
is_boundary=true ;;
|
||||||
|
# Test path (contains /)
|
||||||
|
*/*)
|
||||||
|
is_boundary=true ;;
|
||||||
|
# Test file (ends with .py, possibly with ::method)
|
||||||
|
*.py|*.py::*)
|
||||||
|
is_boundary=true ;;
|
||||||
|
# Environment variable assignment preceding a command (FOO=bar)
|
||||||
|
*=*)
|
||||||
|
# Only treat as boundary if it looks like VAR=value, not
|
||||||
|
# pytest filter expressions like num_gpus=2 inside markers
|
||||||
|
if [[ "$word" =~ ^[A-Z_][A-Z0-9_]*= ]]; then
|
||||||
|
is_boundary=true
|
||||||
|
fi
|
||||||
|
;;
|
||||||
|
esac
|
||||||
|
|
||||||
|
if $is_boundary; then
|
||||||
|
# Flush the collected marker expression
|
||||||
|
if [[ "$marker_buf" == *" "* || "$marker_buf" == *"("* ]]; then
|
||||||
|
output+="'${marker_buf}' "
|
||||||
|
else
|
||||||
|
output+="${marker_buf} "
|
||||||
|
fi
|
||||||
|
collecting=false
|
||||||
|
marker_buf=""
|
||||||
|
# Check if this boundary word itself starts a new -m/-k
|
||||||
|
if [[ "$word" == "-m" || "$word" == "-k" ]]; then
|
||||||
|
output+="${word} "
|
||||||
|
collecting=true
|
||||||
|
# Drop stray backslash tokens silently
|
||||||
|
elif [[ "$word" == "\\" ]]; then
|
||||||
|
:
|
||||||
|
else
|
||||||
|
output+="${word} "
|
||||||
|
fi
|
||||||
|
else
|
||||||
|
# Accumulate into marker buffer
|
||||||
|
if [[ -n "$marker_buf" ]]; then
|
||||||
|
marker_buf+=" ${word}"
|
||||||
|
else
|
||||||
|
marker_buf="${word}"
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
elif [[ "$word" == "-m" || "$word" == "-k" ]]; then
|
||||||
|
output+="${word} "
|
||||||
|
collecting=true
|
||||||
|
marker_buf=""
|
||||||
|
else
|
||||||
|
output+="${word} "
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
||||||
|
# Flush any trailing marker expression (marker at end of command)
|
||||||
|
if $collecting && [[ -n "$marker_buf" ]]; then
|
||||||
|
if [[ "$marker_buf" == *" "* || "$marker_buf" == *"("* ]]; then
|
||||||
|
output+="'${marker_buf}'"
|
||||||
|
else
|
||||||
|
output+="${marker_buf}"
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "${output% }"
|
||||||
|
}
|
||||||
|
|
||||||
|
###############################################################################
|
||||||
|
# ROCm-specific pytest command rewrites
|
||||||
|
#
|
||||||
|
# These apply ignore flags and environment overrides for tests that are not
|
||||||
|
# yet supported or behave differently on ROCm hardware. Kept as a single
|
||||||
|
# function so new exclusions are easy to add in one place.
|
||||||
|
###############################################################################
|
||||||
|
|
||||||
|
apply_rocm_test_overrides() {
|
||||||
|
local cmds="$1"
|
||||||
|
|
||||||
|
# --- Model registry filter ---
|
||||||
|
if [[ $cmds == *"pytest -v -s models/test_registry.py"* ]]; then
|
||||||
|
cmds=${cmds//"pytest -v -s models/test_registry.py"/"pytest -v -s models/test_registry.py -k 'not BambaForCausalLM and not GritLM and not Mamba2ForCausalLM and not Zamba2ForCausalLM'"}
|
||||||
|
fi
|
||||||
|
|
||||||
|
# --- LoRA: disable custom paged attention ---
|
||||||
|
if [[ $cmds == *"pytest -v -s lora"* ]]; then
|
||||||
|
cmds=${cmds//"pytest -v -s lora"/"VLLM_ROCM_CUSTOM_PAGED_ATTN=0 pytest -v -s lora"}
|
||||||
|
fi
|
||||||
|
|
||||||
|
# --- Kernel ignores ---
|
||||||
|
if [[ $cmds == *" kernels/core"* ]]; then
|
||||||
|
cmds="${cmds} \
|
||||||
|
--ignore=kernels/core/test_fused_quant_layernorm.py \
|
||||||
|
--ignore=kernels/core/test_permute_cols.py"
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ $cmds == *" kernels/attention"* ]]; then
|
||||||
|
cmds="${cmds} \
|
||||||
|
--ignore=kernels/attention/test_attention_selector.py \
|
||||||
|
--ignore=kernels/attention/test_encoder_decoder_attn.py \
|
||||||
|
--ignore=kernels/attention/test_flash_attn.py \
|
||||||
|
--ignore=kernels/attention/test_flashinfer.py \
|
||||||
|
--ignore=kernels/attention/test_prefix_prefill.py \
|
||||||
|
--ignore=kernels/attention/test_cascade_flash_attn.py \
|
||||||
|
--ignore=kernels/attention/test_mha_attn.py \
|
||||||
|
--ignore=kernels/attention/test_lightning_attn.py \
|
||||||
|
--ignore=kernels/attention/test_attention.py"
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ $cmds == *" kernels/quantization"* ]]; then
|
||||||
|
cmds="${cmds} \
|
||||||
|
--ignore=kernels/quantization/test_int8_quant.py \
|
||||||
|
--ignore=kernels/quantization/test_machete_mm.py \
|
||||||
|
--ignore=kernels/quantization/test_block_fp8.py \
|
||||||
|
--ignore=kernels/quantization/test_block_int8.py \
|
||||||
|
--ignore=kernels/quantization/test_marlin_gemm.py \
|
||||||
|
--ignore=kernels/quantization/test_cutlass_scaled_mm.py \
|
||||||
|
--ignore=kernels/quantization/test_int8_kernel.py"
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ $cmds == *" kernels/mamba"* ]]; then
|
||||||
|
cmds="${cmds} \
|
||||||
|
--ignore=kernels/mamba/test_mamba_mixer2.py \
|
||||||
|
--ignore=kernels/mamba/test_causal_conv1d.py \
|
||||||
|
--ignore=kernels/mamba/test_mamba_ssm_ssd.py"
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ $cmds == *" kernels/moe"* ]]; then
|
||||||
|
cmds="${cmds} \
|
||||||
|
--ignore=kernels/moe/test_moe.py \
|
||||||
|
--ignore=kernels/moe/test_cutlass_moe.py \
|
||||||
|
--ignore=kernels/moe/test_triton_moe_ptpc_fp8.py"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# --- Entrypoint ignores ---
|
||||||
|
if [[ $cmds == *" entrypoints/openai "* ]]; then
|
||||||
|
cmds=${cmds//" entrypoints/openai "/" entrypoints/openai \
|
||||||
|
--ignore=entrypoints/openai/test_audio.py \
|
||||||
|
--ignore=entrypoints/openai/test_shutdown.py \
|
||||||
|
--ignore=entrypoints/openai/test_completion.py \
|
||||||
|
--ignore=entrypoints/openai/test_models.py \
|
||||||
|
--ignore=entrypoints/openai/test_lora_adapters.py \
|
||||||
|
--ignore=entrypoints/openai/test_return_tokens_as_ids.py \
|
||||||
|
--ignore=entrypoints/openai/test_root_path.py \
|
||||||
|
--ignore=entrypoints/openai/test_tokenization.py \
|
||||||
|
--ignore=entrypoints/openai/test_prompt_validation.py "}
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ $cmds == *" entrypoints/llm "* ]]; then
|
||||||
|
cmds=${cmds//" entrypoints/llm "/" entrypoints/llm \
|
||||||
|
--ignore=entrypoints/llm/test_chat.py \
|
||||||
|
--ignore=entrypoints/llm/test_accuracy.py \
|
||||||
|
--ignore=entrypoints/llm/test_init.py \
|
||||||
|
--ignore=entrypoints/llm/test_prompt_validation.py "}
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Clean up escaped newlines from --ignore appends
|
||||||
|
cmds=$(echo "$cmds" | sed 's/ \\ / /g')
|
||||||
|
|
||||||
|
echo "$cmds"
|
||||||
|
}
|
||||||
|
|
||||||
|
###############################################################################
|
||||||
|
# Main
|
||||||
|
###############################################################################
|
||||||
|
|
||||||
|
# --- GPU initialization ---
|
||||||
|
echo "--- Confirming Clean Initial State"
|
||||||
|
wait_for_clean_gpus
|
||||||
|
|
||||||
|
echo "--- ROCm info"
|
||||||
|
rocminfo
|
||||||
|
|
||||||
|
# --- Docker housekeeping ---
|
||||||
cleanup_docker
|
cleanup_docker
|
||||||
|
|
||||||
echo "--- Resetting GPUs"
|
echo "--- Resetting GPUs"
|
||||||
|
|
||||||
echo "reset" > /opt/amdgpu/etc/gpu_state
|
echo "reset" > /opt/amdgpu/etc/gpu_state
|
||||||
|
wait_for_clean_gpus
|
||||||
|
|
||||||
while true; do
|
# --- Pull test image ---
|
||||||
sleep 3
|
|
||||||
if grep -q clean /opt/amdgpu/etc/gpu_state; then
|
|
||||||
echo "GPUs state is \"clean\""
|
|
||||||
break
|
|
||||||
fi
|
|
||||||
done
|
|
||||||
|
|
||||||
echo "--- Pulling container"
|
echo "--- Pulling container"
|
||||||
image_name="rocm/vllm-ci:${BUILDKITE_COMMIT}"
|
image_name="rocm/vllm-ci:${BUILDKITE_COMMIT}"
|
||||||
container_name="rocm_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
|
container_name="rocm_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
|
||||||
docker pull "${image_name}"
|
docker pull "${image_name}"
|
||||||
|
|
||||||
remove_docker_container() {
|
remove_docker_container() {
|
||||||
docker rm -f "${container_name}" || docker image rm -f "${image_name}" || true
|
docker rm -f "${container_name}" || docker image rm -f "${image_name}" || true
|
||||||
}
|
}
|
||||||
trap remove_docker_container EXIT
|
trap remove_docker_container EXIT
|
||||||
|
|
||||||
|
# --- Prepare commands ---
|
||||||
echo "--- Running container"
|
echo "--- Running container"
|
||||||
|
|
||||||
HF_CACHE="$(realpath ~)/huggingface"
|
HF_CACHE="$(realpath ~)/huggingface"
|
||||||
mkdir -p "${HF_CACHE}"
|
mkdir -p "${HF_CACHE}"
|
||||||
HF_MOUNT="/root/.cache/huggingface"
|
HF_MOUNT="/root/.cache/huggingface"
|
||||||
|
|
||||||
commands=$@
|
# ---- Command source selection ----
|
||||||
echo "Commands:$commands"
|
# Prefer VLLM_TEST_COMMANDS (preserves all inner quoting intact).
|
||||||
|
# Fall back to $* for backward compatibility, but warn that inner
|
||||||
commands=${commands//"pytest -v -s basic_correctness/test_basic_correctness.py"/"pytest -v -s basic_correctness/test_basic_correctness.py"}
|
# double-quotes will have been stripped by the calling shell.
|
||||||
|
if [[ -n "${VLLM_TEST_COMMANDS:-}" ]]; then
|
||||||
if [[ $commands == *"pytest -v -s models/test_registry.py"* ]]; then
|
commands="${VLLM_TEST_COMMANDS}"
|
||||||
commands=${commands//"pytest -v -s models/test_registry.py"/"pytest -v -s models/test_registry.py -k 'not BambaForCausalLM and not GritLM and not Mamba2ForCausalLM and not Zamba2ForCausalLM'"}
|
echo "Commands sourced from VLLM_TEST_COMMANDS (quoting preserved)"
|
||||||
|
else
|
||||||
|
commands="$*"
|
||||||
|
if [[ -z "$commands" ]]; then
|
||||||
|
echo "Error: No test commands provided." >&2
|
||||||
|
echo "Usage:" >&2
|
||||||
|
echo " Preferred: VLLM_TEST_COMMANDS='...' bash $0" >&2
|
||||||
|
echo " Legacy: bash $0 \"commands here\"" >&2
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
echo "Commands sourced from positional args (legacy mode)"
|
||||||
|
echo "WARNING: Inner double-quotes in the command string may have been"
|
||||||
|
echo " stripped by the calling shell. If you see syntax errors, switch to:"
|
||||||
|
echo " export VLLM_TEST_COMMANDS='your commands here'"
|
||||||
|
echo " bash $0"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
commands=${commands//"pytest -v -s compile/test_basic_correctness.py"/"pytest -v -s compile/test_basic_correctness.py"}
|
echo "Raw commands: $commands"
|
||||||
|
|
||||||
if [[ $commands == *"pytest -v -s lora"* ]]; then
|
# Fix quoting before ROCm overrides (so overrides see correct structure)
|
||||||
commands=${commands//"pytest -v -s lora"/"VLLM_ROCM_CUSTOM_PAGED_ATTN=0 pytest -v -s lora"}
|
commands=$(re_quote_pytest_markers "$commands")
|
||||||
fi
|
echo "After re-quoting: $commands"
|
||||||
|
|
||||||
#ignore certain kernels tests
|
commands=$(apply_rocm_test_overrides "$commands")
|
||||||
if [[ $commands == *" kernels/core"* ]]; then
|
echo "Final commands: $commands"
|
||||||
commands="${commands} \
|
|
||||||
--ignore=kernels/core/test_fused_quant_layernorm.py \
|
|
||||||
--ignore=kernels/core/test_permute_cols.py"
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [[ $commands == *" kernels/attention"* ]]; then
|
|
||||||
commands="${commands} \
|
|
||||||
--ignore=kernels/attention/test_attention_selector.py \
|
|
||||||
--ignore=kernels/attention/test_encoder_decoder_attn.py \
|
|
||||||
--ignore=kernels/attention/test_flash_attn.py \
|
|
||||||
--ignore=kernels/attention/test_flashinfer.py \
|
|
||||||
--ignore=kernels/attention/test_prefix_prefill.py \
|
|
||||||
--ignore=kernels/attention/test_cascade_flash_attn.py \
|
|
||||||
--ignore=kernels/attention/test_mha_attn.py \
|
|
||||||
--ignore=kernels/attention/test_lightning_attn.py \
|
|
||||||
--ignore=kernels/attention/test_attention.py"
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [[ $commands == *" kernels/quantization"* ]]; then
|
|
||||||
commands="${commands} \
|
|
||||||
--ignore=kernels/quantization/test_int8_quant.py \
|
|
||||||
--ignore=kernels/quantization/test_machete_mm.py \
|
|
||||||
--ignore=kernels/quantization/test_block_fp8.py \
|
|
||||||
--ignore=kernels/quantization/test_block_int8.py \
|
|
||||||
--ignore=kernels/quantization/test_marlin_gemm.py \
|
|
||||||
--ignore=kernels/quantization/test_cutlass_scaled_mm.py \
|
|
||||||
--ignore=kernels/quantization/test_int8_kernel.py"
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [[ $commands == *" kernels/mamba"* ]]; then
|
|
||||||
commands="${commands} \
|
|
||||||
--ignore=kernels/mamba/test_mamba_mixer2.py \
|
|
||||||
--ignore=kernels/mamba/test_causal_conv1d.py \
|
|
||||||
--ignore=kernels/mamba/test_mamba_ssm_ssd.py"
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [[ $commands == *" kernels/moe"* ]]; then
|
|
||||||
commands="${commands} \
|
|
||||||
--ignore=kernels/moe/test_moe.py \
|
|
||||||
--ignore=kernels/moe/test_cutlass_moe.py \
|
|
||||||
--ignore=kernels/moe/test_triton_moe_ptpc_fp8.py"
|
|
||||||
fi
|
|
||||||
|
|
||||||
#ignore certain Entrypoints/openai tests
|
|
||||||
if [[ $commands == *" entrypoints/openai "* ]]; then
|
|
||||||
commands=${commands//" entrypoints/openai "/" entrypoints/openai \
|
|
||||||
--ignore=entrypoints/openai/test_audio.py \
|
|
||||||
--ignore=entrypoints/openai/test_shutdown.py \
|
|
||||||
--ignore=entrypoints/openai/test_completion.py \
|
|
||||||
--ignore=entrypoints/openai/test_models.py \
|
|
||||||
--ignore=entrypoints/openai/test_lora_adapters.py \
|
|
||||||
--ignore=entrypoints/openai/test_return_tokens_as_ids.py \
|
|
||||||
--ignore=entrypoints/openai/test_root_path.py \
|
|
||||||
--ignore=entrypoints/openai/test_tokenization.py \
|
|
||||||
--ignore=entrypoints/openai/test_prompt_validation.py "}
|
|
||||||
fi
|
|
||||||
|
|
||||||
#ignore certain Entrypoints/llm tests
|
|
||||||
if [[ $commands == *" entrypoints/llm "* ]]; then
|
|
||||||
commands=${commands//" entrypoints/llm "/" entrypoints/llm \
|
|
||||||
--ignore=entrypoints/llm/test_chat.py \
|
|
||||||
--ignore=entrypoints/llm/test_accuracy.py \
|
|
||||||
--ignore=entrypoints/llm/test_init.py \
|
|
||||||
--ignore=entrypoints/llm/test_prompt_validation.py "}
|
|
||||||
fi
|
|
||||||
|
|
||||||
# --ignore=entrypoints/openai/test_encoder_decoder.py \
|
|
||||||
# --ignore=entrypoints/openai/test_embedding.py \
|
|
||||||
# --ignore=entrypoints/openai/test_oot_registration.py
|
|
||||||
# --ignore=entrypoints/openai/test_accuracy.py \
|
|
||||||
# --ignore=entrypoints/openai/test_models.py <= Fails on MI250 but passes on MI300 as of 2025-03-13
|
|
||||||
|
|
||||||
|
|
||||||
PARALLEL_JOB_COUNT=8
|
|
||||||
MYPYTHONPATH=".."
|
MYPYTHONPATH=".."
|
||||||
|
|
||||||
# Test that we're launching on the machine that has
|
# Verify GPU access
|
||||||
# proper access to GPUs
|
|
||||||
render_gid=$(getent group render | cut -d: -f3)
|
render_gid=$(getent group render | cut -d: -f3)
|
||||||
if [[ -z "$render_gid" ]]; then
|
if [[ -z "$render_gid" ]]; then
|
||||||
echo "Error: 'render' group not found. This is required for GPU access." >&2
|
echo "Error: 'render' group not found. This is required for GPU access." >&2
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# check if the command contains shard flag, we will run all shards in parallel because the host have 8 GPUs.
|
# --- RDMA device passthrough (conditional) ---
|
||||||
if [[ $commands == *"--shard-id="* ]]; then
|
# If the host has RDMA devices, pass them through so tests like
|
||||||
# assign job count as the number of shards used
|
# test_moriio_connector can access ibverbs. On hosts without RDMA
|
||||||
commands=$(echo "$commands" | sed -E "s/--num-shards[[:blank:]]*=[[:blank:]]*[0-9]*/--num-shards=${PARALLEL_JOB_COUNT} /g" | sed 's/ \\ / /g')
|
# hardware the tests will gracefully skip via _rdma_available().
|
||||||
for GPU in $(seq 0 $(($PARALLEL_JOB_COUNT-1))); do
|
RDMA_FLAGS=""
|
||||||
# assign shard-id for each shard
|
if [ -d /dev/infiniband ]; then
|
||||||
commands_gpu=$(echo "$commands" | sed -E "s/--shard-id[[:blank:]]*=[[:blank:]]*[0-9]*/--shard-id=${GPU} /g" | sed 's/ \\ / /g')
|
echo "RDMA devices detected on host, enabling passthrough"
|
||||||
echo "Shard ${GPU} commands:$commands_gpu"
|
RDMA_FLAGS="--device /dev/infiniband --cap-add=IPC_LOCK"
|
||||||
echo "Render devices: $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES"
|
else
|
||||||
docker run \
|
echo "No RDMA devices found on host, RDMA tests will be skipped"
|
||||||
--device /dev/kfd $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES \
|
fi
|
||||||
--network=host \
|
|
||||||
--shm-size=16gb \
|
# --- Route: multi-node vs single-node ---
|
||||||
--group-add "$render_gid" \
|
if is_multi_node "$commands"; then
|
||||||
--rm \
|
echo "--- Multi-node job detected"
|
||||||
-e HIP_VISIBLE_DEVICES="${GPU}" \
|
export DCKR_VER=$(docker --version | sed 's/Docker version \(.*\), build .*/\1/')
|
||||||
-e HF_TOKEN \
|
|
||||||
-e AWS_ACCESS_KEY_ID \
|
# Parse the bracket syntax: prefix ; [node0_cmds] && [node1_cmds]
|
||||||
-e AWS_SECRET_ACCESS_KEY \
|
# BASH_REMATCH[1] = prefix (everything before first bracket)
|
||||||
-v "${HF_CACHE}:${HF_MOUNT}" \
|
# BASH_REMATCH[2] = comma-separated node0 commands
|
||||||
-e "HF_HOME=${HF_MOUNT}" \
|
# BASH_REMATCH[3] = comma-separated node1 commands
|
||||||
-e "PYTHONPATH=${MYPYTHONPATH}" \
|
if [[ "$commands" =~ ^(.*)\[(.*)"] && ["(.*)\]$ ]]; then
|
||||||
--name "${container_name}_${GPU}" \
|
prefix=$(echo "${BASH_REMATCH[1]}" | sed 's/;//g')
|
||||||
"${image_name}" \
|
echo "PREFIX: ${prefix}"
|
||||||
/bin/bash -c "${commands_gpu}" \
|
|
||||||
|& while read -r line; do echo ">>Shard $GPU: $line"; done &
|
export composite_command="(command rocm-smi || true)"
|
||||||
PIDS+=($!)
|
saved_IFS=$IFS
|
||||||
done
|
IFS=','
|
||||||
#wait for all processes to finish and collect exit codes
|
read -ra node0 <<< "${BASH_REMATCH[2]}"
|
||||||
for pid in "${PIDS[@]}"; do
|
read -ra node1 <<< "${BASH_REMATCH[3]}"
|
||||||
wait "${pid}"
|
IFS=$saved_IFS
|
||||||
STATUS+=($?)
|
|
||||||
done
|
if [[ ${#node0[@]} -ne ${#node1[@]} ]]; then
|
||||||
at_least_one_shard_with_tests=0
|
echo "Warning: node0 has ${#node0[@]} commands, node1 has ${#node1[@]}. They will be paired by index."
|
||||||
for st in "${STATUS[@]}"; do
|
|
||||||
if [[ ${st} -ne 0 ]] && [[ ${st} -ne 5 ]]; then
|
|
||||||
echo "One of the processes failed with $st"
|
|
||||||
exit "${st}"
|
|
||||||
elif [[ ${st} -eq 5 ]]; then
|
|
||||||
echo "Shard exited with status 5 (no tests collected) - treating as success"
|
|
||||||
else # This means st is 0
|
|
||||||
at_least_one_shard_with_tests=1
|
|
||||||
fi
|
fi
|
||||||
done
|
|
||||||
if [[ ${#STATUS[@]} -gt 0 && ${at_least_one_shard_with_tests} -eq 0 ]]; then
|
for i in "${!node0[@]}"; do
|
||||||
echo "All shards reported no tests collected. Failing the build."
|
command_node_0=$(echo "${node0[i]}" | sed 's/\"//g')
|
||||||
exit 1
|
command_node_1=$(echo "${node1[i]}" | sed 's/\"//g')
|
||||||
|
|
||||||
|
step_cmd="./.buildkite/scripts/run-multi-node-test.sh /vllm-workspace/tests 2 2 ${image_name} '${command_node_0}' '${command_node_1}'"
|
||||||
|
echo "COMMANDS: ${step_cmd}"
|
||||||
|
composite_command="${composite_command} && ${step_cmd}"
|
||||||
|
done
|
||||||
|
|
||||||
|
/bin/bash -c "${composite_command}"
|
||||||
|
exit_code=$?
|
||||||
|
cleanup_network
|
||||||
|
handle_pytest_exit "$exit_code"
|
||||||
|
else
|
||||||
|
echo "Multi-node job detected but failed to parse bracket command syntax."
|
||||||
|
echo "Expected format: prefix ; [node0_cmd1, node0_cmd2] && [node1_cmd1, node1_cmd2]"
|
||||||
|
echo "Got: $commands"
|
||||||
|
cleanup_network
|
||||||
|
exit 111
|
||||||
fi
|
fi
|
||||||
else
|
else
|
||||||
|
echo "--- Single-node job"
|
||||||
echo "Render devices: $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES"
|
echo "Render devices: $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES"
|
||||||
docker run \
|
docker run \
|
||||||
--device /dev/kfd $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES \
|
--device /dev/kfd $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES \
|
||||||
--network=host \
|
$RDMA_FLAGS \
|
||||||
--shm-size=16gb \
|
--network=host \
|
||||||
--group-add "$render_gid" \
|
--shm-size=16gb \
|
||||||
--rm \
|
--group-add "$render_gid" \
|
||||||
-e HF_TOKEN \
|
--rm \
|
||||||
-e AWS_ACCESS_KEY_ID \
|
-e HF_TOKEN \
|
||||||
-e AWS_SECRET_ACCESS_KEY \
|
-e AWS_ACCESS_KEY_ID \
|
||||||
-v "${HF_CACHE}:${HF_MOUNT}" \
|
-e AWS_SECRET_ACCESS_KEY \
|
||||||
-e "HF_HOME=${HF_MOUNT}" \
|
-v "${HF_CACHE}:${HF_MOUNT}" \
|
||||||
-e "PYTHONPATH=${MYPYTHONPATH}" \
|
-e "HF_HOME=${HF_MOUNT}" \
|
||||||
--name "${container_name}" \
|
-e "PYTHONPATH=${MYPYTHONPATH}" \
|
||||||
"${image_name}" \
|
--name "${container_name}" \
|
||||||
/bin/bash -c "${commands}"
|
"${image_name}" \
|
||||||
|
/bin/bash -c "${commands}"
|
||||||
|
|
||||||
|
exit_code=$?
|
||||||
|
handle_pytest_exit "$exit_code"
|
||||||
fi
|
fi
|
||||||
|
|||||||
@@ -0,0 +1,43 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
set -euox pipefail
|
||||||
|
export VLLM_CPU_CI_ENV=0
|
||||||
|
|
||||||
|
echo "--- PP+TP"
|
||||||
|
vllm serve meta-llama/Llama-3.2-3B-Instruct -tp=2 -pp=2 &
|
||||||
|
server_pid=$!
|
||||||
|
timeout 600 bash -c "until curl localhost:8000/v1/models > /dev/null 2>&1; do sleep 1; done" || exit 1
|
||||||
|
vllm bench serve \
|
||||||
|
--backend vllm \
|
||||||
|
--dataset-name random \
|
||||||
|
--model meta-llama/Llama-3.2-3B-Instruct \
|
||||||
|
--num-prompts 20 \
|
||||||
|
--result-dir ./test_results \
|
||||||
|
--result-filename tp_pp.json \
|
||||||
|
--save-result \
|
||||||
|
--endpoint /v1/completions
|
||||||
|
kill -s SIGTERM $server_pid; wait $server_pid || true
|
||||||
|
failed_req=$(jq '.failed' ./test_results/tp_pp.json)
|
||||||
|
if [ "$failed_req" -ne 0 ]; then
|
||||||
|
echo "Some requests were failed!"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "--- DP+TP"
|
||||||
|
vllm serve meta-llama/Llama-3.2-3B-Instruct -tp=2 -dp=2 &
|
||||||
|
server_pid=$!
|
||||||
|
timeout 600 bash -c "until curl localhost:8000/v1/models > /dev/null 2>&1; do sleep 1; done" || exit 1
|
||||||
|
vllm bench serve \
|
||||||
|
--backend vllm \
|
||||||
|
--dataset-name random \
|
||||||
|
--model meta-llama/Llama-3.2-3B-Instruct \
|
||||||
|
--num-prompts 20 \
|
||||||
|
--result-dir ./test_results \
|
||||||
|
--result-filename dp_pp.json \
|
||||||
|
--save-result \
|
||||||
|
--endpoint /v1/completions
|
||||||
|
kill -s SIGTERM $server_pid; wait $server_pid || true
|
||||||
|
failed_req=$(jq '.failed' ./test_results/dp_pp.json)
|
||||||
|
if [ "$failed_req" -ne 0 ]; then
|
||||||
|
echo "Some requests were failed!"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
@@ -27,7 +27,7 @@ function cpu_tests() {
|
|||||||
podman exec -it "$container_id" bash -c "
|
podman exec -it "$container_id" bash -c "
|
||||||
export TORCH_COMPILE_DISABLE=1
|
export TORCH_COMPILE_DISABLE=1
|
||||||
set -xve
|
set -xve
|
||||||
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m" >> $HOME/test_basic.log
|
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m" >> "$HOME"/test_basic.log
|
||||||
|
|
||||||
# Run basic model test
|
# Run basic model test
|
||||||
podman exec -it "$container_id" bash -c "
|
podman exec -it "$container_id" bash -c "
|
||||||
@@ -43,7 +43,7 @@ function cpu_tests() {
|
|||||||
pytest -v -s tests/models/language/generation/test_common.py::test_models[False-False-5-32-google/gemma-1.1-2b-it]
|
pytest -v -s tests/models/language/generation/test_common.py::test_models[False-False-5-32-google/gemma-1.1-2b-it]
|
||||||
pytest -v -s tests/models/language/pooling/test_classification.py::test_models[float-jason9693/Qwen2.5-1.5B-apeach]
|
pytest -v -s tests/models/language/pooling/test_classification.py::test_models[float-jason9693/Qwen2.5-1.5B-apeach]
|
||||||
# TODO: Below test case tests/models/language/pooling/test_embedding.py::test_models[True-ssmits/Qwen2-7B-Instruct-embed-base] fails on ppc64le. Disabling it for time being.
|
# TODO: Below test case tests/models/language/pooling/test_embedding.py::test_models[True-ssmits/Qwen2-7B-Instruct-embed-base] fails on ppc64le. Disabling it for time being.
|
||||||
# pytest -v -s tests/models/language/pooling/test_embedding.py -m cpu_model" >> $HOME/test_rest.log
|
# pytest -v -s tests/models/language/pooling/test_embedding.py -m cpu_model" >> "$HOME"/test_rest.log
|
||||||
}
|
}
|
||||||
|
|
||||||
# All of CPU tests are expected to be finished less than 40 mins.
|
# All of CPU tests are expected to be finished less than 40 mins.
|
||||||
|
|||||||
@@ -2,119 +2,19 @@
|
|||||||
|
|
||||||
# This script build the CPU docker image and run the offline inference inside the container.
|
# This script build the CPU docker image and run the offline inference inside the container.
|
||||||
# It serves a sanity check for compilation and basic model usage.
|
# It serves a sanity check for compilation and basic model usage.
|
||||||
set -ex
|
set -euox pipefail
|
||||||
|
|
||||||
# allow to bind to different cores
|
# allow to bind to different cores
|
||||||
CORE_RANGE=${CORE_RANGE:-48-95}
|
CORE_RANGE=${CORE_RANGE:-48-95}
|
||||||
# used for TP/PP E2E test
|
|
||||||
OMP_CORE_RANGE=${OMP_CORE_RANGE:-48-95}
|
|
||||||
NUMA_NODE=${NUMA_NODE:-1}
|
NUMA_NODE=${NUMA_NODE:-1}
|
||||||
|
IMAGE_NAME="cpu-test-$NUMA_NODE"
|
||||||
|
TIMEOUT_VAL=$1
|
||||||
|
TEST_COMMAND=$2
|
||||||
|
|
||||||
export CMAKE_BUILD_PARALLEL_LEVEL=32
|
# building the docker image
|
||||||
|
echo "--- :docker: Building Docker image"
|
||||||
# Setup cleanup
|
docker build --progress plain --tag "$IMAGE_NAME" --target vllm-test -f docker/Dockerfile.cpu .
|
||||||
remove_docker_container() {
|
|
||||||
set -e;
|
|
||||||
docker rm -f cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"-avx2 || true;
|
|
||||||
}
|
|
||||||
trap remove_docker_container EXIT
|
|
||||||
remove_docker_container
|
|
||||||
|
|
||||||
# Try building the docker image
|
|
||||||
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --progress plain --tag cpu-test-"$NUMA_NODE" --target vllm-test -f docker/Dockerfile.cpu .
|
|
||||||
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --progress plain --build-arg VLLM_CPU_DISABLE_AVX512="true" --tag cpu-test-"$NUMA_NODE"-avx2 --target vllm-test -f docker/Dockerfile.cpu .
|
|
||||||
|
|
||||||
# Run the image, setting --shm-size=4g for tensor parallel.
|
# Run the image, setting --shm-size=4g for tensor parallel.
|
||||||
docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=16 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"
|
docker run --rm --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN -e VLLM_CPU_KVCACHE_SPACE=16 -e VLLM_CPU_CI_ENV=1 -e VLLM_CPU_SIM_MULTI_NUMA=1 --shm-size=4g "$IMAGE_NAME" \
|
||||||
docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=16 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE"-avx2 cpu-test-"$NUMA_NODE"-avx2
|
timeout "$TIMEOUT_VAL" bash -c "set -euox pipefail; echo \"--- Print packages\"; pip list; echo \"--- Running tests\"; ${TEST_COMMAND}"
|
||||||
|
|
||||||
function cpu_tests() {
|
|
||||||
set -e
|
|
||||||
export NUMA_NODE=$2
|
|
||||||
|
|
||||||
# list packages
|
|
||||||
docker exec cpu-test-"$NUMA_NODE"-avx2 bash -c "
|
|
||||||
set -e
|
|
||||||
pip list"
|
|
||||||
|
|
||||||
docker exec cpu-test-"$NUMA_NODE" bash -c "
|
|
||||||
set -e
|
|
||||||
pip list"
|
|
||||||
|
|
||||||
# offline inference
|
|
||||||
docker exec cpu-test-"$NUMA_NODE"-avx2 bash -c "
|
|
||||||
set -e
|
|
||||||
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
|
|
||||||
|
|
||||||
# Run kernel tests
|
|
||||||
docker exec cpu-test-"$NUMA_NODE" bash -c "
|
|
||||||
set -e
|
|
||||||
pytest -x -v -s tests/kernels/attention/test_cpu_attn.py
|
|
||||||
pytest -x -v -s tests/kernels/moe/test_cpu_fused_moe.py
|
|
||||||
pytest -x -v -s tests/kernels/test_onednn.py"
|
|
||||||
|
|
||||||
# Run basic model test
|
|
||||||
docker exec cpu-test-"$NUMA_NODE" bash -c "
|
|
||||||
set -e
|
|
||||||
# Note: disable until supports V1
|
|
||||||
# pytest -x -v -s tests/kernels/attention/test_cache.py -m cpu_model
|
|
||||||
# pytest -x -v -s tests/kernels/attention/test_mla_decode_cpu.py -m cpu_model
|
|
||||||
|
|
||||||
pytest -x -v -s tests/models/language/generation -m cpu_model
|
|
||||||
VLLM_CPU_SGL_KERNEL=1 pytest -x -v -s tests/models/language/generation -m cpu_model
|
|
||||||
|
|
||||||
pytest -x -v -s tests/models/language/pooling -m cpu_model
|
|
||||||
pytest -x -v -s tests/models/multimodal/generation \
|
|
||||||
--ignore=tests/models/multimodal/generation/test_pixtral.py \
|
|
||||||
-m cpu_model"
|
|
||||||
|
|
||||||
# Run compressed-tensor test
|
|
||||||
docker exec cpu-test-"$NUMA_NODE" bash -c "
|
|
||||||
set -e
|
|
||||||
pytest -x -s -v \
|
|
||||||
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_logprobs"
|
|
||||||
|
|
||||||
# Run AWQ/GPTQ test
|
|
||||||
docker exec cpu-test-"$NUMA_NODE" bash -c "
|
|
||||||
set -e
|
|
||||||
pytest -x -s -v \
|
|
||||||
tests/quantization/test_cpu_wna16.py"
|
|
||||||
|
|
||||||
# Run multi-lora tests
|
|
||||||
docker exec cpu-test-"$NUMA_NODE" bash -c "
|
|
||||||
set -e
|
|
||||||
pytest -x -s -v \
|
|
||||||
tests/lora/test_qwenvl.py"
|
|
||||||
|
|
||||||
# online serving: tp+pp
|
|
||||||
docker exec cpu-test-"$NUMA_NODE" bash -c '
|
|
||||||
set -e
|
|
||||||
VLLM_CPU_OMP_THREADS_BIND=$E2E_OMP_THREADS VLLM_CPU_SGL_KERNEL=1 vllm serve meta-llama/Llama-3.2-3B-Instruct -tp=2 -pp=2 &
|
|
||||||
server_pid=$!
|
|
||||||
timeout 600 bash -c "until curl localhost:8000/v1/models; do sleep 1; done" || exit 1
|
|
||||||
vllm bench serve \
|
|
||||||
--backend vllm \
|
|
||||||
--dataset-name random \
|
|
||||||
--model meta-llama/Llama-3.2-3B-Instruct \
|
|
||||||
--num-prompts 20 \
|
|
||||||
--endpoint /v1/completions
|
|
||||||
kill -s SIGTERM $server_pid &'
|
|
||||||
|
|
||||||
# online serving: tp+dp
|
|
||||||
docker exec cpu-test-"$NUMA_NODE" bash -c '
|
|
||||||
set -e
|
|
||||||
VLLM_CPU_OMP_THREADS_BIND=$E2E_OMP_THREADS VLLM_CPU_SGL_KERNEL=1 vllm serve meta-llama/Llama-3.2-3B-Instruct -tp=2 -dp=2 &
|
|
||||||
server_pid=$!
|
|
||||||
timeout 600 bash -c "until curl localhost:8000/v1/models; do sleep 1; done" || exit 1
|
|
||||||
vllm bench serve \
|
|
||||||
--backend vllm \
|
|
||||||
--dataset-name random \
|
|
||||||
--model meta-llama/Llama-3.2-3B-Instruct \
|
|
||||||
--num-prompts 20 \
|
|
||||||
--endpoint /v1/completions
|
|
||||||
kill -s SIGTERM $server_pid &'
|
|
||||||
}
|
|
||||||
|
|
||||||
# All of CPU tests are expected to be finished less than 40 mins.
|
|
||||||
export -f cpu_tests
|
|
||||||
timeout 2.5h bash -c "cpu_tests $CORE_RANGE $NUMA_NODE"
|
|
||||||
|
|||||||
@@ -1,21 +1,49 @@
|
|||||||
#!/bin/bash
|
#!/bin/bash
|
||||||
|
|
||||||
# This script build the CPU docker image and run the offline inference inside the container.
|
# This script builds the HPU docker image and runs the offline inference inside the container.
|
||||||
# It serves a sanity check for compilation and basic model usage.
|
# It serves a sanity check for compilation and basic model usage.
|
||||||
|
#
|
||||||
|
# vllm-gaudi compatibility pinning:
|
||||||
|
# The vllm-gaudi plugin is installed on top of the vllm upstream checkout used by this CI job.
|
||||||
|
# When upstream vllm changes its API, the plugin may break before it has been updated.
|
||||||
|
# To handle this, the vllm-gaudi repository maintains a file:
|
||||||
|
# vllm/last-good-commit-for-vllm-gaudi/VLLM_COMMUNITY_COMMIT
|
||||||
|
# The first line of that file controls what version of vllm is used inside the Docker image:
|
||||||
|
# - "latest" : no checkout override; the current Buildkite CI commit is used as-is.
|
||||||
|
# - "<commit SHA>" : vllm is checked out to that specific commit before building, pinning
|
||||||
|
# the test to a known-compatible baseline.
|
||||||
|
# To unpin (resume testing against the live vllm tip), set the file content back to "latest".
|
||||||
set -exuo pipefail
|
set -exuo pipefail
|
||||||
|
|
||||||
|
# Fetch the vllm community commit reference from vllm-gaudi (first line only).
|
||||||
|
VLLM_COMMUNITY_COMMIT=$(curl -s \
|
||||||
|
https://raw.githubusercontent.com/vllm-project/vllm-gaudi/vllm/last-good-commit-for-vllm-gaudi/VLLM_COMMUNITY_COMMIT \
|
||||||
|
| head -1 | tr -d '\n')
|
||||||
|
|
||||||
|
echo "Using vllm community commit: ${VLLM_COMMUNITY_COMMIT}"
|
||||||
|
|
||||||
# Try building the docker image
|
# Try building the docker image
|
||||||
cat <<EOF | docker build -t hpu-plugin-v1-test-env -f - .
|
image_name="hpu/upstream-vllm-ci:${BUILDKITE_COMMIT}"
|
||||||
|
container_name="hpu-upstream-vllm-ci-${BUILDKITE_COMMIT}-container"
|
||||||
|
cat <<EOF | docker build -t "${image_name}" -f - .
|
||||||
FROM gaudi-base-image:latest
|
FROM gaudi-base-image:latest
|
||||||
|
|
||||||
COPY ./ /workspace/vllm
|
COPY ./ /workspace/vllm
|
||||||
|
|
||||||
|
# If VLLM_COMMUNITY_COMMIT is a specific commit (not "latest"), check it out to pin vllm
|
||||||
|
# to the version known to be compatible with vllm-gaudi. When the value is "latest",
|
||||||
|
# the current checkout (the Buildkite CI commit) is used unchanged.
|
||||||
|
RUN if [ "${VLLM_COMMUNITY_COMMIT}" != "latest" ]; then \
|
||||||
|
cd /workspace/vllm && git fetch --unshallow 2>/dev/null || true && git checkout ${VLLM_COMMUNITY_COMMIT}; \
|
||||||
|
fi
|
||||||
|
|
||||||
WORKDIR /workspace/vllm
|
WORKDIR /workspace/vllm
|
||||||
|
|
||||||
ENV no_proxy=localhost,127.0.0.1
|
ENV no_proxy=localhost,127.0.0.1
|
||||||
ENV PT_HPU_ENABLE_LAZY_COLLECTIVES=true
|
ENV PT_HPU_ENABLE_LAZY_COLLECTIVES=true
|
||||||
|
|
||||||
RUN VLLM_TARGET_DEVICE=empty pip install .
|
RUN bash -c 'pip install -r <(sed "/^torch/d" requirements/build.txt)'
|
||||||
|
RUN VLLM_TARGET_DEVICE=empty pip install --no-build-isolation -e .
|
||||||
RUN pip install git+https://github.com/vllm-project/vllm-gaudi.git
|
RUN pip install git+https://github.com/vllm-project/vllm-gaudi.git
|
||||||
|
|
||||||
# install development dependencies (for testing)
|
# install development dependencies (for testing)
|
||||||
@@ -36,15 +64,20 @@ EOF
|
|||||||
# functions, while other platforms only need one remove_docker_container
|
# functions, while other platforms only need one remove_docker_container
|
||||||
# function.
|
# function.
|
||||||
EXITCODE=1
|
EXITCODE=1
|
||||||
remove_docker_containers() { docker rm -f hpu-plugin-v1-test || true; }
|
remove_docker_containers() { docker rm -f "${container_name}" || true; }
|
||||||
trap 'remove_docker_containers; exit $EXITCODE;' EXIT
|
trap 'remove_docker_containers; exit $EXITCODE;' EXIT
|
||||||
remove_docker_containers
|
remove_docker_containers
|
||||||
|
|
||||||
echo "Running HPU plugin v1 test"
|
echo "Running HPU plugin v1 test"
|
||||||
docker run --rm --runtime=habana --name=hpu-plugin-v1-test --network=host \
|
docker run --rm --runtime=habana --name="${container_name}" --network=host \
|
||||||
-e HABANA_VISIBLE_DEVICES=all \
|
-e HABANA_VISIBLE_DEVICES=all \
|
||||||
hpu-plugin-v1-test-env \
|
-e VLLM_SKIP_WARMUP=true \
|
||||||
/bin/bash "/workspace/vllm-gaudi/tests/upstream_tests/ci_tests.sh"
|
-e PT_HPU_ENABLE_LAZY_COLLECTIVES=true \
|
||||||
|
-e PT_HPU_LAZY_MODE=1 \
|
||||||
|
"${image_name}" \
|
||||||
|
/bin/bash -c '
|
||||||
|
cd vllm; timeout 120s python -u examples/offline_inference/basic/generate.py --model facebook/opt-125m
|
||||||
|
'
|
||||||
|
|
||||||
EXITCODE=$?
|
EXITCODE=$?
|
||||||
if [ $EXITCODE -eq 0 ]; then
|
if [ $EXITCODE -eq 0 ]; then
|
||||||
|
|||||||
@@ -41,6 +41,7 @@ get_config() {
|
|||||||
echo "Error: file '${TEST_RUN_CONFIG_FILE}' does not exist in the warehouse" >&2
|
echo "Error: file '${TEST_RUN_CONFIG_FILE}' does not exist in the warehouse" >&2
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
# shellcheck source=/dev/null
|
||||||
source "${TEST_RUN_CONFIG_FILE}"
|
source "${TEST_RUN_CONFIG_FILE}"
|
||||||
echo "Base docker image name that get from configuration: ${BASE_IMAGE_NAME}"
|
echo "Base docker image name that get from configuration: ${BASE_IMAGE_NAME}"
|
||||||
return 0
|
return 0
|
||||||
@@ -48,9 +49,8 @@ get_config() {
|
|||||||
|
|
||||||
# get test running configuration.
|
# get test running configuration.
|
||||||
fetch_vllm_test_cfg
|
fetch_vllm_test_cfg
|
||||||
get_config
|
|
||||||
# Check if the function call was successful. If not, exit the script.
|
# Check if the function call was successful. If not, exit the script.
|
||||||
if [ $? -ne 0 ]; then
|
if ! get_config; then
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
@@ -62,14 +62,14 @@ agent_idx=$(echo "${BUILDKITE_AGENT_NAME}" | awk -F'-' '{print $(NF-1)}')
|
|||||||
echo "agent_idx: ${agent_idx}"
|
echo "agent_idx: ${agent_idx}"
|
||||||
builder_name="cachebuilder${agent_idx}"
|
builder_name="cachebuilder${agent_idx}"
|
||||||
builder_cache_dir="/mnt/docker-cache${agent_idx}"
|
builder_cache_dir="/mnt/docker-cache${agent_idx}"
|
||||||
mkdir -p ${builder_cache_dir}
|
mkdir -p "${builder_cache_dir}"
|
||||||
|
|
||||||
# Try building the docker image
|
# Try building the docker image
|
||||||
cat <<EOF | DOCKER_BUILDKIT=1 docker build \
|
cat <<EOF | DOCKER_BUILDKIT=1 docker build \
|
||||||
--add-host cache-service-vllm.nginx-pypi-cache.svc.cluster.local:${PYPI_CACHE_HOST} \
|
--add-host cache-service-vllm.nginx-pypi-cache.svc.cluster.local:"${PYPI_CACHE_HOST}" \
|
||||||
--builder ${builder_name} --cache-from type=local,src=${builder_cache_dir} \
|
--builder "${builder_name}" --cache-from type=local,src="${builder_cache_dir}" \
|
||||||
--cache-to type=local,dest=${builder_cache_dir},mode=max \
|
--cache-to type=local,dest="${builder_cache_dir}",mode=max \
|
||||||
--progress=plain --load -t ${image_name} -f - .
|
--progress=plain --load -t "${image_name}" -f - .
|
||||||
FROM ${BASE_IMAGE_NAME}
|
FROM ${BASE_IMAGE_NAME}
|
||||||
|
|
||||||
# Define environments
|
# Define environments
|
||||||
@@ -116,7 +116,7 @@ RUN --mount=type=cache,target=/root/.cache/pip \
|
|||||||
export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi && \
|
export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi && \
|
||||||
source /usr/local/Ascend/ascend-toolkit/set_env.sh && \
|
source /usr/local/Ascend/ascend-toolkit/set_env.sh && \
|
||||||
source /usr/local/Ascend/nnal/atb/set_env.sh && \
|
source /usr/local/Ascend/nnal/atb/set_env.sh && \
|
||||||
export LD_LIBRARY_PATH=\$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/`uname -i`-linux/devlib && \
|
export LD_LIBRARY_PATH=\$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/$(uname -i)-linux/devlib && \
|
||||||
python3 -m pip install -v -e /workspace/vllm-ascend/ --extra-index https://download.pytorch.org/whl/cpu/
|
python3 -m pip install -v -e /workspace/vllm-ascend/ --extra-index https://download.pytorch.org/whl/cpu/
|
||||||
|
|
||||||
ENV VLLM_WORKER_MULTIPROC_METHOD=spawn
|
ENV VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||||
@@ -139,7 +139,7 @@ trap remove_docker_container EXIT
|
|||||||
# Generate corresponding --device args based on BUILDKITE_AGENT_NAME
|
# Generate corresponding --device args based on BUILDKITE_AGENT_NAME
|
||||||
# Ascend NPU BUILDKITE_AGENT_NAME format is {hostname}-{agent_idx}-{npu_card_num}cards, and agent_idx starts from 1.
|
# Ascend NPU BUILDKITE_AGENT_NAME format is {hostname}-{agent_idx}-{npu_card_num}cards, and agent_idx starts from 1.
|
||||||
# e.g. atlas-a2-001-1-2cards means this is the 1-th agent on atlas-a2-001 host, and it has 2 NPU cards.
|
# e.g. atlas-a2-001-1-2cards means this is the 1-th agent on atlas-a2-001 host, and it has 2 NPU cards.
|
||||||
# returns --device /dev/davinci0 --device /dev/davinci1
|
# returns one argument per line: --device, /dev/davinciX, ...
|
||||||
parse_and_gen_devices() {
|
parse_and_gen_devices() {
|
||||||
local input="$1"
|
local input="$1"
|
||||||
local index cards_num
|
local index cards_num
|
||||||
@@ -151,29 +151,24 @@ parse_and_gen_devices() {
|
|||||||
return 1
|
return 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
local devices=""
|
|
||||||
local i=0
|
local i=0
|
||||||
while (( i < cards_num )); do
|
while (( i < cards_num )); do
|
||||||
local dev_idx=$(((index - 1)*cards_num + i ))
|
local dev_idx=$(((index - 1)*cards_num + i ))
|
||||||
devices="$devices --device /dev/davinci${dev_idx}"
|
printf '%s\n' "--device"
|
||||||
|
printf '%s\n' "/dev/davinci${dev_idx}"
|
||||||
((i++))
|
((i++))
|
||||||
done
|
done
|
||||||
|
|
||||||
# trim leading space
|
|
||||||
devices="${devices#"${devices%%[![:space:]]*}"}"
|
|
||||||
# Output devices: assigned to the caller variable
|
|
||||||
printf '%s' "$devices"
|
|
||||||
}
|
}
|
||||||
|
|
||||||
devices=$(parse_and_gen_devices "${BUILDKITE_AGENT_NAME}") || exit 1
|
mapfile -t device_args < <(parse_and_gen_devices "${BUILDKITE_AGENT_NAME}") || exit 1
|
||||||
|
|
||||||
# Run the image and execute the Out-Of-Tree (OOT) platform interface test case on Ascend NPU hardware.
|
# Run the image and execute the Out-Of-Tree (OOT) platform interface test case on Ascend NPU hardware.
|
||||||
# This test checks whether the OOT platform interface is functioning properly in conjunction with
|
# This test checks whether the OOT platform interface is functioning properly in conjunction with
|
||||||
# the hardware plugin vllm-ascend.
|
# the hardware plugin vllm-ascend.
|
||||||
model_cache_dir=/mnt/modelscope${agent_idx}
|
model_cache_dir=/mnt/modelscope${agent_idx}
|
||||||
mkdir -p ${model_cache_dir}
|
mkdir -p "${model_cache_dir}"
|
||||||
docker run \
|
docker run \
|
||||||
${devices} \
|
"${device_args[@]}" \
|
||||||
--device /dev/davinci_manager \
|
--device /dev/davinci_manager \
|
||||||
--device /dev/devmm_svm \
|
--device /dev/devmm_svm \
|
||||||
--device /dev/hisi_hdc \
|
--device /dev/hisi_hdc \
|
||||||
@@ -182,7 +177,7 @@ docker run \
|
|||||||
-v /usr/local/Ascend/driver/lib64/:/usr/local/Ascend/driver/lib64/ \
|
-v /usr/local/Ascend/driver/lib64/:/usr/local/Ascend/driver/lib64/ \
|
||||||
-v /usr/local/Ascend/driver/version.info:/usr/local/Ascend/driver/version.info \
|
-v /usr/local/Ascend/driver/version.info:/usr/local/Ascend/driver/version.info \
|
||||||
-v /etc/ascend_install.info:/etc/ascend_install.info \
|
-v /etc/ascend_install.info:/etc/ascend_install.info \
|
||||||
-v ${model_cache_dir}:/root/.cache/modelscope \
|
-v "${model_cache_dir}":/root/.cache/modelscope \
|
||||||
--entrypoint="" \
|
--entrypoint="" \
|
||||||
--name "${container_name}" \
|
--name "${container_name}" \
|
||||||
"${image_name}" \
|
"${image_name}" \
|
||||||
|
|||||||
@@ -61,7 +61,7 @@ echo "Results will be stored in: $RESULTS_DIR"
|
|||||||
echo "--- Installing Python dependencies ---"
|
echo "--- Installing Python dependencies ---"
|
||||||
python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \
|
python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \
|
||||||
&& python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \
|
&& python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \
|
||||||
&& python3 -m pip install --progress-bar off "lm-eval[api]>=0.4.9.2" \
|
&& python3 -m pip install --progress-bar off "lm-eval[api]>=0.4.11" \
|
||||||
&& python3 -m pip install --progress-bar off hf-transfer tblib==3.1.0
|
&& python3 -m pip install --progress-bar off hf-transfer tblib==3.1.0
|
||||||
echo "--- Python dependencies installed ---"
|
echo "--- Python dependencies installed ---"
|
||||||
|
|
||||||
|
|||||||
@@ -61,7 +61,7 @@ echo "Results will be stored in: $RESULTS_DIR"
|
|||||||
echo "--- Installing Python dependencies ---"
|
echo "--- Installing Python dependencies ---"
|
||||||
python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \
|
python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \
|
||||||
&& python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \
|
&& python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \
|
||||||
&& python3 -m pip install --progress-bar off "lm-eval[api]>=0.4.9.2" \
|
&& python3 -m pip install --progress-bar off "lm-eval[api]>=0.4.11" \
|
||||||
&& python3 -m pip install --progress-bar off hf-transfer tblib==3.1.0
|
&& python3 -m pip install --progress-bar off hf-transfer tblib==3.1.0
|
||||||
echo "--- Python dependencies installed ---"
|
echo "--- Python dependencies installed ---"
|
||||||
|
|
||||||
|
|||||||
@@ -8,7 +8,7 @@ image_name="xpu/vllm-ci:${BUILDKITE_COMMIT}"
|
|||||||
container_name="xpu_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
|
container_name="xpu_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
|
||||||
|
|
||||||
# Try building the docker image
|
# Try building the docker image
|
||||||
docker build -t ${image_name} -f docker/Dockerfile.xpu .
|
docker build -t "${image_name}" -f docker/Dockerfile.xpu .
|
||||||
|
|
||||||
# Setup cleanup
|
# Setup cleanup
|
||||||
remove_docker_container() {
|
remove_docker_container() {
|
||||||
@@ -38,15 +38,18 @@ docker run \
|
|||||||
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 -O3 -cc.cudagraph_mode=NONE
|
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 -O3 -cc.cudagraph_mode=NONE
|
||||||
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray
|
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray
|
||||||
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp
|
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp
|
||||||
python3 examples/offline_inference/basic/generate.py --model Intel/Qwen2.5-0.5B-W4A16-G128-AutoRound-LLMC-TEST-ONLY --enforce-eager
|
|
||||||
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager --attention-backend=TRITON_ATTN
|
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager --attention-backend=TRITON_ATTN
|
||||||
|
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager --quantization fp8
|
||||||
|
python3 examples/offline_inference/basic/generate.py --model superjob/Qwen3-4B-Instruct-2507-GPTQ-Int4 --block-size 64 --enforce-eager
|
||||||
|
python3 examples/offline_inference/basic/generate.py --model ibm-research/PowerMoE-3b --block-size 64 --enforce-eager -tp 2
|
||||||
|
python3 examples/offline_inference/basic/generate.py --model ibm-research/PowerMoE-3b --block-size 64 --enforce-eager -tp 2 --enable-expert-parallel
|
||||||
cd tests
|
cd tests
|
||||||
pytest -v -s v1/core
|
pytest -v -s v1/core --ignore=v1/core/test_reset_prefix_cache_e2e.py
|
||||||
pytest -v -s v1/engine
|
pytest -v -s v1/engine
|
||||||
pytest -v -s v1/sample --ignore=v1/sample/test_logprobs.py --ignore=v1/sample/test_logprobs_e2e.py
|
pytest -v -s v1/sample --ignore=v1/sample/test_logprobs.py --ignore=v1/sample/test_logprobs_e2e.py
|
||||||
pytest -v -s v1/worker --ignore=v1/worker/test_gpu_model_runner.py
|
pytest -v -s v1/worker --ignore=v1/worker/test_gpu_model_runner.py
|
||||||
pytest -v -s v1/structured_output
|
pytest -v -s v1/structured_output
|
||||||
pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_tree_attention.py --ignore=v1/spec_decode/test_speculators_eagle3.py
|
pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_tree_attention.py --ignore=v1/spec_decode/test_speculators_eagle3.py --ignore=v1/spec_decode/test_acceptance_length.py
|
||||||
pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_nixl_connector.py --ignore=v1/kv_connector/unit/test_example_connector.py --ignore=v1/kv_connector/unit/test_lmcache_integration.py
|
pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_nixl_connector.py --ignore=v1/kv_connector/unit/test_example_connector.py --ignore=v1/kv_connector/unit/test_lmcache_integration.py
|
||||||
pytest -v -s v1/test_serial_utils.py
|
pytest -v -s v1/test_serial_utils.py
|
||||||
'
|
'
|
||||||
|
|||||||
@@ -21,16 +21,16 @@ echo "Pushing original tag $ORIG_TAG_NAME$ORIG_TAG_SUFFIX to new nightly tag nam
|
|||||||
|
|
||||||
# pull original arch-dependent images from AWS ECR Public
|
# pull original arch-dependent images from AWS ECR Public
|
||||||
aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7
|
aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7
|
||||||
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:$ORIG_TAG_NAME-x86_64$ORIG_TAG_SUFFIX
|
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:"$ORIG_TAG_NAME"-x86_64"$ORIG_TAG_SUFFIX"
|
||||||
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:$ORIG_TAG_NAME-aarch64$ORIG_TAG_SUFFIX
|
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:"$ORIG_TAG_NAME"-aarch64"$ORIG_TAG_SUFFIX"
|
||||||
# tag arch-dependent images
|
# tag arch-dependent images
|
||||||
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$ORIG_TAG_NAME-x86_64$ORIG_TAG_SUFFIX vllm/vllm-openai:$TAG_NAME-x86_64
|
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:"$ORIG_TAG_NAME"-x86_64"$ORIG_TAG_SUFFIX" vllm/vllm-openai:"$TAG_NAME"-x86_64
|
||||||
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$ORIG_TAG_NAME-aarch64$ORIG_TAG_SUFFIX vllm/vllm-openai:$TAG_NAME-aarch64
|
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:"$ORIG_TAG_NAME"-aarch64"$ORIG_TAG_SUFFIX" vllm/vllm-openai:"$TAG_NAME"-aarch64
|
||||||
# push arch-dependent images to DockerHub
|
# push arch-dependent images to DockerHub
|
||||||
docker push vllm/vllm-openai:$TAG_NAME-x86_64
|
docker push vllm/vllm-openai:"$TAG_NAME"-x86_64
|
||||||
docker push vllm/vllm-openai:$TAG_NAME-aarch64
|
docker push vllm/vllm-openai:"$TAG_NAME"-aarch64
|
||||||
# push arch-independent manifest to DockerHub
|
# push arch-independent manifest to DockerHub
|
||||||
docker manifest create vllm/vllm-openai:$TAG_NAME vllm/vllm-openai:$TAG_NAME-x86_64 vllm/vllm-openai:$TAG_NAME-aarch64 --amend
|
docker manifest create vllm/vllm-openai:"$TAG_NAME" vllm/vllm-openai:"$TAG_NAME"-x86_64 vllm/vllm-openai:"$TAG_NAME"-aarch64 --amend
|
||||||
docker manifest create vllm/vllm-openai:$TAG_NAME-$BUILDKITE_COMMIT vllm/vllm-openai:$TAG_NAME-x86_64 vllm/vllm-openai:$TAG_NAME-aarch64 --amend
|
docker manifest create vllm/vllm-openai:"$TAG_NAME"-"$BUILDKITE_COMMIT" vllm/vllm-openai:"$TAG_NAME"-x86_64 vllm/vllm-openai:"$TAG_NAME"-aarch64 --amend
|
||||||
docker manifest push vllm/vllm-openai:$TAG_NAME
|
docker manifest push vllm/vllm-openai:"$TAG_NAME"
|
||||||
docker manifest push vllm/vllm-openai:$TAG_NAME-$BUILDKITE_COMMIT
|
docker manifest push vllm/vllm-openai:"$TAG_NAME"-"$BUILDKITE_COMMIT"
|
||||||
|
|||||||
@@ -1,64 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
# SPDX-License-Identifier: Apache-2.0
|
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
|
||||||
|
|
||||||
# Setup script for Prime-RL integration tests
|
|
||||||
# This script prepares the environment for running Prime-RL tests with nightly vLLM
|
|
||||||
|
|
||||||
set -euo pipefail
|
|
||||||
|
|
||||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
||||||
REPO_ROOT="$(cd "${SCRIPT_DIR}/../.." && pwd)"
|
|
||||||
PRIME_RL_REPO="https://github.com/PrimeIntellect-ai/prime-rl.git"
|
|
||||||
PRIME_RL_DIR="${REPO_ROOT}/prime-rl"
|
|
||||||
|
|
||||||
if command -v rocm-smi &> /dev/null || command -v rocminfo &> /dev/null; then
|
|
||||||
echo "AMD GPU detected. Prime-RL currently only supports NVIDIA. Skipping..."
|
|
||||||
exit 0
|
|
||||||
fi
|
|
||||||
|
|
||||||
echo "Setting up Prime-RL integration test environment..."
|
|
||||||
|
|
||||||
# Clean up any existing Prime-RL directory
|
|
||||||
if [ -d "${PRIME_RL_DIR}" ]; then
|
|
||||||
echo "Removing existing Prime-RL directory..."
|
|
||||||
rm -rf "${PRIME_RL_DIR}"
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Install UV if not available
|
|
||||||
if ! command -v uv &> /dev/null; then
|
|
||||||
echo "Installing UV package manager..."
|
|
||||||
curl -LsSf https://astral.sh/uv/install.sh | sh
|
|
||||||
source $HOME/.local/bin/env
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Clone Prime-RL repository at specific branch for reproducible tests
|
|
||||||
PRIME_RL_BRANCH="integ-vllm-main"
|
|
||||||
echo "Cloning Prime-RL repository at branch: ${PRIME_RL_BRANCH}..."
|
|
||||||
git clone --branch "${PRIME_RL_BRANCH}" --single-branch "${PRIME_RL_REPO}" "${PRIME_RL_DIR}"
|
|
||||||
cd "${PRIME_RL_DIR}"
|
|
||||||
|
|
||||||
echo "Setting up UV project environment..."
|
|
||||||
export UV_PROJECT_ENVIRONMENT=/usr/local
|
|
||||||
ln -s /usr/bin/python3 /usr/local/bin/python
|
|
||||||
|
|
||||||
# Remove vllm pin from pyproject.toml
|
|
||||||
echo "Removing vllm pin from pyproject.toml..."
|
|
||||||
sed -i '/vllm==/d' pyproject.toml
|
|
||||||
|
|
||||||
# Sync Prime-RL dependencies
|
|
||||||
echo "Installing Prime-RL dependencies..."
|
|
||||||
uv sync --inexact && uv sync --inexact --all-extras
|
|
||||||
|
|
||||||
# Verify installation
|
|
||||||
echo "Verifying installations..."
|
|
||||||
uv run python -c "import vllm; print(f'vLLM version: {vllm.__version__}')"
|
|
||||||
uv run python -c "import prime_rl; print('Prime-RL imported successfully')"
|
|
||||||
|
|
||||||
echo "Prime-RL integration test environment setup complete!"
|
|
||||||
|
|
||||||
echo "Running Prime-RL integration tests..."
|
|
||||||
export WANDB_MODE=offline # this makes this test not require a WANDB_API_KEY
|
|
||||||
uv run pytest -vs tests/integration/test_rl.py -m gpu
|
|
||||||
|
|
||||||
echo "Prime-RL integration tests completed!"
|
|
||||||
@@ -43,7 +43,6 @@ trap cleanup EXIT
|
|||||||
|
|
||||||
for BACK in "${BACKENDS[@]}"; do
|
for BACK in "${BACKENDS[@]}"; do
|
||||||
VLLM_DEEP_GEMM_WARMUP=skip \
|
VLLM_DEEP_GEMM_WARMUP=skip \
|
||||||
VLLM_ALL2ALL_BACKEND=$BACK \
|
|
||||||
vllm serve "$MODEL" \
|
vllm serve "$MODEL" \
|
||||||
--enforce-eager \
|
--enforce-eager \
|
||||||
--tensor-parallel-size 2 \
|
--tensor-parallel-size 2 \
|
||||||
@@ -52,13 +51,14 @@ for BACK in "${BACKENDS[@]}"; do
|
|||||||
--enable-eplb \
|
--enable-eplb \
|
||||||
--trust-remote-code \
|
--trust-remote-code \
|
||||||
--max-model-len 2048 \
|
--max-model-len 2048 \
|
||||||
--port $PORT &
|
--all2all-backend "$BACK" \
|
||||||
|
--port "$PORT" &
|
||||||
SERVER_PID=$!
|
SERVER_PID=$!
|
||||||
wait_for_server $PORT
|
wait_for_server "$PORT"
|
||||||
|
|
||||||
TAG=$(echo "$MODEL" | tr '/: \\n' '_____')
|
TAG=$(echo "$MODEL" | tr '/: \\n' '_____')
|
||||||
OUT="${OUT_DIR}/${TAG}_${BACK}.json"
|
OUT="${OUT_DIR}/${TAG}_${BACK}.json"
|
||||||
python3 tests/evals/gsm8k/gsm8k_eval.py --host http://127.0.0.1 --port $PORT --num-questions ${NUM_Q} --save-results ${OUT}
|
python3 tests/evals/gsm8k/gsm8k_eval.py --host http://127.0.0.1 --port "$PORT" --num-questions "${NUM_Q}" --save-results "${OUT}"
|
||||||
python3 - <<PY
|
python3 - <<PY
|
||||||
import json; acc=json.load(open('${OUT}'))['accuracy']
|
import json; acc=json.load(open('${OUT}'))['accuracy']
|
||||||
print(f"${MODEL} ${BACK}: accuracy {acc:.3f}")
|
print(f"${MODEL} ${BACK}: accuracy {acc:.3f}")
|
||||||
|
|||||||
@@ -0,0 +1,57 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
set -euxo pipefail
|
||||||
|
|
||||||
|
# Nightly e2e test for prefetch offloading with a MoE model.
|
||||||
|
# Runs DeepSeek-V2-Lite with prefetch offloading of MoE expert weights
|
||||||
|
# and validates GSM8K accuracy matches baseline (no offloading).
|
||||||
|
#
|
||||||
|
# args: [THRESHOLD] [NUM_QUESTIONS] [START_PORT]
|
||||||
|
THRESHOLD=${1:-0.25}
|
||||||
|
NUM_Q=${2:-1319}
|
||||||
|
PORT=${3:-8030}
|
||||||
|
OUT_DIR=${OUT_DIR:-/tmp/vllm-scheduled}
|
||||||
|
mkdir -p "${OUT_DIR}"
|
||||||
|
|
||||||
|
wait_for_server() {
|
||||||
|
local port=$1
|
||||||
|
timeout 600 bash -c '
|
||||||
|
until curl -sf "http://127.0.0.1:'"$port"'/health" > /dev/null; do
|
||||||
|
sleep 1
|
||||||
|
done'
|
||||||
|
}
|
||||||
|
|
||||||
|
MODEL="deepseek-ai/DeepSeek-V2-Lite"
|
||||||
|
|
||||||
|
cleanup() {
|
||||||
|
if [[ -n "${SERVER_PID:-}" ]] && kill -0 "${SERVER_PID}" 2>/dev/null; then
|
||||||
|
kill "${SERVER_PID}" 2>/dev/null || true
|
||||||
|
for _ in {1..20}; do
|
||||||
|
kill -0 "${SERVER_PID}" 2>/dev/null || break
|
||||||
|
sleep 0.5
|
||||||
|
done
|
||||||
|
kill -9 "${SERVER_PID}" 2>/dev/null || true
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
trap cleanup EXIT
|
||||||
|
|
||||||
|
vllm serve "$MODEL" \
|
||||||
|
--max-model-len 2048 \
|
||||||
|
--offload-group-size 8 \
|
||||||
|
--offload-num-in-group 2 \
|
||||||
|
--offload-prefetch-step 1 \
|
||||||
|
--offload-params w13_weight w2_weight \
|
||||||
|
--port "$PORT" &
|
||||||
|
SERVER_PID=$!
|
||||||
|
wait_for_server "$PORT"
|
||||||
|
|
||||||
|
TAG=$(echo "$MODEL" | tr '/: \\n' '_____')
|
||||||
|
OUT="${OUT_DIR}/${TAG}_prefetch_offload.json"
|
||||||
|
python3 tests/evals/gsm8k/gsm8k_eval.py --host http://127.0.0.1 --port "$PORT" --num-questions "${NUM_Q}" --save-results "${OUT}"
|
||||||
|
python3 - <<PY
|
||||||
|
import json; acc=json.load(open('${OUT}'))['accuracy']
|
||||||
|
print(f"${MODEL} prefetch_offload: accuracy {acc:.3f}")
|
||||||
|
assert acc >= ${THRESHOLD}, f"${MODEL} prefetch_offload accuracy {acc}"
|
||||||
|
PY
|
||||||
|
|
||||||
|
cleanup
|
||||||
|
SERVER_PID=
|
||||||
@@ -47,20 +47,20 @@ for BACK in "${BACKENDS[@]}"; do
|
|||||||
vllm serve "$MODEL" \
|
vllm serve "$MODEL" \
|
||||||
--enforce-eager \
|
--enforce-eager \
|
||||||
--enable-eplb \
|
--enable-eplb \
|
||||||
--all2all-backend $BACK \
|
--all2all-backend "$BACK" \
|
||||||
--eplb-config '{"window_size":10, "step_interval":100, "num_redundant_experts":0, "log_balancedness":true}' \
|
--eplb-config '{"window_size":10, "step_interval":100, "num_redundant_experts":0, "log_balancedness":true}' \
|
||||||
--tensor-parallel-size ${TENSOR_PARALLEL_SIZE} \
|
--tensor-parallel-size "${TENSOR_PARALLEL_SIZE}" \
|
||||||
--data-parallel-size ${DATA_PARALLEL_SIZE} \
|
--data-parallel-size "${DATA_PARALLEL_SIZE}" \
|
||||||
--enable-expert-parallel \
|
--enable-expert-parallel \
|
||||||
--trust-remote-code \
|
--trust-remote-code \
|
||||||
--max-model-len 2048 \
|
--max-model-len 2048 \
|
||||||
--port $PORT &
|
--port "$PORT" &
|
||||||
SERVER_PID=$!
|
SERVER_PID=$!
|
||||||
wait_for_server $PORT
|
wait_for_server "$PORT"
|
||||||
|
|
||||||
TAG=$(echo "$MODEL" | tr '/: \\n' '_____')
|
TAG=$(echo "$MODEL" | tr '/: \\n' '_____')
|
||||||
OUT="${OUT_DIR}/${TAG}_${BACK}.json"
|
OUT="${OUT_DIR}/${TAG}_${BACK}.json"
|
||||||
python3 tests/evals/gsm8k/gsm8k_eval.py --host http://127.0.0.1 --port $PORT --num-questions ${NUM_Q} --save-results ${OUT}
|
python3 tests/evals/gsm8k/gsm8k_eval.py --host http://127.0.0.1 --port "$PORT" --num-questions "${NUM_Q}" --save-results "${OUT}"
|
||||||
python3 - <<PY
|
python3 - <<PY
|
||||||
import json; acc=json.load(open('${OUT}'))['accuracy']
|
import json; acc=json.load(open('${OUT}'))['accuracy']
|
||||||
print(f"${MODEL} ${BACK}: accuracy {acc:.3f}")
|
print(f"${MODEL} ${BACK}: accuracy {acc:.3f}")
|
||||||
|
|||||||
@@ -51,20 +51,20 @@ for BACK in "${BACKENDS[@]}"; do
|
|||||||
--tensor-parallel-size 4 \
|
--tensor-parallel-size 4 \
|
||||||
--enable-expert-parallel \
|
--enable-expert-parallel \
|
||||||
--enable-eplb \
|
--enable-eplb \
|
||||||
--all2all-backend $BACK \
|
--all2all-backend "$BACK" \
|
||||||
--eplb-config '{"window_size":200,"step_interval":600,"use_async":true}' \
|
--eplb-config '{"window_size":200,"step_interval":600,"use_async":true}' \
|
||||||
--speculative-config '{"method":"qwen3_next_mtp","num_speculative_tokens":1}' \
|
--speculative-config '{"method":"qwen3_next_mtp","num_speculative_tokens":1}' \
|
||||||
--trust-remote-code \
|
--trust-remote-code \
|
||||||
--max-model-len 2048 \
|
--max-model-len 2048 \
|
||||||
--gpu-memory-utilization 0.9 \
|
--gpu-memory-utilization 0.9 \
|
||||||
"${PLATFORM_ARGS[@]}" \
|
"${PLATFORM_ARGS[@]}" \
|
||||||
--port $PORT &
|
--port "$PORT" &
|
||||||
SERVER_PID=$!
|
SERVER_PID=$!
|
||||||
wait_for_server $PORT
|
wait_for_server "$PORT"
|
||||||
|
|
||||||
TAG=$(echo "$MODEL" | tr '/: \\n' '_____')
|
TAG=$(echo "$MODEL" | tr '/: \\n' '_____')
|
||||||
OUT="${OUT_DIR}/${TAG}_${BACK}.json"
|
OUT="${OUT_DIR}/${TAG}_${BACK}.json"
|
||||||
python3 tests/evals/gsm8k/gsm8k_eval.py --host http://127.0.0.1 --port $PORT --num-questions ${NUM_Q} --save-results ${OUT}
|
python3 tests/evals/gsm8k/gsm8k_eval.py --host http://127.0.0.1 --port "$PORT" --num-questions "${NUM_Q}" --save-results "${OUT}"
|
||||||
python3 - <<PY
|
python3 - <<PY
|
||||||
import json; acc=json.load(open('${OUT}'))['accuracy']
|
import json; acc=json.load(open('${OUT}'))['accuracy']
|
||||||
print(f"${MODEL} ${BACK}: accuracy {acc:.3f}")
|
print(f"${MODEL} ${BACK}: accuracy {acc:.3f}")
|
||||||
|
|||||||
@@ -9,10 +9,11 @@ ENV_FILE=$1
|
|||||||
|
|
||||||
# For testing on local vm, use `set -a` to export all variables
|
# For testing on local vm, use `set -a` to export all variables
|
||||||
source /etc/environment
|
source /etc/environment
|
||||||
source $ENV_FILE
|
# shellcheck source=/dev/null
|
||||||
|
source "$ENV_FILE"
|
||||||
|
|
||||||
remove_docker_container() {
|
remove_docker_container() {
|
||||||
docker rm -f $CONTAINER_NAME || true;
|
docker rm -f "$CONTAINER_NAME" || true;
|
||||||
}
|
}
|
||||||
|
|
||||||
trap remove_docker_container EXIT
|
trap remove_docker_container EXIT
|
||||||
@@ -41,13 +42,13 @@ echo
|
|||||||
echo "starting docker...$CONTAINER_NAME"
|
echo "starting docker...$CONTAINER_NAME"
|
||||||
echo
|
echo
|
||||||
docker run \
|
docker run \
|
||||||
-v $DOWNLOAD_DIR:$DOWNLOAD_DIR \
|
-v "$DOWNLOAD_DIR":"$DOWNLOAD_DIR" \
|
||||||
--env-file $ENV_FILE \
|
--env-file "$ENV_FILE" \
|
||||||
-e HF_TOKEN="$HF_TOKEN" \
|
-e HF_TOKEN="$HF_TOKEN" \
|
||||||
-e TARGET_COMMIT=$BUILDKITE_COMMIT \
|
-e TARGET_COMMIT="$BUILDKITE_COMMIT" \
|
||||||
-e MODEL=$MODEL \
|
-e MODEL="$MODEL" \
|
||||||
-e WORKSPACE=/workspace \
|
-e WORKSPACE=/workspace \
|
||||||
--name $CONTAINER_NAME \
|
--name "$CONTAINER_NAME" \
|
||||||
-d \
|
-d \
|
||||||
--privileged \
|
--privileged \
|
||||||
--network host \
|
--network host \
|
||||||
|
|||||||
@@ -42,21 +42,21 @@ echo "lanching vllm..."
|
|||||||
echo "logging to $VLLM_LOG"
|
echo "logging to $VLLM_LOG"
|
||||||
echo
|
echo
|
||||||
|
|
||||||
vllm serve $MODEL \
|
vllm serve "$MODEL" \
|
||||||
--seed 42 \
|
--seed 42 \
|
||||||
--max-num-seqs $MAX_NUM_SEQS \
|
--max-num-seqs "$MAX_NUM_SEQS" \
|
||||||
--max-num-batched-tokens $MAX_NUM_BATCHED_TOKENS \
|
--max-num-batched-tokens "$MAX_NUM_BATCHED_TOKENS" \
|
||||||
--tensor-parallel-size $TENSOR_PARALLEL_SIZE \
|
--tensor-parallel-size "$TENSOR_PARALLEL_SIZE" \
|
||||||
--no-enable-prefix-caching \
|
--no-enable-prefix-caching \
|
||||||
--download_dir $DOWNLOAD_DIR \
|
--download_dir "$DOWNLOAD_DIR" \
|
||||||
--max-model-len $MAX_MODEL_LEN > "$VLLM_LOG" 2>&1 &
|
--max-model-len "$MAX_MODEL_LEN" > "$VLLM_LOG" 2>&1 &
|
||||||
|
|
||||||
|
|
||||||
echo "wait for 20 minutes.."
|
echo "wait for 20 minutes.."
|
||||||
echo
|
echo
|
||||||
# sleep 1200
|
# sleep 1200
|
||||||
# wait for 10 minutes...
|
# wait for 10 minutes...
|
||||||
for i in {1..120}; do
|
for _ in {1..120}; do
|
||||||
# TODO: detect other type of errors.
|
# TODO: detect other type of errors.
|
||||||
if grep -Fq "raise RuntimeError" "$VLLM_LOG"; then
|
if grep -Fq "raise RuntimeError" "$VLLM_LOG"; then
|
||||||
echo "Detected RuntimeError, exiting."
|
echo "Detected RuntimeError, exiting."
|
||||||
@@ -78,11 +78,11 @@ echo "logging to $BM_LOG"
|
|||||||
echo
|
echo
|
||||||
vllm bench serve \
|
vllm bench serve \
|
||||||
--backend vllm \
|
--backend vllm \
|
||||||
--model $MODEL \
|
--model "$MODEL" \
|
||||||
--dataset-name sonnet \
|
--dataset-name sonnet \
|
||||||
--dataset-path benchmarks/sonnet_4x.txt \
|
--dataset-path benchmarks/sonnet_4x.txt \
|
||||||
--sonnet-input-len $INPUT_LEN \
|
--sonnet-input-len "$INPUT_LEN" \
|
||||||
--sonnet-output-len $OUTPUT_LEN \
|
--sonnet-output-len "$OUTPUT_LEN" \
|
||||||
--ignore-eos > "$BM_LOG"
|
--ignore-eos > "$BM_LOG"
|
||||||
|
|
||||||
echo "completed..."
|
echo "completed..."
|
||||||
|
|||||||
@@ -76,16 +76,15 @@ mkdir -p "$INDICES_OUTPUT_DIR"
|
|||||||
# this indices have relative paths that could work as long as it is next to the wheel directory in s3
|
# this indices have relative paths that could work as long as it is next to the wheel directory in s3
|
||||||
# i.e., the wheels are always in s3://vllm-wheels/<commit>/
|
# i.e., the wheels are always in s3://vllm-wheels/<commit>/
|
||||||
# and indices can be placed in /<commit>/, or /nightly/, or /<version>/
|
# and indices can be placed in /<commit>/, or /nightly/, or /<version>/
|
||||||
if [[ ! -z "$DEFAULT_VARIANT_ALIAS" ]]; then
|
alias_args=()
|
||||||
alias_arg="--alias-to-default $DEFAULT_VARIANT_ALIAS"
|
if [[ -n "$DEFAULT_VARIANT_ALIAS" ]]; then
|
||||||
else
|
alias_args=(--alias-to-default "$DEFAULT_VARIANT_ALIAS")
|
||||||
alias_arg=""
|
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# HACK: we do not need regex module here, but it is required by pre-commit hook
|
# HACK: we do not need regex module here, but it is required by pre-commit hook
|
||||||
# To avoid any external dependency, we simply replace it back to the stdlib re module
|
# To avoid any external dependency, we simply replace it back to the stdlib re module
|
||||||
sed -i 's/import regex as re/import re/g' .buildkite/scripts/generate-nightly-index.py
|
sed -i 's/import regex as re/import re/g' .buildkite/scripts/generate-nightly-index.py
|
||||||
$PYTHON .buildkite/scripts/generate-nightly-index.py --version "$SUBPATH" --current-objects "$obj_json" --output-dir "$INDICES_OUTPUT_DIR" --comment "commit $BUILDKITE_COMMIT" $alias_arg
|
$PYTHON .buildkite/scripts/generate-nightly-index.py --version "$SUBPATH" --current-objects "$obj_json" --output-dir "$INDICES_OUTPUT_DIR" --comment "commit $BUILDKITE_COMMIT" "${alias_args[@]}"
|
||||||
|
|
||||||
# copy indices to /<commit>/ unconditionally
|
# copy indices to /<commit>/ unconditionally
|
||||||
echo "Uploading indices to $S3_COMMIT_PREFIX"
|
echo "Uploading indices to $S3_COMMIT_PREFIX"
|
||||||
@@ -100,9 +99,9 @@ fi
|
|||||||
# re-generate and copy to /<pure_version>/ only if it does not have "dev" in the version
|
# re-generate and copy to /<pure_version>/ only if it does not have "dev" in the version
|
||||||
if [[ "$version" != *"dev"* ]]; then
|
if [[ "$version" != *"dev"* ]]; then
|
||||||
echo "Re-generating indices for /$pure_version/"
|
echo "Re-generating indices for /$pure_version/"
|
||||||
rm -rf "$INDICES_OUTPUT_DIR/*"
|
rm -rf "${INDICES_OUTPUT_DIR:?}/*"
|
||||||
mkdir -p "$INDICES_OUTPUT_DIR"
|
mkdir -p "$INDICES_OUTPUT_DIR"
|
||||||
# wheel-dir is overridden to be the commit directory, so that the indices point to the correct wheel path
|
# wheel-dir is overridden to be the commit directory, so that the indices point to the correct wheel path
|
||||||
$PYTHON .buildkite/scripts/generate-nightly-index.py --version "$pure_version" --wheel-dir "$SUBPATH" --current-objects "$obj_json" --output-dir "$INDICES_OUTPUT_DIR" --comment "version $pure_version" $alias_arg
|
$PYTHON .buildkite/scripts/generate-nightly-index.py --version "$pure_version" --wheel-dir "$SUBPATH" --current-objects "$obj_json" --output-dir "$INDICES_OUTPUT_DIR" --comment "version $pure_version" "${alias_args[@]}"
|
||||||
aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "s3://$BUCKET/$pure_version/"
|
aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "s3://$BUCKET/$pure_version/"
|
||||||
fi
|
fi
|
||||||
|
|||||||
@@ -7,17 +7,19 @@ SUBPATH=$BUILDKITE_COMMIT
|
|||||||
S3_COMMIT_PREFIX="s3://$BUCKET/$SUBPATH/"
|
S3_COMMIT_PREFIX="s3://$BUCKET/$SUBPATH/"
|
||||||
|
|
||||||
RELEASE_VERSION=$(buildkite-agent meta-data get release-version)
|
RELEASE_VERSION=$(buildkite-agent meta-data get release-version)
|
||||||
|
GIT_VERSION=$(git describe --exact-match --tags "$BUILDKITE_COMMIT" 2>/dev/null)
|
||||||
|
|
||||||
echo "Release version from Buildkite: $RELEASE_VERSION"
|
echo "Release version from Buildkite: $RELEASE_VERSION"
|
||||||
GIT_VERSION=$(git describe --exact-match --tags $BUILDKITE_COMMIT 2>/dev/null)
|
|
||||||
if [ -z "$GIT_VERSION" ]; then
|
if [[ -z "$GIT_VERSION" ]]; then
|
||||||
echo "[FATAL] Not on a git tag, cannot create release."
|
echo "[FATAL] Not on a git tag, cannot create release."
|
||||||
exit 1
|
exit 1
|
||||||
else
|
else
|
||||||
echo "Git version for commit $BUILDKITE_COMMIT: $GIT_VERSION"
|
echo "Git version for commit $BUILDKITE_COMMIT: $GIT_VERSION"
|
||||||
fi
|
fi
|
||||||
# sanity check for version mismatch
|
# sanity check for version mismatch
|
||||||
if [ "$RELEASE_VERSION" != "$GIT_VERSION" ]; then
|
if [[ "$RELEASE_VERSION" != "$GIT_VERSION" ]]; then
|
||||||
if [ "$FORCE_RELEASE_IGNORE_VERSION_MISMATCH" == "true" ]; then
|
if [[ "$FORCE_RELEASE_IGNORE_VERSION_MISMATCH" == "true" ]]; then
|
||||||
echo "[WARNING] Force release and ignore version mismatch"
|
echo "[WARNING] Force release and ignore version mismatch"
|
||||||
else
|
else
|
||||||
echo "[FATAL] Release version from Buildkite does not match Git version."
|
echo "[FATAL] Release version from Buildkite does not match Git version."
|
||||||
@@ -27,7 +29,7 @@ fi
|
|||||||
PURE_VERSION=${RELEASE_VERSION#v} # remove leading 'v'
|
PURE_VERSION=${RELEASE_VERSION#v} # remove leading 'v'
|
||||||
|
|
||||||
# check pypi token
|
# check pypi token
|
||||||
if [ -z "$PYPI_TOKEN" ]; then
|
if [[ -z "$PYPI_TOKEN" ]]; then
|
||||||
echo "[FATAL] PYPI_TOKEN is not set."
|
echo "[FATAL] PYPI_TOKEN is not set."
|
||||||
exit 1
|
exit 1
|
||||||
else
|
else
|
||||||
@@ -35,41 +37,8 @@ else
|
|||||||
export TWINE_PASSWORD="$PYPI_TOKEN"
|
export TWINE_PASSWORD="$PYPI_TOKEN"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# check github token
|
|
||||||
if [ -z "$GITHUB_TOKEN" ]; then
|
|
||||||
echo "[FATAL] GITHUB_TOKEN is not set."
|
|
||||||
exit 1
|
|
||||||
else
|
|
||||||
export GH_TOKEN="$GITHUB_TOKEN"
|
|
||||||
fi
|
|
||||||
|
|
||||||
set -x # avoid printing secrets above
|
set -x # avoid printing secrets above
|
||||||
|
|
||||||
# download gh CLI from github
|
|
||||||
# Get latest gh CLI version from GitHub API
|
|
||||||
GH_VERSION=$(curl -s https://api.github.com/repos/cli/cli/releases/latest | grep '"tag_name":' | sed -E 's/.*"([^"]+)".*/\1/' | sed 's/^v//')
|
|
||||||
if [ -z "$GH_VERSION" ]; then
|
|
||||||
echo "[FATAL] Failed to get latest gh CLI version from GitHub"
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
echo "Downloading gh CLI version: $GH_VERSION"
|
|
||||||
GH_TARBALL="gh_${GH_VERSION}_linux_amd64.tar.gz"
|
|
||||||
GH_URL="https://github.com/cli/cli/releases/download/v${GH_VERSION}/${GH_TARBALL}"
|
|
||||||
GH_INSTALL_DIR="/tmp/gh-install"
|
|
||||||
mkdir -p "$GH_INSTALL_DIR"
|
|
||||||
pushd "$GH_INSTALL_DIR"
|
|
||||||
curl -L -o "$GH_TARBALL" "$GH_URL"
|
|
||||||
tar -xzf "$GH_TARBALL"
|
|
||||||
GH_BIN=$(realpath $(find . -name "gh" -type f -executable | head -n 1))
|
|
||||||
if [ -z "$GH_BIN" ]; then
|
|
||||||
echo "[FATAL] Failed to find gh CLI executable"
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
echo "gh CLI downloaded successfully, version: $($GH_BIN --version)"
|
|
||||||
echo "Last 5 releases on GitHub:" # as a sanity check of gh and GH_TOKEN
|
|
||||||
command "$GH_BIN" release list --limit 5
|
|
||||||
popd
|
|
||||||
|
|
||||||
# install twine from pypi
|
# install twine from pypi
|
||||||
python3 -m venv /tmp/vllm-release-env
|
python3 -m venv /tmp/vllm-release-env
|
||||||
source /tmp/vllm-release-env/bin/activate
|
source /tmp/vllm-release-env/bin/activate
|
||||||
@@ -86,19 +55,16 @@ mkdir -p $DIST_DIR
|
|||||||
aws s3 cp --recursive --exclude "*" --include "vllm-${PURE_VERSION}*.whl" --exclude "*dev*" --exclude "*rc[0-9]*" "$S3_COMMIT_PREFIX" $DIST_DIR
|
aws s3 cp --recursive --exclude "*" --include "vllm-${PURE_VERSION}*.whl" --exclude "*dev*" --exclude "*rc[0-9]*" "$S3_COMMIT_PREFIX" $DIST_DIR
|
||||||
echo "Wheels copied to local directory"
|
echo "Wheels copied to local directory"
|
||||||
# generate source tarball
|
# generate source tarball
|
||||||
git archive --format=tar.gz --output="$DIST_DIR/vllm-${PURE_VERSION}.tar.gz" $BUILDKITE_COMMIT
|
git archive --format=tar.gz --output="$DIST_DIR/vllm-${PURE_VERSION}.tar.gz" "$BUILDKITE_COMMIT"
|
||||||
ls -la $DIST_DIR
|
ls -la $DIST_DIR
|
||||||
|
|
||||||
|
|
||||||
# upload wheels to PyPI (only default variant, i.e. files without '+' in the name)
|
# upload wheels to PyPI (only default variant, i.e. files without '+' in the name)
|
||||||
PYPI_WHEEL_FILES=$(find $DIST_DIR -name "vllm-${PURE_VERSION}*.whl" -not -name "*+*")
|
PYPI_WHEEL_FILES=$(find $DIST_DIR -name "vllm-${PURE_VERSION}*.whl" -not -name "*+*")
|
||||||
if [ -z "$PYPI_WHEEL_FILES" ]; then
|
if [[ -z "$PYPI_WHEEL_FILES" ]]; then
|
||||||
echo "No default variant wheels found, quitting..."
|
echo "No default variant wheels found, quitting..."
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
python3 -m twine check $PYPI_WHEEL_FILES
|
|
||||||
python3 -m twine --non-interactive --verbose upload $PYPI_WHEEL_FILES
|
|
||||||
echo "Wheels uploaded to PyPI"
|
|
||||||
|
|
||||||
# create release on GitHub with the release version and all wheels
|
python3 -m twine check "$PYPI_WHEEL_FILES"
|
||||||
command "$GH_BIN" release create $GIT_VERSION -d --latest --notes-from-tag --verify-tag $DIST_DIR/*.whl
|
python3 -m twine upload --non-interactive --verbose "$PYPI_WHEEL_FILES"
|
||||||
|
echo "Wheels uploaded to PyPI"
|
||||||
@@ -55,7 +55,7 @@ mkdir -p all-rocm-wheels
|
|||||||
cp artifacts/rocm-base-wheels/*.whl all-rocm-wheels/ 2>/dev/null || true
|
cp artifacts/rocm-base-wheels/*.whl all-rocm-wheels/ 2>/dev/null || true
|
||||||
cp artifacts/rocm-vllm-wheel/*.whl all-rocm-wheels/ 2>/dev/null || true
|
cp artifacts/rocm-vllm-wheel/*.whl all-rocm-wheels/ 2>/dev/null || true
|
||||||
|
|
||||||
WHEEL_COUNT=$(ls all-rocm-wheels/*.whl 2>/dev/null | wc -l)
|
WHEEL_COUNT=$(find all-rocm-wheels -maxdepth 1 -name '*.whl' 2>/dev/null | wc -l)
|
||||||
echo "Total wheels to upload: $WHEEL_COUNT"
|
echo "Total wheels to upload: $WHEEL_COUNT"
|
||||||
|
|
||||||
if [ "$WHEEL_COUNT" -eq 0 ]; then
|
if [ "$WHEEL_COUNT" -eq 0 ]; then
|
||||||
@@ -115,7 +115,7 @@ if [[ "$BUILDKITE_BRANCH" == "main" && "$BUILDKITE_PULL_REQUEST" == "false" ]] |
|
|||||||
fi
|
fi
|
||||||
|
|
||||||
# Extract version from vLLM wheel and update version-specific index
|
# Extract version from vLLM wheel and update version-specific index
|
||||||
VLLM_WHEEL=$(ls all-rocm-wheels/vllm*.whl 2>/dev/null | head -1)
|
VLLM_WHEEL=$(find all-rocm-wheels -maxdepth 1 -name 'vllm*.whl' 2>/dev/null | head -1)
|
||||||
if [ -n "$VLLM_WHEEL" ]; then
|
if [ -n "$VLLM_WHEEL" ]; then
|
||||||
VERSION=$(unzip -p "$VLLM_WHEEL" '**/METADATA' | grep '^Version: ' | cut -d' ' -f2)
|
VERSION=$(unzip -p "$VLLM_WHEEL" '**/METADATA' | grep '^Version: ' | cut -d' ' -f2)
|
||||||
echo "Version in wheel: $VERSION"
|
echo "Version in wheel: $VERSION"
|
||||||
|
|||||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@@ -4,7 +4,7 @@ depends_on:
|
|||||||
steps:
|
steps:
|
||||||
- label: V1 attention (H100)
|
- label: V1 attention (H100)
|
||||||
timeout_in_minutes: 30
|
timeout_in_minutes: 30
|
||||||
gpu: h100
|
device: h100
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/config/attention.py
|
- vllm/config/attention.py
|
||||||
- vllm/model_executor/layers/attention
|
- vllm/model_executor/layers/attention
|
||||||
@@ -15,7 +15,7 @@ steps:
|
|||||||
|
|
||||||
- label: V1 attention (B200)
|
- label: V1 attention (B200)
|
||||||
timeout_in_minutes: 30
|
timeout_in_minutes: 30
|
||||||
gpu: b200
|
device: b200
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/config/attention.py
|
- vllm/config/attention.py
|
||||||
- vllm/model_executor/layers/attention
|
- vllm/model_executor/layers/attention
|
||||||
|
|||||||
@@ -14,3 +14,8 @@ steps:
|
|||||||
- pytest -v -s basic_correctness/test_cumem.py
|
- pytest -v -s basic_correctness/test_cumem.py
|
||||||
- pytest -v -s basic_correctness/test_basic_correctness.py
|
- pytest -v -s basic_correctness/test_basic_correctness.py
|
||||||
- pytest -v -s basic_correctness/test_cpu_offload.py
|
- pytest -v -s basic_correctness/test_cpu_offload.py
|
||||||
|
mirror:
|
||||||
|
amd:
|
||||||
|
device: mi325_1
|
||||||
|
depends_on:
|
||||||
|
- image-build-amd
|
||||||
|
|||||||
@@ -17,3 +17,15 @@ steps:
|
|||||||
- tests/benchmarks/
|
- tests/benchmarks/
|
||||||
commands:
|
commands:
|
||||||
- pytest -v -s benchmarks/
|
- pytest -v -s benchmarks/
|
||||||
|
|
||||||
|
- label: Attention Benchmarks Smoke Test (B200)
|
||||||
|
device: b200
|
||||||
|
num_gpus: 2
|
||||||
|
optional: true
|
||||||
|
working_dir: "/vllm-workspace/"
|
||||||
|
timeout_in_minutes: 10
|
||||||
|
source_file_dependencies:
|
||||||
|
- benchmarks/attention_benchmarks/
|
||||||
|
- vllm/v1/attention/
|
||||||
|
commands:
|
||||||
|
- python3 benchmarks/attention_benchmarks/benchmark.py --backends flash flashinfer --batch-specs "8q1s1k" --repeats 1 --warmup-iters 1
|
||||||
|
|||||||
@@ -2,56 +2,200 @@ group: Compile
|
|||||||
depends_on:
|
depends_on:
|
||||||
- image-build
|
- image-build
|
||||||
steps:
|
steps:
|
||||||
- label: Fusion and Compile Tests (B200)
|
- label: Sequence Parallel Correctness Tests (2 GPUs)
|
||||||
timeout_in_minutes: 40
|
timeout_in_minutes: 50
|
||||||
working_dir: "/vllm-workspace/"
|
working_dir: "/vllm-workspace/"
|
||||||
gpu: b200
|
num_devices: 2
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- csrc/quantization/fp4/
|
- vllm/model_executor/layers/
|
||||||
- vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
|
- vllm/compilation/
|
||||||
- vllm/v1/attention/backends/flashinfer.py
|
|
||||||
- vllm/v1/worker/
|
- vllm/v1/worker/
|
||||||
- vllm/v1/cudagraph_dispatcher.py
|
- vllm/v1/cudagraph_dispatcher.py
|
||||||
- vllm/compilation/
|
- tests/compile/correctness_e2e/test_sequence_parallel.py
|
||||||
# can affect pattern matching
|
|
||||||
- vllm/model_executor/layers/layernorm.py
|
|
||||||
- vllm/model_executor/layers/activation.py
|
|
||||||
- vllm/model_executor/layers/quantization/input_quant_fp8.py
|
|
||||||
- tests/compile/test_fusion_attn.py
|
|
||||||
- tests/compile/test_silu_mul_quant_fusion.py
|
|
||||||
- tests/compile/distributed/test_fusion_all_reduce.py
|
|
||||||
- tests/compile/distributed/test_fusions_e2e.py
|
|
||||||
- tests/compile/fullgraph/test_full_graph.py
|
|
||||||
commands:
|
commands:
|
||||||
- nvidia-smi
|
- export VLLM_TEST_CLEAN_GPU_MEMORY=1
|
||||||
- pytest -v -s tests/compile/test_fusion_attn.py
|
- pytest -v -s tests/compile/correctness_e2e/test_sequence_parallel.py
|
||||||
- pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
|
|
||||||
# this runner has 2 GPUs available even though num_gpus=2 is not set
|
|
||||||
- pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
|
|
||||||
# Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time
|
|
||||||
# Wrap with quotes to escape yaml
|
|
||||||
- "pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and not +quant_fp8 and not +rms_norm'"
|
|
||||||
# test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40)
|
|
||||||
- pytest -v -s tests/compile/fullgraph/test_full_graph.py::test_fp8_kv_scale_compile
|
|
||||||
|
|
||||||
- label: Fusion E2E (2 GPUs)(B200)
|
- label: Sequence Parallel Correctness Tests (2xH100)
|
||||||
timeout_in_minutes: 40
|
timeout_in_minutes: 50
|
||||||
working_dir: "/vllm-workspace/"
|
working_dir: "/vllm-workspace/"
|
||||||
gpu: b200
|
device: h100
|
||||||
optional: true
|
optional: true
|
||||||
num_gpus: 2
|
num_devices: 2
|
||||||
|
commands:
|
||||||
|
- export VLLM_TEST_CLEAN_GPU_MEMORY=1
|
||||||
|
- pytest -v -s tests/compile/correctness_e2e/test_sequence_parallel.py
|
||||||
|
|
||||||
|
- label: AsyncTP Correctness Tests (2xH100)
|
||||||
|
timeout_in_minutes: 50
|
||||||
|
working_dir: "/vllm-workspace/"
|
||||||
|
device: h100
|
||||||
|
optional: true
|
||||||
|
num_devices: 2
|
||||||
|
commands:
|
||||||
|
- export VLLM_TEST_CLEAN_GPU_MEMORY=1
|
||||||
|
- pytest -v -s tests/compile/correctness_e2e/test_async_tp.py
|
||||||
|
|
||||||
|
- label: Distributed Compile Unit Tests (2xH100)
|
||||||
|
timeout_in_minutes: 20
|
||||||
|
working_dir: "/vllm-workspace/"
|
||||||
|
device: h100
|
||||||
|
num_devices: 2
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/compilation/
|
||||||
|
- vllm/model_executor/layers
|
||||||
|
- tests/compile/passes/distributed/
|
||||||
|
commands:
|
||||||
|
- export VLLM_TEST_CLEAN_GPU_MEMORY=1
|
||||||
|
- pytest -s -v tests/compile/passes/distributed
|
||||||
|
|
||||||
|
- label: Fusion and Compile Unit Tests (B200)
|
||||||
|
timeout_in_minutes: 20
|
||||||
|
working_dir: "/vllm-workspace/"
|
||||||
|
device: b200
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- csrc/quantization/fp4/
|
- csrc/quantization/fp4/
|
||||||
- vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
|
- vllm/model_executor/layers/quantization/
|
||||||
- vllm/v1/attention/backends/flashinfer.py
|
|
||||||
- vllm/compilation/
|
|
||||||
# can affect pattern matching
|
|
||||||
- vllm/model_executor/layers/layernorm.py
|
- vllm/model_executor/layers/layernorm.py
|
||||||
- vllm/model_executor/layers/activation.py
|
- vllm/model_executor/layers/activation.py
|
||||||
- vllm/model_executor/layers/quantization/input_quant_fp8.py
|
- vllm/model_executor/layers/attention/attention.py
|
||||||
- tests/compile/distributed/test_fusions_e2e.py
|
- vllm/v1/attention/backends/flashinfer.py
|
||||||
|
- vllm/compilation/ # TODO(luka) limit to vllm/compilation/passes
|
||||||
|
- tests/compile/passes/test_fusion_attn.py
|
||||||
|
- tests/compile/passes/test_silu_mul_quant_fusion.py
|
||||||
|
- tests/compile/passes/distributed/test_fusion_all_reduce.py
|
||||||
|
- tests/compile/fullgraph/test_full_graph.py
|
||||||
|
commands:
|
||||||
|
# b200 runners are limited, so we limit the tests to the minimum set only supported on Blackwell
|
||||||
|
- nvidia-smi
|
||||||
|
- pytest -v -s tests/compile/passes/test_fusion_attn.py -k FLASHINFER
|
||||||
|
- pytest -v -s tests/compile/passes/test_silu_mul_quant_fusion.py
|
||||||
|
# this runner has 2 GPUs available even though num_devices=2 is not set
|
||||||
|
- pytest -v -s tests/compile/passes/distributed/test_fusion_all_reduce.py
|
||||||
|
# test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40)
|
||||||
|
# TODO(luka) move to H100 once pass tests run on H100
|
||||||
|
- pytest -v -s tests/compile/fullgraph/test_full_graph.py::test_fp8_kv_scale_compile
|
||||||
|
|
||||||
|
- label: Fusion E2E Quick (H100)
|
||||||
|
timeout_in_minutes: 15
|
||||||
|
working_dir: "/vllm-workspace/"
|
||||||
|
device: h100
|
||||||
|
num_devices: 1
|
||||||
|
source_file_dependencies:
|
||||||
|
- csrc/quantization/
|
||||||
|
- vllm/model_executor/
|
||||||
|
- vllm/v1/attention/
|
||||||
|
- vllm/compilation/
|
||||||
|
- tests/compile/fusions_e2e/
|
||||||
commands:
|
commands:
|
||||||
- nvidia-smi
|
- nvidia-smi
|
||||||
# Run all e2e fusion tests
|
# Run all models and attn backends but only Inductor partition and native custom ops
|
||||||
- pytest -v -s tests/compile/distributed/test_fusions_e2e.py
|
- pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and not +rms_norm and not +quant_fp8"
|
||||||
|
# Qwen requires +quant_fp8 as -quant_fp8 rms+quant fusion is not supported
|
||||||
|
- pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and not +rms_norm and +quant_fp8 and qwen3"
|
||||||
|
|
||||||
|
- label: Fusion E2E Config Sweep (H100)
|
||||||
|
timeout_in_minutes: 30
|
||||||
|
working_dir: "/vllm-workspace/"
|
||||||
|
device: h100
|
||||||
|
num_devices: 1
|
||||||
|
source_file_dependencies:
|
||||||
|
- csrc/quantization/
|
||||||
|
- vllm/compilation/
|
||||||
|
# can affect pattern matching
|
||||||
|
- vllm/model_executor/layers/layernorm.py
|
||||||
|
- vllm/model_executor/layers/activation.py
|
||||||
|
- vllm/model_executor/layers/attention/attention.py
|
||||||
|
- vllm/model_executor/layers/quantization/input_quant_fp8.py
|
||||||
|
- tests/compile/fusions_e2e/
|
||||||
|
commands:
|
||||||
|
- nvidia-smi
|
||||||
|
# Run just llama3 (fp8) for all config combinations
|
||||||
|
- pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "llama-3"
|
||||||
|
|
||||||
|
- label: Fusion E2E Config Sweep (B200)
|
||||||
|
timeout_in_minutes: 30
|
||||||
|
working_dir: "/vllm-workspace/"
|
||||||
|
device: b200
|
||||||
|
num_devices: 1
|
||||||
|
optional: true
|
||||||
|
commands:
|
||||||
|
- nvidia-smi
|
||||||
|
# Run all models but only FLASHINFER, Inductor partition and native custom ops
|
||||||
|
# Qwen requires +quant_fp8 as -quant_fp8 rms+quant fusion is not supported
|
||||||
|
# Run just llama3 (fp8 & fp4) for all config combinations (only inductor partition)
|
||||||
|
- pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and (FLASHINFER and not +rms_norm and (not +quant_fp8 or +quant_fp8 and qwen3) or llama-3)"
|
||||||
|
|
||||||
|
- label: Fusion E2E TP2 Quick (H100)
|
||||||
|
timeout_in_minutes: 20
|
||||||
|
working_dir: "/vllm-workspace/"
|
||||||
|
device: h100
|
||||||
|
num_devices: 2
|
||||||
|
source_file_dependencies:
|
||||||
|
- csrc/quantization/
|
||||||
|
- vllm/model_executor/
|
||||||
|
- vllm/v1/attention/
|
||||||
|
- vllm/compilation/
|
||||||
|
- tests/compile/fusions_e2e/
|
||||||
|
commands:
|
||||||
|
- nvidia-smi
|
||||||
|
# Run all models and attn backends but only Inductor partition and native custom ops
|
||||||
|
- pytest -v -s tests/compile/fusions_e2e/test_tp2_ar_rms.py -k "inductor_partition and not +rms_norm and not +quant_fp8"
|
||||||
|
- pytest -v -s tests/compile/fusions_e2e/test_tp2_async_tp.py -k "inductor_partition and not +rms_norm and not +quant_fp8"
|
||||||
|
|
||||||
|
- label: Fusion E2E TP2 AR-RMS Config Sweep (H100)
|
||||||
|
timeout_in_minutes: 40
|
||||||
|
working_dir: "/vllm-workspace/"
|
||||||
|
device: h100
|
||||||
|
num_devices: 2
|
||||||
|
source_file_dependencies:
|
||||||
|
- csrc/quantization/
|
||||||
|
- vllm/compilation/
|
||||||
|
# can affect pattern matching
|
||||||
|
- vllm/model_executor/layers/layernorm.py
|
||||||
|
- vllm/model_executor/layers/activation.py
|
||||||
|
- vllm/model_executor/layers/attention/attention.py
|
||||||
|
- vllm/model_executor/layers/quantization/input_quant_fp8.py
|
||||||
|
- tests/compile/fusions_e2e/
|
||||||
|
commands:
|
||||||
|
- nvidia-smi
|
||||||
|
# Run just llama3 (fp8 & bf16) for all config combinations
|
||||||
|
- pytest -v -s tests/compile/fusions_e2e/test_tp2_ar_rms.py -k "llama-3"
|
||||||
|
|
||||||
|
- label: Fusion E2E TP2 AsyncTP Config Sweep (H100)
|
||||||
|
timeout_in_minutes: 40
|
||||||
|
working_dir: "/vllm-workspace/"
|
||||||
|
device: h100
|
||||||
|
num_devices: 2
|
||||||
|
source_file_dependencies:
|
||||||
|
- csrc/quantization/
|
||||||
|
- vllm/compilation/
|
||||||
|
# can affect pattern matching
|
||||||
|
- vllm/model_executor/layers/layernorm.py
|
||||||
|
- vllm/model_executor/layers/activation.py
|
||||||
|
- vllm/model_executor/layers/attention/attention.py
|
||||||
|
- vllm/model_executor/layers/quantization/input_quant_fp8.py
|
||||||
|
- tests/compile/fusions_e2e/
|
||||||
|
commands:
|
||||||
|
- nvidia-smi
|
||||||
|
# Run just llama3 (fp8 & bf16) for all config combinations
|
||||||
|
- pytest -v -s tests/compile/fusions_e2e/test_tp2_async_tp.py -k "llama-3"
|
||||||
|
|
||||||
|
- label: Fusion E2E TP2 (B200)
|
||||||
|
timeout_in_minutes: 20
|
||||||
|
working_dir: "/vllm-workspace/"
|
||||||
|
device: b200
|
||||||
|
num_devices: 2
|
||||||
|
source_file_dependencies:
|
||||||
|
- csrc/quantization/
|
||||||
|
- vllm/model_executor/
|
||||||
|
- vllm/v1/attention/
|
||||||
|
- vllm/compilation/
|
||||||
|
- tests/compile/fusions_e2e/
|
||||||
|
commands:
|
||||||
|
- nvidia-smi
|
||||||
|
# Run all models but only FLASHINFER, Inductor partition and native custom ops
|
||||||
|
# include qwen with +quant_fp8 as -quant_fp8 rms+quant fusion is not supported
|
||||||
|
# for ar-rms-quant-fp4, also sweep llama3
|
||||||
|
- pytest -v -s tests/compile/fusions_e2e/test_tp2_ar_rms.py -k "(FLASHINFER and inductor_partition and not +rms_norm and (not +quant_fp8 or +quant_fp8 and qwen3)) or Llama-3.1-8B-Instruct-FP4"
|
||||||
|
- pytest -v -s tests/compile/fusions_e2e/test_tp2_async_tp.py -k "FLASHINFER and inductor_partition and not +rms_norm and (not +quant_fp8 or +quant_fp8 and qwen3)"
|
||||||
|
|||||||
@@ -9,6 +9,7 @@ steps:
|
|||||||
- tests/cuda
|
- tests/cuda
|
||||||
commands:
|
commands:
|
||||||
- pytest -v -s cuda/test_cuda_context.py
|
- pytest -v -s cuda/test_cuda_context.py
|
||||||
|
- pytest -v -s cuda/test_platform_no_cuda_init.py
|
||||||
|
|
||||||
- label: Cudagraph
|
- label: Cudagraph
|
||||||
timeout_in_minutes: 20
|
timeout_in_minutes: 20
|
||||||
|
|||||||
@@ -5,7 +5,7 @@ steps:
|
|||||||
- label: Distributed Comm Ops
|
- label: Distributed Comm Ops
|
||||||
timeout_in_minutes: 20
|
timeout_in_minutes: 20
|
||||||
working_dir: "/vllm-workspace/tests"
|
working_dir: "/vllm-workspace/tests"
|
||||||
num_gpus: 2
|
num_devices: 2
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/distributed
|
- vllm/distributed
|
||||||
- tests/distributed
|
- tests/distributed
|
||||||
@@ -16,9 +16,9 @@ steps:
|
|||||||
- pytest -v -s distributed/test_shm_storage.py
|
- pytest -v -s distributed/test_shm_storage.py
|
||||||
|
|
||||||
- label: Distributed (2 GPUs)
|
- label: Distributed (2 GPUs)
|
||||||
timeout_in_minutes: 90
|
timeout_in_minutes: 60
|
||||||
working_dir: "/vllm-workspace/tests"
|
working_dir: "/vllm-workspace/tests"
|
||||||
num_gpus: 2
|
num_devices: 2
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/compilation/
|
- vllm/compilation/
|
||||||
- vllm/distributed/
|
- vllm/distributed/
|
||||||
@@ -47,14 +47,13 @@ steps:
|
|||||||
- pytest -v -s ./compile/test_wrapper.py
|
- pytest -v -s ./compile/test_wrapper.py
|
||||||
- VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
|
- VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
|
||||||
- VLLM_TEST_SAME_HOST=1 VLLM_TEST_WITH_DEFAULT_DEVICE_SET=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
|
- VLLM_TEST_SAME_HOST=1 VLLM_TEST_WITH_DEFAULT_DEVICE_SET=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
|
||||||
- pytest -v -s distributed/test_sequence_parallel.py
|
|
||||||
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
|
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
|
||||||
- pytest -v -s v1/worker/test_worker_memory_snapshot.py
|
- pytest -v -s v1/worker/test_worker_memory_snapshot.py
|
||||||
|
|
||||||
- label: Distributed Tests (4 GPUs)
|
- label: Distributed Tests (4 GPUs)
|
||||||
timeout_in_minutes: 50
|
timeout_in_minutes: 50
|
||||||
working_dir: "/vllm-workspace/tests"
|
working_dir: "/vllm-workspace/tests"
|
||||||
num_gpus: 4
|
num_devices: 4
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/distributed/
|
- vllm/distributed/
|
||||||
- tests/distributed/test_utils
|
- tests/distributed/test_utils
|
||||||
@@ -63,6 +62,7 @@ steps:
|
|||||||
- tests/compile/fullgraph/test_basic_correctness.py
|
- tests/compile/fullgraph/test_basic_correctness.py
|
||||||
- examples/offline_inference/rlhf.py
|
- examples/offline_inference/rlhf.py
|
||||||
- examples/offline_inference/rlhf_colocate.py
|
- examples/offline_inference/rlhf_colocate.py
|
||||||
|
- examples/offline_inference/new_weight_syncing/
|
||||||
- tests/examples/offline_inference/data_parallel.py
|
- tests/examples/offline_inference/data_parallel.py
|
||||||
- tests/v1/distributed
|
- tests/v1/distributed
|
||||||
- tests/v1/engine/test_engine_core_client.py
|
- tests/v1/engine/test_engine_core_client.py
|
||||||
@@ -97,14 +97,19 @@ steps:
|
|||||||
- pytest -v -s distributed/test_symm_mem_allreduce.py
|
- pytest -v -s distributed/test_symm_mem_allreduce.py
|
||||||
# TODO: create a dedicated test section for multi-GPU example tests
|
# TODO: create a dedicated test section for multi-GPU example tests
|
||||||
# when we have multiple distributed example tests
|
# when we have multiple distributed example tests
|
||||||
|
# OLD rlhf examples
|
||||||
- cd ../examples/offline_inference
|
- cd ../examples/offline_inference
|
||||||
- VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf.py
|
- VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf.py
|
||||||
- VLLM_ALLOW_INSECURE_SERIALIZATION=1 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
|
- VLLM_ALLOW_INSECURE_SERIALIZATION=1 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
|
||||||
|
# NEW rlhf examples
|
||||||
|
- cd new_weight_syncing
|
||||||
|
- VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf_nccl.py
|
||||||
|
- VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf_ipc.py
|
||||||
|
|
||||||
- label: Distributed Tests (8 GPUs)(H100)
|
- label: Distributed Tests (8 GPUs)(H100)
|
||||||
timeout_in_minutes: 10
|
timeout_in_minutes: 10
|
||||||
gpu: h100
|
device: h100
|
||||||
num_gpus: 8
|
num_devices: 8
|
||||||
working_dir: "/vllm-workspace/tests"
|
working_dir: "/vllm-workspace/tests"
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- examples/offline_inference/torchrun_dp_example.py
|
- examples/offline_inference/torchrun_dp_example.py
|
||||||
@@ -120,9 +125,9 @@ steps:
|
|||||||
- torchrun --nproc-per-node=8 ../examples/offline_inference/torchrun_dp_example.py --tp-size=2 --pp-size=1 --dp-size=4 --enable-ep
|
- torchrun --nproc-per-node=8 ../examples/offline_inference/torchrun_dp_example.py --tp-size=2 --pp-size=1 --dp-size=4 --enable-ep
|
||||||
|
|
||||||
- label: Distributed Tests (4 GPUs)(A100)
|
- label: Distributed Tests (4 GPUs)(A100)
|
||||||
gpu: a100
|
device: a100
|
||||||
optional: true
|
optional: true
|
||||||
num_gpus: 4
|
num_devices: 4
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
commands:
|
commands:
|
||||||
@@ -133,26 +138,23 @@ steps:
|
|||||||
- TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
|
- TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
|
||||||
- pytest -v -s -x lora/test_mixtral.py
|
- pytest -v -s -x lora/test_mixtral.py
|
||||||
|
|
||||||
- label: Distributed Tests (2 GPUs)(H200)
|
- label: Distributed Tests (2 GPUs)(H100)
|
||||||
gpu: h200
|
timeout_in_minutes: 15
|
||||||
|
device: h100
|
||||||
optional: true
|
optional: true
|
||||||
working_dir: "/vllm-workspace/"
|
working_dir: "/vllm-workspace/"
|
||||||
num_gpus: 2
|
num_devices: 2
|
||||||
commands:
|
commands:
|
||||||
- VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_async_tp.py
|
|
||||||
- pytest -v -s tests/compile/distributed/test_sequence_parallelism.py
|
|
||||||
- pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
|
|
||||||
- VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'
|
|
||||||
- VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/distributed/test_sequence_parallel.py
|
|
||||||
- pytest -v -s tests/distributed/test_context_parallel.py
|
- pytest -v -s tests/distributed/test_context_parallel.py
|
||||||
- CUDA_VISIBLE_DEVICES=1,2 VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=deepep_high_throughput
|
# - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 examples/offline_inference/new_weight_syncing/rlhf_async_new_apis.py --- failing, need to re-enable
|
||||||
|
- VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=deepep_high_throughput
|
||||||
- pytest -v -s tests/v1/distributed/test_dbo.py
|
- pytest -v -s tests/v1/distributed/test_dbo.py
|
||||||
|
|
||||||
- label: Distributed Tests (2 GPUs)(B200)
|
- label: Distributed Tests (2 GPUs)(B200)
|
||||||
gpu: b200
|
device: b200
|
||||||
optional: true
|
optional: true
|
||||||
working_dir: "/vllm-workspace/"
|
working_dir: "/vllm-workspace/"
|
||||||
num_gpus: 2
|
num_devices: 2
|
||||||
commands:
|
commands:
|
||||||
- pytest -v -s tests/distributed/test_context_parallel.py
|
- pytest -v -s tests/distributed/test_context_parallel.py
|
||||||
- pytest -v -s tests/distributed/test_nccl_symm_mem_allreduce.py
|
- pytest -v -s tests/distributed/test_nccl_symm_mem_allreduce.py
|
||||||
@@ -161,8 +163,10 @@ steps:
|
|||||||
- label: 2 Node Test (4 GPUs)
|
- label: 2 Node Test (4 GPUs)
|
||||||
timeout_in_minutes: 30
|
timeout_in_minutes: 30
|
||||||
working_dir: "/vllm-workspace/tests"
|
working_dir: "/vllm-workspace/tests"
|
||||||
num_gpus: 2
|
num_devices: 2
|
||||||
num_nodes: 2
|
num_nodes: 2
|
||||||
|
no_plugin: true
|
||||||
|
optional: true # TODO: revert once infra issue solved
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/distributed/
|
- vllm/distributed/
|
||||||
- vllm/engine/
|
- vllm/engine/
|
||||||
@@ -171,12 +175,12 @@ steps:
|
|||||||
- tests/distributed/
|
- tests/distributed/
|
||||||
- tests/examples/offline_inference/data_parallel.py
|
- tests/examples/offline_inference/data_parallel.py
|
||||||
commands:
|
commands:
|
||||||
- ./.buildkite/scripts/run-multi-node-test.sh /vllm-workspace/tests 2 2 public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:0bec63fa317e1fbd62e19b0fc31c43c81bf89077 "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=0 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py" "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=1 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code"
|
- ./.buildkite/scripts/run-multi-node-test.sh /vllm-workspace/tests 2 2 $IMAGE_TAG "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=0 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py" "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=1 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code"
|
||||||
|
|
||||||
- label: Distributed NixlConnector PD accuracy (4 GPUs)
|
- label: Distributed NixlConnector PD accuracy (4 GPUs)
|
||||||
timeout_in_minutes: 30
|
timeout_in_minutes: 30
|
||||||
working_dir: "/vllm-workspace/tests"
|
working_dir: "/vllm-workspace/tests"
|
||||||
num_gpus: 4
|
num_devices: 4
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
|
- vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
|
||||||
- tests/v1/kv_connector/nixl_integration/
|
- tests/v1/kv_connector/nixl_integration/
|
||||||
@@ -184,10 +188,32 @@ steps:
|
|||||||
- uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
|
- uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
|
||||||
- bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh
|
- bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh
|
||||||
|
|
||||||
- label: Pipeline + Context Parallelism (4 GPUs))
|
- label: DP EP Distributed NixlConnector PD accuracy tests (4 GPUs)
|
||||||
|
timeout_in_minutes: 30
|
||||||
|
working_dir: "/vllm-workspace/tests"
|
||||||
|
num_devices: 4
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
|
||||||
|
- tests/v1/kv_connector/nixl_integration/
|
||||||
|
commands:
|
||||||
|
- uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
|
||||||
|
- DP_EP=1 bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh
|
||||||
|
|
||||||
|
- label: CrossLayer KV layout Distributed NixlConnector PD accuracy tests (4 GPUs)
|
||||||
|
timeout_in_minutes: 30
|
||||||
|
working_dir: "/vllm-workspace/tests"
|
||||||
|
num_devices: 4
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
|
||||||
|
- tests/v1/kv_connector/nixl_integration/
|
||||||
|
commands:
|
||||||
|
- uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
|
||||||
|
- CROSS_LAYERS_BLOCKS=True bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh
|
||||||
|
|
||||||
|
- label: Pipeline + Context Parallelism (4 GPUs)
|
||||||
timeout_in_minutes: 60
|
timeout_in_minutes: 60
|
||||||
working_dir: "/vllm-workspace/tests"
|
working_dir: "/vllm-workspace/tests"
|
||||||
num_gpus: 4
|
num_devices: 4
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/distributed/
|
- vllm/distributed/
|
||||||
- vllm/engine/
|
- vllm/engine/
|
||||||
@@ -196,4 +222,4 @@ steps:
|
|||||||
- tests/distributed/
|
- tests/distributed/
|
||||||
commands:
|
commands:
|
||||||
- pytest -v -s distributed/test_pp_cudagraph.py
|
- pytest -v -s distributed/test_pp_cudagraph.py
|
||||||
- pytest -v -s distributed/test_pipeline_parallel.py
|
- pytest -v -s distributed/test_pipeline_parallel.py
|
||||||
|
|||||||
@@ -4,39 +4,36 @@ depends_on:
|
|||||||
steps:
|
steps:
|
||||||
- label: DeepSeek V2-Lite Accuracy
|
- label: DeepSeek V2-Lite Accuracy
|
||||||
timeout_in_minutes: 60
|
timeout_in_minutes: 60
|
||||||
gpu: h100
|
device: h100
|
||||||
optional: true
|
optional: true
|
||||||
num_gpus: 4
|
num_devices: 4
|
||||||
working_dir: "/vllm-workspace"
|
working_dir: "/vllm-workspace"
|
||||||
commands:
|
commands:
|
||||||
- bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh 0.25 200 8010
|
- bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh 0.25 200 8010
|
||||||
|
|
||||||
- label: Qwen3-30B-A3B-FP8-block Accuracy
|
- label: Qwen3-30B-A3B-FP8-block Accuracy
|
||||||
timeout_in_minutes: 60
|
timeout_in_minutes: 60
|
||||||
gpu: h100
|
device: h100
|
||||||
optional: true
|
optional: true
|
||||||
num_gpus: 4
|
num_devices: 4
|
||||||
working_dir: "/vllm-workspace"
|
working_dir: "/vllm-workspace"
|
||||||
commands:
|
commands:
|
||||||
- bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020
|
- bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020
|
||||||
|
|
||||||
- label: Qwen3-30B-A3B-FP8-block Accuracy (B200)
|
- label: Qwen3-30B-A3B-FP8-block Accuracy (B200)
|
||||||
timeout_in_minutes: 60
|
timeout_in_minutes: 60
|
||||||
gpu: b200
|
device: b200
|
||||||
optional: true
|
optional: true
|
||||||
num_gpus: 2
|
num_devices: 2
|
||||||
working_dir: "/vllm-workspace"
|
working_dir: "/vllm-workspace"
|
||||||
commands:
|
commands:
|
||||||
- bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1
|
- bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1
|
||||||
|
|
||||||
- label: Prime-RL Integration (2 GPUs)
|
- label: DeepSeek V2-Lite Prefetch Offload Accuracy (H100)
|
||||||
timeout_in_minutes: 30
|
timeout_in_minutes: 60
|
||||||
|
device: h100
|
||||||
optional: true
|
optional: true
|
||||||
soft_fail: true
|
num_devices: 1
|
||||||
num_gpus: 2
|
|
||||||
working_dir: "/vllm-workspace"
|
working_dir: "/vllm-workspace"
|
||||||
source_file_dependencies:
|
|
||||||
- vllm/
|
|
||||||
- .buildkite/scripts/run-prime-rl-test.sh
|
|
||||||
commands:
|
commands:
|
||||||
- bash .buildkite/scripts/run-prime-rl-test.sh
|
- bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_prefetch_offload.sh 0.25 200 8030
|
||||||
|
|||||||
@@ -14,7 +14,7 @@ steps:
|
|||||||
commands:
|
commands:
|
||||||
- pytest -v -s engine test_sequence.py test_config.py test_logger.py test_vllm_port.py
|
- pytest -v -s engine test_sequence.py test_config.py test_logger.py test_vllm_port.py
|
||||||
|
|
||||||
- label: V1 e2e + engine
|
- label: V1 e2e + engine (1 GPU)
|
||||||
timeout_in_minutes: 45
|
timeout_in_minutes: 45
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
@@ -23,4 +23,48 @@ steps:
|
|||||||
# TODO: accuracy does not match, whether setting
|
# TODO: accuracy does not match, whether setting
|
||||||
# VLLM_USE_FLASHINFER_SAMPLER or not on H100.
|
# VLLM_USE_FLASHINFER_SAMPLER or not on H100.
|
||||||
- pytest -v -s v1/e2e
|
- pytest -v -s v1/e2e
|
||||||
- pytest -v -s v1/engine
|
# Run this test standalone for now;
|
||||||
|
# need to untangle use (implicit) use of spawn/fork across the tests.
|
||||||
|
- pytest -v -s v1/engine/test_preprocess_error_handling.py
|
||||||
|
# Run the rest of v1/engine tests
|
||||||
|
- pytest -v -s v1/engine --ignore v1/engine/test_preprocess_error_handling.py
|
||||||
|
mirror:
|
||||||
|
amd:
|
||||||
|
device: mi325_1
|
||||||
|
depends_on:
|
||||||
|
- image-build-amd
|
||||||
|
commands:
|
||||||
|
- pytest -v -s v1/e2e
|
||||||
|
- pytest -v -s v1/engine
|
||||||
|
|
||||||
|
- label: V1 e2e (2 GPUs)
|
||||||
|
timeout_in_minutes: 60 # TODO: Fix timeout after we have more confidence in the test stability
|
||||||
|
optional: true
|
||||||
|
num_devices: 2
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/
|
||||||
|
- tests/v1/e2e
|
||||||
|
commands:
|
||||||
|
# Only run tests that need exactly 2 GPUs
|
||||||
|
- pytest -v -s v1/e2e/test_spec_decode.py -k "tensor_parallelism"
|
||||||
|
mirror:
|
||||||
|
amd:
|
||||||
|
device: mi325_2
|
||||||
|
depends_on:
|
||||||
|
- image-build-amd
|
||||||
|
|
||||||
|
- label: V1 e2e (4 GPUs)
|
||||||
|
timeout_in_minutes: 60 # TODO: Fix timeout after we have more confidence in the test stability
|
||||||
|
optional: true
|
||||||
|
num_devices: 4
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/
|
||||||
|
- tests/v1/e2e
|
||||||
|
commands:
|
||||||
|
# Only run tests that need 4 GPUs
|
||||||
|
- pytest -v -s v1/e2e/test_spec_decode.py -k "eagle_correctness_heavy"
|
||||||
|
mirror:
|
||||||
|
amd:
|
||||||
|
device: mi325_4
|
||||||
|
depends_on:
|
||||||
|
- image-build-amd
|
||||||
|
|||||||
@@ -24,6 +24,11 @@ steps:
|
|||||||
- pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_collective_rpc.py
|
- pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_collective_rpc.py
|
||||||
- pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
|
- pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
|
||||||
- pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
|
- pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
|
||||||
|
mirror:
|
||||||
|
amd:
|
||||||
|
device: mi325_1
|
||||||
|
depends_on:
|
||||||
|
- image-build-amd
|
||||||
|
|
||||||
- label: Entrypoints Integration (API Server 1)
|
- label: Entrypoints Integration (API Server 1)
|
||||||
timeout_in_minutes: 130
|
timeout_in_minutes: 130
|
||||||
@@ -42,15 +47,13 @@ steps:
|
|||||||
working_dir: "/vllm-workspace/tests"
|
working_dir: "/vllm-workspace/tests"
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
- tests/tool_use
|
|
||||||
- tests/entrypoints/sleep
|
|
||||||
- tests/entrypoints/instrumentator
|
|
||||||
- tests/entrypoints/rpc
|
- tests/entrypoints/rpc
|
||||||
|
- tests/entrypoints/instrumentator
|
||||||
|
- tests/tool_use
|
||||||
commands:
|
commands:
|
||||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||||
- PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/rpc
|
|
||||||
- pytest -v -s entrypoints/instrumentator
|
- pytest -v -s entrypoints/instrumentator
|
||||||
- pytest -v -s entrypoints/sleep
|
- PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/rpc
|
||||||
- pytest -v -s tool_use
|
- pytest -v -s tool_use
|
||||||
|
|
||||||
- label: Entrypoints Integration (Pooling)
|
- label: Entrypoints Integration (Pooling)
|
||||||
@@ -62,6 +65,11 @@ steps:
|
|||||||
commands:
|
commands:
|
||||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||||
- pytest -v -s entrypoints/pooling
|
- pytest -v -s entrypoints/pooling
|
||||||
|
mirror:
|
||||||
|
amd:
|
||||||
|
device: mi325_1
|
||||||
|
depends_on:
|
||||||
|
- image-build-amd
|
||||||
|
|
||||||
- label: Entrypoints Integration (Responses API)
|
- label: Entrypoints Integration (Responses API)
|
||||||
timeout_in_minutes: 50
|
timeout_in_minutes: 50
|
||||||
|
|||||||
@@ -14,10 +14,25 @@ steps:
|
|||||||
- label: EPLB Execution
|
- label: EPLB Execution
|
||||||
timeout_in_minutes: 20
|
timeout_in_minutes: 20
|
||||||
working_dir: "/vllm-workspace/tests"
|
working_dir: "/vllm-workspace/tests"
|
||||||
num_gpus: 4
|
num_devices: 4
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/distributed/eplb
|
- vllm/distributed/eplb
|
||||||
- tests/distributed/test_eplb_execute.py
|
- tests/distributed/test_eplb_execute.py
|
||||||
commands:
|
commands:
|
||||||
- pytest -v -s distributed/test_eplb_execute.py
|
- pytest -v -s distributed/test_eplb_execute.py
|
||||||
- pytest -v -s distributed/test_eplb_spec_decode.py
|
- pytest -v -s distributed/test_eplb_spec_decode.py
|
||||||
|
|
||||||
|
- label: Elastic EP Scaling Test
|
||||||
|
timeout_in_minutes: 20
|
||||||
|
device: b200
|
||||||
|
optional: true
|
||||||
|
working_dir: "/vllm-workspace/tests"
|
||||||
|
num_devices: 4
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/distributed/
|
||||||
|
- vllm/engine/
|
||||||
|
- vllm/executor/
|
||||||
|
- vllm/compilation/
|
||||||
|
- tests/distributed/
|
||||||
|
commands:
|
||||||
|
- pytest -v -s distributed/test_elastic_ep.py
|
||||||
|
|||||||
@@ -15,8 +15,9 @@ steps:
|
|||||||
timeout_in_minutes: 35
|
timeout_in_minutes: 35
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- csrc/attention/
|
- csrc/attention/
|
||||||
- vllm/attention
|
|
||||||
- vllm/v1/attention
|
- vllm/v1/attention
|
||||||
|
# TODO: remove this dependency (https://github.com/vllm-project/vllm/issues/32267)
|
||||||
|
- vllm/model_executor/layers/attention
|
||||||
- tests/kernels/attention
|
- tests/kernels/attention
|
||||||
commands:
|
commands:
|
||||||
- pytest -v -s kernels/attention --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
|
- pytest -v -s kernels/attention --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
|
||||||
@@ -43,7 +44,8 @@ steps:
|
|||||||
- vllm/envs.py
|
- vllm/envs.py
|
||||||
- vllm/config
|
- vllm/config
|
||||||
commands:
|
commands:
|
||||||
- pytest -v -s kernels/moe --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
|
- pytest -v -s kernels/moe --ignore=kernels/moe/test_modular_oai_triton_moe.py --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
|
||||||
|
- pytest -v -s kernels/moe/test_modular_oai_triton_moe.py --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
|
||||||
parallelism: 2
|
parallelism: 2
|
||||||
|
|
||||||
- label: Kernels Mamba Test
|
- label: Kernels Mamba Test
|
||||||
@@ -57,8 +59,8 @@ steps:
|
|||||||
|
|
||||||
- label: Kernels DeepGEMM Test (H100)
|
- label: Kernels DeepGEMM Test (H100)
|
||||||
timeout_in_minutes: 45
|
timeout_in_minutes: 45
|
||||||
gpu: h100
|
device: h100
|
||||||
num_gpus: 1
|
num_devices: 1
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- tools/install_deepgemm.sh
|
- tools/install_deepgemm.sh
|
||||||
- vllm/utils/deep_gemm.py
|
- vllm/utils/deep_gemm.py
|
||||||
@@ -69,7 +71,7 @@ steps:
|
|||||||
- tests/kernels/moe/test_batched_deepgemm.py
|
- tests/kernels/moe/test_batched_deepgemm.py
|
||||||
- tests/kernels/attention/test_deepgemm_attention.py
|
- tests/kernels/attention/test_deepgemm_attention.py
|
||||||
commands:
|
commands:
|
||||||
- pytest -v -s kernels/quantization/test_block_fp8.py -k deep_gemm
|
- pytest -v -s kernels/quantization/test_block_fp8.py
|
||||||
- pytest -v -s kernels/moe/test_deepgemm.py
|
- pytest -v -s kernels/moe/test_deepgemm.py
|
||||||
- pytest -v -s kernels/moe/test_batched_deepgemm.py
|
- pytest -v -s kernels/moe/test_batched_deepgemm.py
|
||||||
- pytest -v -s kernels/attention/test_deepgemm_attention.py
|
- pytest -v -s kernels/attention/test_deepgemm_attention.py
|
||||||
@@ -77,7 +79,7 @@ steps:
|
|||||||
- label: Kernels (B200)
|
- label: Kernels (B200)
|
||||||
timeout_in_minutes: 30
|
timeout_in_minutes: 30
|
||||||
working_dir: "/vllm-workspace/"
|
working_dir: "/vllm-workspace/"
|
||||||
gpu: b200
|
device: b200
|
||||||
# optional: true
|
# optional: true
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- csrc/quantization/fp4/
|
- csrc/quantization/fp4/
|
||||||
@@ -85,7 +87,7 @@ steps:
|
|||||||
- csrc/quantization/cutlass_w8a8/moe/
|
- csrc/quantization/cutlass_w8a8/moe/
|
||||||
- vllm/model_executor/layers/fused_moe/cutlass_moe.py
|
- vllm/model_executor/layers/fused_moe/cutlass_moe.py
|
||||||
- vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py
|
- vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py
|
||||||
- vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py
|
- vllm/model_executor/layers/fused_moe/flashinfer_a2a_prepare_finalize.py
|
||||||
- vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
|
- vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
|
||||||
- vllm/v1/attention/backends/flashinfer.py
|
- vllm/v1/attention/backends/flashinfer.py
|
||||||
- vllm/v1/attention/backends/mla/cutlass_mla.py
|
- vllm/v1/attention/backends/mla/cutlass_mla.py
|
||||||
@@ -114,4 +116,54 @@ steps:
|
|||||||
- pytest -v -s tests/kernels/moe/test_nvfp4_moe.py
|
- pytest -v -s tests/kernels/moe/test_nvfp4_moe.py
|
||||||
- pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py
|
- pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py
|
||||||
- pytest -v -s tests/kernels/moe/test_flashinfer.py
|
- pytest -v -s tests/kernels/moe/test_flashinfer.py
|
||||||
- pytest -v -s tests/kernels/moe/test_cutedsl_moe.py
|
- pytest -v -s tests/kernels/moe/test_flashinfer_moe.py
|
||||||
|
- pytest -v -s tests/kernels/moe/test_cutedsl_moe.py
|
||||||
|
# e2e
|
||||||
|
- pytest -v -s tests/models/quantization/test_nvfp4.py
|
||||||
|
|
||||||
|
- label: Kernels Helion Test
|
||||||
|
timeout_in_minutes: 30
|
||||||
|
device: h100
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/utils/import_utils.py
|
||||||
|
- tests/kernels/helion/
|
||||||
|
commands:
|
||||||
|
- pip install helion
|
||||||
|
- pytest -v -s kernels/helion/
|
||||||
|
|
||||||
|
|
||||||
|
- label: Kernels FP8 MoE Test (1 H100)
|
||||||
|
timeout_in_minutes: 90
|
||||||
|
device: h100
|
||||||
|
num_devices: 1
|
||||||
|
optional: true
|
||||||
|
commands:
|
||||||
|
- pytest -v -s kernels/moe/test_cutlass_moe.py
|
||||||
|
- pytest -v -s kernels/moe/test_flashinfer.py
|
||||||
|
- pytest -v -s kernels/moe/test_gpt_oss_triton_kernels.py
|
||||||
|
- pytest -v -s kernels/moe/test_modular_oai_triton_moe.py
|
||||||
|
- pytest -v -s kernels/moe/test_moe.py
|
||||||
|
# - pytest -v -s kernels/moe/test_block_fp8.py - failing on main
|
||||||
|
- pytest -v -s kernels/moe/test_block_int8.py
|
||||||
|
- pytest -v -s kernels/moe/test_triton_moe_no_act_mul.py
|
||||||
|
- pytest -v -s kernels/moe/test_triton_moe_ptpc_fp8.py
|
||||||
|
|
||||||
|
- label: Kernels FP8 MoE Test (2 H100s)
|
||||||
|
timeout_in_minutes: 90
|
||||||
|
device: h100
|
||||||
|
num_devices: 2
|
||||||
|
optional: true
|
||||||
|
commands:
|
||||||
|
- pytest -v -s kernels/moe/test_deepep_deepgemm_moe.py
|
||||||
|
- pytest -v -s kernels/moe/test_deepep_moe.py
|
||||||
|
|
||||||
|
- label: Kernels Fp4 MoE Test (B200)
|
||||||
|
timeout_in_minutes: 60
|
||||||
|
device: b200
|
||||||
|
num_devices: 1
|
||||||
|
optional: true
|
||||||
|
commands:
|
||||||
|
- pytest -v -s kernels/moe/test_cutedsl_moe.py
|
||||||
|
- pytest -v -s kernels/moe/test_flashinfer_moe.py
|
||||||
|
- pytest -v -s kernels/moe/test_nvfp4_moe.py
|
||||||
|
- pytest -v -s kernels/moe/test_ocp_mx_moe.py
|
||||||
|
|||||||
@@ -11,22 +11,22 @@ steps:
|
|||||||
commands:
|
commands:
|
||||||
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt
|
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt
|
||||||
|
|
||||||
- label: LM Eval Large Models (4 GPUs)(A100)
|
# - label: LM Eval Large Models (4 GPUs)(A100)
|
||||||
gpu: a100
|
# device: a100
|
||||||
optional: true
|
# optional: true
|
||||||
num_gpus: 4
|
# num_devices: 4
|
||||||
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
|
# working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
|
||||||
source_file_dependencies:
|
# source_file_dependencies:
|
||||||
- csrc/
|
# - csrc/
|
||||||
- vllm/model_executor/layers/quantization
|
# - vllm/model_executor/layers/quantization
|
||||||
commands:
|
# commands:
|
||||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
# - export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||||
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4
|
# - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4
|
||||||
|
|
||||||
- label: LM Eval Large Models (4 GPUs)(H100)
|
- label: LM Eval Large Models (4 GPUs)(H100)
|
||||||
gpu: h100
|
device: h100
|
||||||
optional: true
|
optional: true
|
||||||
num_gpus: 4
|
num_devices: 4
|
||||||
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
|
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- csrc/
|
- csrc/
|
||||||
@@ -37,10 +37,65 @@ steps:
|
|||||||
|
|
||||||
- label: LM Eval Small Models (B200)
|
- label: LM Eval Small Models (B200)
|
||||||
timeout_in_minutes: 120
|
timeout_in_minutes: 120
|
||||||
gpu: b200
|
device: b200
|
||||||
optional: true
|
optional: true
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- csrc/
|
- csrc/
|
||||||
- vllm/model_executor/layers/quantization
|
- vllm/model_executor/layers/quantization
|
||||||
commands:
|
commands:
|
||||||
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt
|
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt
|
||||||
|
|
||||||
|
- label: LM Eval Large Models (H200)
|
||||||
|
timeout_in_minutes: 60
|
||||||
|
device: h200
|
||||||
|
optional: true
|
||||||
|
num_devices: 8
|
||||||
|
commands:
|
||||||
|
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-h200.txt
|
||||||
|
|
||||||
|
- label: MoE Refactor Integration Test (H100 - TEMPORARY)
|
||||||
|
device: h100
|
||||||
|
optional: true
|
||||||
|
num_devices: 2
|
||||||
|
commands:
|
||||||
|
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=evals/gsm8k/configs/moe-refactor/config-h100.txt
|
||||||
|
|
||||||
|
- label: MoE Refactor Integration Test (B200 - TEMPORARY)
|
||||||
|
device: b200
|
||||||
|
optional: true
|
||||||
|
num_devices: 2
|
||||||
|
commands:
|
||||||
|
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=evals/gsm8k/configs/moe-refactor/config-b200.txt
|
||||||
|
|
||||||
|
- label: MoE Refactor Integration Test (B200 DP - TEMPORARY)
|
||||||
|
device: b200
|
||||||
|
optional: true
|
||||||
|
num_devices: 2
|
||||||
|
commands:
|
||||||
|
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=evals/gsm8k/configs/moe-refactor-dp-ep/config-b200.txt
|
||||||
|
|
||||||
|
- label: GPQA Eval (GPT-OSS) (H100)
|
||||||
|
timeout_in_minutes: 120
|
||||||
|
device: h100
|
||||||
|
optional: true
|
||||||
|
num_devices: 2
|
||||||
|
source_file_dependencies:
|
||||||
|
- csrc/
|
||||||
|
- vllm/model_executor/layers/quantization
|
||||||
|
- tests/evals/gpt_oss/
|
||||||
|
commands:
|
||||||
|
- uv pip install --system 'gpt-oss[eval]==0.0.5'
|
||||||
|
- pytest -s -v evals/gpt_oss/test_gpqa_correctness.py --config-list-file=configs/models-h100.txt
|
||||||
|
|
||||||
|
- label: GPQA Eval (GPT-OSS) (B200)
|
||||||
|
timeout_in_minutes: 120
|
||||||
|
device: b200
|
||||||
|
optional: true
|
||||||
|
num_devices: 2
|
||||||
|
source_file_dependencies:
|
||||||
|
- csrc/
|
||||||
|
- vllm/model_executor/layers/quantization
|
||||||
|
- tests/evals/gpt_oss/
|
||||||
|
commands:
|
||||||
|
- uv pip install --system 'gpt-oss[eval]==0.0.5'
|
||||||
|
- pytest -s -v evals/gpt_oss/test_gpqa_correctness.py --config-list-file=configs/models-b200.txt
|
||||||
|
|||||||
@@ -14,7 +14,7 @@ steps:
|
|||||||
|
|
||||||
- label: LoRA TP (Distributed)
|
- label: LoRA TP (Distributed)
|
||||||
timeout_in_minutes: 30
|
timeout_in_minutes: 30
|
||||||
num_gpus: 4
|
num_devices: 4
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/lora
|
- vllm/lora
|
||||||
- tests/lora
|
- tests/lora
|
||||||
|
|||||||
@@ -9,6 +9,7 @@ steps:
|
|||||||
- tests/v1
|
- tests/v1
|
||||||
commands:
|
commands:
|
||||||
- uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
|
- uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
|
||||||
|
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||||
# split the test to avoid interference
|
# split the test to avoid interference
|
||||||
- pytest -v -s -m 'not cpu_test' v1/core
|
- pytest -v -s -m 'not cpu_test' v1/core
|
||||||
- pytest -v -s v1/executor
|
- pytest -v -s v1/executor
|
||||||
@@ -16,7 +17,8 @@ steps:
|
|||||||
- pytest -v -s v1/sample
|
- pytest -v -s v1/sample
|
||||||
- pytest -v -s v1/logits_processors
|
- pytest -v -s v1/logits_processors
|
||||||
- pytest -v -s v1/worker
|
- pytest -v -s v1/worker
|
||||||
- pytest -v -s v1/spec_decode
|
# TODO: create another `optional` test group for slow tests
|
||||||
|
- pytest -v -s -m 'not slow_test' v1/spec_decode
|
||||||
- pytest -v -s -m 'not cpu_test' v1/kv_connector/unit
|
- pytest -v -s -m 'not cpu_test' v1/kv_connector/unit
|
||||||
- pytest -v -s -m 'not cpu_test' v1/metrics
|
- pytest -v -s -m 'not cpu_test' v1/metrics
|
||||||
- pytest -v -s v1/test_oracle.py
|
- pytest -v -s v1/test_oracle.py
|
||||||
@@ -25,13 +27,19 @@ steps:
|
|||||||
# Integration test for streaming correctness (requires special branch).
|
# Integration test for streaming correctness (requires special branch).
|
||||||
- pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api
|
- pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api
|
||||||
- pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
|
- pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
|
||||||
|
mirror:
|
||||||
|
amd:
|
||||||
|
device: mi325_1
|
||||||
|
depends_on:
|
||||||
|
- image-build-amd
|
||||||
|
|
||||||
- label: V1 Others (CPU)
|
- label: V1 Others (CPU)
|
||||||
depends_on: ~
|
depends_on:
|
||||||
|
- image-build-cpu
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
- tests/v1
|
- tests/v1
|
||||||
no_gpu: true
|
device: cpu
|
||||||
commands:
|
commands:
|
||||||
# split the test to avoid interference
|
# split the test to avoid interference
|
||||||
- pytest -v -s -m 'cpu_test' v1/core
|
- pytest -v -s -m 'cpu_test' v1/core
|
||||||
@@ -71,7 +79,7 @@ steps:
|
|||||||
- python3 offline_inference/vision_language_multi_image.py --seed 0
|
- python3 offline_inference/vision_language_multi_image.py --seed 0
|
||||||
- python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
|
- python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
|
||||||
# for pooling models
|
# for pooling models
|
||||||
- python3 pooling/pooling/vision_language_pooling.py --seed 0
|
- python3 pooling/embed/vision_embedding_offline.py --seed 0
|
||||||
# for features demo
|
# for features demo
|
||||||
- python3 offline_inference/prefix_caching.py
|
- python3 offline_inference/prefix_caching.py
|
||||||
- python3 offline_inference/llm_engine_example.py
|
- python3 offline_inference/llm_engine_example.py
|
||||||
@@ -82,7 +90,7 @@ steps:
|
|||||||
|
|
||||||
- label: Metrics, Tracing (2 GPUs)
|
- label: Metrics, Tracing (2 GPUs)
|
||||||
timeout_in_minutes: 20
|
timeout_in_minutes: 20
|
||||||
num_gpus: 2
|
num_devices: 2
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
- tests/v1/tracing
|
- tests/v1/tracing
|
||||||
@@ -107,19 +115,24 @@ steps:
|
|||||||
timeout_in_minutes: 50
|
timeout_in_minutes: 50
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
|
- tests/detokenizer
|
||||||
- tests/multimodal
|
- tests/multimodal
|
||||||
- tests/utils_
|
- tests/utils_
|
||||||
commands:
|
commands:
|
||||||
|
- pytest -v -s detokenizer
|
||||||
- pytest -v -s -m 'not cpu_test' multimodal
|
- pytest -v -s -m 'not cpu_test' multimodal
|
||||||
- pytest -v -s utils_
|
- pytest -v -s utils_
|
||||||
|
|
||||||
- label: Async Engine, Inputs, Utils, Worker, Config (CPU)
|
- label: Async Engine, Inputs, Utils, Worker, Config (CPU)
|
||||||
depends_on: ~
|
depends_on:
|
||||||
|
- image-build-cpu
|
||||||
timeout_in_minutes: 30
|
timeout_in_minutes: 30
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
- tests/test_inputs.py
|
- tests/test_inputs.py
|
||||||
- tests/test_outputs.py
|
- tests/test_outputs.py
|
||||||
|
- tests/test_pooling_params.py
|
||||||
|
- tests/test_ray_env.py
|
||||||
- tests/multimodal
|
- tests/multimodal
|
||||||
- tests/renderers
|
- tests/renderers
|
||||||
- tests/standalone_tests/lazy_imports.py
|
- tests/standalone_tests/lazy_imports.py
|
||||||
@@ -127,11 +140,13 @@ steps:
|
|||||||
- tests/tool_parsers
|
- tests/tool_parsers
|
||||||
- tests/transformers_utils
|
- tests/transformers_utils
|
||||||
- tests/config
|
- tests/config
|
||||||
no_gpu: true
|
device: cpu
|
||||||
commands:
|
commands:
|
||||||
- python3 standalone_tests/lazy_imports.py
|
- python3 standalone_tests/lazy_imports.py
|
||||||
- pytest -v -s test_inputs.py
|
- pytest -v -s test_inputs.py
|
||||||
- pytest -v -s test_outputs.py
|
- pytest -v -s test_outputs.py
|
||||||
|
- pytest -v -s test_pooling_params.py
|
||||||
|
- pytest -v -s test_ray_env.py
|
||||||
- pytest -v -s -m 'cpu_test' multimodal
|
- pytest -v -s -m 'cpu_test' multimodal
|
||||||
- pytest -v -s renderers
|
- pytest -v -s renderers
|
||||||
- pytest -v -s tokenizers_
|
- pytest -v -s tokenizers_
|
||||||
@@ -139,23 +154,9 @@ steps:
|
|||||||
- pytest -v -s transformers_utils
|
- pytest -v -s transformers_utils
|
||||||
- pytest -v -s config
|
- pytest -v -s config
|
||||||
|
|
||||||
- label: GPT-OSS Eval (B200)
|
|
||||||
timeout_in_minutes: 60
|
|
||||||
working_dir: "/vllm-workspace/"
|
|
||||||
gpu: b200
|
|
||||||
optional: true
|
|
||||||
source_file_dependencies:
|
|
||||||
- tests/evals/gpt_oss
|
|
||||||
- vllm/model_executor/models/gpt_oss.py
|
|
||||||
- vllm/model_executor/layers/quantization/mxfp4.py
|
|
||||||
- vllm/v1/attention/backends/flashinfer.py
|
|
||||||
commands:
|
|
||||||
- uv pip install --system 'gpt-oss[eval]==0.0.5'
|
|
||||||
- pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58
|
|
||||||
|
|
||||||
- label: Batch Invariance (H100)
|
- label: Batch Invariance (H100)
|
||||||
timeout_in_minutes: 25
|
timeout_in_minutes: 25
|
||||||
gpu: h100
|
device: h100
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/v1/attention
|
- vllm/v1/attention
|
||||||
- vllm/model_executor/layers
|
- vllm/model_executor/layers
|
||||||
@@ -164,4 +165,18 @@ steps:
|
|||||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||||
- pip install pytest-timeout pytest-forked
|
- pip install pytest-timeout pytest-forked
|
||||||
- pytest -v -s v1/determinism/test_batch_invariance.py
|
- pytest -v -s v1/determinism/test_batch_invariance.py
|
||||||
- pytest -v -s v1/determinism/test_rms_norm_batch_invariant.py
|
- pytest -v -s v1/determinism/test_rms_norm_batch_invariant.py
|
||||||
|
|
||||||
|
- label: Acceptance Length Test (Large Models) # optional
|
||||||
|
timeout_in_minutes: 25
|
||||||
|
gpu: h100
|
||||||
|
optional: true
|
||||||
|
num_gpus: 1
|
||||||
|
working_dir: "/vllm-workspace/tests"
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/v1/spec_decode/
|
||||||
|
- vllm/model_executor/models/mlp_speculator.py
|
||||||
|
- tests/v1/spec_decode/test_acceptance_length.py
|
||||||
|
commands:
|
||||||
|
- export VLLM_ALLOW_INSECURE_SERIALIZATION=1
|
||||||
|
- pytest -v -s v1/spec_decode/test_acceptance_length.py -m slow_test
|
||||||
|
|||||||
@@ -4,7 +4,6 @@ depends_on:
|
|||||||
steps:
|
steps:
|
||||||
- label: Basic Models Tests (Initialization)
|
- label: Basic Models Tests (Initialization)
|
||||||
timeout_in_minutes: 45
|
timeout_in_minutes: 45
|
||||||
mirror_hardwares: [amdexperimental]
|
|
||||||
torch_nightly: true
|
torch_nightly: true
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
@@ -16,7 +15,6 @@ steps:
|
|||||||
|
|
||||||
- label: Basic Models Tests (Extra Initialization) %N
|
- label: Basic Models Tests (Extra Initialization) %N
|
||||||
timeout_in_minutes: 45
|
timeout_in_minutes: 45
|
||||||
mirror_hardwares: [amdexperimental]
|
|
||||||
torch_nightly: true
|
torch_nightly: true
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/model_executor/models/
|
- vllm/model_executor/models/
|
||||||
@@ -33,18 +31,27 @@ steps:
|
|||||||
timeout_in_minutes: 45
|
timeout_in_minutes: 45
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
|
- tests/models/test_terratorch.py
|
||||||
- tests/models/test_transformers.py
|
- tests/models/test_transformers.py
|
||||||
- tests/models/test_registry.py
|
- tests/models/test_registry.py
|
||||||
commands:
|
commands:
|
||||||
- pytest -v -s models/test_transformers.py models/test_registry.py
|
- pytest -v -s models/test_terratorch.py models/test_transformers.py models/test_registry.py
|
||||||
|
mirror:
|
||||||
|
amd:
|
||||||
|
device: mi325_1
|
||||||
|
depends_on:
|
||||||
|
- image-build-amd
|
||||||
|
|
||||||
|
|
||||||
- label: Basic Models Test (Other CPU) # 5min
|
- label: Basic Models Test (Other CPU) # 5min
|
||||||
|
depends_on:
|
||||||
|
- image-build-cpu
|
||||||
timeout_in_minutes: 10
|
timeout_in_minutes: 10
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
- tests/models/test_utils.py
|
- tests/models/test_utils.py
|
||||||
- tests/models/test_vision.py
|
- tests/models/test_vision.py
|
||||||
no_gpu: true
|
device: cpu
|
||||||
commands:
|
commands:
|
||||||
- pytest -v -s models/test_utils.py models/test_vision.py
|
- pytest -v -s models/test_utils.py models/test_vision.py
|
||||||
|
|
||||||
|
|||||||
@@ -5,7 +5,7 @@ steps:
|
|||||||
- label: Distributed Model Tests (2 GPUs)
|
- label: Distributed Model Tests (2 GPUs)
|
||||||
timeout_in_minutes: 50
|
timeout_in_minutes: 50
|
||||||
working_dir: "/vllm-workspace/tests"
|
working_dir: "/vllm-workspace/tests"
|
||||||
num_gpus: 2
|
num_devices: 2
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/model_executor/model_loader/sharded_state_loader.py
|
- vllm/model_executor/model_loader/sharded_state_loader.py
|
||||||
- vllm/model_executor/models/
|
- vllm/model_executor/models/
|
||||||
|
|||||||
@@ -4,7 +4,6 @@ depends_on:
|
|||||||
steps:
|
steps:
|
||||||
- label: Language Models Tests (Standard)
|
- label: Language Models Tests (Standard)
|
||||||
timeout_in_minutes: 25
|
timeout_in_minutes: 25
|
||||||
mirror_hardwares: [amdexperimental]
|
|
||||||
torch_nightly: true
|
torch_nightly: true
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
@@ -16,7 +15,6 @@ steps:
|
|||||||
|
|
||||||
- label: Language Models Tests (Extra Standard) %N
|
- label: Language Models Tests (Extra Standard) %N
|
||||||
timeout_in_minutes: 45
|
timeout_in_minutes: 45
|
||||||
mirror_hardwares: [amdexperimental]
|
|
||||||
torch_nightly: true
|
torch_nightly: true
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/model_executor/models/
|
- vllm/model_executor/models/
|
||||||
@@ -32,7 +30,6 @@ steps:
|
|||||||
|
|
||||||
- label: Language Models Tests (Hybrid) %N
|
- label: Language Models Tests (Hybrid) %N
|
||||||
timeout_in_minutes: 75
|
timeout_in_minutes: 75
|
||||||
mirror_hardwares: [amdexperimental]
|
|
||||||
torch_nightly: true
|
torch_nightly: true
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
@@ -40,7 +37,7 @@ steps:
|
|||||||
commands:
|
commands:
|
||||||
# Install fast path packages for testing against transformers
|
# Install fast path packages for testing against transformers
|
||||||
# Note: also needed to run plamo2 model in vLLM
|
# Note: also needed to run plamo2 model in vLLM
|
||||||
- uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/mamba@v2.2.5'
|
- uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/mamba@v2.3.0'
|
||||||
- uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2'
|
- uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2'
|
||||||
# Shard hybrid language model tests
|
# Shard hybrid language model tests
|
||||||
- pytest -v -s models/language/generation -m hybrid_model --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --shard-id=$$BUILDKITE_PARALLEL_JOB
|
- pytest -v -s models/language/generation -m hybrid_model --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --shard-id=$$BUILDKITE_PARALLEL_JOB
|
||||||
@@ -48,7 +45,6 @@ steps:
|
|||||||
|
|
||||||
- label: Language Models Test (Extended Generation) # 80min
|
- label: Language Models Test (Extended Generation) # 80min
|
||||||
timeout_in_minutes: 110
|
timeout_in_minutes: 110
|
||||||
mirror_hardwares: [amdexperimental]
|
|
||||||
optional: true
|
optional: true
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
@@ -56,13 +52,21 @@ steps:
|
|||||||
commands:
|
commands:
|
||||||
# Install fast path packages for testing against transformers
|
# Install fast path packages for testing against transformers
|
||||||
# Note: also needed to run plamo2 model in vLLM
|
# Note: also needed to run plamo2 model in vLLM
|
||||||
- uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/mamba@v2.2.5'
|
- uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/mamba@v2.3.0'
|
||||||
- uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2'
|
- uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2'
|
||||||
- pytest -v -s models/language/generation -m '(not core_model) and (not hybrid_model)'
|
- pytest -v -s models/language/generation -m '(not core_model) and (not hybrid_model)'
|
||||||
|
mirror:
|
||||||
|
amd:
|
||||||
|
device: mi325_1
|
||||||
|
depends_on:
|
||||||
|
- image-build-amd
|
||||||
|
commands:
|
||||||
|
- uv pip install --system --no-build-isolation 'git+https://github.com/AndreasKaratzas/mamba@fix-rocm-7.0-warp-size-constexpr'
|
||||||
|
- uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2'
|
||||||
|
- pytest -v -s models/language/generation -m '(not core_model) and (not hybrid_model)'
|
||||||
|
|
||||||
- label: Language Models Test (PPL)
|
- label: Language Models Test (PPL)
|
||||||
timeout_in_minutes: 110
|
timeout_in_minutes: 110
|
||||||
mirror_hardwares: [amdexperimental]
|
|
||||||
optional: true
|
optional: true
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
@@ -72,17 +76,20 @@ steps:
|
|||||||
|
|
||||||
- label: Language Models Test (Extended Pooling) # 36min
|
- label: Language Models Test (Extended Pooling) # 36min
|
||||||
timeout_in_minutes: 50
|
timeout_in_minutes: 50
|
||||||
mirror_hardwares: [amdexperimental]
|
|
||||||
optional: true
|
optional: true
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
- tests/models/language/pooling
|
- tests/models/language/pooling
|
||||||
commands:
|
commands:
|
||||||
- pytest -v -s models/language/pooling -m 'not core_model'
|
- pytest -v -s models/language/pooling -m 'not core_model'
|
||||||
|
mirror:
|
||||||
|
amd:
|
||||||
|
device: mi325_1
|
||||||
|
depends_on:
|
||||||
|
- image-build-amd
|
||||||
|
|
||||||
- label: Language Models Test (MTEB)
|
- label: Language Models Test (MTEB)
|
||||||
timeout_in_minutes: 110
|
timeout_in_minutes: 110
|
||||||
mirror_hardwares: [amdexperimental]
|
|
||||||
optional: true
|
optional: true
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
|
|||||||
@@ -14,11 +14,14 @@ steps:
|
|||||||
- cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model # Otherwise, mp_method="spawn" doesn't work
|
- cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model # Otherwise, mp_method="spawn" doesn't work
|
||||||
|
|
||||||
- label: Multi-Modal Processor Test (CPU)
|
- label: Multi-Modal Processor Test (CPU)
|
||||||
|
depends_on:
|
||||||
|
- image-build-cpu
|
||||||
timeout_in_minutes: 60
|
timeout_in_minutes: 60
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
- tests/models/multimodal
|
- tests/models/multimodal
|
||||||
no_gpu: true
|
- tests/models/registry.py
|
||||||
|
device: cpu
|
||||||
commands:
|
commands:
|
||||||
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
|
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
|
||||||
- pytest -v -s models/multimodal/processing --ignore models/multimodal/processing/test_tensor_schema.py
|
- pytest -v -s models/multimodal/processing --ignore models/multimodal/processing/test_tensor_schema.py
|
||||||
@@ -28,6 +31,7 @@ steps:
|
|||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
- tests/models/multimodal
|
- tests/models/multimodal
|
||||||
|
- tests/models/registry.py
|
||||||
commands:
|
commands:
|
||||||
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
|
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
|
||||||
- pytest -v -s models/multimodal/processing/test_tensor_schema.py
|
- pytest -v -s models/multimodal/processing/test_tensor_schema.py
|
||||||
@@ -68,12 +72,3 @@ steps:
|
|||||||
commands:
|
commands:
|
||||||
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
|
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
|
||||||
- pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=1) and not core_model'
|
- pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=1) and not core_model'
|
||||||
|
|
||||||
# This test is used only in PR development phase to test individual models and should never run on main
|
|
||||||
- label: Custom Models
|
|
||||||
optional: true
|
|
||||||
commands:
|
|
||||||
- echo 'Testing custom models...'
|
|
||||||
# PR authors can temporarily add commands below to test individual models
|
|
||||||
# e.g. pytest -v -s models/encoder_decoder/vision_language/test_mllama.py
|
|
||||||
# *To avoid merge conflicts, remember to REMOVE (not just comment out) them before merging the PR*
|
|
||||||
|
|||||||
@@ -5,7 +5,7 @@ steps:
|
|||||||
- label: Plugin Tests (2 GPUs)
|
- label: Plugin Tests (2 GPUs)
|
||||||
timeout_in_minutes: 60
|
timeout_in_minutes: 60
|
||||||
working_dir: "/vllm-workspace/tests"
|
working_dir: "/vllm-workspace/tests"
|
||||||
num_gpus: 2
|
num_devices: 2
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/plugins/
|
- vllm/plugins/
|
||||||
- tests/plugins/
|
- tests/plugins/
|
||||||
@@ -19,6 +19,10 @@ steps:
|
|||||||
- pip install -e ./plugins/prithvi_io_processor_plugin
|
- pip install -e ./plugins/prithvi_io_processor_plugin
|
||||||
- pytest -v -s plugins_tests/test_io_processor_plugins.py
|
- pytest -v -s plugins_tests/test_io_processor_plugins.py
|
||||||
- pip uninstall prithvi_io_processor_plugin -y
|
- pip uninstall prithvi_io_processor_plugin -y
|
||||||
|
# test bge_m3_sparse io_processor plugin
|
||||||
|
- pip install -e ./plugins/bge_m3_sparse_plugin
|
||||||
|
- pytest -v -s plugins_tests/test_bge_m3_sparse_io_processor_plugins.py
|
||||||
|
- pip uninstall bge_m3_sparse_plugin -y
|
||||||
# end io_processor plugins test
|
# end io_processor plugins test
|
||||||
# begin stat_logger plugins test
|
# begin stat_logger plugins test
|
||||||
- pip install -e ./plugins/vllm_add_dummy_stat_logger
|
- pip install -e ./plugins/vllm_add_dummy_stat_logger
|
||||||
|
|||||||
@@ -3,7 +3,7 @@ depends_on:
|
|||||||
- image-build
|
- image-build
|
||||||
steps:
|
steps:
|
||||||
- label: PyTorch Compilation Unit Tests
|
- label: PyTorch Compilation Unit Tests
|
||||||
timeout_in_minutes: 30
|
timeout_in_minutes: 10
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
- tests/compile
|
- tests/compile
|
||||||
@@ -17,8 +17,16 @@ steps:
|
|||||||
# (using -0 for proper path handling)
|
# (using -0 for proper path handling)
|
||||||
- "find compile/ -maxdepth 1 -name 'test_*.py' -print0 | xargs -0 -n1 -I{} pytest -s -v '{}'"
|
- "find compile/ -maxdepth 1 -name 'test_*.py' -print0 | xargs -0 -n1 -I{} pytest -s -v '{}'"
|
||||||
|
|
||||||
|
- label: PyTorch Compilation Passes Unit Tests
|
||||||
|
timeout_in_minutes: 20
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/
|
||||||
|
- tests/compile/passes
|
||||||
|
commands:
|
||||||
|
- pytest -s -v compile/passes --ignore compile/passes/distributed
|
||||||
|
|
||||||
- label: PyTorch Fullgraph Smoke Test
|
- label: PyTorch Fullgraph Smoke Test
|
||||||
timeout_in_minutes: 30
|
timeout_in_minutes: 35
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
- tests/compile
|
- tests/compile
|
||||||
@@ -30,16 +38,13 @@ steps:
|
|||||||
- "find compile/fullgraph/ -name 'test_*.py' -not -name 'test_full_graph.py' -exec pytest -s -v {} \\;"
|
- "find compile/fullgraph/ -name 'test_*.py' -not -name 'test_full_graph.py' -exec pytest -s -v {} \\;"
|
||||||
|
|
||||||
- label: PyTorch Fullgraph
|
- label: PyTorch Fullgraph
|
||||||
timeout_in_minutes: 40
|
timeout_in_minutes: 30
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
- tests/compile
|
- tests/compile
|
||||||
commands:
|
commands:
|
||||||
# fp8 kv scales not supported on sm89, tested on Blackwell instead
|
# fp8 kv scales not supported on sm89, tested on Blackwell instead
|
||||||
- pytest -v -s compile/fullgraph/test_full_graph.py -k 'not test_fp8_kv_scale_compile'
|
- pytest -v -s compile/fullgraph/test_full_graph.py -k 'not test_fp8_kv_scale_compile'
|
||||||
# Limit to no custom ops to reduce running time
|
|
||||||
# Wrap with quotes to escape yaml and avoid starting -k string with a -
|
|
||||||
- "pytest -v -s compile/distributed/test_fusions_e2e.py -k 'TRITON and not +quant_fp8 and not Llama-4'"
|
|
||||||
|
|
||||||
- label: Pytorch Nightly Dependency Override Check # 2min
|
- label: Pytorch Nightly Dependency Override Check # 2min
|
||||||
# if this test fails, it means the nightly torch version is not compatible with some
|
# if this test fails, it means the nightly torch version is not compatible with some
|
||||||
|
|||||||
@@ -16,14 +16,14 @@ steps:
|
|||||||
# https://github.com/pytorch/ao/issues/2919, we'll have to skip new torchao tests for now
|
# https://github.com/pytorch/ao/issues/2919, we'll have to skip new torchao tests for now
|
||||||
# we can only upgrade after this is resolved
|
# we can only upgrade after this is resolved
|
||||||
# TODO(jerryzh168): resolve the above comment
|
# TODO(jerryzh168): resolve the above comment
|
||||||
- uv pip install --system torchao==0.13.0 --index-url https://download.pytorch.org/whl/cu129
|
- uv pip install --system torchao==0.14.1 --index-url https://download.pytorch.org/whl/cu129
|
||||||
- uv pip install --system conch-triton-kernels
|
- uv pip install --system conch-triton-kernels
|
||||||
- VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py
|
- VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py
|
||||||
|
|
||||||
- label: Quantized MoE Test (B200)
|
- label: Quantized MoE Test (B200)
|
||||||
timeout_in_minutes: 60
|
timeout_in_minutes: 60
|
||||||
working_dir: "/vllm-workspace/"
|
working_dir: "/vllm-workspace/"
|
||||||
gpu: b200
|
device: b200
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- tests/quantization/test_blackwell_moe.py
|
- tests/quantization/test_blackwell_moe.py
|
||||||
- vllm/model_executor/models/deepseek_v2.py
|
- vllm/model_executor/models/deepseek_v2.py
|
||||||
|
|||||||
16
.buildkite/test_areas/ray_compat.yaml
Normal file
16
.buildkite/test_areas/ray_compat.yaml
Normal file
@@ -0,0 +1,16 @@
|
|||||||
|
group: Ray Compatibility
|
||||||
|
depends_on:
|
||||||
|
- image-build
|
||||||
|
steps:
|
||||||
|
- label: Ray Dependency Compatibility Check
|
||||||
|
# Informational only — does not block the pipeline.
|
||||||
|
# If this fails, it means the PR introduces a dependency that
|
||||||
|
# conflicts with Ray's dependency constraints.
|
||||||
|
# See https://github.com/vllm-project/vllm/issues/33599
|
||||||
|
soft_fail: true
|
||||||
|
timeout_in_minutes: 10
|
||||||
|
source_file_dependencies:
|
||||||
|
- requirements/
|
||||||
|
- setup.py
|
||||||
|
commands:
|
||||||
|
- bash /vllm-workspace/.buildkite/scripts/check-ray-compatibility.sh
|
||||||
@@ -12,3 +12,10 @@ steps:
|
|||||||
commands:
|
commands:
|
||||||
- pytest -v -s samplers
|
- pytest -v -s samplers
|
||||||
- VLLM_USE_FLASHINFER_SAMPLER=1 pytest -v -s samplers
|
- VLLM_USE_FLASHINFER_SAMPLER=1 pytest -v -s samplers
|
||||||
|
mirror:
|
||||||
|
amd:
|
||||||
|
device: mi325_1
|
||||||
|
depends_on:
|
||||||
|
- image-build-amd
|
||||||
|
commands:
|
||||||
|
- pytest -v -s samplers
|
||||||
|
|||||||
@@ -5,7 +5,7 @@ steps:
|
|||||||
- label: Weight Loading Multiple GPU # 33min
|
- label: Weight Loading Multiple GPU # 33min
|
||||||
timeout_in_minutes: 45
|
timeout_in_minutes: 45
|
||||||
working_dir: "/vllm-workspace/tests"
|
working_dir: "/vllm-workspace/tests"
|
||||||
num_gpus: 2
|
num_devices: 2
|
||||||
optional: true
|
optional: true
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
@@ -13,13 +13,13 @@ steps:
|
|||||||
commands:
|
commands:
|
||||||
- bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models.txt
|
- bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models.txt
|
||||||
|
|
||||||
- label: Weight Loading Multiple GPU - Large Models # optional
|
# - label: Weight Loading Multiple GPU - Large Models # optional
|
||||||
working_dir: "/vllm-workspace/tests"
|
# working_dir: "/vllm-workspace/tests"
|
||||||
num_gpus: 2
|
# num_devices: 2
|
||||||
gpu: a100
|
# device: a100
|
||||||
optional: true
|
# optional: true
|
||||||
source_file_dependencies:
|
# source_file_dependencies:
|
||||||
- vllm/
|
# - vllm/
|
||||||
- tests/weight_loading
|
# - tests/weight_loading
|
||||||
commands:
|
# commands:
|
||||||
- bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large.txt
|
# - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large.txt
|
||||||
|
|||||||
24
.github/.bc-linter.yml
vendored
24
.github/.bc-linter.yml
vendored
@@ -1,24 +0,0 @@
|
|||||||
# doc: https://github.com/pytorch/test-infra/blob/main/tools/stronghold/docs/bc_linter_config.md
|
|
||||||
version: 1
|
|
||||||
paths:
|
|
||||||
# We temporarily disable globally, and will only enable with `annotations.include`
|
|
||||||
# include:
|
|
||||||
# - "vllm/v1/attetion/*.py"
|
|
||||||
# - "vllm/v1/core/*.py"
|
|
||||||
exclude:
|
|
||||||
- "**/*.py"
|
|
||||||
|
|
||||||
scan:
|
|
||||||
functions: true # check free functions and methods
|
|
||||||
classes: true # check classes/dataclasses
|
|
||||||
public_only: true # ignore names starting with "_" at any level
|
|
||||||
|
|
||||||
annotations:
|
|
||||||
include: # decorators that force‑include a symbol
|
|
||||||
- name: "bc_linter_include" # matched by simple name or dotted suffix
|
|
||||||
propagate_to_members: false # for classes, include methods/inner classes
|
|
||||||
exclude: # decorators that force‑exclude a symbol
|
|
||||||
- name: "bc_linter_skip" # matched by simple name or dotted suffix
|
|
||||||
propagate_to_members: true # for classes, exclude methods/inner classes
|
|
||||||
|
|
||||||
excluded_violations: [] # e.g. ["ParameterRenamed", "FieldTypeChanged"]
|
|
||||||
63
.github/CODEOWNERS
vendored
63
.github/CODEOWNERS
vendored
@@ -2,43 +2,66 @@
|
|||||||
# for more info about CODEOWNERS file
|
# for more info about CODEOWNERS file
|
||||||
|
|
||||||
# This lists cover the "core" components of vLLM that require careful review
|
# This lists cover the "core" components of vLLM that require careful review
|
||||||
/vllm/attention @LucasWilkinson
|
/vllm/compilation @zou3519 @youkaichao @ProExpertProg @BoyuanFeng
|
||||||
/vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-redhat @njhill @22quinn
|
/vllm/distributed/kv_transfer @NickLucche @ApostaC @orozery
|
||||||
|
/vllm/lora @jeejeelee
|
||||||
|
/vllm/model_executor/layers/attention @LucasWilkinson @MatthewBonanni
|
||||||
/vllm/model_executor/layers/fused_moe @mgoin @pavanimajety
|
/vllm/model_executor/layers/fused_moe @mgoin @pavanimajety
|
||||||
/vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth @yewentao256 @pavanimajety
|
/vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth @yewentao256 @pavanimajety
|
||||||
/vllm/model_executor/layers/mamba @tdoublep
|
/vllm/model_executor/layers/mamba @tdoublep
|
||||||
/vllm/model_executor/model_loader @22quinn
|
/vllm/model_executor/model_loader @22quinn
|
||||||
/vllm/model_executor/layers/batch_invariant.py @yewentao256
|
/vllm/model_executor/layers/batch_invariant.py @yewentao256
|
||||||
/vllm/multimodal @DarkLight1337 @ywang96 @NickLucche @tjtanaa
|
/vllm/multimodal @DarkLight1337 @ywang96 @NickLucche @tjtanaa
|
||||||
/vllm/vllm_flash_attn @LucasWilkinson
|
/vllm/vllm_flash_attn @LucasWilkinson @MatthewBonanni
|
||||||
/vllm/lora @jeejeelee
|
|
||||||
/vllm/reasoning @aarnphm @chaunceyjiang
|
|
||||||
/vllm/entrypoints @aarnphm @chaunceyjiang
|
|
||||||
/vllm/tool_parsers @aarnphm @chaunceyjiang
|
|
||||||
/vllm/compilation @zou3519 @youkaichao @ProExpertProg
|
|
||||||
/vllm/distributed/kv_transfer @NickLucche @ApostaC
|
|
||||||
CMakeLists.txt @tlrmchlsmth @LucasWilkinson
|
CMakeLists.txt @tlrmchlsmth @LucasWilkinson
|
||||||
|
|
||||||
# Any change to the VllmConfig changes can have a large user-facing impact,
|
# Any change to the VllmConfig changes can have a large user-facing impact,
|
||||||
# so spam a lot of people
|
# so spam a lot of people
|
||||||
/vllm/config @WoosukKwon @youkaichao @robertgshaw2-redhat @mgoin @tlrmchlsmth @houseroad @hmellor @yewentao256 @ProExpertProg
|
/vllm/config @WoosukKwon @youkaichao @robertgshaw2-redhat @mgoin @tlrmchlsmth @houseroad @hmellor @yewentao256 @ProExpertProg
|
||||||
/vllm/config/cache.py @WoosukKwon @youkaichao @robertgshaw2-redhat @mgoin @tlrmchlsmth @houseroad @hmellor @yewentao256 @ProExpertProg @heheda12345
|
/vllm/config/cache.py @heheda12345
|
||||||
|
|
||||||
|
# Entrypoints
|
||||||
|
/vllm/entrypoints/anthropic @mgoin @DarkLight1337
|
||||||
|
/vllm/entrypoints/cli @hmellor @mgoin @DarkLight1337 @russellb
|
||||||
|
/vllm/entrypoints/mcp @heheda12345
|
||||||
|
/vllm/entrypoints/openai @aarnphm @chaunceyjiang @DarkLight1337 @russellb
|
||||||
|
/vllm/entrypoints/openai/realtime @njhill
|
||||||
|
/vllm/entrypoints/openai/speech_to_text @NickLucche
|
||||||
|
/vllm/entrypoints/pooling @noooop
|
||||||
|
/vllm/entrypoints/sagemaker @DarkLight1337
|
||||||
|
/vllm/entrypoints/serve @njhill
|
||||||
|
/vllm/entrypoints/*.py @njhill
|
||||||
|
/vllm/entrypoints/chat_utils.py @DarkLight1337
|
||||||
|
/vllm/entrypoints/llm.py @DarkLight1337
|
||||||
|
|
||||||
|
# Input/Output Processing
|
||||||
|
/vllm/sampling_params.py @njhill @NickLucche
|
||||||
|
/vllm/pooling_params.py @noooop @DarkLight1337
|
||||||
|
/vllm/tokenizers @DarkLight1337 @njhill
|
||||||
|
/vllm/renderers @DarkLight1337 @njhill
|
||||||
|
/vllm/reasoning @aarnphm @chaunceyjiang
|
||||||
|
/vllm/tool_parsers @aarnphm @chaunceyjiang
|
||||||
|
|
||||||
# vLLM V1
|
# vLLM V1
|
||||||
/vllm/v1/attention @LucasWilkinson
|
/vllm/v1/attention @LucasWilkinson @MatthewBonanni
|
||||||
/vllm/v1/attention/backend.py @WoosukKwon @zhuohan123 @youkaichao @alexm-redhat @njhill
|
/vllm/v1/attention/backend.py @WoosukKwon @zhuohan123 @youkaichao @alexm-redhat @njhill
|
||||||
/vllm/v1/attention/backends/mla @pavanimajety
|
/vllm/v1/attention/backends/mla @pavanimajety
|
||||||
/vllm/v1/attention/backends/flashinfer.py @mgoin @pavanimajety
|
/vllm/v1/attention/backends/flashinfer.py @mgoin @pavanimajety
|
||||||
/vllm/v1/attention/backends/triton_attn.py @tdoublep
|
/vllm/v1/attention/backends/triton_attn.py @tdoublep
|
||||||
/vllm/v1/core @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @alexm-redhat @heheda12345 @ApostaC
|
/vllm/v1/core @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @alexm-redhat @heheda12345 @ApostaC @orozery
|
||||||
/vllm/v1/sample @22quinn @houseroad @njhill
|
/vllm/v1/sample @22quinn @houseroad @njhill
|
||||||
/vllm/v1/spec_decode @benchislett @luccafong
|
/vllm/v1/spec_decode @benchislett @luccafong @MatthewBonanni
|
||||||
/vllm/v1/structured_output @mgoin @russellb @aarnphm @benchislett
|
/vllm/v1/structured_output @mgoin @russellb @aarnphm @benchislett
|
||||||
/vllm/v1/kv_cache_interface.py @heheda12345
|
/vllm/v1/kv_cache_interface.py @heheda12345
|
||||||
/vllm/v1/offloading @ApostaC
|
/vllm/v1/kv_offload @ApostaC @orozery
|
||||||
|
/vllm/v1/engine @njhill
|
||||||
|
/vllm/v1/executor @njhill
|
||||||
|
/vllm/v1/worker @njhill
|
||||||
|
/vllm/v1/worker/kv_connector_model_runner_mixin.py @orozery @NickLucche
|
||||||
|
|
||||||
# Model runner V2
|
# Model runner V2
|
||||||
/vllm/v1/worker/gpu @WoosukKwon
|
/vllm/v1/worker/gpu @WoosukKwon @njhill
|
||||||
|
/vllm/v1/worker/gpu/kv_connector.py @orozery
|
||||||
|
|
||||||
# Test ownership
|
# Test ownership
|
||||||
/.buildkite/lm-eval-harness @mgoin
|
/.buildkite/lm-eval-harness @mgoin
|
||||||
@@ -54,13 +77,13 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
|
|||||||
/tests/test_inputs.py @DarkLight1337 @ywang96
|
/tests/test_inputs.py @DarkLight1337 @ywang96
|
||||||
/tests/v1/entrypoints/llm/test_struct_output_generate.py @mgoin @russellb @aarnphm
|
/tests/v1/entrypoints/llm/test_struct_output_generate.py @mgoin @russellb @aarnphm
|
||||||
/tests/v1/structured_output @mgoin @russellb @aarnphm
|
/tests/v1/structured_output @mgoin @russellb @aarnphm
|
||||||
/tests/v1/core @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @alexm-redhat @heheda12345 @ApostaC
|
/tests/v1/core @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @alexm-redhat @heheda12345 @ApostaC @orozery
|
||||||
/tests/weight_loading @mgoin @youkaichao @yewentao256
|
/tests/weight_loading @mgoin @youkaichao @yewentao256
|
||||||
/tests/lora @jeejeelee
|
/tests/lora @jeejeelee
|
||||||
/tests/models/language/generation/test_hybrid.py @tdoublep
|
/tests/models/language/generation/test_hybrid.py @tdoublep
|
||||||
/tests/v1/kv_connector/nixl_integration @NickLucche
|
/tests/v1/kv_connector/nixl_integration @NickLucche
|
||||||
/tests/v1/kv_connector @ApostaC
|
/tests/v1/kv_connector @ApostaC @orozery
|
||||||
/tests/v1/offloading @ApostaC
|
/tests/v1/kv_offload @ApostaC @orozery
|
||||||
/tests/v1/determinism @yewentao256
|
/tests/v1/determinism @yewentao256
|
||||||
|
|
||||||
# Transformers modeling backend
|
# Transformers modeling backend
|
||||||
@@ -113,8 +136,8 @@ mkdocs.yaml @hmellor
|
|||||||
/vllm/model_executor/models/mixtral*.py @patrickvonplaten
|
/vllm/model_executor/models/mixtral*.py @patrickvonplaten
|
||||||
/vllm/model_executor/models/voxtral*.py @patrickvonplaten
|
/vllm/model_executor/models/voxtral*.py @patrickvonplaten
|
||||||
/vllm/model_executor/models/pixtral*.py @patrickvonplaten
|
/vllm/model_executor/models/pixtral*.py @patrickvonplaten
|
||||||
|
/vllm/tokenizers/mistral.py @patrickvonplaten
|
||||||
/vllm/transformers_utils/configs/mistral.py @patrickvonplaten
|
/vllm/transformers_utils/configs/mistral.py @patrickvonplaten
|
||||||
/vllm/transformers_utils/tokenizers/mistral.py @patrickvonplaten
|
|
||||||
|
|
||||||
# Kernels
|
# Kernels
|
||||||
/vllm/v1/attention/ops/chunked_prefill_paged_decode.py @tdoublep
|
/vllm/v1/attention/ops/chunked_prefill_paged_decode.py @tdoublep
|
||||||
@@ -150,9 +173,7 @@ mkdocs.yaml @hmellor
|
|||||||
/examples/pooling @noooop
|
/examples/pooling @noooop
|
||||||
/tests/models/*/pooling* @noooop
|
/tests/models/*/pooling* @noooop
|
||||||
/tests/entrypoints/pooling @noooop
|
/tests/entrypoints/pooling @noooop
|
||||||
/vllm/entrypoints/pooling @noooop
|
|
||||||
/vllm/config/pooler.py @noooop
|
/vllm/config/pooler.py @noooop
|
||||||
/vllm/pooling_params.py @noooop
|
|
||||||
/vllm/model_executor/layers/pooler @noooop
|
/vllm/model_executor/layers/pooler @noooop
|
||||||
|
|
||||||
# Security guide and policies
|
# Security guide and policies
|
||||||
|
|||||||
3
.github/mergify.yml
vendored
3
.github/mergify.yml
vendored
@@ -259,8 +259,7 @@ pull_request_rules:
|
|||||||
- files=benchmarks/run_structured_output_benchmark.sh
|
- files=benchmarks/run_structured_output_benchmark.sh
|
||||||
- files=docs/features/structured_outputs.md
|
- files=docs/features/structured_outputs.md
|
||||||
- files=examples/offline_inference/structured_outputs.py
|
- files=examples/offline_inference/structured_outputs.py
|
||||||
- files=examples/online_serving/openai_chat_completion_structured_outputs.py
|
- files=examples/online_serving/structured_outputs/structured_outputs.py
|
||||||
- files=examples/online_serving/openai_chat_completion_structured_outputs_with_reasoning.py
|
|
||||||
- files~=^tests/v1/structured_output/
|
- files~=^tests/v1/structured_output/
|
||||||
- files=tests/v1/entrypoints/llm/test_struct_output_generate.py
|
- files=tests/v1/entrypoints/llm/test_struct_output_generate.py
|
||||||
- files~=^vllm/v1/structured_output/
|
- files~=^vllm/v1/structured_output/
|
||||||
|
|||||||
29
.github/workflows/bc-lint.yml
vendored
29
.github/workflows/bc-lint.yml
vendored
@@ -1,29 +0,0 @@
|
|||||||
name: BC Lint
|
|
||||||
|
|
||||||
on:
|
|
||||||
pull_request:
|
|
||||||
types:
|
|
||||||
- opened
|
|
||||||
- synchronize
|
|
||||||
- reopened
|
|
||||||
- labeled
|
|
||||||
- unlabeled
|
|
||||||
|
|
||||||
jobs:
|
|
||||||
bc_lint:
|
|
||||||
if: github.repository_owner == 'vllm-project'
|
|
||||||
runs-on: ubuntu-latest
|
|
||||||
steps:
|
|
||||||
- name: Run BC Lint Action
|
|
||||||
uses: pytorch/test-infra/.github/actions/bc-lint@main
|
|
||||||
with:
|
|
||||||
repo: ${{ github.event.pull_request.head.repo.full_name }}
|
|
||||||
base_sha: ${{ github.event.pull_request.base.sha }}
|
|
||||||
head_sha: ${{ github.event.pull_request.head.sha }}
|
|
||||||
suppression: ${{ contains(github.event.pull_request.labels.*.name, 'suppress-bc-linter') }}
|
|
||||||
docs_link: 'https://github.com/pytorch/test-infra/wiki/BC-Linter'
|
|
||||||
config_dir: .github
|
|
||||||
|
|
||||||
concurrency:
|
|
||||||
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}
|
|
||||||
cancel-in-progress: true
|
|
||||||
1
.github/workflows/cleanup_pr_body.yml
vendored
1
.github/workflows/cleanup_pr_body.yml
vendored
@@ -19,6 +19,7 @@ jobs:
|
|||||||
uses: actions/setup-python@83679a892e2d95755f2dac6acb0bfd1e9ac5d548 # v6.1.0
|
uses: actions/setup-python@83679a892e2d95755f2dac6acb0bfd1e9ac5d548 # v6.1.0
|
||||||
with:
|
with:
|
||||||
python-version: '3.12'
|
python-version: '3.12'
|
||||||
|
cache: 'pip'
|
||||||
|
|
||||||
- name: Install Python dependencies
|
- name: Install Python dependencies
|
||||||
run: |
|
run: |
|
||||||
|
|||||||
5
.gitignore
vendored
5
.gitignore
vendored
@@ -3,6 +3,8 @@
|
|||||||
|
|
||||||
# vllm-flash-attn built from source
|
# vllm-flash-attn built from source
|
||||||
vllm/vllm_flash_attn/*
|
vllm/vllm_flash_attn/*
|
||||||
|
!vllm/vllm_flash_attn/__init__.py
|
||||||
|
!vllm/vllm_flash_attn/flash_attn_interface.py
|
||||||
|
|
||||||
# OpenAI triton kernels copied from source
|
# OpenAI triton kernels copied from source
|
||||||
vllm/third_party/triton_kernels/*
|
vllm/third_party/triton_kernels/*
|
||||||
@@ -238,3 +240,6 @@ ep_kernels_workspace/
|
|||||||
vllm/grpc/vllm_engine_pb2.py
|
vllm/grpc/vllm_engine_pb2.py
|
||||||
vllm/grpc/vllm_engine_pb2_grpc.py
|
vllm/grpc/vllm_engine_pb2_grpc.py
|
||||||
vllm/grpc/vllm_engine_pb2.pyi
|
vllm/grpc/vllm_engine_pb2.pyi
|
||||||
|
|
||||||
|
# Ignore generated cpu headers
|
||||||
|
csrc/cpu/cpu_attn_dispatch_generated.h
|
||||||
|
|||||||
@@ -121,24 +121,9 @@ repos:
|
|||||||
name: Update Dockerfile dependency graph
|
name: Update Dockerfile dependency graph
|
||||||
entry: tools/pre_commit/update-dockerfile-graph.sh
|
entry: tools/pre_commit/update-dockerfile-graph.sh
|
||||||
language: script
|
language: script
|
||||||
- id: enforce-import-regex-instead-of-re
|
- id: check-forbidden-imports
|
||||||
name: Enforce import regex as re
|
name: Check for forbidden imports
|
||||||
entry: python tools/pre_commit/enforce_regex_import.py
|
entry: python tools/pre_commit/check_forbidden_imports.py
|
||||||
language: python
|
|
||||||
types: [python]
|
|
||||||
pass_filenames: false
|
|
||||||
additional_dependencies: [regex]
|
|
||||||
# forbid directly import triton
|
|
||||||
- id: forbid-direct-triton-import
|
|
||||||
name: "Forbid direct 'import triton'"
|
|
||||||
entry: python tools/pre_commit/check_triton_import.py
|
|
||||||
language: python
|
|
||||||
types: [python]
|
|
||||||
pass_filenames: false
|
|
||||||
additional_dependencies: [regex]
|
|
||||||
- id: check-pickle-imports
|
|
||||||
name: Prevent new pickle/cloudpickle imports
|
|
||||||
entry: python tools/pre_commit/check_pickle_imports.py
|
|
||||||
language: python
|
language: python
|
||||||
types: [python]
|
types: [python]
|
||||||
additional_dependencies: [regex]
|
additional_dependencies: [regex]
|
||||||
@@ -154,6 +139,15 @@ repos:
|
|||||||
files: ^docker/(Dockerfile|versions\.json)$
|
files: ^docker/(Dockerfile|versions\.json)$
|
||||||
pass_filenames: false
|
pass_filenames: false
|
||||||
additional_dependencies: [dockerfile-parse]
|
additional_dependencies: [dockerfile-parse]
|
||||||
|
- id: attention-backend-docs
|
||||||
|
name: Check attention backend documentation is up to date
|
||||||
|
entry: python tools/pre_commit/generate_attention_backend_docs.py --check
|
||||||
|
language: python
|
||||||
|
- id: check-boolean-context-manager
|
||||||
|
name: Check for boolean ops in with-statements
|
||||||
|
entry: python tools/pre_commit/check_boolean_context_manager.py
|
||||||
|
language: python
|
||||||
|
types: [python]
|
||||||
# Keep `suggestion` last
|
# Keep `suggestion` last
|
||||||
- id: suggestion
|
- id: suggestion
|
||||||
name: Suggestion
|
name: Suggestion
|
||||||
|
|||||||
@@ -9,13 +9,14 @@ build:
|
|||||||
python: "3.12"
|
python: "3.12"
|
||||||
jobs:
|
jobs:
|
||||||
post_checkout:
|
post_checkout:
|
||||||
- git fetch --unshallow || true
|
- git fetch origin main --unshallow --no-tags --filter=blob:none || true
|
||||||
|
pre_create_environment:
|
||||||
|
- pip install uv
|
||||||
|
create_environment:
|
||||||
|
- uv venv $READTHEDOCS_VIRTUALENV_PATH
|
||||||
|
install:
|
||||||
|
- uv pip install --python $READTHEDOCS_VIRTUALENV_PATH/bin/python --no-cache-dir -r requirements/docs.txt
|
||||||
|
|
||||||
mkdocs:
|
mkdocs:
|
||||||
configuration: mkdocs.yaml
|
configuration: mkdocs.yaml
|
||||||
fail_on_warning: true
|
fail_on_warning: true
|
||||||
|
|
||||||
# Optionally declare the Python requirements required to build your docs
|
|
||||||
python:
|
|
||||||
install:
|
|
||||||
- requirements: requirements/docs.txt
|
|
||||||
|
|||||||
@@ -56,8 +56,8 @@ endif()
|
|||||||
# requirements.txt files and should be kept consistent. The ROCm torch
|
# requirements.txt files and should be kept consistent. The ROCm torch
|
||||||
# versions are derived from docker/Dockerfile.rocm
|
# versions are derived from docker/Dockerfile.rocm
|
||||||
#
|
#
|
||||||
set(TORCH_SUPPORTED_VERSION_CUDA "2.9.1")
|
set(TORCH_SUPPORTED_VERSION_CUDA "2.10.0")
|
||||||
set(TORCH_SUPPORTED_VERSION_ROCM "2.9.1")
|
set(TORCH_SUPPORTED_VERSION_ROCM "2.10.0")
|
||||||
|
|
||||||
#
|
#
|
||||||
# Try to find python package with an executable that exactly matches
|
# Try to find python package with an executable that exactly matches
|
||||||
@@ -293,6 +293,7 @@ set(VLLM_EXT_SRC
|
|||||||
"csrc/fused_qknorm_rope_kernel.cu"
|
"csrc/fused_qknorm_rope_kernel.cu"
|
||||||
"csrc/layernorm_quant_kernels.cu"
|
"csrc/layernorm_quant_kernels.cu"
|
||||||
"csrc/sampler.cu"
|
"csrc/sampler.cu"
|
||||||
|
"csrc/topk.cu"
|
||||||
"csrc/cuda_view.cu"
|
"csrc/cuda_view.cu"
|
||||||
"csrc/quantization/gptq/q_gemm.cu"
|
"csrc/quantization/gptq/q_gemm.cu"
|
||||||
"csrc/quantization/w8a8/int8/scaled_quant.cu"
|
"csrc/quantization/w8a8/int8/scaled_quant.cu"
|
||||||
@@ -433,7 +434,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
list(APPEND VLLM_EXT_SRC ${MARLIN_TEMPLATE_BF16_KERNEL_SRC})
|
list(APPEND VLLM_EXT_SRC ${MARLIN_TEMPLATE_BF16_KERNEL_SRC})
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
if (MARLIN_SM75_ARCHS)
|
if (MARLIN_SM75_ARCHS)
|
||||||
file(GLOB MARLIN_TEMPLATE_SM75_KERNEL_SRC "csrc/quantization/marlin/sm75_kernel_*.cu")
|
file(GLOB MARLIN_TEMPLATE_SM75_KERNEL_SRC "csrc/quantization/marlin/sm75_kernel_*.cu")
|
||||||
set_gencode_flags_for_srcs(
|
set_gencode_flags_for_srcs(
|
||||||
SRCS "${MARLIN_TEMPLATE_SM75_KERNEL_SRC}"
|
SRCS "${MARLIN_TEMPLATE_SM75_KERNEL_SRC}"
|
||||||
@@ -445,7 +446,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
list(APPEND VLLM_EXT_SRC ${MARLIN_TEMPLATE_SM75_KERNEL_SRC})
|
list(APPEND VLLM_EXT_SRC ${MARLIN_TEMPLATE_SM75_KERNEL_SRC})
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
if (MARLIN_FP8_ARCHS)
|
if (MARLIN_FP8_ARCHS)
|
||||||
file(GLOB MARLIN_TEMPLATE_FP8_KERNEL_SRC "csrc/quantization/marlin/sm89_kernel_*.cu")
|
file(GLOB MARLIN_TEMPLATE_FP8_KERNEL_SRC "csrc/quantization/marlin/sm89_kernel_*.cu")
|
||||||
set_gencode_flags_for_srcs(
|
set_gencode_flags_for_srcs(
|
||||||
SRCS "${MARLIN_TEMPLATE_FP8_KERNEL_SRC}"
|
SRCS "${MARLIN_TEMPLATE_FP8_KERNEL_SRC}"
|
||||||
@@ -458,7 +459,6 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
endif()
|
endif()
|
||||||
|
|
||||||
set(MARLIN_SRCS
|
set(MARLIN_SRCS
|
||||||
"csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu"
|
|
||||||
"csrc/quantization/marlin/marlin.cu"
|
"csrc/quantization/marlin/marlin.cu"
|
||||||
"csrc/quantization/marlin/marlin_int4_fp8_preprocess.cu"
|
"csrc/quantization/marlin/marlin_int4_fp8_preprocess.cu"
|
||||||
"csrc/quantization/marlin/gptq_marlin_repack.cu"
|
"csrc/quantization/marlin/gptq_marlin_repack.cu"
|
||||||
@@ -725,7 +725,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
# CUTLASS MoE kernels
|
# CUTLASS MoE kernels
|
||||||
|
|
||||||
# The MoE kernel cutlass_moe_mm requires CUDA 12.3 or later (and ONLY works
|
# The MoE kernel cutlass_moe_mm requires CUDA 12.3 or later (and ONLY works
|
||||||
# on Hopper). get_cutlass_(pplx_)moe_mm_data should only be compiled
|
# on Hopper). get_cutlass_(batched_)moe_mm_data should only be compiled
|
||||||
# if it's possible to compile MoE kernels that use its output.
|
# if it's possible to compile MoE kernels that use its output.
|
||||||
cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a" "${CUDA_ARCHS}")
|
cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a" "${CUDA_ARCHS}")
|
||||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND SCALED_MM_ARCHS)
|
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND SCALED_MM_ARCHS)
|
||||||
@@ -771,6 +771,51 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
endif()
|
endif()
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
# Expert-specialization MXFP8 blockscaled grouped kernels (SM100+).
|
||||||
|
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
|
||||||
|
cuda_archs_loose_intersection(ES_MXFP8_GROUPED_MM_ARCHS "10.0f;11.0f" "${CUDA_ARCHS}")
|
||||||
|
else()
|
||||||
|
cuda_archs_loose_intersection(ES_MXFP8_GROUPED_MM_ARCHS "10.0a;10.1a;10.3a" "${CUDA_ARCHS}")
|
||||||
|
endif()
|
||||||
|
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND ES_MXFP8_GROUPED_MM_ARCHS)
|
||||||
|
set(SRCS
|
||||||
|
"csrc/moe/mxfp8_moe/cutlass_mxfp8_grouped_mm.cu"
|
||||||
|
"csrc/moe/mxfp8_moe/mxfp8_experts_quant.cu")
|
||||||
|
set_gencode_flags_for_srcs(
|
||||||
|
SRCS "${SRCS}"
|
||||||
|
CUDA_ARCHS "${ES_MXFP8_GROUPED_MM_ARCHS}")
|
||||||
|
list(APPEND VLLM_EXT_SRC "${SRCS}")
|
||||||
|
list(APPEND VLLM_GPU_FLAGS "-DENABLE_ES_MXFP8_GROUPED_MM_SM100=1")
|
||||||
|
message(STATUS "Building ES MXFP8 grouped kernels for archs: ${ES_MXFP8_GROUPED_MM_ARCHS}")
|
||||||
|
else()
|
||||||
|
if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8
|
||||||
|
AND ES_MXFP8_GROUPED_MM_ARCHS)
|
||||||
|
message(STATUS "Not building ES MXFP8 grouped kernels as CUDA Compiler version is "
|
||||||
|
"not >= 12.8.")
|
||||||
|
else()
|
||||||
|
message(STATUS "Not building ES MXFP8 grouped kernels as no compatible archs found "
|
||||||
|
"in CUDA target architectures.")
|
||||||
|
endif()
|
||||||
|
endif()
|
||||||
|
|
||||||
|
# DeepSeek V3 fused A GEMM kernel (requires SM 9.0+, Hopper and later)
|
||||||
|
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
|
||||||
|
cuda_archs_loose_intersection(DSV3_FUSED_A_GEMM_ARCHS "9.0a;10.0f;11.0f" "${CUDA_ARCHS}")
|
||||||
|
else()
|
||||||
|
cuda_archs_loose_intersection(DSV3_FUSED_A_GEMM_ARCHS "9.0a;10.0a;10.1a;10.3a" "${CUDA_ARCHS}")
|
||||||
|
endif()
|
||||||
|
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.0 AND DSV3_FUSED_A_GEMM_ARCHS)
|
||||||
|
set(DSV3_FUSED_A_GEMM_SRC "csrc/dsv3_fused_a_gemm.cu")
|
||||||
|
set_gencode_flags_for_srcs(
|
||||||
|
SRCS "${DSV3_FUSED_A_GEMM_SRC}"
|
||||||
|
CUDA_ARCHS "${DSV3_FUSED_A_GEMM_ARCHS}")
|
||||||
|
list(APPEND VLLM_EXT_SRC ${DSV3_FUSED_A_GEMM_SRC})
|
||||||
|
message(STATUS "Building dsv3_fused_a_gemm for archs: ${DSV3_FUSED_A_GEMM_ARCHS}")
|
||||||
|
else()
|
||||||
|
message(STATUS "Not building dsv3_fused_a_gemm as no compatible archs found "
|
||||||
|
"in CUDA target architectures.")
|
||||||
|
endif()
|
||||||
|
|
||||||
# moe_data.cu is used by all CUTLASS MoE kernels.
|
# moe_data.cu is used by all CUTLASS MoE kernels.
|
||||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
|
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
|
||||||
cuda_archs_loose_intersection(CUTLASS_MOE_DATA_ARCHS "9.0a;10.0f;11.0f;12.0f" "${CUDA_ARCHS}")
|
cuda_archs_loose_intersection(CUTLASS_MOE_DATA_ARCHS "9.0a;10.0f;11.0f;12.0f" "${CUDA_ARCHS}")
|
||||||
@@ -953,7 +998,8 @@ set(VLLM_MOE_EXT_SRC
|
|||||||
if(VLLM_GPU_LANG STREQUAL "CUDA")
|
if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||||
list(APPEND VLLM_MOE_EXT_SRC
|
list(APPEND VLLM_MOE_EXT_SRC
|
||||||
"csrc/moe/moe_wna16.cu"
|
"csrc/moe/moe_wna16.cu"
|
||||||
"csrc/moe/grouped_topk_kernels.cu")
|
"csrc/moe/grouped_topk_kernels.cu"
|
||||||
|
"csrc/moe/router_gemm.cu")
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
if(VLLM_GPU_LANG STREQUAL "CUDA")
|
if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||||
@@ -1043,7 +1089,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
list(APPEND VLLM_MOE_EXT_SRC ${MARLIN_MOE_SRC})
|
list(APPEND VLLM_MOE_EXT_SRC ${MARLIN_MOE_SRC})
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
if (MARLIN_MOE_SM75_ARCHS)
|
if (MARLIN_MOE_SM75_ARCHS)
|
||||||
file(GLOB MARLIN_MOE_SM75_SRC "csrc/moe/marlin_moe_wna16/sm75_kernel_*.cu")
|
file(GLOB MARLIN_MOE_SM75_SRC "csrc/moe/marlin_moe_wna16/sm75_kernel_*.cu")
|
||||||
set_gencode_flags_for_srcs(
|
set_gencode_flags_for_srcs(
|
||||||
SRCS "${MARLIN_MOE_SM75_SRC}"
|
SRCS "${MARLIN_MOE_SM75_SRC}"
|
||||||
@@ -1082,6 +1128,27 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
message(STATUS "Not building Marlin MOE kernels as no compatible archs found"
|
message(STATUS "Not building Marlin MOE kernels as no compatible archs found"
|
||||||
" in CUDA target architectures")
|
" in CUDA target architectures")
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
# DeepSeek V3 router GEMM kernel - requires SM90+
|
||||||
|
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
|
||||||
|
cuda_archs_loose_intersection(DSV3_ROUTER_GEMM_ARCHS "9.0a;10.0f;11.0f" "${CUDA_ARCHS}")
|
||||||
|
else()
|
||||||
|
cuda_archs_loose_intersection(DSV3_ROUTER_GEMM_ARCHS "9.0a;10.0a;10.1a;10.3a" "${CUDA_ARCHS}")
|
||||||
|
endif()
|
||||||
|
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.0 AND DSV3_ROUTER_GEMM_ARCHS)
|
||||||
|
set(DSV3_ROUTER_GEMM_SRC
|
||||||
|
"csrc/moe/dsv3_router_gemm_entry.cu"
|
||||||
|
"csrc/moe/dsv3_router_gemm_float_out.cu"
|
||||||
|
"csrc/moe/dsv3_router_gemm_bf16_out.cu")
|
||||||
|
set_gencode_flags_for_srcs(
|
||||||
|
SRCS "${DSV3_ROUTER_GEMM_SRC}"
|
||||||
|
CUDA_ARCHS "${DSV3_ROUTER_GEMM_ARCHS}")
|
||||||
|
list(APPEND VLLM_MOE_EXT_SRC "${DSV3_ROUTER_GEMM_SRC}")
|
||||||
|
message(STATUS "Building DSV3 router GEMM kernel for archs: ${DSV3_ROUTER_GEMM_ARCHS}")
|
||||||
|
else()
|
||||||
|
message(STATUS "Not building DSV3 router GEMM kernel as no compatible archs found"
|
||||||
|
" (requires SM90+ and CUDA >= 12.0)")
|
||||||
|
endif()
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
message(STATUS "Enabling moe extension.")
|
message(STATUS "Enabling moe extension.")
|
||||||
|
|||||||
@@ -11,7 +11,7 @@ This directory used to contain vLLM's benchmark scripts and utilities for perfor
|
|||||||
|
|
||||||
## Usage
|
## Usage
|
||||||
|
|
||||||
For detailed usage instructions, examples, and dataset information, see the [Benchmark CLI documentation](https://docs.vllm.ai/en/latest/contributing/benchmarks.html#benchmark-cli).
|
For detailed usage instructions, examples, and dataset information, see the [Benchmark CLI documentation](https://docs.vllm.ai/en/latest/benchmarking/cli/#benchmark-cli).
|
||||||
|
|
||||||
For full CLI reference see:
|
For full CLI reference see:
|
||||||
|
|
||||||
|
|||||||
266
benchmarks/attention_benchmarks/README.md
Normal file
266
benchmarks/attention_benchmarks/README.md
Normal file
@@ -0,0 +1,266 @@
|
|||||||
|
# vLLM Attention Benchmarking Suite
|
||||||
|
|
||||||
|
Fast, flexible benchmarking for vLLM attention and MLA backends with an extended batch specification grammar.
|
||||||
|
|
||||||
|
## Quick Start
|
||||||
|
|
||||||
|
```bash
|
||||||
|
cd benchmarks/attention_benchmarks
|
||||||
|
|
||||||
|
# Run a pre-configured benchmark
|
||||||
|
python benchmark.py --config configs/mla_decode.yaml
|
||||||
|
python benchmark.py --config configs/mla_mixed_batch.yaml
|
||||||
|
python benchmark.py --config configs/speculative_decode.yaml
|
||||||
|
python benchmark.py --config configs/standard_attention.yaml
|
||||||
|
python benchmark.py --config configs/reorder_threshold.yaml
|
||||||
|
|
||||||
|
# Or run custom benchmarks
|
||||||
|
python benchmark.py \
|
||||||
|
--backends flash flashinfer \
|
||||||
|
--batch-specs "q2k" "8q1s1k" "2q2k_32q1s1k" \
|
||||||
|
--output-csv results.csv
|
||||||
|
```
|
||||||
|
|
||||||
|
## Simplified Batch Specification Grammar
|
||||||
|
|
||||||
|
Express workloads concisely using query length and sequence length:
|
||||||
|
|
||||||
|
```python
|
||||||
|
"q2k" # 2048-token prefill (q_len=2048, seq_len=2048)
|
||||||
|
"q1s1k" # Decode: 1 token with 1K sequence
|
||||||
|
"8q1s1k" # 8 decode requests
|
||||||
|
"q4s1k" # 4-token extend (e.g., spec decode)
|
||||||
|
"2q2k_32q1s1k" # Mixed: 2 prefills + 32 decodes
|
||||||
|
"16q4s1k" # 16 spec decode (4 tokens each)
|
||||||
|
```
|
||||||
|
|
||||||
|
### Grammar Rule
|
||||||
|
|
||||||
|
```text
|
||||||
|
Format: (<count>?) q<q_len>(k?) (s<seq_len>(k?))?
|
||||||
|
|
||||||
|
- count: Number of identical requests (optional, default=1)
|
||||||
|
- q_len: Query length (number of new tokens)
|
||||||
|
- seq_len: Total sequence length (optional, defaults to q_len for prefill)
|
||||||
|
- 'k': Multiplies value by 1024
|
||||||
|
|
||||||
|
Mixed batches: Use _ to combine (e.g., "2q2k_32q1s1k")
|
||||||
|
```
|
||||||
|
|
||||||
|
**Note**: Decode, prefill, and spec decode are just different query lengths - no special syntax needed!
|
||||||
|
|
||||||
|
## Pre-configured Benchmarks
|
||||||
|
|
||||||
|
The suite includes several pre-configured YAML benchmark configurations:
|
||||||
|
|
||||||
|
### MLA Decode Benchmark
|
||||||
|
|
||||||
|
Tests pure decode performance across MLA backends with varying batch sizes and sequence lengths.
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python benchmark.py --config configs/mla_decode.yaml
|
||||||
|
```
|
||||||
|
|
||||||
|
### MLA Mixed Batch Benchmark
|
||||||
|
|
||||||
|
Tests chunked prefill performance with mixed prefill + decode batches.
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python benchmark.py --config configs/mla_mixed_batch.yaml
|
||||||
|
```
|
||||||
|
|
||||||
|
### Speculative Decoding Benchmark
|
||||||
|
|
||||||
|
Tests speculative decode scenarios (K-token verification) and reorder_batch_threshold optimization.
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python benchmark.py --config configs/speculative_decode.yaml
|
||||||
|
```
|
||||||
|
|
||||||
|
### Standard Attention Benchmark
|
||||||
|
|
||||||
|
Tests standard attention backends (Flash/Triton/FlashInfer) with pure prefill, decode, and mixed batches.
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python benchmark.py --config configs/standard_attention.yaml
|
||||||
|
```
|
||||||
|
|
||||||
|
### Reorder Threshold Study
|
||||||
|
|
||||||
|
**Question:** At what query length does the prefill pipeline become faster than the decode pipeline?
|
||||||
|
|
||||||
|
Tests query lengths from 1-1024 across 9 batch sizes to find the crossover point. Uses `decode_vs_prefill` mode to compare both pipelines for each query length.
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python benchmark.py --config configs/reorder_threshold.yaml
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Universal Benchmark
|
||||||
|
|
||||||
|
The `benchmark.py` script handles **all** backends - both standard attention and MLA.
|
||||||
|
|
||||||
|
### Standard Attention (Flash/Triton/FlashInfer)
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python benchmark.py \
|
||||||
|
--backends flash triton flashinfer \
|
||||||
|
--batch-specs "q2k" "8q1s1k" "2q2k_32q1s1k" \
|
||||||
|
--num-layers 10 \
|
||||||
|
--repeats 5 \
|
||||||
|
--output-csv results.csv
|
||||||
|
```
|
||||||
|
|
||||||
|
### MLA Backends
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Compare all MLA backends
|
||||||
|
python benchmark.py \
|
||||||
|
--backends cutlass_mla flashinfer_mla flashattn_mla flashmla \
|
||||||
|
--batch-specs "64q1s1k" "64q1s4k" \
|
||||||
|
--output-csv mla_results.csv
|
||||||
|
```
|
||||||
|
|
||||||
|
### Parameter Sweeps
|
||||||
|
|
||||||
|
Use `--sweep-param` and `--sweep-values` to run parameter sweeps from the CLI:
|
||||||
|
|
||||||
|
#### CUTLASS MLA num-splits Optimization
|
||||||
|
|
||||||
|
**Question:** What is the optimal `num_kv_splits` for CUTLASS MLA?
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python benchmark.py \
|
||||||
|
--backend cutlass_mla \
|
||||||
|
--batch-specs "64q1s1k" "64q1s4k" "64q1s16k" \
|
||||||
|
--sweep-param num_kv_splits \
|
||||||
|
--sweep-values 1 2 4 8 16 \
|
||||||
|
--output-json optimal_splits.json
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Reorder Batch Threshold Optimization
|
||||||
|
|
||||||
|
**Question:** What's the optimal `reorder_batch_threshold` for speculative decoding?
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python benchmark.py \
|
||||||
|
--backend flashmla \
|
||||||
|
--batch-specs "q4s1k" "q8s2k" \
|
||||||
|
--sweep-param reorder_batch_threshold \
|
||||||
|
--sweep-values 1 4 16 64 256 512 \
|
||||||
|
--output-csv threshold_sweep.csv
|
||||||
|
```
|
||||||
|
|
||||||
|
### All Command-Line Options
|
||||||
|
|
||||||
|
```text
|
||||||
|
--config CONFIG # Path to YAML config file (overrides other args)
|
||||||
|
--backends BACKEND [BACKEND ...] # flash, triton, flashinfer, cutlass_mla,
|
||||||
|
# flashinfer_mla, flashattn_mla, flashmla
|
||||||
|
--backend BACKEND # Single backend (alternative to --backends)
|
||||||
|
--batch-specs SPEC [SPEC ...] # Batch specifications using extended grammar
|
||||||
|
|
||||||
|
# Model configuration
|
||||||
|
--num-layers N # Number of layers
|
||||||
|
--head-dim N # Head dimension
|
||||||
|
--num-q-heads N # Query heads
|
||||||
|
--num-kv-heads N # KV heads
|
||||||
|
--block-size N # Block size
|
||||||
|
|
||||||
|
# Benchmark settings
|
||||||
|
--device DEVICE # Device (default: cuda:0)
|
||||||
|
--repeats N # Repetitions
|
||||||
|
--warmup-iters N # Warmup iterations
|
||||||
|
--profile-memory # Profile memory usage
|
||||||
|
|
||||||
|
# Parameter sweeps
|
||||||
|
--sweep-param PARAM # Parameter name to sweep (e.g., num_kv_splits,
|
||||||
|
# reorder_batch_threshold)
|
||||||
|
--sweep-values N [N ...] # Values to sweep for the parameter
|
||||||
|
|
||||||
|
# Output
|
||||||
|
--output-csv FILE # Save to CSV
|
||||||
|
--output-json FILE # Save to JSON
|
||||||
|
```
|
||||||
|
|
||||||
|
## Hardware Requirements
|
||||||
|
|
||||||
|
| Backend | Hardware |
|
||||||
|
|---------|----------|
|
||||||
|
| Flash/Triton/FlashInfer | Any CUDA GPU |
|
||||||
|
| CUTLASS MLA | Blackwell (SM100+) |
|
||||||
|
| FlashAttn MLA | Hopper (SM90+) |
|
||||||
|
| FlashMLA | Hopper (SM90+) |
|
||||||
|
| FlashInfer-MLA | Any CUDA GPU |
|
||||||
|
|
||||||
|
## Using MLA Runner Directly
|
||||||
|
|
||||||
|
All MLA backends are available through `mla_runner.run_mla_benchmark()`:
|
||||||
|
|
||||||
|
```python
|
||||||
|
from mla_runner import run_mla_benchmark
|
||||||
|
from common import BenchmarkConfig
|
||||||
|
|
||||||
|
config = BenchmarkConfig(
|
||||||
|
backend="cutlass_mla",
|
||||||
|
batch_spec="64q1s4k",
|
||||||
|
num_layers=10,
|
||||||
|
head_dim=576,
|
||||||
|
num_q_heads=128,
|
||||||
|
num_kv_heads=1,
|
||||||
|
block_size=128,
|
||||||
|
device="cuda:0",
|
||||||
|
repeats=5,
|
||||||
|
warmup_iters=3,
|
||||||
|
)
|
||||||
|
|
||||||
|
# CUTLASS MLA with specific num_kv_splits
|
||||||
|
result = run_mla_benchmark("cutlass_mla", config, num_kv_splits=4)
|
||||||
|
print(f"Time: {result.mean_time:.6f}s")
|
||||||
|
|
||||||
|
# FlashInfer-MLA
|
||||||
|
result = run_mla_benchmark("flashinfer_mla", config)
|
||||||
|
|
||||||
|
# FlashAttn MLA (Hopper SM90+)
|
||||||
|
result = run_mla_benchmark("flashattn_mla", config, reorder_batch_threshold=64)
|
||||||
|
|
||||||
|
# FlashMLA (Hopper SM90+)
|
||||||
|
result = run_mla_benchmark("flashmla", config, reorder_batch_threshold=64)
|
||||||
|
```
|
||||||
|
|
||||||
|
## Python API
|
||||||
|
|
||||||
|
```python
|
||||||
|
from batch_spec import parse_batch_spec, format_batch_spec, get_batch_stats
|
||||||
|
from common import BenchmarkConfig, BenchmarkResult, ResultsFormatter
|
||||||
|
|
||||||
|
# Parse batch specs
|
||||||
|
requests = parse_batch_spec("2q2k_q4s1k_32q1s1k")
|
||||||
|
print(format_batch_spec(requests))
|
||||||
|
# "2 prefill (2x2k), 1 extend (1xq4kv1k), 32 decode (32x1k)"
|
||||||
|
|
||||||
|
# Get batch statistics
|
||||||
|
stats = get_batch_stats(requests)
|
||||||
|
print(f"Total tokens: {stats['total_tokens']}")
|
||||||
|
print(f"Num decode: {stats['num_decode']}, Num prefill: {stats['num_prefill']}")
|
||||||
|
|
||||||
|
# Format results
|
||||||
|
formatter = ResultsFormatter()
|
||||||
|
formatter.save_csv(results, "output.csv")
|
||||||
|
formatter.save_json(results, "output.json")
|
||||||
|
```
|
||||||
|
|
||||||
|
## Tips
|
||||||
|
|
||||||
|
**1. Warmup matters** - Use `--warmup-iters 10` for stable results
|
||||||
|
|
||||||
|
**2. Multiple repeats** - Use `--repeats 20` for low variance
|
||||||
|
|
||||||
|
**3. Save results** - Always use `--output-csv` or `--output-json`
|
||||||
|
|
||||||
|
**4. Test incrementally** - Start with `--num-layers 1 --repeats 1`
|
||||||
|
|
||||||
|
**5. Extended grammar** - Leverage spec decode, chunked prefill patterns
|
||||||
|
|
||||||
|
**6. Parameter sweeps** - Use `--sweep-param` and `--sweep-values` to find optimal values
|
||||||
42
benchmarks/attention_benchmarks/__init__.py
Normal file
42
benchmarks/attention_benchmarks/__init__.py
Normal file
@@ -0,0 +1,42 @@
|
|||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
|
||||||
|
"""vLLM Attention Benchmarking Suite."""
|
||||||
|
|
||||||
|
from .batch_spec import (
|
||||||
|
BatchRequest,
|
||||||
|
format_batch_spec,
|
||||||
|
get_batch_stats,
|
||||||
|
parse_batch_spec,
|
||||||
|
reorder_for_flashinfer,
|
||||||
|
split_by_type,
|
||||||
|
)
|
||||||
|
from .common import (
|
||||||
|
BenchmarkConfig,
|
||||||
|
BenchmarkResult,
|
||||||
|
MockLayer,
|
||||||
|
ResultsFormatter,
|
||||||
|
get_attention_scale,
|
||||||
|
is_mla_backend,
|
||||||
|
setup_mla_dims,
|
||||||
|
)
|
||||||
|
|
||||||
|
__all__ = [
|
||||||
|
# Batch specification
|
||||||
|
"BatchRequest",
|
||||||
|
"parse_batch_spec",
|
||||||
|
"format_batch_spec",
|
||||||
|
"reorder_for_flashinfer",
|
||||||
|
"split_by_type",
|
||||||
|
"get_batch_stats",
|
||||||
|
# Benchmarking infrastructure
|
||||||
|
"BenchmarkConfig",
|
||||||
|
"BenchmarkResult",
|
||||||
|
"ResultsFormatter",
|
||||||
|
# Mock objects
|
||||||
|
"MockLayer",
|
||||||
|
# Utilities
|
||||||
|
"setup_mla_dims",
|
||||||
|
"get_attention_scale",
|
||||||
|
"is_mla_backend",
|
||||||
|
]
|
||||||
268
benchmarks/attention_benchmarks/batch_spec.py
Normal file
268
benchmarks/attention_benchmarks/batch_spec.py
Normal file
@@ -0,0 +1,268 @@
|
|||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
|
||||||
|
"""
|
||||||
|
Simplified batch specification grammar for attention benchmarks.
|
||||||
|
|
||||||
|
Grammar (underscore-separated segments):
|
||||||
|
Format: (<count>?) q<q_len>(k?) (s<seq_len>(k?))?
|
||||||
|
|
||||||
|
- count: Number of identical requests (optional, default=1)
|
||||||
|
- q_len: Query length (number of new tokens)
|
||||||
|
- seq_len: Total sequence length (optional, defaults to q_len for prefill)
|
||||||
|
- 'k' suffix: Multiplies value by 1024
|
||||||
|
|
||||||
|
Common patterns:
|
||||||
|
- Prefill: q_len == seq_len (e.g., "q2k" → 2048 new tokens, 2048 seq)
|
||||||
|
- Decode: q_len == 1 (e.g., "q1s1k" → 1 token, 1024 seq length)
|
||||||
|
- Extend: q_len < seq_len (e.g., "q4s1k" → 4 tokens, 1024 seq length)
|
||||||
|
|
||||||
|
Examples:
|
||||||
|
q2k -> [(2048, 2048)] # Prefill: 2048 tokens
|
||||||
|
q1s1k -> [(1, 1024)] # Decode: 1 token, 1K sequence
|
||||||
|
8q1s1k -> [(1, 1024)] * 8 # 8 decode requests
|
||||||
|
q4s1k -> [(4, 1024)] # 4-token extend (spec decode)
|
||||||
|
2q1k_32q1s1k -> [(1024, 1024)] * 2 + [(1, 1024)] * 32 # Mixed batch
|
||||||
|
16q4s1k -> [(4, 1024)] * 16 # 16 spec decode requests
|
||||||
|
"""
|
||||||
|
|
||||||
|
from collections import Counter
|
||||||
|
from dataclasses import dataclass
|
||||||
|
|
||||||
|
import regex as re
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class BatchRequest:
|
||||||
|
"""Represents a single request in a batch."""
|
||||||
|
|
||||||
|
q_len: int # Query length (number of new tokens)
|
||||||
|
kv_len: int # Total KV cache length
|
||||||
|
|
||||||
|
@property
|
||||||
|
def is_decode(self) -> bool:
|
||||||
|
"""True if this is a decode request (q_len == 1)."""
|
||||||
|
return self.q_len == 1
|
||||||
|
|
||||||
|
@property
|
||||||
|
def is_prefill(self) -> bool:
|
||||||
|
"""True if this is a pure prefill (q_len == kv_len)."""
|
||||||
|
return self.q_len == self.kv_len
|
||||||
|
|
||||||
|
@property
|
||||||
|
def is_extend(self) -> bool:
|
||||||
|
"""True if this is context extension (q_len > 1, kv_len > q_len)."""
|
||||||
|
return self.q_len > 1 and self.kv_len > self.q_len
|
||||||
|
|
||||||
|
@property
|
||||||
|
def context_len(self) -> int:
|
||||||
|
"""Context length (KV cache - query)."""
|
||||||
|
return self.kv_len - self.q_len
|
||||||
|
|
||||||
|
def as_tuple(self) -> tuple[int, int]:
|
||||||
|
"""Return as (q_len, kv_len) tuple for compatibility."""
|
||||||
|
return (self.q_len, self.kv_len)
|
||||||
|
|
||||||
|
|
||||||
|
def _parse_size(size_str: str, k_suffix: str) -> int:
|
||||||
|
"""Parse size string with optional 'k' suffix."""
|
||||||
|
size = int(size_str)
|
||||||
|
return size * 1024 if k_suffix == "k" else size
|
||||||
|
|
||||||
|
|
||||||
|
def parse_batch_spec(spec: str) -> list[BatchRequest]:
|
||||||
|
"""
|
||||||
|
Parse batch specification string into list of BatchRequest objects.
|
||||||
|
|
||||||
|
Grammar: (<count>?) q<q_len>(k?) (s<seq_len>(k?))?
|
||||||
|
|
||||||
|
Args:
|
||||||
|
spec: Batch specification string (see module docstring for grammar)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of BatchRequest objects
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
ValueError: If spec format is invalid
|
||||||
|
"""
|
||||||
|
requests = []
|
||||||
|
|
||||||
|
for seg in spec.split("_"):
|
||||||
|
# Unified pattern: (<count>?) q<q_len>(k?) (s<seq_len>(k?))?
|
||||||
|
m = re.match(r"^(?:(\d+))?q(\d+)(k?)(?:s(\d+)(k?))?$", seg)
|
||||||
|
if m:
|
||||||
|
cnt = int(m.group(1)) if m.group(1) else 1
|
||||||
|
q_len = _parse_size(m.group(2), m.group(3))
|
||||||
|
kv_len = _parse_size(m.group(4), m.group(5)) if m.group(4) else q_len
|
||||||
|
requests.extend([BatchRequest(q_len=q_len, kv_len=kv_len)] * cnt)
|
||||||
|
continue
|
||||||
|
|
||||||
|
raise ValueError(f"Invalid batch spec segment: '{seg}'")
|
||||||
|
|
||||||
|
return requests
|
||||||
|
|
||||||
|
|
||||||
|
def format_batch_spec(requests: list[BatchRequest]) -> str:
|
||||||
|
"""
|
||||||
|
Format list of BatchRequest into human-readable string.
|
||||||
|
|
||||||
|
Groups requests by type and provides counts and sizes.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
requests: List of BatchRequest objects
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Formatted string describing the batch
|
||||||
|
"""
|
||||||
|
kinds = {
|
||||||
|
"prefill": [],
|
||||||
|
"extend": [],
|
||||||
|
"decode": [],
|
||||||
|
}
|
||||||
|
|
||||||
|
for req in requests:
|
||||||
|
tup = (req.q_len, req.kv_len)
|
||||||
|
if req.is_prefill:
|
||||||
|
kinds["prefill"].append(tup)
|
||||||
|
elif req.is_extend:
|
||||||
|
kinds["extend"].append(tup)
|
||||||
|
elif req.is_decode:
|
||||||
|
kinds["decode"].append(tup)
|
||||||
|
|
||||||
|
parts = []
|
||||||
|
for kind in ["prefill", "extend", "decode"]:
|
||||||
|
lst = kinds[kind]
|
||||||
|
if not lst:
|
||||||
|
continue
|
||||||
|
|
||||||
|
cnt_total = len(lst)
|
||||||
|
ctr = Counter(lst)
|
||||||
|
inner = []
|
||||||
|
|
||||||
|
for (q, kv), cnt in ctr.items():
|
||||||
|
if kind == "prefill":
|
||||||
|
size = f"{q // 1024}k" if q % 1024 == 0 else str(q)
|
||||||
|
inner.append(f"{cnt}x{size}")
|
||||||
|
elif kind == "decode":
|
||||||
|
size = f"{kv // 1024}k" if kv % 1024 == 0 else str(kv)
|
||||||
|
inner.append(f"{cnt}x{size}")
|
||||||
|
else: # extend
|
||||||
|
qstr = f"{q // 1024}k" if q % 1024 == 0 else str(q)
|
||||||
|
kstr = f"{kv // 1024}k" if kv % 1024 == 0 else str(kv)
|
||||||
|
inner.append(f"{cnt}xq{qstr}kv{kstr}")
|
||||||
|
|
||||||
|
parts.append(f"{cnt_total} {kind} ({', '.join(inner)})")
|
||||||
|
|
||||||
|
return ", ".join(parts)
|
||||||
|
|
||||||
|
|
||||||
|
def reorder_for_flashinfer(requests: list[BatchRequest]) -> list[BatchRequest]:
|
||||||
|
"""
|
||||||
|
Reorder requests for FlashInfer: decode first, then prefill.
|
||||||
|
|
||||||
|
FlashInfer expects decode requests before prefill requests for
|
||||||
|
optimal performance.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
requests: Original list of BatchRequest
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Reordered list with decode requests first
|
||||||
|
"""
|
||||||
|
decodes = [r for r in requests if r.is_decode]
|
||||||
|
non_decodes = [r for r in requests if not r.is_decode]
|
||||||
|
return decodes + non_decodes
|
||||||
|
|
||||||
|
|
||||||
|
def split_by_type(
|
||||||
|
requests: list[BatchRequest],
|
||||||
|
) -> dict[str, list[BatchRequest]]:
|
||||||
|
"""
|
||||||
|
Split requests by type for analysis.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
requests: List of BatchRequest
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dict with keys: 'decode', 'prefill', 'extend'
|
||||||
|
"""
|
||||||
|
result = {
|
||||||
|
"decode": [],
|
||||||
|
"prefill": [],
|
||||||
|
"extend": [],
|
||||||
|
}
|
||||||
|
|
||||||
|
for req in requests:
|
||||||
|
if req.is_decode:
|
||||||
|
result["decode"].append(req)
|
||||||
|
elif req.is_prefill:
|
||||||
|
result["prefill"].append(req)
|
||||||
|
elif req.is_extend:
|
||||||
|
result["extend"].append(req)
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
def get_batch_stats(requests: list[BatchRequest]) -> dict:
|
||||||
|
"""
|
||||||
|
Compute statistics about a batch.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
requests: List of BatchRequest
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dict with batch statistics
|
||||||
|
"""
|
||||||
|
by_type = split_by_type(requests)
|
||||||
|
|
||||||
|
return {
|
||||||
|
"total_requests": len(requests),
|
||||||
|
"num_decode": len(by_type["decode"]),
|
||||||
|
"num_prefill": len(by_type["prefill"]),
|
||||||
|
"num_extend": len(by_type["extend"]),
|
||||||
|
"total_tokens": sum(r.q_len for r in requests),
|
||||||
|
"total_kv_cache": sum(r.kv_len for r in requests),
|
||||||
|
"max_q_len": max((r.q_len for r in requests), default=0),
|
||||||
|
"max_kv_len": max((r.kv_len for r in requests), default=0),
|
||||||
|
"avg_q_len": sum(r.q_len for r in requests) / len(requests) if requests else 0,
|
||||||
|
"avg_kv_len": (
|
||||||
|
sum(r.kv_len for r in requests) / len(requests) if requests else 0
|
||||||
|
),
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def get_batch_type(batch_spec: str, spec_decode_threshold: int = 8) -> str:
|
||||||
|
"""
|
||||||
|
Classify a batch spec into a type string.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
batch_spec: Batch specification string (e.g., "q2k", "8q1s1k", "2q2k_8q1s1k")
|
||||||
|
spec_decode_threshold: Max q_len to be considered spec-decode vs extend
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Type string: "prefill", "decode", "spec-decode", "extend", or "mixed (types...)"
|
||||||
|
"""
|
||||||
|
requests = parse_batch_spec(batch_spec)
|
||||||
|
|
||||||
|
# Classify each request
|
||||||
|
types_present = set()
|
||||||
|
for req in requests:
|
||||||
|
if req.is_decode:
|
||||||
|
types_present.add("decode")
|
||||||
|
elif req.is_prefill:
|
||||||
|
types_present.add("prefill")
|
||||||
|
elif req.is_extend:
|
||||||
|
# Distinguish spec-decode (small q_len) from extend (chunked prefill)
|
||||||
|
if req.q_len <= spec_decode_threshold:
|
||||||
|
types_present.add("spec-decode")
|
||||||
|
else:
|
||||||
|
types_present.add("extend")
|
||||||
|
|
||||||
|
if len(types_present) == 1:
|
||||||
|
return types_present.pop()
|
||||||
|
elif len(types_present) > 1:
|
||||||
|
# Sort for consistent output
|
||||||
|
sorted_types = sorted(types_present)
|
||||||
|
return f"mixed ({'+'.join(sorted_types)})"
|
||||||
|
else:
|
||||||
|
return "unknown"
|
||||||
895
benchmarks/attention_benchmarks/benchmark.py
Normal file
895
benchmarks/attention_benchmarks/benchmark.py
Normal file
@@ -0,0 +1,895 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
|
||||||
|
"""
|
||||||
|
Universal vLLM Attention Benchmark
|
||||||
|
|
||||||
|
Benchmark any attention backend with the extended grammar.
|
||||||
|
Supports standard attention (Flash/Triton/FlashInfer) and MLA backends.
|
||||||
|
|
||||||
|
Examples:
|
||||||
|
# Standard attention
|
||||||
|
python benchmark.py --backends flash flashinfer --batch-specs "q2k" "8q1s1k"
|
||||||
|
|
||||||
|
# MLA backends
|
||||||
|
python benchmark.py --backends cutlass_mla flashinfer_mla --batch-specs "64q1s1k"
|
||||||
|
|
||||||
|
# Parameter sweep (CLI)
|
||||||
|
python benchmark.py --backend cutlass_mla \
|
||||||
|
--batch-specs "64q1s1k" \
|
||||||
|
--sweep-param num_kv_splits \
|
||||||
|
--sweep-values 1 4 8 16
|
||||||
|
|
||||||
|
# Parameter sweep (YAML config - recommended)
|
||||||
|
python benchmark.py --config configs/cutlass_numsplits.yaml
|
||||||
|
"""
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import sys
|
||||||
|
from dataclasses import replace
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import yaml
|
||||||
|
from rich.console import Console
|
||||||
|
from tqdm import tqdm
|
||||||
|
|
||||||
|
sys.path.insert(0, str(Path(__file__).parent.parent.parent))
|
||||||
|
|
||||||
|
from batch_spec import parse_batch_spec
|
||||||
|
from common import (
|
||||||
|
BenchmarkConfig,
|
||||||
|
BenchmarkResult,
|
||||||
|
ModelParameterSweep,
|
||||||
|
ParameterSweep,
|
||||||
|
ResultsFormatter,
|
||||||
|
batch_spec_sort_key,
|
||||||
|
is_mla_backend,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def run_standard_attention_benchmark(config: BenchmarkConfig) -> BenchmarkResult:
|
||||||
|
"""Run standard attention benchmark (Flash/Triton/FlashInfer)."""
|
||||||
|
from runner import run_attention_benchmark
|
||||||
|
|
||||||
|
return run_attention_benchmark(config)
|
||||||
|
|
||||||
|
|
||||||
|
def run_mla_benchmark(config: BenchmarkConfig, **kwargs) -> BenchmarkResult:
|
||||||
|
"""Run MLA benchmark with appropriate backend."""
|
||||||
|
from mla_runner import run_mla_benchmark as run_mla
|
||||||
|
|
||||||
|
return run_mla(config.backend, config, **kwargs)
|
||||||
|
|
||||||
|
|
||||||
|
def run_benchmark(config: BenchmarkConfig, **kwargs) -> BenchmarkResult:
|
||||||
|
"""
|
||||||
|
Run a single benchmark with proper backend selection.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
config: BenchmarkConfig with backend, batch_spec, and model params
|
||||||
|
**kwargs: Additional arguments passed to MLA benchmarks
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
BenchmarkResult (may have error field set on failure)
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
if is_mla_backend(config.backend):
|
||||||
|
return run_mla_benchmark(config, **kwargs)
|
||||||
|
else:
|
||||||
|
return run_standard_attention_benchmark(config)
|
||||||
|
except Exception as e:
|
||||||
|
return BenchmarkResult(
|
||||||
|
config=config,
|
||||||
|
mean_time=float("inf"),
|
||||||
|
std_time=0,
|
||||||
|
min_time=float("inf"),
|
||||||
|
max_time=float("inf"),
|
||||||
|
error=str(e),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def run_model_parameter_sweep(
|
||||||
|
backends: list[str],
|
||||||
|
batch_specs: list[str],
|
||||||
|
base_config_args: dict,
|
||||||
|
sweep: ModelParameterSweep,
|
||||||
|
console: Console,
|
||||||
|
) -> list[BenchmarkResult]:
|
||||||
|
"""
|
||||||
|
Run model parameter sweep for given backends and batch specs.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
backends: List of backend names
|
||||||
|
batch_specs: List of batch specifications
|
||||||
|
base_config_args: Base configuration arguments (num_layers, head_dim, etc.)
|
||||||
|
sweep: ModelParameterSweep configuration
|
||||||
|
console: Rich console for output
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of BenchmarkResult objects
|
||||||
|
"""
|
||||||
|
all_results = []
|
||||||
|
|
||||||
|
console.print(
|
||||||
|
f"[yellow]Model sweep mode: testing {sweep.param_name} = {sweep.values}[/]"
|
||||||
|
)
|
||||||
|
|
||||||
|
total = len(backends) * len(batch_specs) * len(sweep.values)
|
||||||
|
|
||||||
|
with tqdm(total=total, desc="Benchmarking") as pbar:
|
||||||
|
for backend in backends:
|
||||||
|
for spec in batch_specs:
|
||||||
|
for value in sweep.values:
|
||||||
|
# Create config with modified model parameter
|
||||||
|
config_args = base_config_args.copy()
|
||||||
|
config_args[sweep.param_name] = value
|
||||||
|
|
||||||
|
# Create config with original backend for running
|
||||||
|
clean_config = BenchmarkConfig(
|
||||||
|
backend=backend, batch_spec=spec, **config_args
|
||||||
|
)
|
||||||
|
|
||||||
|
# Run benchmark
|
||||||
|
result = run_benchmark(clean_config)
|
||||||
|
|
||||||
|
# Replace backend with labeled version for display
|
||||||
|
backend_label = sweep.get_label(backend, value)
|
||||||
|
labeled_config = replace(result.config, backend=backend_label)
|
||||||
|
result = replace(result, config=labeled_config)
|
||||||
|
all_results.append(result)
|
||||||
|
|
||||||
|
if not result.success:
|
||||||
|
console.print(
|
||||||
|
f"[red]Error {backend} {spec} {sweep.param_name}="
|
||||||
|
f"{value}: {result.error}[/]"
|
||||||
|
)
|
||||||
|
|
||||||
|
pbar.update(1)
|
||||||
|
|
||||||
|
# Display sweep results - create separate table for each parameter value
|
||||||
|
console.print("\n[bold green]Model Parameter Sweep Results:[/]")
|
||||||
|
formatter = ResultsFormatter(console)
|
||||||
|
|
||||||
|
# Group results by parameter value and extract backend mapping
|
||||||
|
by_param_value = {}
|
||||||
|
backend_mapping = {} # Maps labeled backend -> original backend
|
||||||
|
|
||||||
|
for r in all_results:
|
||||||
|
# Extract original backend and param value from labeled backend
|
||||||
|
# The label format is: {backend}_{param_name}_{value}
|
||||||
|
# We need to reverse engineer this
|
||||||
|
labeled_backend = r.config.backend
|
||||||
|
|
||||||
|
# Try each backend to find which one this result belongs to
|
||||||
|
for backend in backends:
|
||||||
|
for value in sweep.values:
|
||||||
|
expected_label = sweep.get_label(backend, value)
|
||||||
|
if labeled_backend == expected_label:
|
||||||
|
backend_mapping[labeled_backend] = backend
|
||||||
|
param_value = str(value)
|
||||||
|
|
||||||
|
if param_value not in by_param_value:
|
||||||
|
by_param_value[param_value] = []
|
||||||
|
by_param_value[param_value].append(r)
|
||||||
|
break
|
||||||
|
|
||||||
|
# Create a table for each parameter value
|
||||||
|
sorted_param_values = sorted(
|
||||||
|
by_param_value.keys(), key=lambda x: int(x) if x.isdigit() else x
|
||||||
|
)
|
||||||
|
|
||||||
|
for param_value in sorted_param_values:
|
||||||
|
console.print(f"\n[bold cyan]{sweep.param_name} = {param_value}[/]")
|
||||||
|
param_results = by_param_value[param_value]
|
||||||
|
|
||||||
|
# Create modified results with original backend names
|
||||||
|
modified_results = []
|
||||||
|
for r in param_results:
|
||||||
|
# Get the original backend name from our mapping
|
||||||
|
original_backend = backend_mapping[r.config.backend]
|
||||||
|
modified_config = replace(r.config, backend=original_backend)
|
||||||
|
modified_result = replace(r, config=modified_config)
|
||||||
|
modified_results.append(modified_result)
|
||||||
|
|
||||||
|
# Print table with original backend names
|
||||||
|
formatter.print_table(modified_results, backends, compare_to_fastest=True)
|
||||||
|
|
||||||
|
# Show optimal backend for each (param_value, batch_spec) combination
|
||||||
|
console.print(
|
||||||
|
f"\n[bold cyan]Optimal backend for each ({sweep.param_name}, batch_spec):[/]"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Group by (param_value, batch_spec)
|
||||||
|
by_param_and_spec = {}
|
||||||
|
for r in all_results:
|
||||||
|
if r.success:
|
||||||
|
# Find which (backend, value) this result corresponds to
|
||||||
|
labeled_backend = r.config.backend
|
||||||
|
for backend in backends:
|
||||||
|
for value in sweep.values:
|
||||||
|
expected_label = sweep.get_label(backend, value)
|
||||||
|
if labeled_backend == expected_label:
|
||||||
|
param_value = str(value)
|
||||||
|
spec = r.config.batch_spec
|
||||||
|
key = (param_value, spec)
|
||||||
|
|
||||||
|
if key not in by_param_and_spec:
|
||||||
|
by_param_and_spec[key] = []
|
||||||
|
by_param_and_spec[key].append(r)
|
||||||
|
break
|
||||||
|
|
||||||
|
# Sort by param value then spec (batch_size, q_len, kv_len)
|
||||||
|
sorted_keys = sorted(
|
||||||
|
by_param_and_spec.keys(),
|
||||||
|
key=lambda x: (
|
||||||
|
int(x[0]) if x[0].isdigit() else x[0],
|
||||||
|
batch_spec_sort_key(x[1]),
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
current_param_value = None
|
||||||
|
for param_value, spec in sorted_keys:
|
||||||
|
# Print header when param value changes
|
||||||
|
if param_value != current_param_value:
|
||||||
|
console.print(f"\n [bold]{sweep.param_name}={param_value}:[/]")
|
||||||
|
current_param_value = param_value
|
||||||
|
|
||||||
|
results = by_param_and_spec[(param_value, spec)]
|
||||||
|
best = min(results, key=lambda r: r.mean_time)
|
||||||
|
|
||||||
|
# Extract original backend name using the mapping
|
||||||
|
backend_name = backend_mapping[best.config.backend]
|
||||||
|
|
||||||
|
# Show all backends' times for comparison
|
||||||
|
times_str = " | ".join(
|
||||||
|
[
|
||||||
|
f"{backend_mapping[r.config.backend]}: {r.mean_time:.6f}s"
|
||||||
|
for r in sorted(results, key=lambda r: r.mean_time)
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
console.print(
|
||||||
|
f" {spec:12s} -> [bold green]{backend_name:15s}[/] ({times_str})"
|
||||||
|
)
|
||||||
|
|
||||||
|
return all_results
|
||||||
|
|
||||||
|
|
||||||
|
def run_parameter_sweep(
|
||||||
|
backends: list[str],
|
||||||
|
batch_specs: list[str],
|
||||||
|
base_config_args: dict,
|
||||||
|
sweep: ParameterSweep,
|
||||||
|
console: Console,
|
||||||
|
) -> list[BenchmarkResult]:
|
||||||
|
"""
|
||||||
|
Run parameter sweep for given backends and batch specs.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
backends: List of backend names
|
||||||
|
batch_specs: List of batch specifications
|
||||||
|
base_config_args: Base configuration arguments (num_layers, head_dim, etc.)
|
||||||
|
sweep: ParameterSweep configuration
|
||||||
|
console: Rich console for output
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of BenchmarkResult objects
|
||||||
|
"""
|
||||||
|
all_results = []
|
||||||
|
|
||||||
|
# Build list of values to sweep (including auto if requested)
|
||||||
|
sweep_values = list(sweep.values)
|
||||||
|
if sweep.include_auto:
|
||||||
|
sweep_values.append("auto")
|
||||||
|
|
||||||
|
console.print(f"[yellow]Sweep mode: testing {sweep.param_name} = {sweep_values}[/]")
|
||||||
|
|
||||||
|
total = len(backends) * len(batch_specs) * len(sweep_values)
|
||||||
|
|
||||||
|
with tqdm(total=total, desc="Benchmarking") as pbar:
|
||||||
|
for backend in backends:
|
||||||
|
for spec in batch_specs:
|
||||||
|
for value in sweep_values:
|
||||||
|
# Create config with original backend for running
|
||||||
|
config = BenchmarkConfig(
|
||||||
|
backend=backend, batch_spec=spec, **base_config_args
|
||||||
|
)
|
||||||
|
|
||||||
|
# Prepare kwargs for benchmark runner
|
||||||
|
kwargs = {}
|
||||||
|
if value != "auto":
|
||||||
|
kwargs[sweep.param_name] = value
|
||||||
|
|
||||||
|
# Run benchmark
|
||||||
|
result = run_benchmark(config, **kwargs)
|
||||||
|
|
||||||
|
# Replace backend with labeled version for display
|
||||||
|
backend_label = sweep.get_label(backend, value)
|
||||||
|
labeled_config = replace(result.config, backend=backend_label)
|
||||||
|
result = replace(result, config=labeled_config)
|
||||||
|
all_results.append(result)
|
||||||
|
|
||||||
|
if not result.success:
|
||||||
|
console.print(
|
||||||
|
f"[red]Error {backend} {spec} {sweep.param_name}="
|
||||||
|
f"{value}: {result.error}[/]"
|
||||||
|
)
|
||||||
|
|
||||||
|
pbar.update(1)
|
||||||
|
|
||||||
|
# Display sweep results
|
||||||
|
console.print("\n[bold green]Sweep Results:[/]")
|
||||||
|
backend_labels = [sweep.get_label(b, v) for b in backends for v in sweep_values]
|
||||||
|
formatter = ResultsFormatter(console)
|
||||||
|
formatter.print_table(all_results, backend_labels)
|
||||||
|
|
||||||
|
# Show optimal values
|
||||||
|
console.print(f"\n[bold cyan]Optimal {sweep.param_name} per batch spec:[/]")
|
||||||
|
by_spec = {}
|
||||||
|
for r in all_results:
|
||||||
|
if r.success:
|
||||||
|
spec = r.config.batch_spec
|
||||||
|
if spec not in by_spec:
|
||||||
|
by_spec[spec] = []
|
||||||
|
by_spec[spec].append(r)
|
||||||
|
|
||||||
|
for spec in sorted(by_spec.keys(), key=batch_spec_sort_key):
|
||||||
|
results = by_spec[spec]
|
||||||
|
best = min(results, key=lambda r: r.mean_time)
|
||||||
|
console.print(
|
||||||
|
f" {spec}: [bold green]{best.config.backend}[/] ({best.mean_time:.6f}s)"
|
||||||
|
)
|
||||||
|
|
||||||
|
return all_results
|
||||||
|
|
||||||
|
|
||||||
|
def load_config_from_yaml(config_path: str) -> dict:
|
||||||
|
"""Load configuration from YAML file."""
|
||||||
|
with open(config_path) as f:
|
||||||
|
return yaml.safe_load(f)
|
||||||
|
|
||||||
|
|
||||||
|
def generate_batch_specs_from_ranges(ranges: list[dict]) -> list[str]:
|
||||||
|
"""
|
||||||
|
Generate batch specs from range specifications.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
ranges: List of range specifications, each containing:
|
||||||
|
- template: Batch spec template (e.g., "q{q_len}kv1k")
|
||||||
|
- q_len: Dict with start, stop, step, end_inclusive (optional)
|
||||||
|
- Other parameters can also be ranges
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of generated batch spec strings
|
||||||
|
|
||||||
|
Example:
|
||||||
|
ranges = [
|
||||||
|
{
|
||||||
|
"template": "q{q_len}kv1k",
|
||||||
|
"q_len": {
|
||||||
|
"start": 1,
|
||||||
|
"stop": 16,
|
||||||
|
"step": 1,
|
||||||
|
"end_inclusive": true # Optional, defaults to true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
]
|
||||||
|
Returns: ["q1kv1k", "q2kv1k", ..., "q16kv1k"]
|
||||||
|
"""
|
||||||
|
all_specs = []
|
||||||
|
|
||||||
|
for range_spec in ranges:
|
||||||
|
template = range_spec.get("template")
|
||||||
|
if not template:
|
||||||
|
raise ValueError("Range specification must include 'template'")
|
||||||
|
|
||||||
|
# Extract all range parameters from the spec
|
||||||
|
range_params = {}
|
||||||
|
for key, value in range_spec.items():
|
||||||
|
if key == "template":
|
||||||
|
continue
|
||||||
|
if isinstance(value, dict) and "start" in value:
|
||||||
|
# This is a range specification
|
||||||
|
start = value["start"]
|
||||||
|
stop = value["stop"]
|
||||||
|
step = value.get("step", 1)
|
||||||
|
# Check if end should be inclusive (default: True)
|
||||||
|
end_inclusive = value.get("end_inclusive", True)
|
||||||
|
|
||||||
|
# Adjust stop based on end_inclusive
|
||||||
|
if end_inclusive:
|
||||||
|
range_params[key] = list(range(start, stop + 1, step))
|
||||||
|
else:
|
||||||
|
range_params[key] = list(range(start, stop, step))
|
||||||
|
else:
|
||||||
|
# This is a fixed value
|
||||||
|
range_params[key] = [value]
|
||||||
|
|
||||||
|
# Generate all combinations (Cartesian product)
|
||||||
|
if range_params:
|
||||||
|
import itertools
|
||||||
|
|
||||||
|
param_names = list(range_params.keys())
|
||||||
|
param_values = [range_params[name] for name in param_names]
|
||||||
|
|
||||||
|
for values in itertools.product(*param_values):
|
||||||
|
params = dict(zip(param_names, values))
|
||||||
|
spec = template.format(**params)
|
||||||
|
all_specs.append(spec)
|
||||||
|
else:
|
||||||
|
# No parameters, just use template as-is
|
||||||
|
all_specs.append(template)
|
||||||
|
|
||||||
|
return all_specs
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
parser = argparse.ArgumentParser(
|
||||||
|
description="Universal vLLM attention benchmark",
|
||||||
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||||
|
epilog=__doc__,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Config file
|
||||||
|
parser.add_argument(
|
||||||
|
"--config",
|
||||||
|
help="Path to YAML config file (overrides other args)",
|
||||||
|
)
|
||||||
|
|
||||||
|
# Backend selection
|
||||||
|
parser.add_argument(
|
||||||
|
"--backends",
|
||||||
|
nargs="+",
|
||||||
|
help="Backends to benchmark (flash, triton, flashinfer, cutlass_mla, "
|
||||||
|
"flashinfer_mla, flashattn_mla, flashmla)",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--backend",
|
||||||
|
help="Single backend (alternative to --backends)",
|
||||||
|
)
|
||||||
|
|
||||||
|
# Batch specifications
|
||||||
|
parser.add_argument(
|
||||||
|
"--batch-specs",
|
||||||
|
nargs="+",
|
||||||
|
default=["q2k", "8q1s1k"],
|
||||||
|
help="Batch specifications using extended grammar",
|
||||||
|
)
|
||||||
|
|
||||||
|
# Model config
|
||||||
|
parser.add_argument("--num-layers", type=int, default=10, help="Number of layers")
|
||||||
|
parser.add_argument("--head-dim", type=int, default=128, help="Head dimension")
|
||||||
|
parser.add_argument("--num-q-heads", type=int, default=32, help="Query heads")
|
||||||
|
parser.add_argument("--num-kv-heads", type=int, default=8, help="KV heads")
|
||||||
|
parser.add_argument("--block-size", type=int, default=16, help="Block size")
|
||||||
|
|
||||||
|
# Benchmark settings
|
||||||
|
parser.add_argument("--device", default="cuda:0", help="Device")
|
||||||
|
parser.add_argument("--repeats", type=int, default=1, help="Repetitions")
|
||||||
|
parser.add_argument("--warmup-iters", type=int, default=3, help="Warmup iterations")
|
||||||
|
parser.add_argument("--profile-memory", action="store_true", help="Profile memory")
|
||||||
|
|
||||||
|
# Parameter sweep (use YAML config for advanced sweeps)
|
||||||
|
parser.add_argument(
|
||||||
|
"--sweep-param",
|
||||||
|
help="Parameter name to sweep (e.g., num_kv_splits, reorder_batch_threshold)",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--sweep-values",
|
||||||
|
type=int,
|
||||||
|
nargs="+",
|
||||||
|
help="Values to sweep for the parameter",
|
||||||
|
)
|
||||||
|
|
||||||
|
# Output
|
||||||
|
parser.add_argument("--output-csv", help="Save to CSV")
|
||||||
|
parser.add_argument("--output-json", help="Save to JSON")
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
console = Console()
|
||||||
|
console.print("[bold cyan]vLLM Attention Benchmark[/]")
|
||||||
|
|
||||||
|
# Load config from YAML if provided
|
||||||
|
if args.config:
|
||||||
|
console.print(f"[yellow]Loading config from: {args.config}[/]")
|
||||||
|
yaml_config = load_config_from_yaml(args.config)
|
||||||
|
|
||||||
|
# Show description if available
|
||||||
|
if "description" in yaml_config:
|
||||||
|
console.print(f"[dim]{yaml_config['description']}[/]")
|
||||||
|
|
||||||
|
# Override args with YAML values, but CLI args take precedence
|
||||||
|
# Check if CLI provided backends (they would be non-None and not default)
|
||||||
|
cli_backends_provided = args.backends is not None or args.backend is not None
|
||||||
|
|
||||||
|
# Backend(s) - only use YAML if CLI didn't specify
|
||||||
|
if not cli_backends_provided:
|
||||||
|
if "backend" in yaml_config:
|
||||||
|
args.backend = yaml_config["backend"]
|
||||||
|
args.backends = None
|
||||||
|
elif "backends" in yaml_config:
|
||||||
|
args.backends = yaml_config["backends"]
|
||||||
|
args.backend = None
|
||||||
|
|
||||||
|
# Check for special modes
|
||||||
|
if "mode" in yaml_config:
|
||||||
|
args.mode = yaml_config["mode"]
|
||||||
|
else:
|
||||||
|
args.mode = None
|
||||||
|
|
||||||
|
# Batch specs and sizes
|
||||||
|
# Support both explicit batch_specs and generated batch_spec_ranges
|
||||||
|
if "batch_spec_ranges" in yaml_config:
|
||||||
|
# Generate batch specs from ranges
|
||||||
|
generated_specs = generate_batch_specs_from_ranges(
|
||||||
|
yaml_config["batch_spec_ranges"]
|
||||||
|
)
|
||||||
|
# Combine with any explicit batch_specs
|
||||||
|
if "batch_specs" in yaml_config:
|
||||||
|
args.batch_specs = yaml_config["batch_specs"] + generated_specs
|
||||||
|
else:
|
||||||
|
args.batch_specs = generated_specs
|
||||||
|
console.print(
|
||||||
|
f"[dim]Generated {len(generated_specs)} batch specs from ranges[/]"
|
||||||
|
)
|
||||||
|
elif "batch_specs" in yaml_config:
|
||||||
|
args.batch_specs = yaml_config["batch_specs"]
|
||||||
|
|
||||||
|
if "batch_sizes" in yaml_config:
|
||||||
|
args.batch_sizes = yaml_config["batch_sizes"]
|
||||||
|
else:
|
||||||
|
args.batch_sizes = None
|
||||||
|
|
||||||
|
# Model config
|
||||||
|
if "model" in yaml_config:
|
||||||
|
model = yaml_config["model"]
|
||||||
|
args.num_layers = model.get("num_layers", args.num_layers)
|
||||||
|
args.head_dim = model.get("head_dim", args.head_dim)
|
||||||
|
args.num_q_heads = model.get("num_q_heads", args.num_q_heads)
|
||||||
|
args.num_kv_heads = model.get("num_kv_heads", args.num_kv_heads)
|
||||||
|
args.block_size = model.get("block_size", args.block_size)
|
||||||
|
|
||||||
|
# Benchmark settings (top-level keys)
|
||||||
|
if "device" in yaml_config:
|
||||||
|
args.device = yaml_config["device"]
|
||||||
|
if "repeats" in yaml_config:
|
||||||
|
args.repeats = yaml_config["repeats"]
|
||||||
|
if "warmup_iters" in yaml_config:
|
||||||
|
args.warmup_iters = yaml_config["warmup_iters"]
|
||||||
|
if "profile_memory" in yaml_config:
|
||||||
|
args.profile_memory = yaml_config["profile_memory"]
|
||||||
|
|
||||||
|
# Parameter sweep configuration
|
||||||
|
if "parameter_sweep" in yaml_config:
|
||||||
|
sweep_config = yaml_config["parameter_sweep"]
|
||||||
|
args.parameter_sweep = ParameterSweep(
|
||||||
|
param_name=sweep_config["param_name"],
|
||||||
|
values=sweep_config["values"],
|
||||||
|
include_auto=sweep_config.get("include_auto", False),
|
||||||
|
label_format=sweep_config.get(
|
||||||
|
"label_format", "{backend}_{param_name}_{value}"
|
||||||
|
),
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
args.parameter_sweep = None
|
||||||
|
|
||||||
|
# Model parameter sweep configuration
|
||||||
|
if "model_parameter_sweep" in yaml_config:
|
||||||
|
sweep_config = yaml_config["model_parameter_sweep"]
|
||||||
|
args.model_parameter_sweep = ModelParameterSweep(
|
||||||
|
param_name=sweep_config["param_name"],
|
||||||
|
values=sweep_config["values"],
|
||||||
|
label_format=sweep_config.get(
|
||||||
|
"label_format", "{backend}_{param_name}_{value}"
|
||||||
|
),
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
args.model_parameter_sweep = None
|
||||||
|
|
||||||
|
# Output
|
||||||
|
if "output" in yaml_config:
|
||||||
|
output = yaml_config["output"]
|
||||||
|
if "csv" in output and not args.output_csv:
|
||||||
|
args.output_csv = output["csv"]
|
||||||
|
if "json" in output and not args.output_json:
|
||||||
|
args.output_json = output["json"]
|
||||||
|
|
||||||
|
console.print()
|
||||||
|
|
||||||
|
# Handle CLI-based parameter sweep (if not from YAML)
|
||||||
|
if (
|
||||||
|
(not hasattr(args, "parameter_sweep") or args.parameter_sweep is None)
|
||||||
|
and args.sweep_param
|
||||||
|
and args.sweep_values
|
||||||
|
):
|
||||||
|
args.parameter_sweep = ParameterSweep(
|
||||||
|
param_name=args.sweep_param,
|
||||||
|
values=args.sweep_values,
|
||||||
|
include_auto=False,
|
||||||
|
label_format="{backend}_{param_name}_{value}",
|
||||||
|
)
|
||||||
|
|
||||||
|
# Determine backends
|
||||||
|
backends = args.backends or ([args.backend] if args.backend else ["flash"])
|
||||||
|
console.print(f"Backends: {', '.join(backends)}")
|
||||||
|
console.print(f"Batch specs: {', '.join(args.batch_specs)}")
|
||||||
|
console.print()
|
||||||
|
|
||||||
|
# Run benchmarks
|
||||||
|
all_results = []
|
||||||
|
|
||||||
|
# Handle special mode: decode_vs_prefill comparison
|
||||||
|
if hasattr(args, "mode") and args.mode == "decode_vs_prefill":
|
||||||
|
console.print("[yellow]Mode: Decode vs Prefill pipeline comparison[/]")
|
||||||
|
console.print(
|
||||||
|
"[dim]For each query length, testing both decode and prefill pipelines[/]"
|
||||||
|
)
|
||||||
|
console.print("[dim]Using batched execution for optimal performance[/]")
|
||||||
|
|
||||||
|
# Extract batch sizes from config
|
||||||
|
batch_sizes = getattr(args, "batch_sizes", [1])
|
||||||
|
backend = backends[0] # Use first backend (should only be one)
|
||||||
|
|
||||||
|
# Calculate total benchmarks
|
||||||
|
total = len(batch_sizes)
|
||||||
|
|
||||||
|
with tqdm(total=total, desc="Benchmarking") as pbar:
|
||||||
|
for batch_size in batch_sizes:
|
||||||
|
# Prepare all configs for this batch size
|
||||||
|
configs_with_thresholds = []
|
||||||
|
|
||||||
|
for spec in args.batch_specs:
|
||||||
|
# Parse the batch spec to get query length
|
||||||
|
requests = parse_batch_spec(spec)
|
||||||
|
if not requests:
|
||||||
|
console.print(
|
||||||
|
f"[red]Error: Could not parse batch spec '{spec}'[/]"
|
||||||
|
)
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Get query length from first request
|
||||||
|
query_length = requests[0].q_len
|
||||||
|
|
||||||
|
# Create batch spec for this batch size
|
||||||
|
# For batch_size > 1, we need to prepend the count
|
||||||
|
batch_spec = f"{batch_size}{spec}" if batch_size > 1 else spec
|
||||||
|
|
||||||
|
# Create base config (without backend name)
|
||||||
|
base_config = BenchmarkConfig(
|
||||||
|
backend=backend, # Will be overridden later
|
||||||
|
batch_spec=batch_spec,
|
||||||
|
num_layers=args.num_layers,
|
||||||
|
head_dim=args.head_dim,
|
||||||
|
num_q_heads=args.num_q_heads,
|
||||||
|
num_kv_heads=args.num_kv_heads,
|
||||||
|
block_size=args.block_size,
|
||||||
|
device=args.device,
|
||||||
|
repeats=args.repeats,
|
||||||
|
warmup_iters=args.warmup_iters,
|
||||||
|
profile_memory=args.profile_memory,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Add decode pipeline config
|
||||||
|
decode_threshold = query_length
|
||||||
|
config_decode = replace(
|
||||||
|
base_config,
|
||||||
|
backend=f"{backend}_decode_qlen{query_length}_bs{batch_size}",
|
||||||
|
)
|
||||||
|
configs_with_thresholds.append((config_decode, decode_threshold))
|
||||||
|
|
||||||
|
# Add prefill pipeline config if query_length > 1
|
||||||
|
if query_length > 1:
|
||||||
|
prefill_threshold = query_length - 1
|
||||||
|
config_prefill = replace(
|
||||||
|
base_config,
|
||||||
|
backend=f"{backend}_prefill_qlen{query_length}"
|
||||||
|
f"_bs{batch_size}",
|
||||||
|
)
|
||||||
|
configs_with_thresholds.append(
|
||||||
|
(config_prefill, prefill_threshold)
|
||||||
|
)
|
||||||
|
|
||||||
|
# Run all benchmarks for this batch size in one go (batched mode)
|
||||||
|
try:
|
||||||
|
from mla_runner import run_mla_benchmark as run_mla
|
||||||
|
|
||||||
|
# Use batched API: pass list of (config, threshold) tuples
|
||||||
|
timing_results = run_mla(backend, configs_with_thresholds)
|
||||||
|
|
||||||
|
# Create BenchmarkResult objects from timing results
|
||||||
|
for (config, _), timing in zip(
|
||||||
|
configs_with_thresholds, timing_results
|
||||||
|
):
|
||||||
|
result = BenchmarkResult(
|
||||||
|
config=config,
|
||||||
|
mean_time=timing["mean"],
|
||||||
|
std_time=timing["std"],
|
||||||
|
min_time=timing["min"],
|
||||||
|
max_time=timing["max"],
|
||||||
|
throughput_tokens_per_sec=timing.get("throughput", None),
|
||||||
|
)
|
||||||
|
all_results.append(result)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
import traceback
|
||||||
|
|
||||||
|
console.print(
|
||||||
|
f"[red]Error running batched benchmarks for "
|
||||||
|
f"batch_size={batch_size}: {e}[/]"
|
||||||
|
)
|
||||||
|
console.print("[red]Traceback:[/]")
|
||||||
|
traceback.print_exc()
|
||||||
|
# Add error results for all configs
|
||||||
|
for config, _ in configs_with_thresholds:
|
||||||
|
result = BenchmarkResult(
|
||||||
|
config=config,
|
||||||
|
mean_time=float("inf"),
|
||||||
|
std_time=0,
|
||||||
|
min_time=float("inf"),
|
||||||
|
max_time=float("inf"),
|
||||||
|
error=str(e),
|
||||||
|
)
|
||||||
|
all_results.append(result)
|
||||||
|
|
||||||
|
pbar.update(1)
|
||||||
|
|
||||||
|
# Display decode vs prefill results
|
||||||
|
console.print("\n[bold green]Decode vs Prefill Results:[/]")
|
||||||
|
|
||||||
|
# Group by batch size
|
||||||
|
by_batch_size = {}
|
||||||
|
for r in all_results:
|
||||||
|
if r.success:
|
||||||
|
# Extract batch size from backend name
|
||||||
|
parts = r.config.backend.split("_")
|
||||||
|
bs_part = [p for p in parts if p.startswith("bs")]
|
||||||
|
if bs_part:
|
||||||
|
bs = int(bs_part[0][2:])
|
||||||
|
if bs not in by_batch_size:
|
||||||
|
by_batch_size[bs] = []
|
||||||
|
by_batch_size[bs].append(r)
|
||||||
|
|
||||||
|
# For each batch size, analyze crossover point
|
||||||
|
for bs in sorted(by_batch_size.keys()):
|
||||||
|
console.print(f"\n[bold cyan]Batch size: {bs}[/]")
|
||||||
|
results = by_batch_size[bs]
|
||||||
|
|
||||||
|
# Group by query length
|
||||||
|
by_qlen = {}
|
||||||
|
for r in results:
|
||||||
|
parts = r.config.backend.split("_")
|
||||||
|
qlen_part = [p for p in parts if p.startswith("qlen")]
|
||||||
|
if qlen_part:
|
||||||
|
qlen = int(qlen_part[0][4:])
|
||||||
|
if qlen not in by_qlen:
|
||||||
|
by_qlen[qlen] = {}
|
||||||
|
|
||||||
|
pipeline = "decode" if "decode" in r.config.backend else "prefill"
|
||||||
|
by_qlen[qlen][pipeline] = r
|
||||||
|
|
||||||
|
# Find crossover point
|
||||||
|
last_decode_faster = None
|
||||||
|
for qlen in sorted(by_qlen.keys()):
|
||||||
|
pipelines = by_qlen[qlen]
|
||||||
|
if "decode" in pipelines and "prefill" in pipelines:
|
||||||
|
decode_time = pipelines["decode"].mean_time
|
||||||
|
prefill_time = pipelines["prefill"].mean_time
|
||||||
|
faster = "decode" if decode_time < prefill_time else "prefill"
|
||||||
|
|
||||||
|
speedup = (
|
||||||
|
prefill_time / decode_time
|
||||||
|
if decode_time < prefill_time
|
||||||
|
else decode_time / prefill_time
|
||||||
|
)
|
||||||
|
|
||||||
|
console.print(
|
||||||
|
f" qlen={qlen:3d}: decode={decode_time:.6f}s, "
|
||||||
|
f"prefill={prefill_time:.6f}s -> "
|
||||||
|
f"[bold]{faster}[/] ({speedup:.2f}x)"
|
||||||
|
)
|
||||||
|
|
||||||
|
if faster == "decode":
|
||||||
|
last_decode_faster = qlen
|
||||||
|
|
||||||
|
if last_decode_faster is not None:
|
||||||
|
optimal_threshold = last_decode_faster
|
||||||
|
console.print(
|
||||||
|
f"\n [bold green]Optimal threshold for batch_size={bs}: "
|
||||||
|
f"{optimal_threshold}[/]"
|
||||||
|
)
|
||||||
|
console.print(
|
||||||
|
f" [dim](Use decode pipeline for query_length <= "
|
||||||
|
f"{optimal_threshold})[/]"
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
console.print(
|
||||||
|
f"\n [yellow]Prefill always faster for batch_size={bs}[/]"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Handle model parameter sweep mode
|
||||||
|
elif hasattr(args, "model_parameter_sweep") and args.model_parameter_sweep:
|
||||||
|
# Model parameter sweep
|
||||||
|
base_config_args = {
|
||||||
|
"num_layers": args.num_layers,
|
||||||
|
"head_dim": args.head_dim,
|
||||||
|
"num_q_heads": args.num_q_heads,
|
||||||
|
"num_kv_heads": args.num_kv_heads,
|
||||||
|
"block_size": args.block_size,
|
||||||
|
"device": args.device,
|
||||||
|
"repeats": args.repeats,
|
||||||
|
"warmup_iters": args.warmup_iters,
|
||||||
|
"profile_memory": args.profile_memory,
|
||||||
|
}
|
||||||
|
all_results = run_model_parameter_sweep(
|
||||||
|
backends,
|
||||||
|
args.batch_specs,
|
||||||
|
base_config_args,
|
||||||
|
args.model_parameter_sweep,
|
||||||
|
console,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Handle parameter sweep mode (unified)
|
||||||
|
elif hasattr(args, "parameter_sweep") and args.parameter_sweep:
|
||||||
|
# Unified parameter sweep
|
||||||
|
base_config_args = {
|
||||||
|
"num_layers": args.num_layers,
|
||||||
|
"head_dim": args.head_dim,
|
||||||
|
"num_q_heads": args.num_q_heads,
|
||||||
|
"num_kv_heads": args.num_kv_heads,
|
||||||
|
"block_size": args.block_size,
|
||||||
|
"device": args.device,
|
||||||
|
"repeats": args.repeats,
|
||||||
|
"warmup_iters": args.warmup_iters,
|
||||||
|
"profile_memory": args.profile_memory,
|
||||||
|
}
|
||||||
|
all_results = run_parameter_sweep(
|
||||||
|
backends, args.batch_specs, base_config_args, args.parameter_sweep, console
|
||||||
|
)
|
||||||
|
|
||||||
|
else:
|
||||||
|
# Normal mode: compare backends
|
||||||
|
total = len(backends) * len(args.batch_specs)
|
||||||
|
|
||||||
|
with tqdm(total=total, desc="Benchmarking") as pbar:
|
||||||
|
for spec in args.batch_specs:
|
||||||
|
for backend in backends:
|
||||||
|
config = BenchmarkConfig(
|
||||||
|
backend=backend,
|
||||||
|
batch_spec=spec,
|
||||||
|
num_layers=args.num_layers,
|
||||||
|
head_dim=args.head_dim,
|
||||||
|
num_q_heads=args.num_q_heads,
|
||||||
|
num_kv_heads=args.num_kv_heads,
|
||||||
|
block_size=args.block_size,
|
||||||
|
device=args.device,
|
||||||
|
repeats=args.repeats,
|
||||||
|
warmup_iters=args.warmup_iters,
|
||||||
|
profile_memory=args.profile_memory,
|
||||||
|
)
|
||||||
|
|
||||||
|
result = run_benchmark(config)
|
||||||
|
all_results.append(result)
|
||||||
|
|
||||||
|
if not result.success:
|
||||||
|
console.print(f"[red]Error {backend} {spec}: {result.error}[/]")
|
||||||
|
|
||||||
|
pbar.update(1)
|
||||||
|
|
||||||
|
# Display results
|
||||||
|
console.print("\n[bold green]Results:[/]")
|
||||||
|
formatter = ResultsFormatter(console)
|
||||||
|
formatter.print_table(all_results, backends)
|
||||||
|
|
||||||
|
# Save results
|
||||||
|
if all_results:
|
||||||
|
formatter = ResultsFormatter(console)
|
||||||
|
if args.output_csv:
|
||||||
|
formatter.save_csv(all_results, args.output_csv)
|
||||||
|
if args.output_json:
|
||||||
|
formatter.save_json(all_results, args.output_json)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
475
benchmarks/attention_benchmarks/common.py
Normal file
475
benchmarks/attention_benchmarks/common.py
Normal file
@@ -0,0 +1,475 @@
|
|||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
|
||||||
|
"""Common utilities for attention benchmarking."""
|
||||||
|
|
||||||
|
import csv
|
||||||
|
import json
|
||||||
|
import math
|
||||||
|
from dataclasses import asdict, dataclass
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
import torch
|
||||||
|
from batch_spec import get_batch_type, parse_batch_spec
|
||||||
|
from rich.console import Console
|
||||||
|
from rich.table import Table
|
||||||
|
|
||||||
|
|
||||||
|
def batch_spec_sort_key(spec: str) -> tuple[int, int, int]:
|
||||||
|
"""
|
||||||
|
Extract sorting key from batch spec: (batch_size, max_q_len, max_kv_len).
|
||||||
|
|
||||||
|
This ensures results are sorted by batch size first, then query length,
|
||||||
|
then sequence length, rather than alphabetically.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
requests = parse_batch_spec(spec)
|
||||||
|
batch_size = len(requests)
|
||||||
|
max_q_len = max(r.q_len for r in requests) if requests else 0
|
||||||
|
max_kv_len = max(r.kv_len for r in requests) if requests else 0
|
||||||
|
return (batch_size, max_q_len, max_kv_len)
|
||||||
|
except Exception:
|
||||||
|
# Fallback for unparseable specs
|
||||||
|
return (0, 0, 0)
|
||||||
|
|
||||||
|
|
||||||
|
# Mock classes for vLLM attention infrastructure
|
||||||
|
|
||||||
|
|
||||||
|
class MockHfConfig:
|
||||||
|
"""Mock HuggingFace config that satisfies vLLM's requirements."""
|
||||||
|
|
||||||
|
def __init__(self, mla_dims: dict, index_topk: int | None = None):
|
||||||
|
self.num_attention_heads = mla_dims["num_q_heads"]
|
||||||
|
self.num_key_value_heads = mla_dims["num_kv_heads"]
|
||||||
|
self.hidden_size = mla_dims["head_dim"] * mla_dims["num_q_heads"]
|
||||||
|
self.model_type = "deepseek_v2"
|
||||||
|
self.is_encoder_decoder = False
|
||||||
|
self.kv_lora_rank = mla_dims["kv_lora_rank"]
|
||||||
|
self.qk_nope_head_dim = mla_dims["qk_nope_head_dim"]
|
||||||
|
self.qk_rope_head_dim = mla_dims["qk_rope_head_dim"]
|
||||||
|
self.v_head_dim = mla_dims["v_head_dim"]
|
||||||
|
self.qk_head_dim = mla_dims["qk_nope_head_dim"] + mla_dims["qk_rope_head_dim"]
|
||||||
|
if index_topk is not None:
|
||||||
|
self.index_topk = index_topk
|
||||||
|
|
||||||
|
def get_text_config(self):
|
||||||
|
return self
|
||||||
|
|
||||||
|
|
||||||
|
# Import AttentionLayerBase at module level to avoid circular dependencies
|
||||||
|
try:
|
||||||
|
from vllm.model_executor.layers.attention_layer_base import AttentionLayerBase
|
||||||
|
except ImportError:
|
||||||
|
AttentionLayerBase = object # Fallback
|
||||||
|
|
||||||
|
|
||||||
|
class MockKVBProj:
|
||||||
|
"""Mock KV projection layer for MLA prefill mode.
|
||||||
|
|
||||||
|
Mimics ColumnParallelLinear behavior for kv_b_proj in MLA backends.
|
||||||
|
Projects kv_c_normed to [qk_nope_head_dim + v_head_dim] per head.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, num_heads: int, qk_nope_head_dim: int, v_head_dim: int):
|
||||||
|
self.num_heads = num_heads
|
||||||
|
self.qk_nope_head_dim = qk_nope_head_dim
|
||||||
|
self.v_head_dim = v_head_dim
|
||||||
|
self.out_dim = qk_nope_head_dim + v_head_dim
|
||||||
|
|
||||||
|
def __call__(self, x: torch.Tensor) -> tuple[torch.Tensor]:
|
||||||
|
"""
|
||||||
|
Project kv_c_normed to output space.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
x: Input tensor [num_tokens, kv_lora_rank]
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Tuple containing output tensor
|
||||||
|
[num_tokens, num_heads, qk_nope_head_dim + v_head_dim]
|
||||||
|
"""
|
||||||
|
num_tokens = x.shape[0]
|
||||||
|
result = torch.randn(
|
||||||
|
num_tokens,
|
||||||
|
self.num_heads,
|
||||||
|
self.out_dim,
|
||||||
|
device=x.device,
|
||||||
|
dtype=x.dtype,
|
||||||
|
)
|
||||||
|
return (result,) # Return as tuple to match ColumnParallelLinear API
|
||||||
|
|
||||||
|
|
||||||
|
class MockIndexer:
|
||||||
|
"""Mock Indexer for sparse MLA backends.
|
||||||
|
|
||||||
|
Provides topk_indices_buffer that sparse MLA backends use to determine
|
||||||
|
which KV cache slots to attend to for each token.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
max_num_tokens: int,
|
||||||
|
topk_tokens: int,
|
||||||
|
device: torch.device,
|
||||||
|
):
|
||||||
|
self.topk_tokens = topk_tokens
|
||||||
|
self.topk_indices_buffer = torch.zeros(
|
||||||
|
(max_num_tokens, topk_tokens),
|
||||||
|
dtype=torch.int32,
|
||||||
|
device=device,
|
||||||
|
)
|
||||||
|
|
||||||
|
def fill_random_indices(self, num_tokens: int, max_kv_len: int):
|
||||||
|
"""Fill topk_indices_buffer with random valid indices for benchmarking."""
|
||||||
|
indices = torch.randint(
|
||||||
|
0,
|
||||||
|
max_kv_len,
|
||||||
|
(num_tokens, self.topk_tokens),
|
||||||
|
dtype=torch.int32,
|
||||||
|
device=self.topk_indices_buffer.device,
|
||||||
|
)
|
||||||
|
self.topk_indices_buffer[:num_tokens] = indices
|
||||||
|
|
||||||
|
|
||||||
|
class MockLayer(AttentionLayerBase):
|
||||||
|
"""Mock attention layer with scale parameters and impl.
|
||||||
|
|
||||||
|
Inherits from AttentionLayerBase so it passes isinstance checks
|
||||||
|
in get_layers_from_vllm_config when FlashInfer prefill is enabled.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, device: torch.device, impl=None, kv_cache_spec=None):
|
||||||
|
# Don't call super().__init__() as AttentionLayerBase doesn't have __init__
|
||||||
|
self._k_scale = torch.tensor(1.0, device=device)
|
||||||
|
self._v_scale = torch.tensor(1.0, device=device)
|
||||||
|
self._q_scale = torch.tensor(1.0, device=device)
|
||||||
|
# Scalar floats for kernels that need them
|
||||||
|
self._k_scale_float = float(self._k_scale.item())
|
||||||
|
self._v_scale_float = float(self._v_scale.item())
|
||||||
|
self._q_scale_float = float(self._q_scale.item())
|
||||||
|
# AttentionImpl for metadata builders to query
|
||||||
|
self.impl = impl
|
||||||
|
# KV cache spec for get_kv_cache_spec
|
||||||
|
self._kv_cache_spec = kv_cache_spec
|
||||||
|
|
||||||
|
def get_attn_backend(self):
|
||||||
|
"""Get the attention backend class (required by AttentionLayerBase)."""
|
||||||
|
# Return None as this is just a mock layer for benchmarking
|
||||||
|
return None
|
||||||
|
|
||||||
|
def get_kv_cache_spec(self):
|
||||||
|
"""Get the KV cache spec (required by AttentionLayerBase)."""
|
||||||
|
return self._kv_cache_spec
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class ParameterSweep:
|
||||||
|
"""Configuration for sweeping a backend parameter."""
|
||||||
|
|
||||||
|
param_name: str # Name of the backend parameter to sweep
|
||||||
|
values: list[Any] # List of values to test
|
||||||
|
include_auto: bool = False # Also test with param unset (auto mode)
|
||||||
|
label_format: str = "{backend}_{param_name}_{value}" # Result label template
|
||||||
|
|
||||||
|
def get_label(self, backend: str, value: Any) -> str:
|
||||||
|
"""Generate a label for a specific parameter value."""
|
||||||
|
return self.label_format.format(
|
||||||
|
backend=backend, param_name=self.param_name, value=value
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class ModelParameterSweep:
|
||||||
|
"""Configuration for sweeping a model configuration parameter."""
|
||||||
|
|
||||||
|
param_name: str # Name of the model config parameter to sweep (e.g., "num_q_heads")
|
||||||
|
values: list[Any] # List of values to test
|
||||||
|
label_format: str = "{backend}_{param_name}_{value}" # Result label template
|
||||||
|
|
||||||
|
def get_label(self, backend: str, value: Any) -> str:
|
||||||
|
"""Generate a label for a specific parameter value."""
|
||||||
|
return self.label_format.format(
|
||||||
|
backend=backend, param_name=self.param_name, value=value
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class BenchmarkConfig:
|
||||||
|
"""Configuration for a single benchmark run."""
|
||||||
|
|
||||||
|
backend: str
|
||||||
|
batch_spec: str
|
||||||
|
num_layers: int
|
||||||
|
head_dim: int
|
||||||
|
num_q_heads: int
|
||||||
|
num_kv_heads: int
|
||||||
|
block_size: int
|
||||||
|
device: str
|
||||||
|
dtype: torch.dtype = torch.float16
|
||||||
|
repeats: int = 1
|
||||||
|
warmup_iters: int = 3
|
||||||
|
profile_memory: bool = False
|
||||||
|
use_cuda_graphs: bool = False
|
||||||
|
|
||||||
|
# MLA-specific
|
||||||
|
kv_lora_rank: int | None = None
|
||||||
|
qk_nope_head_dim: int | None = None
|
||||||
|
qk_rope_head_dim: int | None = None
|
||||||
|
v_head_dim: int | None = None
|
||||||
|
|
||||||
|
# Backend-specific tuning
|
||||||
|
num_kv_splits: int | None = None # CUTLASS MLA
|
||||||
|
reorder_batch_threshold: int | None = None # FlashAttn MLA, FlashMLA
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class BenchmarkResult:
|
||||||
|
"""Results from a single benchmark run."""
|
||||||
|
|
||||||
|
config: BenchmarkConfig
|
||||||
|
mean_time: float # seconds
|
||||||
|
std_time: float # seconds
|
||||||
|
min_time: float # seconds
|
||||||
|
max_time: float # seconds
|
||||||
|
throughput_tokens_per_sec: float | None = None
|
||||||
|
memory_allocated_mb: float | None = None
|
||||||
|
memory_reserved_mb: float | None = None
|
||||||
|
error: str | None = None
|
||||||
|
|
||||||
|
@property
|
||||||
|
def success(self) -> bool:
|
||||||
|
"""Whether benchmark completed successfully."""
|
||||||
|
return self.error is None
|
||||||
|
|
||||||
|
def to_dict(self) -> dict[str, Any]:
|
||||||
|
"""Convert to dictionary for serialization."""
|
||||||
|
return {
|
||||||
|
"config": asdict(self.config),
|
||||||
|
"mean_time": self.mean_time,
|
||||||
|
"std_time": self.std_time,
|
||||||
|
"min_time": self.min_time,
|
||||||
|
"max_time": self.max_time,
|
||||||
|
"throughput_tokens_per_sec": self.throughput_tokens_per_sec,
|
||||||
|
"memory_allocated_mb": self.memory_allocated_mb,
|
||||||
|
"memory_reserved_mb": self.memory_reserved_mb,
|
||||||
|
"error": self.error,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
class ResultsFormatter:
|
||||||
|
"""Format and display benchmark results."""
|
||||||
|
|
||||||
|
def __init__(self, console: Console | None = None):
|
||||||
|
self.console = console or Console()
|
||||||
|
|
||||||
|
def print_table(
|
||||||
|
self,
|
||||||
|
results: list[BenchmarkResult],
|
||||||
|
backends: list[str],
|
||||||
|
compare_to_fastest: bool = True,
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
Print results as a rich table.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
results: List of BenchmarkResult
|
||||||
|
backends: List of backend names being compared
|
||||||
|
compare_to_fastest: Show percentage comparison to fastest
|
||||||
|
"""
|
||||||
|
# Group by batch spec, preserving first-occurrence order
|
||||||
|
by_spec = {}
|
||||||
|
specs_order = []
|
||||||
|
for r in results:
|
||||||
|
spec = r.config.batch_spec
|
||||||
|
if spec not in by_spec:
|
||||||
|
by_spec[spec] = {}
|
||||||
|
specs_order.append(spec)
|
||||||
|
by_spec[spec][r.config.backend] = r
|
||||||
|
|
||||||
|
# Sort specs by (batch_size, q_len, kv_len) instead of alphabetically
|
||||||
|
specs_order = sorted(by_spec.keys(), key=batch_spec_sort_key)
|
||||||
|
|
||||||
|
# Create shortened backend names for display
|
||||||
|
def shorten_backend_name(name: str) -> str:
|
||||||
|
"""Shorten long backend names for table display."""
|
||||||
|
# Remove common prefixes
|
||||||
|
name = name.replace("flashattn_mla", "famla")
|
||||||
|
name = name.replace("flashinfer_mla", "fimla")
|
||||||
|
name = name.replace("flashmla", "fmla")
|
||||||
|
name = name.replace("cutlass_mla", "cmla")
|
||||||
|
name = name.replace("numsplits", "ns")
|
||||||
|
return name
|
||||||
|
|
||||||
|
table = Table(title="Attention Benchmark Results")
|
||||||
|
table.add_column("Batch\nSpec", no_wrap=True)
|
||||||
|
table.add_column("Type", no_wrap=True)
|
||||||
|
table.add_column("Batch\nSize", justify="right", no_wrap=True)
|
||||||
|
|
||||||
|
multi = len(backends) > 1
|
||||||
|
for backend in backends:
|
||||||
|
short_name = shorten_backend_name(backend)
|
||||||
|
# Time column
|
||||||
|
col_time = f"{short_name}\nTime (s)"
|
||||||
|
table.add_column(col_time, justify="right", no_wrap=False)
|
||||||
|
if multi and compare_to_fastest:
|
||||||
|
# Relative performance column
|
||||||
|
col_rel = f"{short_name}\nvs Best"
|
||||||
|
table.add_column(col_rel, justify="right", no_wrap=False)
|
||||||
|
|
||||||
|
# Add rows
|
||||||
|
for spec in specs_order:
|
||||||
|
spec_results = by_spec[spec]
|
||||||
|
times = {b: r.mean_time for b, r in spec_results.items() if r.success}
|
||||||
|
best_time = min(times.values()) if times else 0.0
|
||||||
|
|
||||||
|
batch_type = get_batch_type(spec)
|
||||||
|
batch_size = len(parse_batch_spec(spec))
|
||||||
|
row = [spec, batch_type, str(batch_size)]
|
||||||
|
for backend in backends:
|
||||||
|
if backend in spec_results:
|
||||||
|
r = spec_results[backend]
|
||||||
|
if r.success:
|
||||||
|
row.append(f"{r.mean_time:.6f}")
|
||||||
|
if multi and compare_to_fastest:
|
||||||
|
pct = (
|
||||||
|
(r.mean_time / best_time * 100) if best_time > 0 else 0
|
||||||
|
)
|
||||||
|
pct_str = f"{pct:.1f}%"
|
||||||
|
if r.mean_time == best_time:
|
||||||
|
pct_str = f"[bold green]{pct_str}[/]"
|
||||||
|
row.append(pct_str)
|
||||||
|
else:
|
||||||
|
row.append("[red]ERROR[/]")
|
||||||
|
if multi and compare_to_fastest:
|
||||||
|
row.append("-")
|
||||||
|
else:
|
||||||
|
row.append("-")
|
||||||
|
if multi and compare_to_fastest:
|
||||||
|
row.append("-")
|
||||||
|
|
||||||
|
table.add_row(*row)
|
||||||
|
|
||||||
|
self.console.print(table)
|
||||||
|
|
||||||
|
def save_csv(self, results: list[BenchmarkResult], path: str):
|
||||||
|
"""Save results to CSV file."""
|
||||||
|
if not results:
|
||||||
|
return
|
||||||
|
|
||||||
|
path_obj = Path(path)
|
||||||
|
path_obj.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
with open(path, "w", newline="") as f:
|
||||||
|
writer = csv.DictWriter(
|
||||||
|
f,
|
||||||
|
fieldnames=[
|
||||||
|
"backend",
|
||||||
|
"batch_spec",
|
||||||
|
"num_layers",
|
||||||
|
"mean_time",
|
||||||
|
"std_time",
|
||||||
|
"throughput",
|
||||||
|
"memory_mb",
|
||||||
|
],
|
||||||
|
)
|
||||||
|
writer.writeheader()
|
||||||
|
for r in results:
|
||||||
|
writer.writerow(
|
||||||
|
{
|
||||||
|
"backend": r.config.backend,
|
||||||
|
"batch_spec": r.config.batch_spec,
|
||||||
|
"num_layers": r.config.num_layers,
|
||||||
|
"mean_time": r.mean_time,
|
||||||
|
"std_time": r.std_time,
|
||||||
|
"throughput": r.throughput_tokens_per_sec or 0,
|
||||||
|
"memory_mb": r.memory_allocated_mb or 0,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
self.console.print(f"[green]Saved CSV results to {path}[/]")
|
||||||
|
|
||||||
|
def save_json(self, results: list[BenchmarkResult], path: str):
|
||||||
|
"""Save results to JSON file."""
|
||||||
|
path_obj = Path(path)
|
||||||
|
path_obj.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
data = [r.to_dict() for r in results]
|
||||||
|
with open(path, "w") as f:
|
||||||
|
json.dump(data, f, indent=2, default=str)
|
||||||
|
|
||||||
|
self.console.print(f"[green]Saved JSON results to {path}[/]")
|
||||||
|
|
||||||
|
|
||||||
|
def setup_mla_dims(model_name: str = "deepseek-v3") -> dict:
|
||||||
|
"""
|
||||||
|
Get MLA dimensions for known models.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
model_name: Model identifier
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dict with MLA dimension configuration
|
||||||
|
"""
|
||||||
|
configs = {
|
||||||
|
"deepseek-v2": {
|
||||||
|
"kv_lora_rank": 512,
|
||||||
|
"qk_nope_head_dim": 128,
|
||||||
|
"qk_rope_head_dim": 64,
|
||||||
|
"v_head_dim": 128,
|
||||||
|
"num_q_heads": 128,
|
||||||
|
"num_kv_heads": 1,
|
||||||
|
"head_dim": 576,
|
||||||
|
},
|
||||||
|
"deepseek-v3": {
|
||||||
|
"kv_lora_rank": 512,
|
||||||
|
"qk_nope_head_dim": 128,
|
||||||
|
"qk_rope_head_dim": 64,
|
||||||
|
"v_head_dim": 128,
|
||||||
|
"num_q_heads": 128,
|
||||||
|
"num_kv_heads": 1,
|
||||||
|
"head_dim": 576,
|
||||||
|
},
|
||||||
|
"deepseek-v2-lite": {
|
||||||
|
"kv_lora_rank": 512,
|
||||||
|
"qk_nope_head_dim": 128,
|
||||||
|
"qk_rope_head_dim": 64,
|
||||||
|
"v_head_dim": 128,
|
||||||
|
"num_q_heads": 16,
|
||||||
|
"num_kv_heads": 1,
|
||||||
|
"head_dim": 576,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
if model_name not in configs:
|
||||||
|
raise ValueError(
|
||||||
|
f"Unknown model '{model_name}'. Known models: {list(configs.keys())}"
|
||||||
|
)
|
||||||
|
|
||||||
|
return configs[model_name]
|
||||||
|
|
||||||
|
|
||||||
|
def get_attention_scale(head_dim: int) -> float:
|
||||||
|
"""Compute attention scale factor (1/sqrt(d))."""
|
||||||
|
return 1.0 / math.sqrt(head_dim)
|
||||||
|
|
||||||
|
|
||||||
|
def is_mla_backend(backend: str) -> bool:
|
||||||
|
"""
|
||||||
|
Check if backend is an MLA backend using the AttentionBackendEnum.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
backend: Backend name matching AttentionBackendEnum exactly
|
||||||
|
(e.g., "FLASHMLA_SPARSE")
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
True if the backend is an MLA backend, False otherwise
|
||||||
|
"""
|
||||||
|
from vllm.v1.attention.backends.registry import AttentionBackendEnum
|
||||||
|
|
||||||
|
try:
|
||||||
|
backend_enum = AttentionBackendEnum[backend]
|
||||||
|
backend_class = backend_enum.get_class()
|
||||||
|
return backend_class.is_mla()
|
||||||
|
except (KeyError, ValueError, ImportError, AttributeError):
|
||||||
|
return False
|
||||||
70
benchmarks/attention_benchmarks/configs/mla_decode.yaml
Normal file
70
benchmarks/attention_benchmarks/configs/mla_decode.yaml
Normal file
@@ -0,0 +1,70 @@
|
|||||||
|
# MLA decode-only benchmark configuration
|
||||||
|
|
||||||
|
model:
|
||||||
|
name: "deepseek-v3"
|
||||||
|
num_layers: 60
|
||||||
|
num_q_heads: 128 # Base value, can be swept for TP simulation
|
||||||
|
num_kv_heads: 1 # MLA uses single latent KV
|
||||||
|
head_dim: 576
|
||||||
|
kv_lora_rank: 512
|
||||||
|
qk_nope_head_dim: 128
|
||||||
|
qk_rope_head_dim: 64
|
||||||
|
v_head_dim: 128
|
||||||
|
block_size: 128 # CUTLASS MLA and FlashAttn MLA use 128
|
||||||
|
|
||||||
|
# Model parameter sweep: simulate tensor parallelism by varying num_q_heads
|
||||||
|
# TP=1: 128 heads, TP=2: 64 heads, TP=4: 32 heads, TP=8: 16 heads
|
||||||
|
model_parameter_sweep:
|
||||||
|
param_name: "num_q_heads"
|
||||||
|
values: [128, 64, 32, 16]
|
||||||
|
label_format: "{backend}_{value}h"
|
||||||
|
|
||||||
|
batch_specs:
|
||||||
|
# Small batches, varying sequence lengths
|
||||||
|
- "16q1s512" # 16 requests, 512 KV cache
|
||||||
|
- "16q1s1k" # 16 requests, 1k KV cache
|
||||||
|
- "16q1s2k" # 16 requests, 2k KV cache
|
||||||
|
- "16q1s4k" # 16 requests, 4k KV cache
|
||||||
|
|
||||||
|
# Medium batches
|
||||||
|
- "32q1s1k" # 32 requests, 1k KV cache
|
||||||
|
- "32q1s2k" # 32 requests, 2k KV cache
|
||||||
|
- "32q1s4k" # 32 requests, 4k KV cache
|
||||||
|
- "32q1s8k" # 32 requests, 8k KV cache
|
||||||
|
|
||||||
|
# Large batches
|
||||||
|
- "64q1s1k" # 64 requests, 1k KV cache
|
||||||
|
- "64q1s2k" # 64 requests, 2k KV cache
|
||||||
|
- "64q1s4k" # 64 requests, 4k KV cache
|
||||||
|
- "64q1s8k" # 64 requests, 8k KV cache
|
||||||
|
|
||||||
|
# Very large batches
|
||||||
|
- "128q1s1k" # 128 requests, 1k KV cache
|
||||||
|
- "128q1s2k" # 128 requests, 2k KV cache
|
||||||
|
- "128q1s4k" # 128 requests, 4k KV cache
|
||||||
|
- "128q1s8k" # 128 requests, 8k KV cache
|
||||||
|
|
||||||
|
# Long context
|
||||||
|
- "32q1s16k" # 32 requests, 16k KV cache
|
||||||
|
- "32q1s32k" # 32 requests, 32k KV cache
|
||||||
|
|
||||||
|
backends:
|
||||||
|
- CUTLASS_MLA
|
||||||
|
- FLASHINFER_MLA
|
||||||
|
- FLASH_ATTN_MLA # Hopper only
|
||||||
|
- FLASHMLA # Hopper only
|
||||||
|
|
||||||
|
device: "cuda:0"
|
||||||
|
repeats: 100
|
||||||
|
warmup_iters: 10
|
||||||
|
profile_memory: true
|
||||||
|
|
||||||
|
# Backend-specific tuning
|
||||||
|
CUTLASS_MLA:
|
||||||
|
num_kv_splits: auto # or specific value like 4, 8, 16
|
||||||
|
|
||||||
|
FLASH_ATTN_MLA:
|
||||||
|
reorder_batch_threshold: 512
|
||||||
|
|
||||||
|
FLASHMLA:
|
||||||
|
reorder_batch_threshold: 1
|
||||||
60
benchmarks/attention_benchmarks/configs/mla_mixed_batch.yaml
Normal file
60
benchmarks/attention_benchmarks/configs/mla_mixed_batch.yaml
Normal file
@@ -0,0 +1,60 @@
|
|||||||
|
# MLA mixed batch benchmark (prefill + decode)
|
||||||
|
# Tests chunked prefill performance
|
||||||
|
|
||||||
|
model:
|
||||||
|
name: "deepseek-v3"
|
||||||
|
num_layers: 60
|
||||||
|
num_q_heads: 128
|
||||||
|
num_kv_heads: 1
|
||||||
|
head_dim: 576
|
||||||
|
kv_lora_rank: 512
|
||||||
|
qk_nope_head_dim: 128
|
||||||
|
qk_rope_head_dim: 64
|
||||||
|
v_head_dim: 128
|
||||||
|
block_size: 128
|
||||||
|
|
||||||
|
batch_specs:
|
||||||
|
# Small prefill + decode
|
||||||
|
- "1q1k_8q1s1k" # 1 prefill + 8 decode
|
||||||
|
- "2q2k_16q1s1k" # 2 prefill + 16 decode
|
||||||
|
- "4q1k_32q1s2k" # 4 prefill + 32 decode
|
||||||
|
|
||||||
|
# Medium prefill + decode
|
||||||
|
- "2q4k_32q1s2k" # 2 medium prefill + 32 decode
|
||||||
|
- "4q4k_64q1s2k" # 4 medium prefill + 64 decode
|
||||||
|
- "8q2k_64q1s4k" # 8 prefill + 64 decode
|
||||||
|
|
||||||
|
# Large prefill + decode (chunked prefill stress test)
|
||||||
|
- "2q8k_32q1s1k" # 2 large prefill + 32 decode
|
||||||
|
- "1q16k_16q1s2k" # 1 very large prefill + 16 decode
|
||||||
|
- "2q16k_32q1s4k" # 2 very large prefill + 32 decode
|
||||||
|
|
||||||
|
# Context extension + decode
|
||||||
|
- "2q1kkv2k_16q1s1k" # 2 extend + 16 decode
|
||||||
|
- "4q2kkv4k_32q1s2k" # 4 extend + 32 decode
|
||||||
|
- "2q1kkv8k_32q1s2k" # 2 large extend + 32 decode
|
||||||
|
|
||||||
|
# Explicitly chunked prefill
|
||||||
|
- "q8k" # 8k prefill with chunking hint
|
||||||
|
- "q16k" # 16k prefill with chunking hint
|
||||||
|
- "2q8k_32q1s2k" # 2 chunked prefill + 32 decode
|
||||||
|
|
||||||
|
# High decode ratio (realistic serving)
|
||||||
|
- "1q2k_63q1s1k" # 1 prefill + 63 decode
|
||||||
|
- "2q2k_62q1s2k" # 2 prefill + 62 decode
|
||||||
|
- "4q4k_60q1s4k" # 4 prefill + 60 decode
|
||||||
|
|
||||||
|
backends:
|
||||||
|
- CUTLASS_MLA
|
||||||
|
- FLASHINFER_MLA
|
||||||
|
- FLASH_ATTN_MLA # Hopper only
|
||||||
|
- FLASHMLA # Hopper only
|
||||||
|
|
||||||
|
device: "cuda:0"
|
||||||
|
repeats: 5
|
||||||
|
warmup_iters: 3
|
||||||
|
profile_memory: true
|
||||||
|
|
||||||
|
# Analyze chunked prefill workspace size impact
|
||||||
|
chunked_prefill:
|
||||||
|
test_workspace_sizes: [4096, 8192, 16384, 32768, 65536]
|
||||||
62
benchmarks/attention_benchmarks/configs/mla_prefill.yaml
Normal file
62
benchmarks/attention_benchmarks/configs/mla_prefill.yaml
Normal file
@@ -0,0 +1,62 @@
|
|||||||
|
# MLA prefill-only benchmark configuration for sparse backends
|
||||||
|
|
||||||
|
model:
|
||||||
|
name: "deepseek-v3"
|
||||||
|
num_layers: 60
|
||||||
|
num_q_heads: 128
|
||||||
|
num_kv_heads: 1
|
||||||
|
head_dim: 576
|
||||||
|
kv_lora_rank: 512
|
||||||
|
qk_nope_head_dim: 128
|
||||||
|
qk_rope_head_dim: 64
|
||||||
|
v_head_dim: 128
|
||||||
|
block_size: 128
|
||||||
|
|
||||||
|
# Model parameter sweep: simulate tensor parallelism by varying num_q_heads
|
||||||
|
# TP=1: 128 heads, TP=2: 64 heads, TP=4: 32 heads, TP=8: 16 heads
|
||||||
|
model_parameter_sweep:
|
||||||
|
param_name: "num_q_heads"
|
||||||
|
values: [128, 64, 32, 16]
|
||||||
|
label_format: "{backend}_{value}h"
|
||||||
|
|
||||||
|
batch_specs:
|
||||||
|
# Pure prefill
|
||||||
|
- "1q512"
|
||||||
|
- "1q1k"
|
||||||
|
- "1q2k"
|
||||||
|
- "1q4k"
|
||||||
|
- "1q8k"
|
||||||
|
|
||||||
|
# Batched pure prefill
|
||||||
|
- "2q512"
|
||||||
|
- "2q1k"
|
||||||
|
- "2q2k"
|
||||||
|
- "2q4k"
|
||||||
|
- "2q8k"
|
||||||
|
- "4q512"
|
||||||
|
- "4q1k"
|
||||||
|
- "4q2k"
|
||||||
|
- "4q4k"
|
||||||
|
- "4q8k"
|
||||||
|
- "8q512"
|
||||||
|
- "8q1k"
|
||||||
|
- "8q2k"
|
||||||
|
- "8q4k"
|
||||||
|
- "8q8k"
|
||||||
|
|
||||||
|
# Extend
|
||||||
|
- "1q512s4k"
|
||||||
|
- "1q512s8k"
|
||||||
|
- "1q1ks8k"
|
||||||
|
- "1q2ks8k"
|
||||||
|
- "1q2ks16k"
|
||||||
|
- "1q4ks16k"
|
||||||
|
|
||||||
|
backends:
|
||||||
|
- FLASHMLA_SPARSE
|
||||||
|
- FLASHINFER_MLA_SPARSE
|
||||||
|
|
||||||
|
device: "cuda:0"
|
||||||
|
repeats: 10
|
||||||
|
warmup_iters: 3
|
||||||
|
profile_memory: true
|
||||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user