Compare commits
800 Commits
v0.14.0rc1
...
v0.16.0rc2
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
c44d0c6d66 | ||
|
|
83db96d8cd | ||
|
|
dbfb79fe45 | ||
|
|
b2e1fc3589 | ||
|
|
55a1baebc5 | ||
|
|
e1e9841631 | ||
|
|
5bd63387c3 | ||
|
|
22b64948f6 | ||
|
|
7c233dbb36 | ||
|
|
a75a5b54c7 | ||
|
|
f97ca67176 | ||
|
|
084aa19f02 | ||
|
|
1ecfabe525 | ||
|
|
4df841fe75 | ||
|
|
a263aa6140 | ||
|
|
179ae7da8f | ||
|
|
c4df59ad43 | ||
|
|
785cf28fff | ||
|
|
a96197f564 | ||
|
|
ab10d79855 | ||
|
|
7fcb705b80 | ||
|
|
b956cdf818 | ||
|
|
ed17f54c8b | ||
|
|
860981d8d8 | ||
|
|
52181baaea | ||
|
|
de3869bb4d | ||
|
|
ce9b3cd3e9 | ||
|
|
db4ede9743 | ||
|
|
2cb2340f7a | ||
|
|
4df44c16ba | ||
|
|
81fe69cae5 | ||
|
|
dd6a6e1190 | ||
|
|
edb359cce4 | ||
|
|
6ed5eda300 | ||
|
|
11a4c9d30d | ||
|
|
15a0b9e570 | ||
|
|
c490d8cc73 | ||
|
|
48312e579a | ||
|
|
bc32444b23 | ||
|
|
18e8545297 | ||
|
|
6f7adc533a | ||
|
|
40218a82ba | ||
|
|
1c3b22058f | ||
|
|
3920cafdd6 | ||
|
|
ec28784fdc | ||
|
|
55aeec04f5 | ||
|
|
906077181b | ||
|
|
89a385d79f | ||
|
|
4a2d00eafd | ||
|
|
207c3a0c20 | ||
|
|
ae2e93f89b | ||
|
|
9e9acce577 | ||
|
|
fe5438200b | ||
|
|
77c09e1130 | ||
|
|
16786da735 | ||
|
|
aaa2efbe98 | ||
|
|
aca5967416 | ||
|
|
67a746e87f | ||
|
|
7bec435130 | ||
|
|
5c52644b10 | ||
|
|
2ce9fe4ad0 | ||
|
|
cd8b405bd0 | ||
|
|
4707f7ebb4 | ||
|
|
c39ee9ee2b | ||
|
|
350ca72c04 | ||
|
|
1fb0495a72 | ||
|
|
85ee1d962b | ||
|
|
51a7bda625 | ||
|
|
6e7b1c4b59 | ||
|
|
2991dd3d22 | ||
|
|
ac32e66cf9 | ||
|
|
f79d9dce16 | ||
|
|
ba5cbbf107 | ||
|
|
233b26ab35 | ||
|
|
791a94bed0 | ||
|
|
e969a169ef | ||
|
|
6d8d34be6d | ||
|
|
1363e3d6d5 | ||
|
|
965525667b | ||
|
|
6550815c3a | ||
|
|
7439e4f41b | ||
|
|
ac04dd374f | ||
|
|
035a6cb09a | ||
|
|
a32cb49b60 | ||
|
|
20d7454c9b | ||
|
|
5819ca8944 | ||
|
|
79028d4388 | ||
|
|
325ab6b0a8 | ||
|
|
91a07ff618 | ||
|
|
d5c4800112 | ||
|
|
42d5d705f9 | ||
|
|
116880a5a0 | ||
|
|
4145e50d85 | ||
|
|
20f5d185a6 | ||
|
|
1887acca9e | ||
|
|
92e7562a99 | ||
|
|
87d0d17ab5 | ||
|
|
a57c8228ff | ||
|
|
1ee95841bd | ||
|
|
7d8c6804e2 | ||
|
|
af3162d3aa | ||
|
|
5b2a9422f0 | ||
|
|
c1858b7ec8 | ||
|
|
82914d2ae8 | ||
|
|
81a90e5277 | ||
|
|
1c3a221d3b | ||
|
|
7bd42e609d | ||
|
|
a2522839d8 | ||
|
|
59a5cb387a | ||
|
|
8322d4e47f | ||
|
|
3e472e81f9 | ||
|
|
038914b7c8 | ||
|
|
d2f4a71cd5 | ||
|
|
2abd97592f | ||
|
|
6abb0454ad | ||
|
|
db6f71d4c9 | ||
|
|
fd03538bf9 | ||
|
|
1f70313e59 | ||
|
|
07daee132b | ||
|
|
9595afda18 | ||
|
|
c1395f72cd | ||
|
|
007b183d74 | ||
|
|
add9f1fbd9 | ||
|
|
e3bf79ffa0 | ||
|
|
fb1270f1f8 | ||
|
|
72bb24e2db | ||
|
|
a7be77beef | ||
|
|
bbe0574d8e | ||
|
|
4d9513537d | ||
|
|
439afa4eea | ||
|
|
fa4e0fb028 | ||
|
|
ce498a6d61 | ||
|
|
9f14c9224d | ||
|
|
535de06cb1 | ||
|
|
4292c90a2a | ||
|
|
6e98f6d8b6 | ||
|
|
2f6d17cb2f | ||
|
|
192ad4648b | ||
|
|
0e92298622 | ||
|
|
87d9a26166 | ||
|
|
80f921ba4b | ||
|
|
711edaf0d0 | ||
|
|
1d367a738e | ||
|
|
32a02c7ca2 | ||
|
|
f67ee8b859 | ||
|
|
e57ef99b40 | ||
|
|
f8516a1ab9 | ||
|
|
824058076c | ||
|
|
8e32690869 | ||
|
|
a208439537 | ||
|
|
bcd2f74c0d | ||
|
|
f79f777803 | ||
|
|
4c8d1bf361 | ||
|
|
061da6bcf7 | ||
|
|
4403e3ed4c | ||
|
|
08e094997e | ||
|
|
d88a1df699 | ||
|
|
90d74ebaa4 | ||
|
|
45f8fd6f97 | ||
|
|
5e1e0a0fbd | ||
|
|
eb5ed20743 | ||
|
|
2647163674 | ||
|
|
9fb27dd3b3 | ||
|
|
4dffc5e044 | ||
|
|
e1bf04b6c2 | ||
|
|
02080179a3 | ||
|
|
1b8fe6f7c4 | ||
|
|
52ee21021a | ||
|
|
655efb3e69 | ||
|
|
bd8da29a66 | ||
|
|
2a99c5a6c8 | ||
|
|
3f7662d650 | ||
|
|
a372f3f40a | ||
|
|
61e632aea1 | ||
|
|
b1bb18de8d | ||
|
|
2267cb1cfd | ||
|
|
0d6ccf68fa | ||
|
|
18e7cbbb15 | ||
|
|
f0d5251715 | ||
|
|
5c4f2dd6ef | ||
|
|
f3d8a34671 | ||
|
|
4bc913aeec | ||
|
|
fbb3cf6981 | ||
|
|
2df2b3499d | ||
|
|
2a8d84e66d | ||
|
|
a3acfa1071 | ||
|
|
be8168ff88 | ||
|
|
f6af34626d | ||
|
|
ceab70c89d | ||
|
|
52683ccbe1 | ||
|
|
e346e2d056 | ||
|
|
83449a5ff0 | ||
|
|
dad2d6a590 | ||
|
|
32e84fa1ff | ||
|
|
fd9c83d0e0 | ||
|
|
b95cc5014d | ||
|
|
61397891ce | ||
|
|
ef248ff740 | ||
|
|
e10604480b | ||
|
|
bf001da4bf | ||
|
|
a0a984ac2e | ||
|
|
f1cb9b5544 | ||
|
|
4c4b6f7a97 | ||
|
|
10546f925a | ||
|
|
e69c990c21 | ||
|
|
5eac9a1b34 | ||
|
|
1b60b45d0d | ||
|
|
4b3803d180 | ||
|
|
5019c59dd2 | ||
|
|
089cd4f002 | ||
|
|
0130223bd9 | ||
|
|
5d1aef3004 | ||
|
|
ffe1fc7a28 | ||
|
|
8b7346d5f1 | ||
|
|
6141ebe0dd | ||
|
|
199e3cb476 | ||
|
|
9f8cb81b44 | ||
|
|
d7e17aaacd | ||
|
|
528e9b1490 | ||
|
|
d95b4be47a | ||
|
|
4061dcf4c5 | ||
|
|
0aca8b8c62 | ||
|
|
9eb58f8cf1 | ||
|
|
b10d05b8a8 | ||
|
|
b398e5c819 | ||
|
|
78061ef584 | ||
|
|
528b3076af | ||
|
|
a502831d36 | ||
|
|
ba871fb788 | ||
|
|
ab374786c7 | ||
|
|
808dd87b30 | ||
|
|
beb8899482 | ||
|
|
ce88756b96 | ||
|
|
a3154a6092 | ||
|
|
7c036432fc | ||
|
|
318b120766 | ||
|
|
c3b40dc3e7 | ||
|
|
a01ef3fa51 | ||
|
|
7320ca3942 | ||
|
|
cf0a99f84d | ||
|
|
e535d90deb | ||
|
|
0b225fb7b2 | ||
|
|
46b4a02794 | ||
|
|
8869cd8ec1 | ||
|
|
cd86fff38f | ||
|
|
b5f8c3092d | ||
|
|
21997f45b1 | ||
|
|
672023877b | ||
|
|
754a8ca942 | ||
|
|
302ecf64ff | ||
|
|
b6bb2842cf | ||
|
|
79b6ec6aab | ||
|
|
d6416fdde9 | ||
|
|
0fb3157267 | ||
|
|
a358e4dffe | ||
|
|
079781177a | ||
|
|
63c0889416 | ||
|
|
1e86c802d4 | ||
|
|
fedf64332e | ||
|
|
2238a12c13 | ||
|
|
ce0afe2451 | ||
|
|
88c3e114d8 | ||
|
|
92924b2ddd | ||
|
|
27cb2f678f | ||
|
|
22d9a056d5 | ||
|
|
13b842f271 | ||
|
|
15f40b20aa | ||
|
|
793af538a3 | ||
|
|
6f5e7cda57 | ||
|
|
68feb76a6f | ||
|
|
4cb59dea6a | ||
|
|
608b556507 | ||
|
|
f0a1c8453a | ||
|
|
8980001c93 | ||
|
|
527bcd14d4 | ||
|
|
f68e3ea4e1 | ||
|
|
d5c41db35b | ||
|
|
1618e25492 | ||
|
|
f3888aca83 | ||
|
|
f0bca83ee4 | ||
|
|
73419abfae | ||
|
|
e77f162cf5 | ||
|
|
8ecd213c0b | ||
|
|
5b55c0bea7 | ||
|
|
15e0bb9c42 | ||
|
|
6c64c41b4a | ||
|
|
a2ef06e1b3 | ||
|
|
0a3c71e7e5 | ||
|
|
29fba76781 | ||
|
|
9df152bbf6 | ||
|
|
876a16f4fb | ||
|
|
aaa901ad55 | ||
|
|
010ec0c30e | ||
|
|
64a40a7ab4 | ||
|
|
31aedfe7d6 | ||
|
|
67ebaff528 | ||
|
|
2b465570e6 | ||
|
|
9ca66ecc10 | ||
|
|
c3a9752b0c | ||
|
|
f451b4558b | ||
|
|
3f96fcf646 | ||
|
|
6c1f9e4c18 | ||
|
|
67239c4c42 | ||
|
|
8ece60768f | ||
|
|
fd0e377244 | ||
|
|
f857a03f6b | ||
|
|
74898a7015 | ||
|
|
8f5d51203b | ||
|
|
ae5b7aff2b | ||
|
|
a11bc12d53 | ||
|
|
58cb55e4de | ||
|
|
cf896ae0e3 | ||
|
|
c5113f60f2 | ||
|
|
174f16700b | ||
|
|
8e2ad97ad0 | ||
|
|
10152d2194 | ||
|
|
1a7894dbdf | ||
|
|
c87eac18f7 | ||
|
|
f45870b53f | ||
|
|
ba45bedfd1 | ||
|
|
9432ed8c7e | ||
|
|
726d89720c | ||
|
|
d334dd26c4 | ||
|
|
070c811d6f | ||
|
|
8bfc8d5600 | ||
|
|
ec51831a22 | ||
|
|
80b918f2bd | ||
|
|
c46b0cd0af | ||
|
|
133765760b | ||
|
|
bfb9bdaf3f | ||
|
|
2284461d02 | ||
|
|
8e2a469b3b | ||
|
|
23591e631e | ||
|
|
0493d897c4 | ||
|
|
8c8ebeb941 | ||
|
|
831453fcef | ||
|
|
5a66c9cc76 | ||
|
|
5e73e4900c | ||
|
|
c6e7404cc5 | ||
|
|
17b17c0684 | ||
|
|
8bb6271c77 | ||
|
|
8b3f0a99dd | ||
|
|
8311f083bd | ||
|
|
40c35038d2 | ||
|
|
a5aa4d5c0f | ||
|
|
615e8033e5 | ||
|
|
d09135fbd0 | ||
|
|
8688c3d460 | ||
|
|
5400014d55 | ||
|
|
3a92c6f3b5 | ||
|
|
e01ff5c070 | ||
|
|
fb946a7f89 | ||
|
|
a650ad1588 | ||
|
|
d697581a7c | ||
|
|
5eeba80c74 | ||
|
|
08b1195e62 | ||
|
|
3bba2edb0f | ||
|
|
53fc166402 | ||
|
|
31b25f6516 | ||
|
|
abb34ac43a | ||
|
|
2515bbd027 | ||
|
|
c487a8eef4 | ||
|
|
9e138cb01d | ||
|
|
f9d03599ef | ||
|
|
39037d258e | ||
|
|
51550179fc | ||
|
|
07ea184f00 | ||
|
|
a663b218ae | ||
|
|
1bd47d6e5a | ||
|
|
141cd43967 | ||
|
|
6bf3b46d78 | ||
|
|
77c4f45c6c | ||
|
|
ca1969186d | ||
|
|
ab597c869a | ||
|
|
4197168ea5 | ||
|
|
59bcc5b6f2 | ||
|
|
3e440786af | ||
|
|
8bdd3979d8 | ||
|
|
c4e744dbd4 | ||
|
|
8ebf372e9d | ||
|
|
f210f0b7b1 | ||
|
|
392c5af4fe | ||
|
|
af9b69f977 | ||
|
|
8e5e40daf4 | ||
|
|
2e8de86777 | ||
|
|
247d1a32ea | ||
|
|
ecb4f82209 | ||
|
|
5914090765 | ||
|
|
f1acbd68c5 | ||
|
|
9581185d51 | ||
|
|
2dd359f953 | ||
|
|
22ad649501 | ||
|
|
36d450e3b8 | ||
|
|
a2b877df6c | ||
|
|
35fb0b8613 | ||
|
|
2eb673a088 | ||
|
|
a97b5e206d | ||
|
|
911b51b69f | ||
|
|
604e3b87e8 | ||
|
|
706f123b23 | ||
|
|
fb7abfc1d0 | ||
|
|
5d3d6e44e8 | ||
|
|
46ec6d71c7 | ||
|
|
e82fa448c4 | ||
|
|
d9aa39a3bb | ||
|
|
3a6d5cbefd | ||
|
|
f5d7049cc1 | ||
|
|
3c3c547ce0 | ||
|
|
1cbccb6dba | ||
|
|
bd92089d33 | ||
|
|
a6760f1525 | ||
|
|
66e601ef79 | ||
|
|
0cd259b2d8 | ||
|
|
83fb2d09e8 | ||
|
|
f3a5ee705f | ||
|
|
7cbbca9aaa | ||
|
|
5ec44056f7 | ||
|
|
492a7983dd | ||
|
|
a608b4c6c2 | ||
|
|
1f3a2c2944 | ||
|
|
7227d06156 | ||
|
|
14385c80fc | ||
|
|
76139d0801 | ||
|
|
da8d0c441a | ||
|
|
58996f3589 | ||
|
|
b539f988e1 | ||
|
|
6c00645712 | ||
|
|
b781eeaa15 | ||
|
|
e0b005d9cf | ||
|
|
3b8f0fe59e | ||
|
|
c831911be2 | ||
|
|
157caf511b | ||
|
|
0b53bec60b | ||
|
|
c568581ff3 | ||
|
|
2d7053438a | ||
|
|
5a93b9162b | ||
|
|
6d86fde09c | ||
|
|
510ed1e8d3 | ||
|
|
8caffd92df | ||
|
|
58a05b0ca1 | ||
|
|
6ee7f18f33 | ||
|
|
8f987883cb | ||
|
|
ebe0ba91db | ||
|
|
43a013c3a2 | ||
|
|
c25dbee40d | ||
|
|
19ab0f7ce5 | ||
|
|
67fe677c53 | ||
|
|
d56afd45fd | ||
|
|
a2393ed496 | ||
|
|
be6931ee27 | ||
|
|
9ef3b718d9 | ||
|
|
bb17e8f11c | ||
|
|
dcd80206b7 | ||
|
|
f4a0921c9c | ||
|
|
208c56256f | ||
|
|
9ac818a551 | ||
|
|
6ca2c91b96 | ||
|
|
e33192b269 | ||
|
|
61274bdef5 | ||
|
|
b40db4dfec | ||
|
|
11b556878b | ||
|
|
ee484b3f4b | ||
|
|
a9b53dd435 | ||
|
|
254db42ede | ||
|
|
105d104576 | ||
|
|
566cdb6cfb | ||
|
|
2f0d3ba745 | ||
|
|
edf927bc9f | ||
|
|
22aeb43007 | ||
|
|
a698e8e7ad | ||
|
|
151e5451c2 | ||
|
|
73b243463b | ||
|
|
7e67df5570 | ||
|
|
ff6c1da4e6 | ||
|
|
fcb9df99bd | ||
|
|
1ebdff412a | ||
|
|
91601ff478 | ||
|
|
d4dbb7af63 | ||
|
|
203d0bc0c2 | ||
|
|
17ab54de81 | ||
|
|
cd775bdbe0 | ||
|
|
da5e7b12be | ||
|
|
719ac592ed | ||
|
|
1209b784f2 | ||
|
|
5fa0f6efa9 | ||
|
|
bc0d291bfe | ||
|
|
9ad7f89f55 | ||
|
|
6450b536a6 | ||
|
|
0f19427db5 | ||
|
|
51931c5c9a | ||
|
|
06b557ecd9 | ||
|
|
81c2a889ce | ||
|
|
8edaf38570 | ||
|
|
5c86a89805 | ||
|
|
0ccecf8833 | ||
|
|
0b9a735e11 | ||
|
|
14d03b8ddb | ||
|
|
d0cbac5827 | ||
|
|
c0d820457a | ||
|
|
97ef11dd34 | ||
|
|
ecc3dd66cc | ||
|
|
7e1f10d562 | ||
|
|
a28b94e6ef | ||
|
|
0118cdcc02 | ||
|
|
136c499f6e | ||
|
|
ebd0a17e0e | ||
|
|
37c9859fab | ||
|
|
4561f13985 | ||
|
|
6cc6d92be5 | ||
|
|
dfab5f3764 | ||
|
|
586a57ad7e | ||
|
|
3a41459501 | ||
|
|
8518b30447 | ||
|
|
2d6b537157 | ||
|
|
68b0a6c1ba | ||
|
|
5206e5e28c | ||
|
|
fec9da0af4 | ||
|
|
bbbd696af9 | ||
|
|
9b77bb790d | ||
|
|
305e53ade8 | ||
|
|
1cb4341fbc | ||
|
|
1fb648bf10 | ||
|
|
7e22309755 | ||
|
|
90c2007932 | ||
|
|
d95d650762 | ||
|
|
13d8746c54 | ||
|
|
10e94c84f6 | ||
|
|
243e78c20f | ||
|
|
aac0b817fa | ||
|
|
05f3d714db | ||
|
|
3f3f89529d | ||
|
|
5da4c7d789 | ||
|
|
160c6fa387 | ||
|
|
a8eb1182f1 | ||
|
|
fa6e599a61 | ||
|
|
7ef5873752 | ||
|
|
5e4e0e51f4 | ||
|
|
f61c9da711 | ||
|
|
7fe255889e | ||
|
|
dc917cceb8 | ||
|
|
fc56f4a071 | ||
|
|
d08b356ee0 | ||
|
|
f744810184 | ||
|
|
44f08af3a7 | ||
|
|
955b43a5a5 | ||
|
|
744ef30484 | ||
|
|
300622e609 | ||
|
|
69d09fdd6c | ||
|
|
3a63be0faa | ||
|
|
803e3f3f68 | ||
|
|
70917b1c55 | ||
|
|
c517d8c934 | ||
|
|
fc37187a51 | ||
|
|
ff365eea94 | ||
|
|
444e2e7e1f | ||
|
|
bc14663e6a | ||
|
|
654a71fc3c | ||
|
|
15e302dfce | ||
|
|
d117a4d1a9 | ||
|
|
421012b63a | ||
|
|
841d53aaa8 | ||
|
|
1752262e96 | ||
|
|
ea6102b85d | ||
|
|
328cbb2773 | ||
|
|
64e3d67ac0 | ||
|
|
098b2d66fe | ||
|
|
8ebf271bb6 | ||
|
|
49a1262267 | ||
|
|
2b8a38b6d6 | ||
|
|
1bf1a34b19 | ||
|
|
a810299838 | ||
|
|
eb1629da24 | ||
|
|
019e2c3b7c | ||
|
|
f5fdec8ce2 | ||
|
|
1579c9b5fd | ||
|
|
889722f3bf | ||
|
|
49d9653852 | ||
|
|
a1d82466ea | ||
|
|
24a163ed77 | ||
|
|
378385b90c | ||
|
|
c5487e2b96 | ||
|
|
6437ff1fb9 | ||
|
|
5e00b561cd | ||
|
|
408195ec59 | ||
|
|
63227accf5 | ||
|
|
e675dda67b | ||
|
|
24dc30f7ff | ||
|
|
180fba653e | ||
|
|
f999539869 | ||
|
|
e1da249c93 | ||
|
|
9b693d023c | ||
|
|
808d6fd7b9 | ||
|
|
1861ae8aae | ||
|
|
4e31b7f228 | ||
|
|
6c20e89c02 | ||
|
|
85f55c943c | ||
|
|
cea3c754c4 | ||
|
|
42135d6898 | ||
|
|
e14467be43 | ||
|
|
7727ce35c2 | ||
|
|
6bb2bc71e2 | ||
|
|
c80f92c14d | ||
|
|
f23fb5a7c1 | ||
|
|
360aa93f8f | ||
|
|
27ca95b3c9 | ||
|
|
b4f64e5b02 | ||
|
|
7ab80a8e37 | ||
|
|
0900cedb3f | ||
|
|
6f067b1fb7 | ||
|
|
27b81e010d | ||
|
|
7013e9ac8f | ||
|
|
c78ee240b3 | ||
|
|
d2389c1262 | ||
|
|
22375f8d13 | ||
|
|
9b67338b78 | ||
|
|
2261340806 | ||
|
|
86c69dc54c | ||
|
|
7c5dedc247 | ||
|
|
193069d129 | ||
|
|
f0feb1cf81 | ||
|
|
09194b90a5 | ||
|
|
9ab4388cd3 | ||
|
|
04a9e064db | ||
|
|
c025263ddd | ||
|
|
6c97b9b9b6 | ||
|
|
4ca62a0dbd | ||
|
|
7901109ea5 | ||
|
|
13f6630a9e | ||
|
|
fda3f03eb2 | ||
|
|
bb9172030e | ||
|
|
c4e5bdf61b | ||
|
|
7f1bcd18ff | ||
|
|
8be263c3fb | ||
|
|
e1a34c3a5d | ||
|
|
148117ea2e | ||
|
|
e9c83cdc51 | ||
|
|
b75e85dede | ||
|
|
4753f3bf69 | ||
|
|
6c01ffb897 | ||
|
|
7b7cdce968 | ||
|
|
12dab78f49 | ||
|
|
05dc4bfab6 | ||
|
|
1a1fc3bbc0 | ||
|
|
43fada5360 | ||
|
|
4a5299c93f | ||
|
|
73f2a81c75 | ||
|
|
7350331718 | ||
|
|
9d1e611f0e | ||
|
|
0727cc9ecf | ||
|
|
a0490be8f1 | ||
|
|
cd3ac5b797 | ||
|
|
2636d76257 | ||
|
|
aa7f37ccfa | ||
|
|
c88860d759 | ||
|
|
758df5afe7 | ||
|
|
cdd03d25d3 | ||
|
|
74c583bc50 | ||
|
|
c0a350ca73 | ||
|
|
71832ba71e | ||
|
|
11bbf86f6a | ||
|
|
3c8740aacb | ||
|
|
7518a3dc65 | ||
|
|
976af2f314 | ||
|
|
9a1f16da1e | ||
|
|
bb1848cd62 | ||
|
|
6101a26dc9 | ||
|
|
f5d1740030 | ||
|
|
eebc58df0c | ||
|
|
16de822c71 | ||
|
|
5480c6b1fa | ||
|
|
ba29ab441e | ||
|
|
afc3622602 | ||
|
|
327a02d8db | ||
|
|
2f03035a61 | ||
|
|
38bf2ffb21 | ||
|
|
c826c72a96 | ||
|
|
fe36bf5e80 | ||
|
|
963dc0b865 | ||
|
|
8cc26acd8b | ||
|
|
4a6af8813f | ||
|
|
4147910f1e | ||
|
|
3055232ba0 | ||
|
|
965765aef9 | ||
|
|
9e078d0582 | ||
|
|
2b99f210f5 | ||
|
|
1646fea672 | ||
|
|
d3317bbba4 | ||
|
|
8e61425ee6 | ||
|
|
2e7c89e708 | ||
|
|
037a6487af | ||
|
|
5a3050a089 | ||
|
|
484e22bc18 | ||
|
|
ca21288080 | ||
|
|
4c82b6fac7 | ||
|
|
a884bc62d6 | ||
|
|
7a1030431a | ||
|
|
9fd918e510 | ||
|
|
c9a533079c | ||
|
|
6ca4f400d8 | ||
|
|
180e981d56 | ||
|
|
b84c426a8c | ||
|
|
b66b0d6abb | ||
|
|
03da3b52ef | ||
|
|
14ce524249 | ||
|
|
4ae77dfd42 | ||
|
|
73f635a75f | ||
|
|
35bf5d08e8 | ||
|
|
5de6dd0662 | ||
|
|
709502558c | ||
|
|
46f8a982b1 | ||
|
|
bcf2333cd6 | ||
|
|
83239ff19a | ||
|
|
c277fbdf31 | ||
|
|
aca5c51487 | ||
|
|
31c29257c8 | ||
|
|
8c11001ba2 | ||
|
|
bd292be0c0 | ||
|
|
41c544f78a | ||
|
|
1be5a73571 | ||
|
|
c36ba69bda | ||
|
|
047413375c | ||
|
|
74e4bb1c5a | ||
|
|
b34474bf2c | ||
|
|
6218034dd7 | ||
|
|
77c16df31d | ||
|
|
130d6c9514 | ||
|
|
361dfdc9d8 | ||
|
|
8ebfacaa75 | ||
|
|
b89275d018 | ||
|
|
28459785ff | ||
|
|
8853a50af2 | ||
|
|
c5891b5430 | ||
|
|
707b44cc28 | ||
|
|
3a4e10c847 | ||
|
|
cbbae38f93 | ||
|
|
cdba4c74b3 | ||
|
|
a52d1396a7 | ||
|
|
1e584823f8 | ||
|
|
4c1c501a7e | ||
|
|
ae1eba6a9a | ||
|
|
e9ec2a72d8 | ||
|
|
2c9b4cf5bf | ||
|
|
9d7ae3fcdb | ||
|
|
3c2685645e | ||
|
|
773d7073ae | ||
|
|
edadca109c | ||
|
|
d86fc23bdd | ||
|
|
375e5984fe | ||
|
|
19b251fe3d | ||
|
|
15422ed3f7 | ||
|
|
8471b27df9 | ||
|
|
66652e8082 | ||
|
|
e27078ea80 | ||
|
|
d084e9fca7 | ||
|
|
3a612322eb | ||
|
|
9ea07b41da | ||
|
|
552b262936 | ||
|
|
00e6402d56 | ||
|
|
ce0946249d | ||
|
|
3f28174c6a | ||
|
|
769d0629e1 | ||
|
|
90db5b31e4 | ||
|
|
b8199f6049 | ||
|
|
7e6f123810 | ||
|
|
9312a6c03a | ||
|
|
6388b50058 | ||
|
|
048bb59728 | ||
|
|
7933638051 | ||
|
|
6b176095e3 | ||
|
|
9d0d7f48d5 | ||
|
|
50632adc58 | ||
|
|
6fa6e7ef0c | ||
|
|
90c0836902 | ||
|
|
8ef50d9a6b | ||
|
|
2a60ac91d0 | ||
|
|
9e65bb4ef4 | ||
|
|
0db574b185 | ||
|
|
2f4a71daf2 | ||
|
|
69f8a0ea37 | ||
|
|
f28125d87b | ||
|
|
46f8c6b725 | ||
|
|
af54d2e2d0 | ||
|
|
6beef12b9b | ||
|
|
ab74b2a27a | ||
|
|
2263d44b68 | ||
|
|
4f3676e726 | ||
|
|
510265472c | ||
|
|
4f02cb2eac | ||
|
|
252c011012 | ||
|
|
98f60e5acb | ||
|
|
fefce49807 | ||
|
|
a5bbbd2f24 | ||
|
|
8c8653b672 | ||
|
|
232214b2ae | ||
|
|
eb28e8068d | ||
|
|
542a4059b2 | ||
|
|
df7e12715f | ||
|
|
44c34f22d9 | ||
|
|
80221e1884 | ||
|
|
5e714f7ff4 |
@@ -1,7 +1,8 @@
|
||||
name: vllm_ci
|
||||
job_dirs:
|
||||
- ".buildkite/test_areas"
|
||||
- ".buildkite/image_build"
|
||||
- ".buildkite/test_areas"
|
||||
- ".buildkite/hardware_tests"
|
||||
run_all_patterns:
|
||||
- "docker/Dockerfile"
|
||||
- "CMakeLists.txt"
|
||||
|
||||
29
.buildkite/hardware_tests/amd.yaml
Normal file
29
.buildkite/hardware_tests/amd.yaml
Normal file
@@ -0,0 +1,29 @@
|
||||
group: Hardware
|
||||
steps:
|
||||
- label: "AMD: :docker: build image"
|
||||
depends_on: []
|
||||
device: amd_cpu
|
||||
no_plugin: true
|
||||
commands:
|
||||
- >
|
||||
docker build
|
||||
--build-arg max_jobs=16
|
||||
--build-arg REMOTE_VLLM=1
|
||||
--build-arg ARG_PYTORCH_ROCM_ARCH='gfx90a;gfx942'
|
||||
--build-arg VLLM_BRANCH=$BUILDKITE_COMMIT
|
||||
--tag "rocm/vllm-ci:${BUILDKITE_COMMIT}"
|
||||
-f docker/Dockerfile.rocm
|
||||
--target test
|
||||
--no-cache
|
||||
--progress plain .
|
||||
- docker push "rocm/vllm-ci:${BUILDKITE_COMMIT}"
|
||||
env:
|
||||
DOCKER_BUILDKIT: "1"
|
||||
retry:
|
||||
automatic:
|
||||
- exit_status: -1 # Agent was lost
|
||||
limit: 1
|
||||
- exit_status: -10 # Agent was lost
|
||||
limit: 1
|
||||
- exit_status: 1 # Machine occasionally fail
|
||||
limit: 1
|
||||
10
.buildkite/hardware_tests/ascend_npu.yaml
Normal file
10
.buildkite/hardware_tests/ascend_npu.yaml
Normal file
@@ -0,0 +1,10 @@
|
||||
group: Hardware
|
||||
depends_on: ~
|
||||
steps:
|
||||
- label: "Ascend NPU Test"
|
||||
soft_fail: true
|
||||
timeout_in_minutes: 20
|
||||
no_plugin: true
|
||||
device: ascend_npu
|
||||
commands:
|
||||
- bash .buildkite/scripts/hardware_ci/run-npu-test.sh
|
||||
100
.buildkite/hardware_tests/cpu.yaml
Normal file
100
.buildkite/hardware_tests/cpu.yaml
Normal file
@@ -0,0 +1,100 @@
|
||||
group: CPU
|
||||
depends_on: []
|
||||
steps:
|
||||
- label: CPU-Kernel Tests
|
||||
depends_on: []
|
||||
soft_fail: true
|
||||
device: intel_cpu
|
||||
no_plugin: true
|
||||
source_file_dependencies:
|
||||
- csrc/cpu/
|
||||
- cmake/cpu_extension.cmake
|
||||
- CMakeLists.txt
|
||||
- vllm/_custom_ops.py
|
||||
- tests/kernels/attention/test_cpu_attn.py
|
||||
- tests/kernels/moe/test_cpu_fused_moe.py
|
||||
- tests/kernels/test_onednn.py
|
||||
commands:
|
||||
- |
|
||||
bash .buildkite/scripts/hardware_ci/run-cpu-test.sh 20m "
|
||||
pytest -x -v -s tests/kernels/attention/test_cpu_attn.py
|
||||
pytest -x -v -s tests/kernels/moe/test_cpu_fused_moe.py
|
||||
pytest -x -v -s tests/kernels/test_onednn.py"
|
||||
|
||||
- label: CPU-Language Generation and Pooling Model Tests
|
||||
depends_on: []
|
||||
soft_fail: true
|
||||
device: intel_cpu
|
||||
no_plugin: true
|
||||
source_file_dependencies:
|
||||
- csrc/cpu/
|
||||
- vllm/
|
||||
- tests/models/language/generation/
|
||||
- tests/models/language/pooling/
|
||||
commands:
|
||||
- |
|
||||
bash .buildkite/scripts/hardware_ci/run-cpu-test.sh 30m "
|
||||
pytest -x -v -s tests/models/language/generation -m cpu_model
|
||||
pytest -x -v -s tests/models/language/pooling -m cpu_model"
|
||||
|
||||
- label: CPU-Quantization Model Tests
|
||||
depends_on: []
|
||||
soft_fail: true
|
||||
device: intel_cpu
|
||||
no_plugin: true
|
||||
source_file_dependencies:
|
||||
- csrc/cpu/
|
||||
- vllm/model_executor/layers/quantization/cpu_wna16.py
|
||||
- vllm/model_executor/layers/quantization/gptq_marlin.py
|
||||
- vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py
|
||||
- vllm/model_executor/layers/quantization/kernels/scaled_mm/cpu.py
|
||||
- vllm/model_executor/layers/quantization/kernels/mixed_precision/cpu.py
|
||||
- tests/quantization/test_compressed_tensors.py
|
||||
- tests/quantization/test_cpu_wna16.py
|
||||
commands:
|
||||
- |
|
||||
bash .buildkite/scripts/hardware_ci/run-cpu-test.sh 20m "
|
||||
pytest -x -v -s tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_logprobs
|
||||
pytest -x -v -s tests/quantization/test_cpu_wna16.py"
|
||||
|
||||
- label: CPU-Distributed Tests
|
||||
depends_on: []
|
||||
soft_fail: true
|
||||
device: intel_cpu
|
||||
no_plugin: true
|
||||
source_file_dependencies:
|
||||
- csrc/cpu/shm.cpp
|
||||
- vllm/v1/worker/cpu_worker.py
|
||||
- vllm/v1/worker/gpu_worker.py
|
||||
- vllm/v1/worker/cpu_model_runner.py
|
||||
- vllm/v1/worker/gpu_model_runner.py
|
||||
- vllm/platforms/cpu.py
|
||||
- vllm/distributed/parallel_state.py
|
||||
- vllm/distributed/device_communicators/cpu_communicator.py
|
||||
commands:
|
||||
- |
|
||||
bash .buildkite/scripts/hardware_ci/run-cpu-test.sh 10m "
|
||||
bash .buildkite/scripts/hardware_ci/run-cpu-distributed-smoke-test.sh"
|
||||
|
||||
- label: CPU-Multi-Modal Model Tests %N
|
||||
depends_on: []
|
||||
soft_fail: true
|
||||
device: intel_cpu
|
||||
no_plugin: true
|
||||
source_file_dependencies:
|
||||
# - vllm/
|
||||
- vllm/model_executor/layers/rotary_embedding
|
||||
- tests/models/multimodal/generation/
|
||||
commands:
|
||||
- |
|
||||
bash .buildkite/scripts/hardware_ci/run-cpu-test.sh 45m "
|
||||
pytest -x -v -s tests/models/multimodal/generation --ignore=tests/models/multimodal/generation/test_pixtral.py -m cpu_model --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --shard-id=$$BUILDKITE_PARALLEL_JOB"
|
||||
parallelism: 2
|
||||
|
||||
- label: "Arm CPU Test"
|
||||
depends_on: []
|
||||
soft_fail: true
|
||||
device: arm_cpu
|
||||
no_plugin: true
|
||||
commands:
|
||||
- bash .buildkite/scripts/hardware_ci/run-cpu-test-arm.sh
|
||||
10
.buildkite/hardware_tests/gh200.yaml
Normal file
10
.buildkite/hardware_tests/gh200.yaml
Normal file
@@ -0,0 +1,10 @@
|
||||
group: Hardware
|
||||
steps:
|
||||
- label: "GH200 Test"
|
||||
soft_fail: true
|
||||
device: gh200
|
||||
no_plugin: true
|
||||
optional: true
|
||||
commands:
|
||||
- nvidia-smi
|
||||
- bash .buildkite/scripts/hardware_ci/run-gh200-test.sh
|
||||
17
.buildkite/hardware_tests/intel.yaml
Normal file
17
.buildkite/hardware_tests/intel.yaml
Normal file
@@ -0,0 +1,17 @@
|
||||
group: Hardware
|
||||
depends_on: ~
|
||||
steps:
|
||||
- label: "Intel HPU Test"
|
||||
soft_fail: true
|
||||
device: intel_hpu
|
||||
no_plugin: true
|
||||
commands:
|
||||
- bash .buildkite/scripts/hardware_ci/run-hpu-test.sh
|
||||
|
||||
- label: "Intel GPU Test"
|
||||
depends_on: []
|
||||
soft_fail: true
|
||||
device: intel_gpu
|
||||
no_plugin: true
|
||||
commands:
|
||||
- bash .buildkite/scripts/hardware_ci/run-xpu-test.sh
|
||||
@@ -1,56 +1,256 @@
|
||||
#!/bin/bash
|
||||
set -e
|
||||
set -euo pipefail
|
||||
|
||||
if [[ $# -lt 8 ]]; then
|
||||
echo "Usage: $0 <registry> <repo> <commit> <branch> <vllm_use_precompiled> <vllm_merge_base_commit> <cache_from> <cache_to>"
|
||||
exit 1
|
||||
# replace invalid characters in Docker image tags and truncate to 128 chars
|
||||
clean_docker_tag() {
|
||||
local input="$1"
|
||||
echo "$input" | sed 's/[^a-zA-Z0-9._-]/_/g' | cut -c1-128
|
||||
}
|
||||
|
||||
print_usage_and_exit() {
|
||||
echo "Usage: $0 <registry> <repo> <commit> <branch> <vllm_use_precompiled> <vllm_merge_base_commit> <cache_from> <cache_to>"
|
||||
exit 1
|
||||
}
|
||||
|
||||
print_instance_info() {
|
||||
echo ""
|
||||
echo "=== Debug: Instance Information ==="
|
||||
# Get IMDSv2 token
|
||||
if TOKEN=$(curl -s -X PUT "http://169.254.169.254/latest/api/token" \
|
||||
-H "X-aws-ec2-metadata-token-ttl-seconds: 21600" 2>/dev/null); then
|
||||
AMI_ID=$(curl -s -H "X-aws-ec2-metadata-token: $TOKEN" \
|
||||
http://169.254.169.254/latest/meta-data/ami-id 2>/dev/null || echo "unknown")
|
||||
INSTANCE_TYPE=$(curl -s -H "X-aws-ec2-metadata-token: $TOKEN" \
|
||||
http://169.254.169.254/latest/meta-data/instance-type 2>/dev/null || echo "unknown")
|
||||
INSTANCE_ID=$(curl -s -H "X-aws-ec2-metadata-token: $TOKEN" \
|
||||
http://169.254.169.254/latest/meta-data/instance-id 2>/dev/null || echo "unknown")
|
||||
AZ=$(curl -s -H "X-aws-ec2-metadata-token: $TOKEN" \
|
||||
http://169.254.169.254/latest/meta-data/placement/availability-zone 2>/dev/null || echo "unknown")
|
||||
echo "AMI ID: ${AMI_ID}"
|
||||
echo "Instance Type: ${INSTANCE_TYPE}"
|
||||
echo "Instance ID: ${INSTANCE_ID}"
|
||||
echo "AZ: ${AZ}"
|
||||
else
|
||||
echo "Not running on EC2 or IMDS not available"
|
||||
fi
|
||||
# Check for warm cache AMI (marker file baked into custom AMI)
|
||||
if [[ -f /etc/vllm-ami-info ]]; then
|
||||
echo "Cache: warm (custom vLLM AMI)"
|
||||
cat /etc/vllm-ami-info
|
||||
else
|
||||
echo "Cache: cold (standard AMI)"
|
||||
fi
|
||||
echo "==================================="
|
||||
echo ""
|
||||
}
|
||||
|
||||
setup_buildx_builder() {
|
||||
echo "--- :buildkite: Setting up buildx builder"
|
||||
if [[ -S "${BUILDKIT_SOCKET}" ]]; then
|
||||
# Custom AMI with standalone buildkitd - use remote driver for warm cache
|
||||
echo "✅ Found local buildkitd socket at ${BUILDKIT_SOCKET}"
|
||||
echo "Using remote driver to connect to buildkitd (warm cache available)"
|
||||
if docker buildx inspect baked-vllm-builder >/dev/null 2>&1; then
|
||||
echo "Using existing baked-vllm-builder"
|
||||
docker buildx use baked-vllm-builder
|
||||
else
|
||||
echo "Creating baked-vllm-builder with remote driver"
|
||||
docker buildx create \
|
||||
--name baked-vllm-builder \
|
||||
--driver remote \
|
||||
--use \
|
||||
"unix://${BUILDKIT_SOCKET}"
|
||||
fi
|
||||
docker buildx inspect --bootstrap
|
||||
elif docker buildx inspect "${BUILDER_NAME}" >/dev/null 2>&1; then
|
||||
# Existing builder available
|
||||
echo "Using existing builder: ${BUILDER_NAME}"
|
||||
docker buildx use "${BUILDER_NAME}"
|
||||
docker buildx inspect --bootstrap
|
||||
else
|
||||
# No local buildkitd, no existing builder - create new docker-container builder
|
||||
echo "No local buildkitd found, using docker-container driver"
|
||||
docker buildx create --name "${BUILDER_NAME}" --driver docker-container --use
|
||||
docker buildx inspect --bootstrap
|
||||
fi
|
||||
|
||||
# builder info
|
||||
echo "Active builder:"
|
||||
docker buildx ls | grep -E '^\*|^NAME' || docker buildx ls
|
||||
}
|
||||
|
||||
check_and_skip_if_image_exists() {
|
||||
if [[ -n "${IMAGE_TAG:-}" ]]; then
|
||||
echo "--- :mag: Checking if image exists"
|
||||
if docker manifest inspect "${IMAGE_TAG}" >/dev/null 2>&1; then
|
||||
echo "Image already exists: ${IMAGE_TAG}"
|
||||
echo "Skipping build"
|
||||
exit 0
|
||||
fi
|
||||
echo "Image not found, proceeding with build"
|
||||
fi
|
||||
}
|
||||
|
||||
ecr_login() {
|
||||
aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin "$REGISTRY"
|
||||
aws ecr get-login-password --region us-east-1 | docker login --username AWS --password-stdin 936637512419.dkr.ecr.us-east-1.amazonaws.com
|
||||
}
|
||||
|
||||
prepare_cache_tags() {
|
||||
# resolve and set: CACHE_TO, CACHE_FROM, CACHE_FROM_BASE_BRANCH, CACHE_FROM_MAIN
|
||||
TEST_CACHE_ECR="936637512419.dkr.ecr.us-east-1.amazonaws.com/vllm-ci-test-cache"
|
||||
MAIN_CACHE_ECR="936637512419.dkr.ecr.us-east-1.amazonaws.com/vllm-ci-postmerge-cache"
|
||||
|
||||
if [[ "$BUILDKITE_PULL_REQUEST" == "false" ]]; then
|
||||
if [[ "$BUILDKITE_BRANCH" == "main" ]]; then
|
||||
cache="${MAIN_CACHE_ECR}:latest"
|
||||
else
|
||||
clean_branch=$(clean_docker_tag "$BUILDKITE_BRANCH")
|
||||
cache="${TEST_CACHE_ECR}:${clean_branch}"
|
||||
fi
|
||||
CACHE_TO="$cache"
|
||||
CACHE_FROM="$cache"
|
||||
CACHE_FROM_BASE_BRANCH="$cache"
|
||||
else
|
||||
CACHE_TO="${TEST_CACHE_ECR}:pr-${BUILDKITE_PULL_REQUEST}"
|
||||
CACHE_FROM="${TEST_CACHE_ECR}:pr-${BUILDKITE_PULL_REQUEST}"
|
||||
if [[ "$BUILDKITE_PULL_REQUEST_BASE_BRANCH" == "main" ]]; then
|
||||
CACHE_FROM_BASE_BRANCH="${MAIN_CACHE_ECR}:latest"
|
||||
else
|
||||
clean_base=$(clean_docker_tag "$BUILDKITE_PULL_REQUEST_BASE_BRANCH")
|
||||
CACHE_FROM_BASE_BRANCH="${TEST_CACHE_ECR}:${clean_base}"
|
||||
fi
|
||||
fi
|
||||
|
||||
CACHE_FROM_MAIN="${MAIN_CACHE_ECR}:latest"
|
||||
export CACHE_TO CACHE_FROM CACHE_FROM_BASE_BRANCH CACHE_FROM_MAIN
|
||||
}
|
||||
|
||||
resolve_parent_commit() {
|
||||
if [[ -z "${PARENT_COMMIT:-}" ]]; then
|
||||
PARENT_COMMIT=$(git rev-parse HEAD~1 2>/dev/null || echo "")
|
||||
if [[ -n "${PARENT_COMMIT}" ]]; then
|
||||
echo "Computed parent commit for cache fallback: ${PARENT_COMMIT}"
|
||||
export PARENT_COMMIT
|
||||
else
|
||||
echo "Could not determine parent commit (may be first commit in repo)"
|
||||
fi
|
||||
else
|
||||
echo "Using provided PARENT_COMMIT: ${PARENT_COMMIT}"
|
||||
fi
|
||||
}
|
||||
|
||||
print_bake_config() {
|
||||
echo "--- :page_facing_up: Resolved bake configuration"
|
||||
BAKE_CONFIG_FILE="bake-config-build-${BUILDKITE_BUILD_NUMBER:-local}.json"
|
||||
docker buildx bake -f "${VLLM_BAKE_FILE_PATH}" -f "${CI_HCL_PATH}" --print "${TARGET}" | tee "${BAKE_CONFIG_FILE}" || true
|
||||
echo "Saved bake config to ${BAKE_CONFIG_FILE}"
|
||||
echo "--- :arrow_down: Uploading bake config to Buildkite"
|
||||
buildkite-agent artifact upload "${BAKE_CONFIG_FILE}"
|
||||
}
|
||||
|
||||
#################################
|
||||
# Main Script #
|
||||
#################################
|
||||
print_instance_info
|
||||
|
||||
if [[ $# -lt 7 ]]; then
|
||||
print_usage_and_exit
|
||||
fi
|
||||
|
||||
# input args
|
||||
REGISTRY=$1
|
||||
REPO=$2
|
||||
BUILDKITE_COMMIT=$3
|
||||
BRANCH=$4
|
||||
VLLM_USE_PRECOMPILED=$5
|
||||
VLLM_MERGE_BASE_COMMIT=$6
|
||||
CACHE_FROM=$7
|
||||
CACHE_TO=$8
|
||||
IMAGE_TAG=$7
|
||||
IMAGE_TAG_LATEST=${8:-} # only used for main branch, optional
|
||||
|
||||
# authenticate with AWS ECR
|
||||
aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin $REGISTRY
|
||||
aws ecr get-login-password --region us-east-1 | docker login --username AWS --password-stdin 936637512419.dkr.ecr.us-east-1.amazonaws.com
|
||||
# build config
|
||||
TARGET="test-ci"
|
||||
VLLM_BAKE_FILE_PATH="${VLLM_BAKE_FILE_PATH:-docker/docker-bake.hcl}"
|
||||
BUILDER_NAME="${BUILDER_NAME:-vllm-builder}"
|
||||
CI_HCL_URL="${CI_HCL_URL:-https://raw.githubusercontent.com/vllm-project/ci-infra/main/docker/ci.hcl}"
|
||||
CI_HCL_PATH="/tmp/ci.hcl"
|
||||
BUILDKIT_SOCKET="/run/buildkit/buildkitd.sock"
|
||||
|
||||
# docker buildx
|
||||
docker buildx create --name vllm-builder --driver docker-container --use
|
||||
docker buildx inspect --bootstrap
|
||||
docker buildx ls
|
||||
prepare_cache_tags
|
||||
ecr_login
|
||||
|
||||
# skip build if image already exists
|
||||
if [[ -z $(docker manifest inspect $REGISTRY/$REPO:$BUILDKITE_COMMIT) ]]; then
|
||||
echo "Image not found, proceeding with build..."
|
||||
else
|
||||
echo "Image found"
|
||||
exit 0
|
||||
# Environment info (for docs and human readers)
|
||||
# VLLM_CI_BRANCH - ci-infra branch to use (default: main)
|
||||
# VLLM_BAKE_FILE_PATH - Path to vLLM's bake file (default: docker/docker-bake.hcl)
|
||||
# BUILDER_NAME - Name for buildx builder (default: vllm-builder)
|
||||
#
|
||||
# Build configuration (exported as environment variables for bake):
|
||||
export BUILDKITE_COMMIT
|
||||
export PARENT_COMMIT
|
||||
export IMAGE_TAG
|
||||
export IMAGE_TAG_LATEST
|
||||
export CACHE_FROM
|
||||
export CACHE_FROM_BASE_BRANCH
|
||||
export CACHE_FROM_MAIN
|
||||
export CACHE_TO
|
||||
export VLLM_USE_PRECOMPILED
|
||||
export VLLM_MERGE_BASE_COMMIT
|
||||
|
||||
# print args
|
||||
echo "--- :mag: Arguments"
|
||||
echo "REGISTRY: ${REGISTRY}"
|
||||
echo "REPO: ${REPO}"
|
||||
echo "BUILDKITE_COMMIT: ${BUILDKITE_COMMIT}"
|
||||
echo "BRANCH: ${BRANCH}"
|
||||
echo "VLLM_USE_PRECOMPILED: ${VLLM_USE_PRECOMPILED}"
|
||||
echo "VLLM_MERGE_BASE_COMMIT: ${VLLM_MERGE_BASE_COMMIT}"
|
||||
echo "IMAGE_TAG: ${IMAGE_TAG}"
|
||||
echo "IMAGE_TAG_LATEST: ${IMAGE_TAG_LATEST}"
|
||||
|
||||
# print build configuration
|
||||
echo "--- :mag: Build configuration"
|
||||
echo "TARGET: ${TARGET}"
|
||||
echo "vLLM bake file: ${VLLM_BAKE_FILE_PATH}"
|
||||
echo "BUILDER_NAME: ${BUILDER_NAME}"
|
||||
echo "CI_HCL_URL: ${CI_HCL_URL}"
|
||||
echo "BUILDKIT_SOCKET: ${BUILDKIT_SOCKET}"
|
||||
|
||||
echo "--- :mag: Cache tags"
|
||||
echo "CACHE_TO: ${CACHE_TO}"
|
||||
echo "CACHE_FROM: ${CACHE_FROM}"
|
||||
echo "CACHE_FROM_BASE_BRANCH: ${CACHE_FROM_BASE_BRANCH}"
|
||||
echo "CACHE_FROM_MAIN: ${CACHE_FROM_MAIN}"
|
||||
|
||||
check_and_skip_if_image_exists
|
||||
|
||||
echo "--- :docker: Setting up Docker buildx bake"
|
||||
echo "Target: ${TARGET}"
|
||||
echo "vLLM bake file: ${VLLM_BAKE_FILE_PATH}"
|
||||
echo "CI HCL path: ${CI_HCL_PATH}"
|
||||
|
||||
if [[ ! -f "${VLLM_BAKE_FILE_PATH}" ]]; then
|
||||
echo "Error: vLLM bake file not found at ${VLLM_BAKE_FILE_PATH}"
|
||||
echo "Make sure you're running from the vLLM repository root"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [[ "${VLLM_USE_PRECOMPILED:-0}" == "1" ]]; then
|
||||
merge_base_commit_build_args="--build-arg VLLM_MERGE_BASE_COMMIT=${VLLM_MERGE_BASE_COMMIT}"
|
||||
else
|
||||
merge_base_commit_build_args=""
|
||||
echo "--- :arrow_down: Downloading ci.hcl"
|
||||
curl -sSfL -o "${CI_HCL_PATH}" "${CI_HCL_URL}"
|
||||
echo "Downloaded to ${CI_HCL_PATH}"
|
||||
|
||||
if [[ ! -f "${CI_HCL_PATH}" ]]; then
|
||||
echo "Error: ci.hcl not found at ${CI_HCL_PATH}"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# build
|
||||
docker buildx build --file docker/Dockerfile \
|
||||
--build-arg max_jobs=16 \
|
||||
--build-arg buildkite_commit=$BUILDKITE_COMMIT \
|
||||
--build-arg USE_SCCACHE=1 \
|
||||
--build-arg TORCH_CUDA_ARCH_LIST="8.0 8.9 9.0 10.0" \
|
||||
--build-arg FI_TORCH_CUDA_ARCH_LIST="8.0 8.9 9.0a 10.0a" \
|
||||
--build-arg VLLM_USE_PRECOMPILED="${VLLM_USE_PRECOMPILED:-0}" \
|
||||
${merge_base_commit_build_args} \
|
||||
--cache-from type=registry,ref=${CACHE_FROM},mode=max \
|
||||
--cache-to type=registry,ref=${CACHE_TO},mode=max \
|
||||
--tag ${REGISTRY}/${REPO}:${BUILDKITE_COMMIT} \
|
||||
$( [[ "${BRANCH}" == "main" ]] && echo "--tag ${REGISTRY}/${REPO}:latest" ) \
|
||||
--push \
|
||||
--target test \
|
||||
--progress plain .
|
||||
setup_buildx_builder
|
||||
|
||||
resolve_parent_commit
|
||||
export PARENT_COMMIT
|
||||
|
||||
print_bake_config
|
||||
|
||||
echo "--- :docker: Building ${TARGET}"
|
||||
docker --debug buildx bake -f "${VLLM_BAKE_FILE_PATH}" -f "${CI_HCL_PATH}" --progress plain "${TARGET}"
|
||||
|
||||
echo "--- :white_check_mark: Build complete"
|
||||
|
||||
@@ -3,8 +3,10 @@ steps:
|
||||
- label: ":docker: Build image"
|
||||
key: image-build
|
||||
depends_on: []
|
||||
timeout_in_minutes: 600
|
||||
commands:
|
||||
- .buildkite/image_build/image_build.sh $REGISTRY $REPO $BUILDKITE_COMMIT $BRANCH $VLLM_USE_PRECOMPILED $VLLM_MERGE_BASE_COMMIT $CACHE_FROM $CACHE_TO
|
||||
- if [[ "$BUILDKITE_BRANCH" != "main" ]]; then .buildkite/image_build/image_build.sh $REGISTRY $REPO $BUILDKITE_COMMIT $BRANCH $VLLM_USE_PRECOMPILED $VLLM_MERGE_BASE_COMMIT $IMAGE_TAG; fi
|
||||
- if [[ "$BUILDKITE_BRANCH" == "main" ]]; then .buildkite/image_build/image_build.sh $REGISTRY $REPO $BUILDKITE_COMMIT $BRANCH $VLLM_USE_PRECOMPILED $VLLM_MERGE_BASE_COMMIT $IMAGE_TAG $IMAGE_TAG_LATEST; fi
|
||||
retry:
|
||||
automatic:
|
||||
- exit_status: -1 # Agent was lost
|
||||
@@ -40,7 +42,7 @@ steps:
|
||||
limit: 2
|
||||
- exit_status: -10 # Agent was lost
|
||||
limit: 2
|
||||
|
||||
|
||||
- label: ":docker: Build CPU arm64 image"
|
||||
key: cpu-arm64-image-build
|
||||
depends_on: []
|
||||
|
||||
@@ -0,0 +1,15 @@
|
||||
model_name: "nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16"
|
||||
tasks:
|
||||
- name: "gsm8k"
|
||||
metrics:
|
||||
- name: "exact_match,strict-match"
|
||||
value: 0.695
|
||||
- name: "exact_match,flexible-extract"
|
||||
value: 0.447
|
||||
limit: 1319
|
||||
num_fewshot: 5
|
||||
max_model_len: 262144
|
||||
enforce_eager: false
|
||||
apply_chat_template: true
|
||||
fewshot_as_multiturn: true
|
||||
trust_remote_code: true
|
||||
@@ -0,0 +1,19 @@
|
||||
model_name: "nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-FP8"
|
||||
tasks:
|
||||
- name: "gsm8k"
|
||||
metrics:
|
||||
- name: "exact_match,strict-match"
|
||||
value: 0.7142
|
||||
- name: "exact_match,flexible-extract"
|
||||
value: 0.4579
|
||||
env_vars:
|
||||
VLLM_USE_FLASHINFER_MOE_FP8: "1"
|
||||
VLLM_FLASHINFER_MOE_BACKEND: "throughput"
|
||||
limit: 1319
|
||||
num_fewshot: 5
|
||||
max_model_len: 262144
|
||||
kv_cache_dtype: fp8
|
||||
enforce_eager: false
|
||||
apply_chat_template: true
|
||||
fewshot_as_multiturn: true
|
||||
trust_remote_code: true
|
||||
@@ -1 +1,2 @@
|
||||
Qwen3-235B-A22B-Instruct-2507-FP8.yaml
|
||||
NVIDIA-Nemotron-3-Nano-30B-A3B-FP8.yaml
|
||||
|
||||
@@ -3,3 +3,4 @@ Meta-Llama-3-70B-Instruct.yaml
|
||||
Mixtral-8x7B-Instruct-v0.1.yaml
|
||||
Qwen2-57B-A14-Instruct.yaml
|
||||
DeepSeek-V2-Lite-Chat.yaml
|
||||
NVIDIA-Nemotron-3-Nano-30B-A3B-BF16.yaml
|
||||
|
||||
5
.buildkite/lm-eval-harness/configs/models-small-rocm.txt
Normal file
5
.buildkite/lm-eval-harness/configs/models-small-rocm.txt
Normal file
@@ -0,0 +1,5 @@
|
||||
Qwen2.5-1.5B-Instruct.yaml
|
||||
Meta-Llama-3.2-1B-Instruct-INT8-compressed-tensors.yaml
|
||||
Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml
|
||||
Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml
|
||||
Qwen1.5-MoE-W4A16-compressed-tensors.yaml
|
||||
@@ -393,7 +393,7 @@ if __name__ == "__main__":
|
||||
with open(results_folder / md_file, "w") as f:
|
||||
results = read_markdown(
|
||||
"../.buildkite/performance-benchmarks/"
|
||||
+ "performance-benchmarks-descriptions.md"
|
||||
"performance-benchmarks-descriptions.md"
|
||||
)
|
||||
results = results.format(
|
||||
latency_tests_markdown_table=latency_md_table,
|
||||
|
||||
@@ -25,9 +25,9 @@ check_gpus() {
|
||||
echo "Need at least 1 GPU to run benchmarking."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
|
||||
declare -g arch_suffix=''
|
||||
|
||||
|
||||
if command -v nvidia-smi; then
|
||||
declare -g gpu_type=$(nvidia-smi --query-gpu=name --format=csv,noheader | awk '{print $2}')
|
||||
elif command -v amd-smi; then
|
||||
@@ -181,19 +181,20 @@ upload_to_buildkite() {
|
||||
$BUILDKITE_AGENT_COMMAND artifact upload "$RESULTS_FOLDER/*"
|
||||
}
|
||||
|
||||
run_latency_tests() {
|
||||
# run latency tests using `vllm bench latency` command
|
||||
# $1: a json file specifying latency test cases
|
||||
run_benchmark_tests() {
|
||||
# run benchmark tests using `vllm bench <test_type>` command
|
||||
# $1: test type (latency or throughput)
|
||||
# $2: a json file specifying test cases
|
||||
|
||||
local latency_test_file
|
||||
latency_test_file=$1
|
||||
local test_type=$1
|
||||
local test_file=$2
|
||||
|
||||
# Iterate over latency tests
|
||||
jq -c '.[]' "$latency_test_file" | while read -r params; do
|
||||
# Iterate over tests
|
||||
jq -c '.[]' "$test_file" | while read -r params; do
|
||||
# get the test name, and append the GPU type back to it.
|
||||
test_name=$(echo "$params" | jq -r '.test_name')
|
||||
if [[ ! "$test_name" =~ ^latency_ ]]; then
|
||||
echo "In latency-test.json, test_name must start with \"latency_\"."
|
||||
if [[ ! "$test_name" =~ ^${test_type}_ ]]; then
|
||||
echo "In ${test_type}-test.json, test_name must start with \"${test_type}_\"."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
@@ -204,15 +205,15 @@ run_latency_tests() {
|
||||
fi
|
||||
|
||||
# get arguments
|
||||
latency_params=$(echo "$params" | jq -r '.parameters')
|
||||
latency_args=$(json2args "$latency_params")
|
||||
latency_environment_variables=$(echo "$params" | jq -r '.environment_variables')
|
||||
latency_envs=$(json2envs "$latency_environment_variables")
|
||||
bench_params=$(echo "$params" | jq -r '.parameters')
|
||||
bench_args=$(json2args "$bench_params")
|
||||
bench_environment_variables=$(echo "$params" | jq -r '.environment_variables')
|
||||
bench_envs=$(json2envs "$bench_environment_variables")
|
||||
|
||||
# check if there is enough GPU to run the test
|
||||
tp=$(echo "$latency_params" | jq -r '.tensor_parallel_size')
|
||||
tp=$(echo "$bench_params" | jq -r '.tensor_parallel_size')
|
||||
if [[ "$ON_CPU" == "1" ]]; then
|
||||
pp=$(echo "$latency_params" | jq -r '.pipeline_parallel_size // 1')
|
||||
pp=$(echo "$bench_params" | jq -r '.pipeline_parallel_size // 1')
|
||||
world_size=$(($tp*$pp))
|
||||
if [[ $numa_count -lt $world_size && -z "${REMOTE_HOST}" ]]; then
|
||||
echo "Required world-size $world_size but only $numa_count NUMA nodes found. Skip testcase $test_name."
|
||||
@@ -225,97 +226,42 @@ run_latency_tests() {
|
||||
fi
|
||||
fi
|
||||
|
||||
latency_command=" $latency_envs vllm bench latency \
|
||||
bench_command=" $bench_envs vllm bench $test_type \
|
||||
--output-json $RESULTS_FOLDER/${test_name}.json \
|
||||
$latency_args"
|
||||
$bench_args"
|
||||
|
||||
echo "Running test case $test_name"
|
||||
echo "Latency command: $latency_command"
|
||||
echo "${test_type^} command: $bench_command"
|
||||
|
||||
# recoding benchmarking command ang GPU command
|
||||
# recording benchmarking command and GPU command
|
||||
jq_output=$(jq -n \
|
||||
--arg latency "$latency_command" \
|
||||
--arg command "$bench_command" \
|
||||
--arg gpu "$gpu_type" \
|
||||
--arg test_type "$test_type" \
|
||||
'{
|
||||
latency_command: $latency,
|
||||
($test_type + "_command"): $command,
|
||||
gpu_type: $gpu
|
||||
}')
|
||||
echo "$jq_output" >"$RESULTS_FOLDER/$test_name.commands"
|
||||
|
||||
# run the benchmark
|
||||
eval "$latency_command"
|
||||
eval "$bench_command"
|
||||
|
||||
kill_gpu_processes
|
||||
|
||||
done
|
||||
}
|
||||
|
||||
run_latency_tests() {
|
||||
run_benchmark_tests "latency" "$1"
|
||||
}
|
||||
|
||||
run_startup_tests() {
|
||||
run_benchmark_tests "startup" "$1"
|
||||
}
|
||||
|
||||
run_throughput_tests() {
|
||||
# run throughput tests using `vllm bench throughput`
|
||||
# $1: a json file specifying throughput test cases
|
||||
|
||||
local throughput_test_file
|
||||
throughput_test_file=$1
|
||||
|
||||
# Iterate over throughput tests
|
||||
jq -c '.[]' "$throughput_test_file" | while read -r params; do
|
||||
# get the test name, and append the GPU type back to it.
|
||||
test_name=$(echo "$params" | jq -r '.test_name')
|
||||
if [[ ! "$test_name" =~ ^throughput_ ]]; then
|
||||
echo "In throughput-test.json, test_name must start with \"throughput_\"."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# if TEST_SELECTOR is set, only run the test cases that match the selector
|
||||
if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
|
||||
echo "Skip test case $test_name."
|
||||
continue
|
||||
fi
|
||||
|
||||
# get arguments
|
||||
throughput_params=$(echo "$params" | jq -r '.parameters')
|
||||
throughput_args=$(json2args "$throughput_params")
|
||||
throughput_environment_variables=$(echo "$params" | jq -r '.environment_variables')
|
||||
throughput_envs=$(json2envs "$throughput_environment_variables")
|
||||
|
||||
# check if there is enough GPU to run the test
|
||||
tp=$(echo "$throughput_params" | jq -r '.tensor_parallel_size')
|
||||
if [[ "$ON_CPU" == "1" ]]; then
|
||||
pp=$(echo "$throughput_params" | jq -r '.pipeline_parallel_size // 1')
|
||||
world_size=$(($tp*$pp))
|
||||
if [[ $numa_count -lt $world_size && -z "${REMOTE_HOST}" ]]; then
|
||||
echo "Required world-size $world_size but only $numa_count NUMA nodes found. Skip testcase $test_name."
|
||||
continue
|
||||
fi
|
||||
else
|
||||
if [[ $gpu_count -lt $tp ]]; then
|
||||
echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name."
|
||||
continue
|
||||
fi
|
||||
fi
|
||||
|
||||
throughput_command=" $throughput_envs vllm bench throughput \
|
||||
--output-json $RESULTS_FOLDER/${test_name}.json \
|
||||
$throughput_args"
|
||||
|
||||
echo "Running test case $test_name"
|
||||
echo "Throughput command: $throughput_command"
|
||||
# recoding benchmarking command ang GPU command
|
||||
jq_output=$(jq -n \
|
||||
--arg command "$throughput_command" \
|
||||
--arg gpu "$gpu_type" \
|
||||
'{
|
||||
throughput_command: $command,
|
||||
gpu_type: $gpu
|
||||
}')
|
||||
echo "$jq_output" >"$RESULTS_FOLDER/$test_name.commands"
|
||||
|
||||
# run the benchmark
|
||||
eval "$throughput_command"
|
||||
|
||||
kill_gpu_processes
|
||||
|
||||
done
|
||||
run_benchmark_tests "throughput" "$1"
|
||||
}
|
||||
|
||||
run_serving_tests() {
|
||||
@@ -447,6 +393,11 @@ run_serving_tests() {
|
||||
fi
|
||||
fi
|
||||
|
||||
# save the compilation mode and optimization level on the serving results
|
||||
# whenever they are set
|
||||
compilation_config_mode=$(echo "$server_params" | jq -r '."compilation_config.mode" // empty')
|
||||
optimization_level=$(echo "$server_params" | jq -r '.optimization_level // empty')
|
||||
|
||||
# iterate over different QPS
|
||||
for qps in $qps_list; do
|
||||
# remove the surrounding single quote from qps
|
||||
@@ -460,15 +411,15 @@ run_serving_tests() {
|
||||
for max_concurrency in $max_concurrency_list; do
|
||||
new_test_name=$test_name"_qps_"$qps"_concurrency_"$max_concurrency
|
||||
echo " new test name $new_test_name"
|
||||
# pass the tensor parallel size to the client so that it can be displayed
|
||||
# on the benchmark dashboard
|
||||
# pass the tensor parallel size, the compilation mode, and the optimization
|
||||
# level to the client so that they can be used on the benchmark dashboard
|
||||
client_command="vllm bench serve \
|
||||
--save-result \
|
||||
--result-dir $RESULTS_FOLDER \
|
||||
--result-filename ${new_test_name}.json \
|
||||
--request-rate $qps \
|
||||
--max-concurrency $max_concurrency \
|
||||
--metadata "tensor_parallel_size=$tp" \
|
||||
--metadata tensor_parallel_size=$tp compilation_config.mode=$compilation_config_mode optimization_level=$optimization_level \
|
||||
$client_args $client_remote_args "
|
||||
|
||||
echo "Running test case $test_name with qps $qps"
|
||||
@@ -534,6 +485,7 @@ main() {
|
||||
# benchmarking
|
||||
run_serving_tests $QUICK_BENCHMARK_ROOT/tests/"${SERVING_JSON:-serving-tests$ARCH.json}"
|
||||
run_latency_tests $QUICK_BENCHMARK_ROOT/tests/"${LATENCY_JSON:-latency-tests$ARCH.json}"
|
||||
run_startup_tests $QUICK_BENCHMARK_ROOT/tests/"${STARTUP_JSON:-startup-tests$ARCH.json}"
|
||||
run_throughput_tests $QUICK_BENCHMARK_ROOT/tests/"${THROUGHPUT_JSON:-throughput-tests$ARCH.json}"
|
||||
|
||||
# postprocess benchmarking results
|
||||
|
||||
@@ -1,198 +1,713 @@
|
||||
steps:
|
||||
# aarch64 + CUDA builds
|
||||
- label: "Build arm64 wheel - CUDA 12.9"
|
||||
depends_on: ~
|
||||
id: build-wheel-arm64-cuda-12-9
|
||||
agents:
|
||||
queue: arm64_cpu_queue_postmerge
|
||||
commands:
|
||||
# #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here:
|
||||
# https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
||||
- "mkdir artifacts"
|
||||
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
||||
- "bash .buildkite/scripts/upload-wheels.sh"
|
||||
env:
|
||||
DOCKER_BUILDKIT: "1"
|
||||
|
||||
- label: "Build arm64 wheel - CUDA 13.0"
|
||||
depends_on: ~
|
||||
id: build-wheel-arm64-cuda-13-0
|
||||
agents:
|
||||
queue: arm64_cpu_queue_postmerge
|
||||
commands:
|
||||
# #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here:
|
||||
# https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
||||
- "mkdir artifacts"
|
||||
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
||||
- "bash .buildkite/scripts/upload-wheels.sh manylinux_2_35"
|
||||
env:
|
||||
DOCKER_BUILDKIT: "1"
|
||||
|
||||
# aarch64 build
|
||||
- label: "Build arm64 CPU wheel"
|
||||
depends_on: ~
|
||||
id: build-wheel-arm64-cpu
|
||||
agents:
|
||||
queue: arm64_cpu_queue_postmerge
|
||||
commands:
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_BUILD_ACL=ON --tag vllm-ci:build-image --target vllm-build --progress plain -f docker/Dockerfile.cpu ."
|
||||
- "mkdir artifacts"
|
||||
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
||||
- "bash .buildkite/scripts/upload-wheels.sh manylinux_2_35"
|
||||
env:
|
||||
DOCKER_BUILDKIT: "1"
|
||||
|
||||
# x86 + CUDA builds
|
||||
- label: "Build wheel - CUDA 12.9"
|
||||
depends_on: ~
|
||||
id: build-wheel-cuda-12-9
|
||||
agents:
|
||||
queue: cpu_queue_postmerge
|
||||
commands:
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
||||
- "mkdir artifacts"
|
||||
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
||||
- "bash .buildkite/scripts/upload-wheels.sh manylinux_2_31"
|
||||
env:
|
||||
DOCKER_BUILDKIT: "1"
|
||||
|
||||
- label: "Build wheel - CUDA 13.0"
|
||||
depends_on: ~
|
||||
id: build-wheel-cuda-13-0
|
||||
agents:
|
||||
queue: cpu_queue_postmerge
|
||||
commands:
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
||||
- "mkdir artifacts"
|
||||
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
||||
- "bash .buildkite/scripts/upload-wheels.sh manylinux_2_35"
|
||||
env:
|
||||
DOCKER_BUILDKIT: "1"
|
||||
|
||||
# x86 CPU wheel build
|
||||
- label: "Build x86 CPU wheel"
|
||||
depends_on: ~
|
||||
id: build-wheel-x86-cpu
|
||||
agents:
|
||||
queue: cpu_queue_postmerge
|
||||
commands:
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_CPU_AVX512BF16=true --build-arg VLLM_CPU_AVX512VNNI=true --build-arg VLLM_CPU_AMXBF16=true --tag vllm-ci:build-image --target vllm-build --progress plain -f docker/Dockerfile.cpu ."
|
||||
- "mkdir artifacts"
|
||||
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
||||
- "bash .buildkite/scripts/upload-wheels.sh manylinux_2_35"
|
||||
env:
|
||||
DOCKER_BUILDKIT: "1"
|
||||
|
||||
# Build release images (12.9)
|
||||
- label: "Build release image (x86)"
|
||||
depends_on: ~
|
||||
id: build-release-image-x86
|
||||
agents:
|
||||
queue: cpu_queue_postmerge
|
||||
commands:
|
||||
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
|
||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)"
|
||||
# re-tag to default image tag and push, just in case arm64 build fails
|
||||
- "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
|
||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
|
||||
|
||||
- label: "Build release image (arm64)"
|
||||
depends_on: ~
|
||||
id: build-release-image-arm64
|
||||
agents:
|
||||
queue: arm64_cpu_queue_postmerge
|
||||
commands:
|
||||
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
|
||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)"
|
||||
|
||||
# Add job to create multi-arch manifest
|
||||
- label: "Create multi-arch manifest"
|
||||
depends_on:
|
||||
- build-release-image-x86
|
||||
- build-release-image-arm64
|
||||
id: create-multi-arch-manifest
|
||||
agents:
|
||||
queue: cpu_queue_postmerge
|
||||
commands:
|
||||
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
||||
- "docker manifest create public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64 --amend"
|
||||
- "docker manifest push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
|
||||
|
||||
- label: "Annotate release workflow"
|
||||
depends_on:
|
||||
- create-multi-arch-manifest
|
||||
id: annotate-release-workflow
|
||||
agents:
|
||||
queue: cpu_queue_postmerge
|
||||
commands:
|
||||
- "bash .buildkite/scripts/annotate-release.sh"
|
||||
|
||||
- input: "Provide Release version here"
|
||||
id: input-release-version
|
||||
fields:
|
||||
- text: "What is the release version?"
|
||||
key: release-version
|
||||
|
||||
- block: "Build CPU release image"
|
||||
key: block-cpu-release-image-build
|
||||
- group: "Build Python wheels"
|
||||
key: "build-wheels"
|
||||
steps:
|
||||
- label: "Build wheel - aarch64 - CUDA 12.9"
|
||||
depends_on: ~
|
||||
id: build-wheel-arm64-cuda-12-9
|
||||
agents:
|
||||
queue: arm64_cpu_queue_postmerge
|
||||
commands:
|
||||
# #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here:
|
||||
# https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
||||
- "mkdir artifacts"
|
||||
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
||||
- "bash .buildkite/scripts/upload-nightly-wheels.sh"
|
||||
env:
|
||||
DOCKER_BUILDKIT: "1"
|
||||
|
||||
- label: "Build wheel - aarch64 - CUDA 13.0"
|
||||
depends_on: ~
|
||||
id: build-wheel-arm64-cuda-13-0
|
||||
agents:
|
||||
queue: arm64_cpu_queue_postmerge
|
||||
commands:
|
||||
# #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here:
|
||||
# https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
||||
- "mkdir artifacts"
|
||||
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
||||
- "bash .buildkite/scripts/upload-nightly-wheels.sh manylinux_2_35"
|
||||
env:
|
||||
DOCKER_BUILDKIT: "1"
|
||||
|
||||
- label: "Build wheel - aarch64 - CPU"
|
||||
depends_on: ~
|
||||
id: build-wheel-arm64-cpu
|
||||
agents:
|
||||
queue: arm64_cpu_queue_postmerge
|
||||
commands:
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_BUILD_ACL=ON --tag vllm-ci:build-image --target vllm-build --progress plain -f docker/Dockerfile.cpu ."
|
||||
- "mkdir artifacts"
|
||||
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
||||
- "bash .buildkite/scripts/upload-nightly-wheels.sh manylinux_2_35"
|
||||
env:
|
||||
DOCKER_BUILDKIT: "1"
|
||||
|
||||
- label: "Build wheel - x86_64 - CUDA 12.9"
|
||||
depends_on: ~
|
||||
id: build-wheel-x86-cuda-12-9
|
||||
agents:
|
||||
queue: cpu_queue_postmerge
|
||||
commands:
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
||||
- "mkdir artifacts"
|
||||
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
||||
- "bash .buildkite/scripts/upload-nightly-wheels.sh manylinux_2_31"
|
||||
env:
|
||||
DOCKER_BUILDKIT: "1"
|
||||
|
||||
- label: "Build wheel - x86_64 - CUDA 13.0"
|
||||
depends_on: ~
|
||||
id: build-wheel-x86-cuda-13-0
|
||||
agents:
|
||||
queue: cpu_queue_postmerge
|
||||
commands:
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
||||
- "mkdir artifacts"
|
||||
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
||||
- "bash .buildkite/scripts/upload-nightly-wheels.sh manylinux_2_35"
|
||||
env:
|
||||
DOCKER_BUILDKIT: "1"
|
||||
|
||||
- label: "Build wheel - x86_64 - CPU"
|
||||
depends_on: ~
|
||||
id: build-wheel-x86-cpu
|
||||
agents:
|
||||
queue: cpu_queue_postmerge
|
||||
commands:
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_CPU_AVX512BF16=true --build-arg VLLM_CPU_AVX512VNNI=true --build-arg VLLM_CPU_AMXBF16=true --tag vllm-ci:build-image --target vllm-build --progress plain -f docker/Dockerfile.cpu ."
|
||||
- "mkdir artifacts"
|
||||
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
||||
- "bash .buildkite/scripts/upload-nightly-wheels.sh manylinux_2_35"
|
||||
env:
|
||||
DOCKER_BUILDKIT: "1"
|
||||
|
||||
- group: "Build release Docker images"
|
||||
key: "build-release-images"
|
||||
steps:
|
||||
- label: "Build release image - x86_64 - CUDA 12.9"
|
||||
depends_on: ~
|
||||
id: build-release-image-x86
|
||||
agents:
|
||||
queue: cpu_queue_postmerge
|
||||
commands:
|
||||
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
|
||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)"
|
||||
# re-tag to default image tag and push, just in case arm64 build fails
|
||||
- "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
|
||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
|
||||
|
||||
- label: "Build release image - aarch64 - CUDA 12.9"
|
||||
depends_on: ~
|
||||
id: build-release-image-arm64
|
||||
agents:
|
||||
queue: arm64_cpu_queue_postmerge
|
||||
commands:
|
||||
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
|
||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)"
|
||||
|
||||
- label: "Build release image - x86_64 - CUDA 13.0"
|
||||
depends_on: ~
|
||||
id: build-release-image-x86-cuda-13-0
|
||||
agents:
|
||||
queue: cpu_queue_postmerge
|
||||
commands:
|
||||
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg INSTALL_KV_CONNECTORS=true --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130 --target vllm-openai --progress plain -f docker/Dockerfile ."
|
||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130"
|
||||
# re-tag to default image tag and push, just in case arm64 build fails
|
||||
- "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu130"
|
||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu130"
|
||||
|
||||
- label: "Build release image - aarch64 - CUDA 13.0"
|
||||
depends_on: ~
|
||||
id: build-release-image-arm64-cuda-13-0
|
||||
agents:
|
||||
queue: arm64_cpu_queue_postmerge
|
||||
commands:
|
||||
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
||||
# compute capability 12.0 for RTX-50 series / RTX PRO 6000 Blackwell, 12.1 for DGX Spark
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0 12.1' --build-arg INSTALL_KV_CONNECTORS=true --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130 --target vllm-openai --progress plain -f docker/Dockerfile ."
|
||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130"
|
||||
|
||||
- block: "Build release image for x86_64 CPU"
|
||||
key: block-cpu-release-image-build
|
||||
depends_on: ~
|
||||
|
||||
- label: "Build release image - x86_64 - CPU"
|
||||
depends_on:
|
||||
- block-cpu-release-image-build
|
||||
- input-release-version
|
||||
agents:
|
||||
queue: cpu_queue_postmerge
|
||||
commands:
|
||||
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_CPU_AVX512BF16=true --build-arg VLLM_CPU_AVX512VNNI=true --build-arg VLLM_CPU_AMXBF16=true --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ."
|
||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest"
|
||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version)"
|
||||
env:
|
||||
DOCKER_BUILDKIT: "1"
|
||||
|
||||
- block: "Build release image for arm64 CPU"
|
||||
key: block-arm64-cpu-release-image-build
|
||||
depends_on: ~
|
||||
|
||||
- label: "Build release image - arm64 - CPU"
|
||||
depends_on:
|
||||
- block-arm64-cpu-release-image-build
|
||||
- input-release-version
|
||||
agents:
|
||||
queue: arm64_cpu_queue_postmerge
|
||||
commands:
|
||||
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ."
|
||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:latest"
|
||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:$(buildkite-agent meta-data get release-version)"
|
||||
env:
|
||||
DOCKER_BUILDKIT: "1"
|
||||
|
||||
- group: "Publish release images"
|
||||
key: "publish-release-images"
|
||||
steps:
|
||||
- label: "Create multi-arch manifest - CUDA 12.9"
|
||||
depends_on:
|
||||
- build-release-image-x86
|
||||
- build-release-image-arm64
|
||||
id: create-multi-arch-manifest
|
||||
agents:
|
||||
queue: small_cpu_queue_postmerge
|
||||
commands:
|
||||
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
||||
- "docker manifest create public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64 --amend"
|
||||
- "docker manifest push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
|
||||
|
||||
- label: "Annotate release workflow - CUDA 12.9"
|
||||
depends_on:
|
||||
- create-multi-arch-manifest
|
||||
id: annotate-release-workflow
|
||||
agents:
|
||||
queue: small_cpu_queue_postmerge
|
||||
commands:
|
||||
- "bash .buildkite/scripts/annotate-release.sh"
|
||||
|
||||
- label: "Create multi-arch manifest - CUDA 13.0"
|
||||
depends_on:
|
||||
- build-release-image-x86-cuda-13-0
|
||||
- build-release-image-arm64-cuda-13-0
|
||||
id: create-multi-arch-manifest-cuda-13-0
|
||||
agents:
|
||||
queue: small_cpu_queue_postmerge
|
||||
commands:
|
||||
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
||||
- "docker manifest create public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu130 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64-cu130 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64-cu130 --amend"
|
||||
- "docker manifest push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu130"
|
||||
|
||||
- label: "Publish nightly multi-arch image to DockerHub"
|
||||
depends_on:
|
||||
- create-multi-arch-manifest
|
||||
if: build.env("NIGHTLY") == "1"
|
||||
agents:
|
||||
queue: small_cpu_queue_postmerge
|
||||
commands:
|
||||
- "bash .buildkite/scripts/push-nightly-builds.sh"
|
||||
# Clean up old nightly builds (keep only last 14)
|
||||
- "bash .buildkite/scripts/cleanup-nightly-builds.sh"
|
||||
plugins:
|
||||
- docker-login#v3.0.0:
|
||||
username: vllmbot
|
||||
password-env: DOCKERHUB_TOKEN
|
||||
env:
|
||||
DOCKER_BUILDKIT: "1"
|
||||
DOCKERHUB_USERNAME: "vllmbot"
|
||||
|
||||
- label: "Publish nightly multi-arch image to DockerHub - CUDA 13.0"
|
||||
depends_on:
|
||||
- create-multi-arch-manifest-cuda-13-0
|
||||
if: build.env("NIGHTLY") == "1"
|
||||
agents:
|
||||
queue: small_cpu_queue_postmerge
|
||||
commands:
|
||||
- "bash .buildkite/scripts/push-nightly-builds.sh cu130"
|
||||
# Clean up old nightly builds (keep only last 14)
|
||||
- "bash .buildkite/scripts/cleanup-nightly-builds.sh cu130-nightly-"
|
||||
plugins:
|
||||
- docker-login#v3.0.0:
|
||||
username: vllmbot
|
||||
password-env: DOCKERHUB_TOKEN
|
||||
env:
|
||||
DOCKER_BUILDKIT: "1"
|
||||
DOCKERHUB_USERNAME: "vllmbot"
|
||||
|
||||
- group: "Publish wheels"
|
||||
key: "publish-wheels"
|
||||
steps:
|
||||
- block: "Confirm update release wheels to PyPI (experimental, use with caution)?"
|
||||
key: block-upload-release-wheels
|
||||
depends_on:
|
||||
- input-release-version
|
||||
- build-wheels
|
||||
|
||||
- label: "Upload release wheels to PyPI"
|
||||
depends_on:
|
||||
- block-upload-release-wheels
|
||||
id: upload-release-wheels
|
||||
agents:
|
||||
queue: small_cpu_queue_postmerge
|
||||
commands:
|
||||
- "bash .buildkite/scripts/upload-release-wheels-pypi.sh"
|
||||
|
||||
# =============================================================================
|
||||
# ROCm Release Pipeline (x86_64 only)
|
||||
# =============================================================================
|
||||
#
|
||||
# vLLM version is determined by the Buildkite checkout (like CUDA pipeline).
|
||||
# To build a specific version, trigger the build from that branch/tag.
|
||||
#
|
||||
# Environment variables for ROCm builds (set via Buildkite UI or schedule):
|
||||
# ROCM_PYTHON_VERSION: Python version (default: 3.12)
|
||||
# PYTORCH_ROCM_ARCH: GPU architectures (default: gfx90a;gfx942;gfx950;gfx1100;gfx1101;gfx1200;gfx1201;gfx1150;gfx1151)
|
||||
# ROCM_UPLOAD_WHEELS: Upload to S3 (default: false for nightly, true for releases)
|
||||
# ROCM_FORCE_REBUILD: Force rebuild base wheels, ignore S3 cache (default: false)
|
||||
#
|
||||
# Note: ROCm version is determined by BASE_IMAGE in docker/Dockerfile.rocm_base
|
||||
# (currently rocm/dev-ubuntu-22.04:7.1-complete)
|
||||
#
|
||||
# =============================================================================
|
||||
|
||||
# ROCm Input Step - Collect build configuration (manual trigger only)
|
||||
- input: "ROCm Wheel Release Build Configuration"
|
||||
key: input-rocm-config
|
||||
depends_on: ~
|
||||
if: build.source == "ui"
|
||||
fields:
|
||||
- text: "Python Version"
|
||||
key: "rocm-python-version"
|
||||
default: "3.12"
|
||||
hint: "Python version (e.g., 3.12)"
|
||||
- text: "GPU Architectures"
|
||||
key: "rocm-pytorch-rocm-arch"
|
||||
default: "gfx90a;gfx942;gfx950;gfx1100;gfx1101;gfx1200;gfx1201;gfx1150;gfx1151"
|
||||
hint: "Semicolon-separated GPU architectures"
|
||||
- select: "Upload Wheels to S3"
|
||||
key: "rocm-upload-wheels"
|
||||
default: "true"
|
||||
options:
|
||||
- label: "No - Build only (nightly/dev)"
|
||||
value: "false"
|
||||
- label: "Yes - Upload to S3 (release)"
|
||||
value: "true"
|
||||
- select: "Force Rebuild Base Wheels"
|
||||
key: "rocm-force-rebuild"
|
||||
default: "false"
|
||||
hint: "Ignore S3 cache and rebuild base wheels from scratch"
|
||||
options:
|
||||
- label: "No - Use cached wheels if available"
|
||||
value: "false"
|
||||
- label: "Yes - Rebuild even if cache exists"
|
||||
value: "true"
|
||||
|
||||
- label: "Build and publish CPU release image"
|
||||
depends_on: block-cpu-release-image-build
|
||||
agents:
|
||||
queue: cpu_queue_postmerge
|
||||
commands:
|
||||
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_CPU_AVX512BF16=true --build-arg VLLM_CPU_AVX512VNNI=true --build-arg VLLM_CPU_AMXBF16=true --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ."
|
||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest"
|
||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version)"
|
||||
env:
|
||||
DOCKER_BUILDKIT: "1"
|
||||
|
||||
- block: "Build arm64 CPU release image"
|
||||
key: block-arm64-cpu-release-image-build
|
||||
depends_on: ~
|
||||
|
||||
- label: "Build and publish arm64 CPU release image"
|
||||
depends_on: block-arm64-cpu-release-image-build
|
||||
agents:
|
||||
queue: arm64_cpu_queue_postmerge
|
||||
commands:
|
||||
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ."
|
||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:latest"
|
||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:$(buildkite-agent meta-data get release-version)"
|
||||
env:
|
||||
DOCKER_BUILDKIT: "1"
|
||||
|
||||
- label: "Build and publish nightly multi-arch image to DockerHub"
|
||||
# ROCm Job 1: Build ROCm Base Wheels (with S3 caching)
|
||||
- label: ":rocm: Build ROCm Base Wheels"
|
||||
id: build-rocm-base-wheels
|
||||
depends_on:
|
||||
- create-multi-arch-manifest
|
||||
if: build.env("NIGHTLY") == "1"
|
||||
- step: input-rocm-config
|
||||
allow_failure: true # Allow failure so non-UI builds can proceed (input step is skipped)
|
||||
agents:
|
||||
queue: cpu_queue_postmerge
|
||||
commands:
|
||||
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
||||
- "docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64"
|
||||
- "docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64"
|
||||
- "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64 vllm/vllm-openai:nightly-x86_64"
|
||||
- "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64 vllm/vllm-openai:nightly-aarch64"
|
||||
- "docker push vllm/vllm-openai:nightly-x86_64"
|
||||
- "docker push vllm/vllm-openai:nightly-aarch64"
|
||||
- "docker manifest create vllm/vllm-openai:nightly vllm/vllm-openai:nightly-x86_64 vllm/vllm-openai:nightly-aarch64 --amend"
|
||||
- "docker manifest create vllm/vllm-openai:nightly-$BUILDKITE_COMMIT vllm/vllm-openai:nightly-x86_64 vllm/vllm-openai:nightly-aarch64 --amend"
|
||||
- "docker manifest push vllm/vllm-openai:nightly"
|
||||
- "docker manifest push vllm/vllm-openai:nightly-$BUILDKITE_COMMIT"
|
||||
# Clean up old nightly builds (keep only last 14)
|
||||
- "bash .buildkite/scripts/cleanup-nightly-builds.sh"
|
||||
plugins:
|
||||
- docker-login#v3.0.0:
|
||||
username: vllmbot
|
||||
password-env: DOCKERHUB_TOKEN
|
||||
# Set configuration and check cache
|
||||
- |
|
||||
set -euo pipefail
|
||||
|
||||
# Get values from meta-data (set by input step) or use defaults
|
||||
PYTHON_VERSION="$$(buildkite-agent meta-data get rocm-python-version 2>/dev/null || echo '')"
|
||||
export PYTHON_VERSION="$${PYTHON_VERSION:-3.12}"
|
||||
|
||||
PYTORCH_ROCM_ARCH="$$(buildkite-agent meta-data get rocm-pytorch-rocm-arch 2>/dev/null || echo '')"
|
||||
export PYTORCH_ROCM_ARCH="$${PYTORCH_ROCM_ARCH:-gfx90a;gfx942;gfx950;gfx1100;gfx1101;gfx1200;gfx1201;gfx1150;gfx1151}"
|
||||
|
||||
# Check for force rebuild flag
|
||||
ROCM_FORCE_REBUILD="$${ROCM_FORCE_REBUILD:-}"
|
||||
if [ -z "$${ROCM_FORCE_REBUILD}" ]; then
|
||||
ROCM_FORCE_REBUILD="$$(buildkite-agent meta-data get rocm-force-rebuild 2>/dev/null || echo '')"
|
||||
fi
|
||||
|
||||
echo "========================================"
|
||||
echo "ROCm Base Wheels Build Configuration"
|
||||
echo "========================================"
|
||||
echo " PYTHON_VERSION: $${PYTHON_VERSION}"
|
||||
echo " PYTORCH_ROCM_ARCH: $${PYTORCH_ROCM_ARCH}"
|
||||
echo " ROCM_FORCE_REBUILD: $${ROCM_FORCE_REBUILD:-false}"
|
||||
echo "========================================"
|
||||
|
||||
# Save resolved config for later jobs
|
||||
buildkite-agent meta-data set "rocm-python-version" "$${PYTHON_VERSION}"
|
||||
buildkite-agent meta-data set "rocm-pytorch-rocm-arch" "$${PYTORCH_ROCM_ARCH}"
|
||||
|
||||
# Check S3 cache for pre-built wheels
|
||||
CACHE_KEY=$$(.buildkite/scripts/cache-rocm-base-wheels.sh key)
|
||||
CACHE_PATH=$$(.buildkite/scripts/cache-rocm-base-wheels.sh path)
|
||||
echo ""
|
||||
echo "Cache key: $${CACHE_KEY}"
|
||||
echo "Cache path: $${CACHE_PATH}"
|
||||
|
||||
# Save cache key for downstream jobs
|
||||
buildkite-agent meta-data set "rocm-cache-key" "$${CACHE_KEY}"
|
||||
|
||||
CACHE_STATUS="miss"
|
||||
if [ "$${ROCM_FORCE_REBUILD}" != "true" ]; then
|
||||
CACHE_STATUS=$$(.buildkite/scripts/cache-rocm-base-wheels.sh check)
|
||||
else
|
||||
echo "Force rebuild requested, skipping cache check"
|
||||
fi
|
||||
|
||||
if [ "$${CACHE_STATUS}" = "hit" ]; then
|
||||
echo ""
|
||||
echo "CACHE HIT! Downloading pre-built wheels..."
|
||||
echo ""
|
||||
.buildkite/scripts/cache-rocm-base-wheels.sh download
|
||||
|
||||
# Set the S3 path for the cached Docker image (for Job 2 to download)
|
||||
S3_ARTIFACT_PATH="s3://$${S3_BUCKET}/rocm/cache/$${CACHE_KEY}"
|
||||
buildkite-agent meta-data set "rocm-docker-image-s3-path" "$${S3_ARTIFACT_PATH}/rocm-base-image.tar.gz"
|
||||
|
||||
# Mark that we used cache (for Docker image handling)
|
||||
buildkite-agent meta-data set "rocm-used-cache" "true"
|
||||
|
||||
echo ""
|
||||
echo "Cache download complete. Skipping Docker build."
|
||||
echo "Docker image will be downloaded from: $${S3_ARTIFACT_PATH}/rocm-base-image.tar.gz"
|
||||
else
|
||||
echo ""
|
||||
echo "CACHE MISS. Building from scratch..."
|
||||
echo ""
|
||||
|
||||
# Build full base image (for later vLLM build)
|
||||
DOCKER_BUILDKIT=1 docker buildx build \
|
||||
--file docker/Dockerfile.rocm_base \
|
||||
--tag rocm/vllm-dev:base-$${BUILDKITE_BUILD_NUMBER} \
|
||||
--build-arg PYTORCH_ROCM_ARCH="$${PYTORCH_ROCM_ARCH}" \
|
||||
--build-arg PYTHON_VERSION="$${PYTHON_VERSION}" \
|
||||
--build-arg USE_SCCACHE=1 \
|
||||
--build-arg SCCACHE_BUCKET_NAME=vllm-build-sccache \
|
||||
--build-arg SCCACHE_REGION_NAME=us-west-2 \
|
||||
--build-arg SCCACHE_S3_NO_CREDENTIALS=0 \
|
||||
--load \
|
||||
.
|
||||
|
||||
# Build debs_wheel_release stage for wheel extraction
|
||||
DOCKER_BUILDKIT=1 docker buildx build \
|
||||
--file docker/Dockerfile.rocm_base \
|
||||
--tag rocm-base-debs:$${BUILDKITE_BUILD_NUMBER} \
|
||||
--target debs_wheel_release \
|
||||
--build-arg PYTORCH_ROCM_ARCH="$${PYTORCH_ROCM_ARCH}" \
|
||||
--build-arg PYTHON_VERSION="$${PYTHON_VERSION}" \
|
||||
--build-arg USE_SCCACHE=1 \
|
||||
--build-arg SCCACHE_BUCKET_NAME=vllm-build-sccache \
|
||||
--build-arg SCCACHE_REGION_NAME=us-west-2 \
|
||||
--build-arg SCCACHE_S3_NO_CREDENTIALS=0 \
|
||||
--load \
|
||||
.
|
||||
|
||||
# Extract wheels from Docker image
|
||||
mkdir -p artifacts/rocm-base-wheels
|
||||
container_id=$$(docker create rocm-base-debs:$${BUILDKITE_BUILD_NUMBER})
|
||||
docker cp $${container_id}:/app/debs/. artifacts/rocm-base-wheels/
|
||||
docker rm $${container_id}
|
||||
echo "Extracted base wheels:"
|
||||
ls -lh artifacts/rocm-base-wheels/
|
||||
|
||||
# Upload wheels to S3 cache for future builds
|
||||
echo ""
|
||||
echo "Uploading wheels to S3 cache..."
|
||||
.buildkite/scripts/cache-rocm-base-wheels.sh upload
|
||||
|
||||
# Export base Docker image for reuse in vLLM build
|
||||
mkdir -p artifacts/rocm-docker-image
|
||||
docker save rocm/vllm-dev:base-$${BUILDKITE_BUILD_NUMBER} | gzip > artifacts/rocm-docker-image/rocm-base-image.tar.gz
|
||||
echo "Docker image size:"
|
||||
ls -lh artifacts/rocm-docker-image/
|
||||
|
||||
# Upload large Docker image to S3 (also cached by cache key)
|
||||
S3_ARTIFACT_PATH="s3://$${S3_BUCKET}/rocm/cache/$${CACHE_KEY}"
|
||||
echo "Uploading Docker image to $${S3_ARTIFACT_PATH}/"
|
||||
aws s3 cp artifacts/rocm-docker-image/rocm-base-image.tar.gz "$${S3_ARTIFACT_PATH}/rocm-base-image.tar.gz"
|
||||
|
||||
# Save the S3 path for downstream jobs
|
||||
buildkite-agent meta-data set "rocm-docker-image-s3-path" "$${S3_ARTIFACT_PATH}/rocm-base-image.tar.gz"
|
||||
|
||||
# Mark that we did NOT use cache
|
||||
buildkite-agent meta-data set "rocm-used-cache" "false"
|
||||
|
||||
echo ""
|
||||
echo "Build complete. Wheels cached for future builds."
|
||||
fi
|
||||
artifact_paths:
|
||||
- "artifacts/rocm-base-wheels/*.whl"
|
||||
env:
|
||||
DOCKER_BUILDKIT: "1"
|
||||
DOCKERHUB_USERNAME: "vllmbot"
|
||||
S3_BUCKET: "vllm-wheels"
|
||||
|
||||
# ROCm Job 2: Build vLLM ROCm Wheel
|
||||
- label: ":python: Build vLLM ROCm Wheel - x86_64"
|
||||
id: build-rocm-vllm-wheel
|
||||
depends_on:
|
||||
- step: build-rocm-base-wheels
|
||||
allow_failure: false
|
||||
agents:
|
||||
queue: cpu_queue_postmerge
|
||||
timeout_in_minutes: 180
|
||||
commands:
|
||||
# Download artifacts and prepare Docker image
|
||||
- |
|
||||
set -euo pipefail
|
||||
|
||||
# Ensure git tags are up-to-date (Buildkite's default fetch doesn't update tags)
|
||||
# This fixes version detection when tags are moved/force-pushed
|
||||
echo "Fetching latest tags from origin..."
|
||||
git fetch --tags --force origin
|
||||
|
||||
# Log tag information for debugging version detection
|
||||
echo "========================================"
|
||||
echo "Git Tag Verification"
|
||||
echo "========================================"
|
||||
echo "Current HEAD: $(git rev-parse HEAD)"
|
||||
echo "git describe --tags: $(git describe --tags 2>/dev/null || echo 'No tags found')"
|
||||
echo ""
|
||||
echo "Recent tags (pointing to commits near HEAD):"
|
||||
git tag -l --sort=-creatordate | head -5
|
||||
echo "setuptools_scm version detection:"
|
||||
pip install -q setuptools_scm 2>/dev/null || true
|
||||
python3 -c "import setuptools_scm; print(' Detected version:', setuptools_scm.get_version())" 2>/dev/null || echo " (setuptools_scm not available in this environment)"
|
||||
echo "========================================"
|
||||
|
||||
# Download wheel artifacts from current build
|
||||
echo "Downloading wheel artifacts from current build"
|
||||
buildkite-agent artifact download "artifacts/rocm-base-wheels/*.whl" .
|
||||
|
||||
# Download Docker image from S3 (too large for Buildkite artifacts)
|
||||
DOCKER_IMAGE_S3_PATH="$$(buildkite-agent meta-data get rocm-docker-image-s3-path 2>/dev/null || echo '')"
|
||||
if [ -z "$${DOCKER_IMAGE_S3_PATH}" ]; then
|
||||
echo "ERROR: rocm-docker-image-s3-path metadata not found"
|
||||
echo "This should have been set by the build-rocm-base-wheels job"
|
||||
exit 1
|
||||
fi
|
||||
echo "Downloading Docker image from $${DOCKER_IMAGE_S3_PATH}"
|
||||
mkdir -p artifacts/rocm-docker-image
|
||||
aws s3 cp "$${DOCKER_IMAGE_S3_PATH}" artifacts/rocm-docker-image/rocm-base-image.tar.gz
|
||||
|
||||
# Load base Docker image and capture the tag
|
||||
echo "Loading base Docker image..."
|
||||
LOAD_OUTPUT=$$(gunzip -c artifacts/rocm-docker-image/rocm-base-image.tar.gz | docker load)
|
||||
echo "$${LOAD_OUTPUT}"
|
||||
# Extract the actual loaded image tag from "Loaded image: <tag>" output
|
||||
# This avoids picking up stale images (like rocm/vllm-dev:nightly) already on the agent
|
||||
BASE_IMAGE_TAG=$$(echo "$${LOAD_OUTPUT}" | grep "Loaded image:" | sed 's/Loaded image: //')
|
||||
if [ -z "$${BASE_IMAGE_TAG}" ]; then
|
||||
echo "ERROR: Failed to extract image tag from docker load output"
|
||||
echo "Load output was: $${LOAD_OUTPUT}"
|
||||
exit 1
|
||||
fi
|
||||
echo "Loaded base image: $${BASE_IMAGE_TAG}"
|
||||
|
||||
# Prepare base wheels for Docker build context
|
||||
mkdir -p docker/context/base-wheels
|
||||
touch docker/context/base-wheels/.keep
|
||||
cp artifacts/rocm-base-wheels/*.whl docker/context/base-wheels/
|
||||
echo "Base wheels for vLLM build:"
|
||||
ls -lh docker/context/base-wheels/
|
||||
|
||||
# Get GPU architectures from meta-data
|
||||
PYTORCH_ROCM_ARCH="$$(buildkite-agent meta-data get rocm-pytorch-rocm-arch 2>/dev/null || echo '')"
|
||||
PYTORCH_ROCM_ARCH="$${PYTORCH_ROCM_ARCH:-gfx90a;gfx942;gfx950;gfx1100;gfx1101;gfx1200;gfx1201;gfx1150;gfx1151}"
|
||||
|
||||
echo "========================================"
|
||||
echo "Building vLLM wheel with:"
|
||||
echo " BUILDKITE_COMMIT: $${BUILDKITE_COMMIT}"
|
||||
echo " BUILDKITE_BRANCH: $${BUILDKITE_BRANCH}"
|
||||
echo " PYTORCH_ROCM_ARCH: $${PYTORCH_ROCM_ARCH}"
|
||||
echo " BASE_IMAGE: $${BASE_IMAGE_TAG}"
|
||||
echo "========================================"
|
||||
|
||||
# Build vLLM wheel using local checkout (REMOTE_VLLM=0)
|
||||
DOCKER_BUILDKIT=1 docker build \
|
||||
--file docker/Dockerfile.rocm \
|
||||
--target export_vllm_wheel_release \
|
||||
--output type=local,dest=rocm-dist \
|
||||
--build-arg BASE_IMAGE="$${BASE_IMAGE_TAG}" \
|
||||
--build-arg ARG_PYTORCH_ROCM_ARCH="$${PYTORCH_ROCM_ARCH}" \
|
||||
--build-arg REMOTE_VLLM=0 \
|
||||
--build-arg GIT_REPO_CHECK=1 \
|
||||
--build-arg USE_SCCACHE=1 \
|
||||
--build-arg SCCACHE_BUCKET_NAME=vllm-build-sccache \
|
||||
--build-arg SCCACHE_REGION_NAME=us-west-2 \
|
||||
--build-arg SCCACHE_S3_NO_CREDENTIALS=0 \
|
||||
.
|
||||
|
||||
echo "Built vLLM wheel:"
|
||||
ls -lh rocm-dist/*.whl
|
||||
|
||||
# Copy wheel to artifacts directory
|
||||
mkdir -p artifacts/rocm-vllm-wheel
|
||||
cp rocm-dist/*.whl artifacts/rocm-vllm-wheel/
|
||||
echo "Final vLLM wheel:"
|
||||
ls -lh artifacts/rocm-vllm-wheel/
|
||||
artifact_paths:
|
||||
- "artifacts/rocm-vllm-wheel/*.whl"
|
||||
env:
|
||||
DOCKER_BUILDKIT: "1"
|
||||
S3_BUCKET: "vllm-wheels"
|
||||
|
||||
# ROCm Job 3: Upload Wheels to S3
|
||||
- label: ":s3: Upload ROCm Wheels to S3"
|
||||
id: upload-rocm-wheels
|
||||
depends_on:
|
||||
- step: build-rocm-vllm-wheel
|
||||
allow_failure: false
|
||||
agents:
|
||||
queue: cpu_queue_postmerge
|
||||
timeout_in_minutes: 60
|
||||
commands:
|
||||
# Download all wheel artifacts and run upload
|
||||
- |
|
||||
set -euo pipefail
|
||||
|
||||
# Check if upload is enabled (from env var, meta-data, or release branch)
|
||||
ROCM_UPLOAD_WHEELS="$${ROCM_UPLOAD_WHEELS:-}"
|
||||
if [ -z "$${ROCM_UPLOAD_WHEELS}" ]; then
|
||||
# Try to get from meta-data (input form)
|
||||
ROCM_UPLOAD_WHEELS="$$(buildkite-agent meta-data get rocm-upload-wheels 2>/dev/null || echo '')"
|
||||
fi
|
||||
|
||||
echo "========================================"
|
||||
echo "Upload check:"
|
||||
echo " ROCM_UPLOAD_WHEELS: $${ROCM_UPLOAD_WHEELS}"
|
||||
echo " BUILDKITE_BRANCH: $${BUILDKITE_BRANCH}"
|
||||
echo "========================================"
|
||||
|
||||
# Skip upload if not enabled
|
||||
if [ "$${ROCM_UPLOAD_WHEELS}" != "true" ]; then
|
||||
echo "Skipping S3 upload (ROCM_UPLOAD_WHEELS != true, NIGHTLY != 1, not a release branch)"
|
||||
echo "To enable upload, set 'Upload Wheels to S3' to 'Yes' in the build configuration"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
echo "Upload enabled, proceeding..."
|
||||
|
||||
# Download artifacts from current build
|
||||
echo "Downloading artifacts from current build"
|
||||
buildkite-agent artifact download "artifacts/rocm-base-wheels/*.whl" .
|
||||
buildkite-agent artifact download "artifacts/rocm-vllm-wheel/*.whl" .
|
||||
|
||||
# Run upload script
|
||||
bash .buildkite/scripts/upload-rocm-wheels.sh
|
||||
env:
|
||||
DOCKER_BUILDKIT: "1"
|
||||
S3_BUCKET: "vllm-wheels"
|
||||
|
||||
# ROCm Job 4: Annotate ROCm Wheel Release
|
||||
- label: ":memo: Annotate ROCm wheel release"
|
||||
id: annotate-rocm-release
|
||||
depends_on:
|
||||
- step: upload-rocm-wheels
|
||||
allow_failure: true
|
||||
- step: input-release-version
|
||||
allow_failure: true
|
||||
agents:
|
||||
queue: cpu_queue_postmerge
|
||||
commands:
|
||||
- "bash .buildkite/scripts/annotate-rocm-release.sh"
|
||||
env:
|
||||
S3_BUCKET: "vllm-wheels"
|
||||
|
||||
# ROCm Job 5: Generate Root Index for ROCm Wheels (for release only)
|
||||
# This is the job to create https://wheels.vllm.ai/rocm/ index allowing
|
||||
# users to install with `uv pip install vllm --extra-index-url https://wheels.vllm.ai/rocm/`
|
||||
- block: "Generate Root Index for ROCm Wheels for Release"
|
||||
key: block-generate-root-index-rocm-wheels
|
||||
depends_on: upload-rocm-wheels
|
||||
|
||||
- label: ":package: Generate Root Index for ROCm Wheels for Release"
|
||||
depends_on: block-generate-root-index-rocm-wheels
|
||||
id: generate-root-index-rocm-wheels
|
||||
agents:
|
||||
queue: cpu_queue_postmerge
|
||||
commands:
|
||||
- "bash tools/vllm-rocm/generate-rocm-wheels-root-index.sh"
|
||||
env:
|
||||
S3_BUCKET: "vllm-wheels"
|
||||
VARIANT: "rocm700"
|
||||
|
||||
# ROCm Job 5: Build ROCm Release Docker Image
|
||||
- label: ":docker: Build release image - x86_64 - ROCm"
|
||||
id: build-rocm-release-image
|
||||
depends_on:
|
||||
- step: build-rocm-base-wheels
|
||||
allow_failure: false
|
||||
agents:
|
||||
queue: cpu_queue_postmerge
|
||||
timeout_in_minutes: 60
|
||||
commands:
|
||||
- |
|
||||
set -euo pipefail
|
||||
|
||||
# Login to ECR
|
||||
aws ecr-public get-login-password --region us-east-1 | \
|
||||
docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7
|
||||
|
||||
# Download Docker image from S3 (set by build-rocm-base-wheels)
|
||||
DOCKER_IMAGE_S3_PATH="$$(buildkite-agent meta-data get rocm-docker-image-s3-path 2>/dev/null || echo '')"
|
||||
if [ -z "$${DOCKER_IMAGE_S3_PATH}" ]; then
|
||||
echo "ERROR: rocm-docker-image-s3-path metadata not found"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "Downloading base image from $${DOCKER_IMAGE_S3_PATH}"
|
||||
mkdir -p artifacts/rocm-docker-image
|
||||
aws s3 cp "$${DOCKER_IMAGE_S3_PATH}" artifacts/rocm-docker-image/rocm-base-image.tar.gz
|
||||
|
||||
# Load base Docker image
|
||||
echo "Loading base Docker image..."
|
||||
LOAD_OUTPUT=$$(gunzip -c artifacts/rocm-docker-image/rocm-base-image.tar.gz | docker load)
|
||||
BASE_IMAGE_TAG=$$(echo "$${LOAD_OUTPUT}" | grep "Loaded image:" | sed 's/Loaded image: //')
|
||||
echo "Loaded base image: $${BASE_IMAGE_TAG}"
|
||||
|
||||
# Tag and push the base image to ECR
|
||||
docker tag "$${BASE_IMAGE_TAG}" public.ecr.aws/q9t5s3a7/vllm-release-repo:$${BUILDKITE_COMMIT}-rocm-base
|
||||
docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$${BUILDKITE_COMMIT}-rocm-base
|
||||
echo "Pushed base image: public.ecr.aws/q9t5s3a7/vllm-release-repo:$${BUILDKITE_COMMIT}-rocm-base"
|
||||
|
||||
# Get GPU architectures from meta-data
|
||||
PYTORCH_ROCM_ARCH="$$(buildkite-agent meta-data get rocm-pytorch-rocm-arch 2>/dev/null || echo '')"
|
||||
PYTORCH_ROCM_ARCH="$${PYTORCH_ROCM_ARCH:-gfx90a;gfx942;gfx950;gfx1100;gfx1101;gfx1200;gfx1201;gfx1150;gfx1151}"
|
||||
|
||||
# Build vLLM ROCm release image using cached base
|
||||
DOCKER_BUILDKIT=1 docker build \
|
||||
--build-arg max_jobs=16 \
|
||||
--build-arg BASE_IMAGE="$${BASE_IMAGE_TAG}" \
|
||||
--build-arg ARG_PYTORCH_ROCM_ARCH="$${PYTORCH_ROCM_ARCH}" \
|
||||
--build-arg USE_SCCACHE=1 \
|
||||
--build-arg SCCACHE_BUCKET_NAME=vllm-build-sccache \
|
||||
--build-arg SCCACHE_REGION_NAME=us-west-2 \
|
||||
--build-arg SCCACHE_S3_NO_CREDENTIALS=0 \
|
||||
--tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$${BUILDKITE_COMMIT}-rocm \
|
||||
--target vllm-openai \
|
||||
--progress plain \
|
||||
-f docker/Dockerfile.rocm .
|
||||
|
||||
# Push to ECR
|
||||
docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$${BUILDKITE_COMMIT}-rocm
|
||||
echo "Pushed: public.ecr.aws/q9t5s3a7/vllm-release-repo:$${BUILDKITE_COMMIT}-rocm"
|
||||
env:
|
||||
DOCKER_BUILDKIT: "1"
|
||||
S3_BUCKET: "vllm-wheels"
|
||||
|
||||
@@ -11,27 +11,36 @@ fi
|
||||
buildkite-agent annotate --style 'info' --context 'release-workflow' << EOF
|
||||
To download the wheel (by commit):
|
||||
\`\`\`
|
||||
aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux1_x86_64.whl .
|
||||
aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux2014_aarch64.whl .
|
||||
aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux_2_31_x86_64.whl .
|
||||
aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux_2_31_aarch64.whl .
|
||||
|
||||
aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}+cu129-cp38-abi3-manylinux1_x86_64.whl .
|
||||
aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}+cu129-cp38-abi3-manylinux1_x86_64.whl .
|
||||
(Optional) For CUDA 13.0:
|
||||
aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}+cu130-cp38-abi3-manylinux_2_35_x86_64.whl .
|
||||
aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}+cu130-cp38-abi3-manylinux_2_35_aarch64.whl .
|
||||
|
||||
(Optional) For CPU:
|
||||
aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}+cpu-cp38-abi3-manylinux_2_35_x86_64.whl .
|
||||
aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}+cpu-cp38-abi3-manylinux_2_35_aarch64.whl .
|
||||
\`\`\`
|
||||
|
||||
To download the wheel (by version):
|
||||
\`\`\`
|
||||
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux1_x86_64.whl .
|
||||
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux2014_aarch64.whl .
|
||||
|
||||
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu129/vllm-${RELEASE_VERSION}+cu129-cp38-abi3-manylinux1_x86_64.whl .
|
||||
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu130/vllm-${RELEASE_VERSION}+cu130-cp38-abi3-manylinux1_x86_64.whl .
|
||||
\`\`\`
|
||||
|
||||
To download and upload the image:
|
||||
|
||||
\`\`\`
|
||||
# Download images:
|
||||
|
||||
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64
|
||||
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64
|
||||
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64-cu130
|
||||
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64-cu130
|
||||
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm-base
|
||||
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm
|
||||
docker pull public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:v${RELEASE_VERSION}
|
||||
docker pull public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:v${RELEASE_VERSION}
|
||||
|
||||
# Tag and push images:
|
||||
|
||||
## CUDA
|
||||
|
||||
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64 vllm/vllm-openai:x86_64
|
||||
docker tag vllm/vllm-openai:x86_64 vllm/vllm-openai:latest-x86_64
|
||||
@@ -39,16 +48,70 @@ docker tag vllm/vllm-openai:x86_64 vllm/vllm-openai:v${RELEASE_VERSION}-x86_64
|
||||
docker push vllm/vllm-openai:latest-x86_64
|
||||
docker push vllm/vllm-openai:v${RELEASE_VERSION}-x86_64
|
||||
|
||||
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64-cu130 vllm/vllm-openai:x86_64-cu130
|
||||
docker tag vllm/vllm-openai:x86_64-cu130 vllm/vllm-openai:latest-x86_64-cu130
|
||||
docker tag vllm/vllm-openai:x86_64-cu130 vllm/vllm-openai:v${RELEASE_VERSION}-x86_64-cu130
|
||||
docker push vllm/vllm-openai:latest-x86_64-cu130
|
||||
docker push vllm/vllm-openai:v${RELEASE_VERSION}-x86_64-cu130
|
||||
|
||||
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64 vllm/vllm-openai:aarch64
|
||||
docker tag vllm/vllm-openai:aarch64 vllm/vllm-openai:latest-aarch64
|
||||
docker tag vllm/vllm-openai:aarch64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
|
||||
docker push vllm/vllm-openai:latest-aarch64
|
||||
docker push vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
|
||||
|
||||
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64-cu130 vllm/vllm-openai:aarch64-cu130
|
||||
docker tag vllm/vllm-openai:aarch64-cu130 vllm/vllm-openai:latest-aarch64-cu130
|
||||
docker tag vllm/vllm-openai:aarch64-cu130 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64-cu130
|
||||
docker push vllm/vllm-openai:latest-aarch64-cu130
|
||||
docker push vllm/vllm-openai:v${RELEASE_VERSION}-aarch64-cu130
|
||||
|
||||
## ROCm
|
||||
|
||||
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}
|
||||
docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT} vllm/vllm-openai-rocm:latest
|
||||
docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT} vllm/vllm-openai-rocm:v${RELEASE_VERSION}
|
||||
docker push vllm/vllm-openai-rocm:latest
|
||||
docker push vllm/vllm-openai-rocm:v${RELEASE_VERSION}
|
||||
|
||||
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm-base vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-base
|
||||
docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-base vllm/vllm-openai-rocm:latest-base
|
||||
docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-base vllm/vllm-openai-rocm:v${RELEASE_VERSION}-base
|
||||
docker push vllm/vllm-openai-rocm:latest-base
|
||||
docker push vllm/vllm-openai-rocm:v${RELEASE_VERSION}-base
|
||||
|
||||
## CPU
|
||||
|
||||
docker tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:v${RELEASE_VERSION} vllm/vllm-openai-cpu:x86_64
|
||||
docker tag vllm/vllm-openai-cpu:x86_64 vllm/vllm-openai-cpu:latest-x86_64
|
||||
docker tag vllm/vllm-openai-cpu:x86_64 vllm/vllm-openai-cpu:v${RELEASE_VERSION}-x86_64
|
||||
docker push vllm/vllm-openai-cpu:latest-x86_64
|
||||
docker push vllm/vllm-openai-cpu:v${RELEASE_VERSION}-x86_64
|
||||
|
||||
docker tag public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:v${RELEASE_VERSION} vllm/vllm-openai-cpu:arm64
|
||||
docker tag vllm/vllm-openai-cpu:arm64 vllm/vllm-openai-cpu:latest-arm64
|
||||
docker tag vllm/vllm-openai-cpu:arm64 vllm/vllm-openai-cpu:v${RELEASE_VERSION}-arm64
|
||||
docker push vllm/vllm-openai-cpu:latest-arm64
|
||||
docker push vllm/vllm-openai-cpu:v${RELEASE_VERSION}-arm64
|
||||
|
||||
# Create multi-arch manifest:
|
||||
|
||||
docker manifest rm vllm/vllm-openai:latest
|
||||
docker manifest create vllm/vllm-openai:latest vllm/vllm-openai:latest-x86_64 vllm/vllm-openai:latest-aarch64
|
||||
docker manifest create vllm/vllm-openai:v${RELEASE_VERSION} vllm/vllm-openai:v${RELEASE_VERSION}-x86_64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
|
||||
docker manifest push vllm/vllm-openai:latest
|
||||
docker manifest push vllm/vllm-openai:v${RELEASE_VERSION}
|
||||
|
||||
docker manifest rm vllm/vllm-openai:latest-cu130
|
||||
docker manifest create vllm/vllm-openai:latest-cu130 vllm/vllm-openai:latest-x86_64-cu130 vllm/vllm-openai:latest-aarch64-cu130
|
||||
docker manifest create vllm/vllm-openai:v${RELEASE_VERSION}-cu130 vllm/vllm-openai:v${RELEASE_VERSION}-x86_64-cu130 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64-cu130
|
||||
docker manifest push vllm/vllm-openai:latest-cu130
|
||||
docker manifest push vllm/vllm-openai:v${RELEASE_VERSION}-cu130
|
||||
|
||||
docker manifest rm vllm/vllm-openai-cpu:latest || true
|
||||
docker manifest create vllm/vllm-openai-cpu:latest vllm/vllm-openai-cpu:latest-x86_64 vllm/vllm-openai-cpu:latest-arm64
|
||||
docker manifest create vllm/vllm-openai-cpu:v${RELEASE_VERSION} vllm/vllm-openai-cpu:v${RELEASE_VERSION}-x86_64 vllm/vllm-openai-cpu:v${RELEASE_VERSION}-arm64
|
||||
docker manifest push vllm/vllm-openai-cpu:latest
|
||||
docker manifest push vllm/vllm-openai-cpu:v${RELEASE_VERSION}
|
||||
\`\`\`
|
||||
EOF
|
||||
EOF
|
||||
|
||||
112
.buildkite/scripts/annotate-rocm-release.sh
Executable file
112
.buildkite/scripts/annotate-rocm-release.sh
Executable file
@@ -0,0 +1,112 @@
|
||||
#!/bin/bash
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
#
|
||||
# Generate Buildkite annotation for ROCm wheel release
|
||||
set -ex
|
||||
|
||||
# Get build configuration from meta-data
|
||||
# Extract ROCm version dynamically from Dockerfile.rocm_base
|
||||
# BASE_IMAGE format: rocm/dev-ubuntu-22.04:7.0-complete -> extracts "7.0"
|
||||
ROCM_VERSION=$(grep -E '^ARG BASE_IMAGE=' docker/Dockerfile.rocm_base | sed -E 's/.*:([0-9]+\.[0-9]+).*/\1/' || echo "unknown")
|
||||
PYTHON_VERSION=$(buildkite-agent meta-data get rocm-python-version 2>/dev/null || echo "3.12")
|
||||
PYTORCH_ROCM_ARCH=$(buildkite-agent meta-data get rocm-pytorch-rocm-arch 2>/dev/null || echo "gfx90a;gfx942;gfx950;gfx1100;gfx1101;gfx1200;gfx1201;gfx1150;gfx1151")
|
||||
|
||||
# TODO: Enable the nightly build for ROCm
|
||||
# Get release version, default to 1.0.0.dev for nightly/per-commit builds
|
||||
RELEASE_VERSION=$(buildkite-agent meta-data get release-version 2>/dev/null || echo "")
|
||||
if [ -z "${RELEASE_VERSION}" ]; then
|
||||
RELEASE_VERSION="1.0.0.dev"
|
||||
fi
|
||||
|
||||
# S3 URLs
|
||||
S3_BUCKET="${S3_BUCKET:-vllm-wheels}"
|
||||
S3_REGION="${AWS_DEFAULT_REGION:-us-west-2}"
|
||||
S3_URL="http://${S3_BUCKET}.s3-website-${S3_REGION}.amazonaws.com"
|
||||
|
||||
# Format ROCm version for path (e.g., "7.1" -> "rocm710")
|
||||
ROCM_VERSION_PATH="rocm$(echo ${ROCM_VERSION} | tr -d '.')"
|
||||
ROCM_PATH="rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}"
|
||||
buildkite-agent annotate --style 'success' --context 'rocm-release-workflow' << EOF
|
||||
## ROCm Wheel and Docker Image Releases
|
||||
### Build Configuration
|
||||
| Setting | Value |
|
||||
|---------|-------|
|
||||
| **ROCm Version** | ${ROCM_VERSION} |
|
||||
| **Python Version** | ${PYTHON_VERSION} |
|
||||
| **GPU Architectures** | ${PYTORCH_ROCM_ARCH} |
|
||||
| **Branch** | \`${BUILDKITE_BRANCH}\` |
|
||||
| **Commit** | \`${BUILDKITE_COMMIT}\` |
|
||||
|
||||
### :package: Installation
|
||||
|
||||
**Install from this build (by commit):**
|
||||
|
||||
\`\`\`bash
|
||||
pip install vllm --extra-index-url ${S3_URL}/${ROCM_PATH}/ --trusted-host ${S3_BUCKET}.s3-website-${S3_REGION}.amazonaws.com
|
||||
|
||||
# Example for ROCm ${ROCM_VERSION}:
|
||||
pip install vllm --extra-index-url ${S3_URL}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/ --trusted-host ${S3_BUCKET}.s3-website-${S3_REGION}.amazonaws.com
|
||||
\`\`\`
|
||||
|
||||
**Install from nightly (if published):**
|
||||
|
||||
\`\`\`bash
|
||||
pip install vllm --extra-index-url ${S3_URL}/rocm/nightly/ --trusted-host ${S3_BUCKET}.s3-website-${S3_REGION}.amazonaws.com
|
||||
\`\`\`
|
||||
|
||||
### :floppy_disk: Download Wheels Directly
|
||||
|
||||
\`\`\`bash
|
||||
# List all ROCm wheels
|
||||
aws s3 ls s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/
|
||||
# Download specific wheels
|
||||
aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/vllm-*.whl .
|
||||
aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/torch-*.whl .
|
||||
aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/triton-*.whl .
|
||||
aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/triton-kernels-*.whl .
|
||||
aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/torchvision-*.whl .
|
||||
aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/torchaudio-*.whl .
|
||||
aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/amdsmi-*.whl .
|
||||
aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/aiter-*.whl .
|
||||
aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/flash-attn-*.whl .
|
||||
\`\`\`
|
||||
|
||||
### :gear: Included Packages
|
||||
- **vllm**: vLLM with ROCm support
|
||||
- **torch**: PyTorch built for ROCm ${ROCM_VERSION}
|
||||
- **triton**: Triton
|
||||
- **triton-kernels**: Triton kernels
|
||||
- **torchvision**: TorchVision for ROCm PyTorch
|
||||
- **torchaudio**: Torchaudio for ROCm PyTorch
|
||||
- **amdsmi**: AMD SMI Python bindings
|
||||
- **aiter**: Aiter for ROCm
|
||||
- **flash-attn**: Flash Attention for ROCm
|
||||
|
||||
### :warning: Notes
|
||||
- These wheels are built for **ROCm ${ROCM_VERSION}** and will NOT work with CUDA GPUs
|
||||
- Supported GPU architectures: ${PYTORCH_ROCM_ARCH}
|
||||
- Platform: Linux x86_64 only
|
||||
|
||||
### :package: Docker Image Release
|
||||
|
||||
To download and upload the image:
|
||||
|
||||
\`\`\`
|
||||
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm-base
|
||||
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm
|
||||
|
||||
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm-base vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-base
|
||||
docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-base vllm/vllm-openai-rocm:latest-base
|
||||
docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-base vllm/vllm-openai-rocm:v${RELEASE_VERSION}-base
|
||||
docker push vllm/vllm-openai-rocm:latest-base
|
||||
docker push vllm/vllm-openai-rocm:v${RELEASE_VERSION}-base
|
||||
|
||||
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}
|
||||
docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT} vllm/vllm-openai-rocm:latest
|
||||
docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT} vllm/vllm-openai-rocm:v${RELEASE_VERSION}
|
||||
docker push vllm/vllm-openai-rocm:latest
|
||||
docker push vllm/vllm-openai-rocm:v${RELEASE_VERSION}
|
||||
\`\`\`
|
||||
|
||||
EOF
|
||||
140
.buildkite/scripts/cache-rocm-base-wheels.sh
Executable file
140
.buildkite/scripts/cache-rocm-base-wheels.sh
Executable file
@@ -0,0 +1,140 @@
|
||||
#!/usr/bin/env bash
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
#
|
||||
# Cache helper for ROCm base wheels
|
||||
#
|
||||
# This script manages caching of pre-built ROCm base wheels (torch, triton, etc.)
|
||||
# to avoid rebuilding them when Dockerfile.rocm_base hasn't changed.
|
||||
#
|
||||
# Usage:
|
||||
# cache-rocm-base-wheels.sh check - Check if cache exists, outputs "hit" or "miss"
|
||||
# cache-rocm-base-wheels.sh upload - Upload wheels to cache
|
||||
# cache-rocm-base-wheels.sh download - Download wheels from cache
|
||||
# cache-rocm-base-wheels.sh key - Output the cache key
|
||||
#
|
||||
# Environment variables:
|
||||
# S3_BUCKET - S3 bucket name (default: vllm-wheels)
|
||||
# PYTHON_VERSION - Python version (affects cache key)
|
||||
# PYTORCH_ROCM_ARCH - GPU architectures (affects cache key)
|
||||
#
|
||||
# Note: ROCm version is determined by BASE_IMAGE in Dockerfile.rocm_base,
|
||||
# so changes to ROCm version are captured by the Dockerfile hash.
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
BUCKET="${S3_BUCKET:-vllm-wheels}"
|
||||
DOCKERFILE="docker/Dockerfile.rocm_base"
|
||||
CACHE_PREFIX="rocm/cache"
|
||||
|
||||
# Generate hash from Dockerfile content + build args
|
||||
generate_cache_key() {
|
||||
# Include Dockerfile content
|
||||
if [[ ! -f "$DOCKERFILE" ]]; then
|
||||
echo "ERROR: Dockerfile not found: $DOCKERFILE" >&2
|
||||
exit 1
|
||||
fi
|
||||
local dockerfile_hash=$(sha256sum "$DOCKERFILE" | cut -c1-16)
|
||||
|
||||
# Include key build args that affect the output
|
||||
# These should match the ARGs in Dockerfile.rocm_base that change the build output
|
||||
# Note: ROCm version is determined by BASE_IMAGE in the Dockerfile, so it's captured by dockerfile_hash
|
||||
local args_string="${PYTHON_VERSION:-}|${PYTORCH_ROCM_ARCH:-}"
|
||||
local args_hash=$(echo "$args_string" | sha256sum | cut -c1-8)
|
||||
|
||||
echo "${dockerfile_hash}-${args_hash}"
|
||||
}
|
||||
|
||||
CACHE_KEY=$(generate_cache_key)
|
||||
CACHE_PATH="s3://${BUCKET}/${CACHE_PREFIX}/${CACHE_KEY}/"
|
||||
|
||||
case "${1:-}" in
|
||||
check)
|
||||
echo "Checking cache for key: ${CACHE_KEY}" >&2
|
||||
echo "Cache path: ${CACHE_PATH}" >&2
|
||||
echo "Variables used in cache key:" >&2
|
||||
echo " PYTHON_VERSION: ${PYTHON_VERSION:-<not set>}" >&2
|
||||
echo " PYTORCH_ROCM_ARCH: ${PYTORCH_ROCM_ARCH:-<not set>}" >&2
|
||||
|
||||
# Check if cache exists by listing objects
|
||||
# We look for at least one .whl file
|
||||
echo "Running: aws s3 ls ${CACHE_PATH}" >&2
|
||||
S3_OUTPUT=$(aws s3 ls "${CACHE_PATH}" 2>&1) || true
|
||||
echo "S3 ls output:" >&2
|
||||
echo "$S3_OUTPUT" | head -5 >&2
|
||||
|
||||
if echo "$S3_OUTPUT" | grep -q "\.whl"; then
|
||||
echo "hit"
|
||||
else
|
||||
echo "miss"
|
||||
fi
|
||||
;;
|
||||
|
||||
upload)
|
||||
echo "========================================"
|
||||
echo "Uploading wheels to cache"
|
||||
echo "========================================"
|
||||
echo "Cache key: ${CACHE_KEY}"
|
||||
echo "Cache path: ${CACHE_PATH}"
|
||||
echo ""
|
||||
|
||||
if [[ ! -d "artifacts/rocm-base-wheels" ]]; then
|
||||
echo "ERROR: artifacts/rocm-base-wheels directory not found" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
WHEEL_COUNT=$(ls artifacts/rocm-base-wheels/*.whl 2>/dev/null | wc -l)
|
||||
if [[ "$WHEEL_COUNT" -eq 0 ]]; then
|
||||
echo "ERROR: No wheels found in artifacts/rocm-base-wheels/" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "Uploading $WHEEL_COUNT wheels..."
|
||||
aws s3 cp --recursive artifacts/rocm-base-wheels/ "${CACHE_PATH}"
|
||||
|
||||
echo ""
|
||||
echo "Cache upload complete!"
|
||||
echo "========================================"
|
||||
;;
|
||||
|
||||
download)
|
||||
echo "========================================"
|
||||
echo "Downloading wheels from cache"
|
||||
echo "========================================"
|
||||
echo "Cache key: ${CACHE_KEY}"
|
||||
echo "Cache path: ${CACHE_PATH}"
|
||||
echo ""
|
||||
|
||||
mkdir -p artifacts/rocm-base-wheels
|
||||
aws s3 cp --recursive "${CACHE_PATH}" artifacts/rocm-base-wheels/
|
||||
|
||||
echo ""
|
||||
echo "Downloaded wheels:"
|
||||
ls -lh artifacts/rocm-base-wheels/
|
||||
|
||||
WHEEL_COUNT=$(ls artifacts/rocm-base-wheels/*.whl 2>/dev/null | wc -l)
|
||||
echo ""
|
||||
echo "Total: $WHEEL_COUNT wheels"
|
||||
echo "========================================"
|
||||
;;
|
||||
|
||||
key)
|
||||
echo "${CACHE_KEY}"
|
||||
;;
|
||||
|
||||
path)
|
||||
echo "${CACHE_PATH}"
|
||||
;;
|
||||
|
||||
*)
|
||||
echo "Usage: $0 {check|upload|download|key|path}" >&2
|
||||
echo "" >&2
|
||||
echo "Commands:" >&2
|
||||
echo " check - Check if cache exists, outputs 'hit' or 'miss'" >&2
|
||||
echo " upload - Upload wheels from artifacts/rocm-base-wheels/ to cache" >&2
|
||||
echo " download - Download wheels from cache to artifacts/rocm-base-wheels/" >&2
|
||||
echo " key - Output the cache key" >&2
|
||||
echo " path - Output the full S3 cache path" >&2
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
242
.buildkite/scripts/cherry-pick-from-milestone.sh
Executable file
242
.buildkite/scripts/cherry-pick-from-milestone.sh
Executable file
@@ -0,0 +1,242 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# cherry-pick-from-milestone.sh
|
||||
# Find commits from a GitHub milestone that are missing from the current branch
|
||||
# and output them in chronological order for cherry-picking.
|
||||
#
|
||||
# Usage: ./cherry-pick-from-milestone.sh <milestone> [--dry-run] [--execute]
|
||||
#
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
# Colors for output
|
||||
RED='\033[0;31m'
|
||||
GREEN='\033[0;32m'
|
||||
YELLOW='\033[1;33m'
|
||||
BLUE='\033[0;34m'
|
||||
NC='\033[0m' # No Color
|
||||
|
||||
usage() {
|
||||
cat <<EOF
|
||||
Usage: $(basename "$0") <milestone> [options]
|
||||
|
||||
Find commits from a GitHub milestone that need to be cherry-picked into the current branch.
|
||||
|
||||
Arguments:
|
||||
milestone The GitHub milestone name (e.g., v0.14.0)
|
||||
|
||||
Options:
|
||||
--dry-run Show the cherry-pick commands without executing (default)
|
||||
--execute Actually execute the cherry-picks
|
||||
--main-branch Specify the main branch name (default: main)
|
||||
--help Show this help message
|
||||
|
||||
Examples:
|
||||
$(basename "$0") v0.14.0
|
||||
$(basename "$0") v0.14.0 --dry-run
|
||||
$(basename "$0") v0.14.0 --execute
|
||||
$(basename "$0") v0.14.0 --main-branch master
|
||||
EOF
|
||||
exit 1
|
||||
}
|
||||
|
||||
log_info() {
|
||||
echo -e "${BLUE}[INFO]${NC} $1"
|
||||
}
|
||||
|
||||
log_success() {
|
||||
echo -e "${GREEN}[OK]${NC} $1"
|
||||
}
|
||||
|
||||
log_warn() {
|
||||
echo -e "${YELLOW}[WARN]${NC} $1"
|
||||
}
|
||||
|
||||
log_error() {
|
||||
echo -e "${RED}[ERROR]${NC} $1" >&2
|
||||
}
|
||||
|
||||
# Default values
|
||||
MILESTONE=""
|
||||
DRY_RUN=true
|
||||
MAIN_BRANCH="main"
|
||||
|
||||
# Parse arguments
|
||||
while [[ $# -gt 0 ]]; do
|
||||
case $1 in
|
||||
--dry-run)
|
||||
DRY_RUN=true
|
||||
shift
|
||||
;;
|
||||
--execute)
|
||||
DRY_RUN=false
|
||||
shift
|
||||
;;
|
||||
--main-branch)
|
||||
MAIN_BRANCH="$2"
|
||||
shift 2
|
||||
;;
|
||||
--help|-h)
|
||||
usage
|
||||
;;
|
||||
-*)
|
||||
log_error "Unknown option: $1"
|
||||
usage
|
||||
;;
|
||||
*)
|
||||
if [[ -z "$MILESTONE" ]]; then
|
||||
MILESTONE="$1"
|
||||
else
|
||||
log_error "Unexpected argument: $1"
|
||||
usage
|
||||
fi
|
||||
shift
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
# Validate milestone argument
|
||||
if [[ -z "$MILESTONE" ]]; then
|
||||
log_error "Milestone is required"
|
||||
usage
|
||||
fi
|
||||
|
||||
# Check if we're in a git repository
|
||||
if ! git rev-parse --is-inside-work-tree &>/dev/null; then
|
||||
log_error "Not in a git repository"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Check if gh CLI is available
|
||||
if ! command -v gh &>/dev/null; then
|
||||
log_error "GitHub CLI (gh) is not installed"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Check if authenticated with gh
|
||||
if ! gh auth status &>/dev/null; then
|
||||
log_error "Not authenticated with GitHub CLI. Run 'gh auth login' first."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
CURRENT_BRANCH=$(git branch --show-current)
|
||||
log_info "Current branch: ${CURRENT_BRANCH}"
|
||||
log_info "Main branch: ${MAIN_BRANCH}"
|
||||
log_info "Milestone: ${MILESTONE}"
|
||||
echo ""
|
||||
|
||||
# Fetch latest from remote
|
||||
log_info "Fetching latest from remote..."
|
||||
git fetch origin "$MAIN_BRANCH" --quiet
|
||||
|
||||
# Get merged PRs from the milestone, sorted by merge date
|
||||
log_info "Fetching merged PRs from milestone '${MILESTONE}'..."
|
||||
|
||||
# Store PR data in a temp file
|
||||
PR_DATA=$(mktemp)
|
||||
trap "rm -f $PR_DATA" EXIT
|
||||
|
||||
if ! gh pr list --state merged --search "milestone:${MILESTONE}" \
|
||||
--limit 1000 \
|
||||
--json number,title,mergeCommit,mergedAt \
|
||||
--jq 'sort_by(.mergedAt) | .[] | "\(.mergeCommit.oid)\t\(.number)\t\(.title)"' > "$PR_DATA" 2>/dev/null; then
|
||||
log_error "Failed to fetch PRs from milestone '${MILESTONE}'"
|
||||
log_error "This could be due to:"
|
||||
log_error " - Milestone does not exist"
|
||||
log_error " - Network/authentication issues"
|
||||
log_error " - Invalid milestone name format"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [[ ! -s "$PR_DATA" ]]; then
|
||||
log_warn "No merged PRs found for milestone '${MILESTONE}'"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
TOTAL_PRS=$(wc -l < "$PR_DATA")
|
||||
log_info "Found ${TOTAL_PRS} merged PR(s) in milestone"
|
||||
echo ""
|
||||
|
||||
# Find commits that are missing from current branch
|
||||
MISSING_COMMITS=()
|
||||
MISSING_INFO=()
|
||||
|
||||
while IFS=$'\t' read -r sha pr_number title; do
|
||||
# Skip if SHA is empty or null
|
||||
if [[ -z "$sha" || "$sha" == "null" ]]; then
|
||||
log_warn "PR #${pr_number} has no merge commit SHA, skipping"
|
||||
continue
|
||||
fi
|
||||
|
||||
# Check if this commit is already in the current branch
|
||||
if git merge-base --is-ancestor "$sha" HEAD 2>/dev/null; then
|
||||
log_success "PR #${pr_number} already in branch: ${title:0:60}"
|
||||
else
|
||||
log_warn "PR #${pr_number} MISSING: ${title:0:60}"
|
||||
MISSING_COMMITS+=("$sha")
|
||||
MISSING_INFO+=("$sha PR #${pr_number}: ${title}")
|
||||
fi
|
||||
done < "$PR_DATA"
|
||||
|
||||
echo ""
|
||||
|
||||
if [[ ${#MISSING_COMMITS[@]} -eq 0 ]]; then
|
||||
log_success "All PRs from milestone '${MILESTONE}' are already in the current branch!"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
log_info "Found ${#MISSING_COMMITS[@]} missing commit(s) to cherry-pick"
|
||||
echo ""
|
||||
|
||||
# Output the cherry-pick commands
|
||||
echo "=========================================="
|
||||
echo "Cherry-pick commands (in chronological order):"
|
||||
echo "=========================================="
|
||||
echo ""
|
||||
|
||||
for info in "${MISSING_INFO[@]}"; do
|
||||
echo "# $info"
|
||||
done
|
||||
echo ""
|
||||
|
||||
echo "# Run these commands to cherry-pick all missing commits:"
|
||||
echo "git cherry-pick ${MISSING_COMMITS[*]}"
|
||||
echo ""
|
||||
|
||||
# Or one by one
|
||||
echo "# Or cherry-pick one at a time:"
|
||||
for sha in "${MISSING_COMMITS[@]}"; do
|
||||
echo "git cherry-pick $sha"
|
||||
done
|
||||
echo ""
|
||||
|
||||
# Execute if requested
|
||||
if [[ "$DRY_RUN" == false ]]; then
|
||||
echo "=========================================="
|
||||
log_info "Executing cherry-picks..."
|
||||
echo "=========================================="
|
||||
|
||||
for i in "${!MISSING_COMMITS[@]}"; do
|
||||
sha="${MISSING_COMMITS[$i]}"
|
||||
info="${MISSING_INFO[$i]}"
|
||||
|
||||
echo ""
|
||||
log_info "Cherry-picking: $info"
|
||||
|
||||
if git cherry-pick "$sha"; then
|
||||
log_success "Successfully cherry-picked $sha"
|
||||
else
|
||||
log_error "Failed to cherry-pick $sha"
|
||||
log_error "Resolve conflicts and run 'git cherry-pick --continue', or 'git cherry-pick --abort' to cancel"
|
||||
exit 1
|
||||
fi
|
||||
done
|
||||
|
||||
echo ""
|
||||
log_success "All cherry-picks completed successfully!"
|
||||
else
|
||||
echo "=========================================="
|
||||
echo -e "${YELLOW}Dry run mode - no changes made${NC}"
|
||||
echo "Run with --execute to perform the cherry-picks"
|
||||
echo "=========================================="
|
||||
fi
|
||||
@@ -3,7 +3,14 @@
|
||||
set -ex
|
||||
|
||||
# Clean up old nightly builds from DockerHub, keeping only the last 14 builds
|
||||
# This script uses DockerHub API to list and delete old tags with "nightly-" prefix
|
||||
# This script uses DockerHub API to list and delete old tags with specified prefix
|
||||
# Usage: cleanup-nightly-builds.sh [TAG_PREFIX]
|
||||
# Example: cleanup-nightly-builds.sh "nightly-" or cleanup-nightly-builds.sh "cu130-nightly-"
|
||||
|
||||
# Get tag prefix from argument, default to "nightly-" if not provided
|
||||
TAG_PREFIX="${1:-nightly-}"
|
||||
|
||||
echo "Cleaning up tags with prefix: $TAG_PREFIX"
|
||||
|
||||
# DockerHub API endpoint for vllm/vllm-openai repository
|
||||
REPO_API_URL="https://hub.docker.com/v2/repositories/vllm/vllm-openai/tags"
|
||||
@@ -45,7 +52,7 @@ get_all_tags() {
|
||||
set -x
|
||||
|
||||
# Get both last_updated timestamp and tag name, separated by |
|
||||
local tags=$(echo "$response" | jq -r '.results[] | select(.name | startswith("nightly-")) | "\(.last_updated)|\(.name)"')
|
||||
local tags=$(echo "$response" | jq -r --arg prefix "$TAG_PREFIX" '.results[] | select(.name | startswith($prefix)) | "\(.last_updated)|\(.name)"')
|
||||
|
||||
if [ -z "$tags" ]; then
|
||||
break
|
||||
|
||||
@@ -16,6 +16,18 @@ from urllib.parse import quote
|
||||
|
||||
import regex as re
|
||||
|
||||
|
||||
def normalize_package_name(name: str) -> str:
|
||||
"""
|
||||
Normalize package name according to PEP 503.
|
||||
https://peps.python.org/pep-0503/#normalized-names
|
||||
|
||||
Replace runs of underscores, hyphens, and periods with a single hyphen,
|
||||
and lowercase the result.
|
||||
"""
|
||||
return re.sub(r"[-_.]+", "-", name).lower()
|
||||
|
||||
|
||||
if not sys.version_info >= (3, 12):
|
||||
raise RuntimeError("This script requires Python 3.12 or higher.")
|
||||
|
||||
@@ -78,7 +90,13 @@ def parse_from_filename(file: str) -> WheelFileInfo:
|
||||
version = version.removesuffix("." + variant)
|
||||
else:
|
||||
if "+" in version:
|
||||
version, variant = version.split("+")
|
||||
version_part, suffix = version.split("+", 1)
|
||||
# Only treat known patterns as variants (rocmXXX, cuXXX, cpu)
|
||||
# Git hashes and other suffixes are NOT variants
|
||||
if suffix.startswith(("rocm", "cu", "cpu")):
|
||||
variant = suffix
|
||||
version = version_part
|
||||
# Otherwise keep the full version string (variant stays None)
|
||||
|
||||
return WheelFileInfo(
|
||||
package_name=package_name,
|
||||
@@ -94,7 +112,7 @@ def parse_from_filename(file: str) -> WheelFileInfo:
|
||||
|
||||
def generate_project_list(subdir_names: list[str], comment: str = "") -> str:
|
||||
"""
|
||||
Generate project list HTML content linking to each project & variant sub-directory.
|
||||
Generate project list HTML content linking to each project & variant subdirectory.
|
||||
"""
|
||||
href_tags = []
|
||||
for name in sorted(subdir_names):
|
||||
@@ -150,23 +168,23 @@ def generate_index_and_metadata(
|
||||
comment (str | None): Optional comment to include in the generated HTML files.
|
||||
|
||||
First, parse all wheel files to extract metadata.
|
||||
We need to collect all wheel files for each variant, and generate an index for it (in a sub-directory).
|
||||
We need to collect all wheel files for each variant, and generate an index for it (in a subdirectory).
|
||||
The index for the default variant (if any) is generated in the root index directory.
|
||||
|
||||
If `default_variant` is provided, all wheels must have variant suffixes, and the default variant index
|
||||
is purely a copy of the corresponding variant index, with only the links adjusted.
|
||||
Otherwise, all wheels without variant suffixes are treated as the default variant.
|
||||
|
||||
If `alias_to_default` is provided, an additional alias sub-directory is created, it has the same content
|
||||
If `alias_to_default` is provided, an additional alias subdirectory is created, it has the same content
|
||||
as the default variant index, but the links are adjusted accordingly.
|
||||
|
||||
Index directory structure:
|
||||
index_base_dir/ (hosted at wheels.vllm.ai/{nightly,$commit,$version}/)
|
||||
index.html # project list, linking to "vllm/" and other packages, and all variant sub-directories
|
||||
index.html # project list, linking to "vllm/" and other packages, and all variant subdirectories
|
||||
vllm/
|
||||
index.html # package index, pointing to actual files in wheel_base_dir (relative path)
|
||||
metadata.json # machine-readable metadata for all wheels in this package
|
||||
cpu/ # cpu variant sub-directory
|
||||
cpu/ # cpu variant subdirectory
|
||||
index.html
|
||||
vllm/
|
||||
index.html
|
||||
@@ -176,7 +194,7 @@ def generate_index_and_metadata(
|
||||
vllm/
|
||||
index.html
|
||||
metadata.json
|
||||
cu130/ # cu130 variant sub-directory
|
||||
cu130/ # cu130 variant subdirectory
|
||||
index.html
|
||||
vllm/
|
||||
index.html
|
||||
@@ -206,6 +224,26 @@ def generate_index_and_metadata(
|
||||
print("No wheel files found, skipping index generation.")
|
||||
return
|
||||
|
||||
# For ROCm builds: inherit variant from vllm wheel
|
||||
# All ROCm wheels should share the same variant as vllm
|
||||
rocm_variant = None
|
||||
for file in parsed_files:
|
||||
if (
|
||||
file.package_name == "vllm"
|
||||
and file.variant
|
||||
and file.variant.startswith("rocm")
|
||||
):
|
||||
rocm_variant = file.variant
|
||||
print(f"Detected ROCm variant from vllm: {rocm_variant}")
|
||||
break
|
||||
|
||||
# Apply ROCm variant to all wheels without a variant
|
||||
if rocm_variant:
|
||||
for file in parsed_files:
|
||||
if file.variant is None:
|
||||
file.variant = rocm_variant
|
||||
print(f"Inherited variant '{rocm_variant}' for {file.filename}")
|
||||
|
||||
# Group by variant
|
||||
variant_to_files: dict[str, list[WheelFileInfo]] = {}
|
||||
for file in parsed_files:
|
||||
@@ -256,8 +294,8 @@ def generate_index_and_metadata(
|
||||
|
||||
variant_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# gather all package names in this variant
|
||||
packages = set(f.package_name for f in files)
|
||||
# gather all package names in this variant (normalized per PEP 503)
|
||||
packages = set(normalize_package_name(f.package_name) for f in files)
|
||||
if variant == "default":
|
||||
# these packages should also appear in the "project list"
|
||||
# generate after all variants are processed
|
||||
@@ -269,8 +307,10 @@ def generate_index_and_metadata(
|
||||
f.write(project_list_str)
|
||||
|
||||
for package in packages:
|
||||
# filter files belonging to this package only
|
||||
package_files = [f for f in files if f.package_name == package]
|
||||
# filter files belonging to this package only (compare normalized names)
|
||||
package_files = [
|
||||
f for f in files if normalize_package_name(f.package_name) == package
|
||||
]
|
||||
package_dir = variant_dir / package
|
||||
package_dir.mkdir(parents=True, exist_ok=True)
|
||||
index_str, metadata_str = generate_package_index_and_metadata(
|
||||
@@ -341,8 +381,13 @@ if __name__ == "__main__":
|
||||
args = parser.parse_args()
|
||||
|
||||
version = args.version
|
||||
if "/" in version or "\\" in version:
|
||||
raise ValueError("Version string must not contain slashes.")
|
||||
# Allow rocm/ prefix, reject other slashes and all backslashes
|
||||
if "\\" in version:
|
||||
raise ValueError("Version string must not contain backslashes.")
|
||||
if "/" in version and not version.startswith("rocm/"):
|
||||
raise ValueError(
|
||||
"Version string must not contain slashes (except for 'rocm/' prefix)."
|
||||
)
|
||||
current_objects_path = Path(args.current_objects)
|
||||
output_dir = Path(args.output_dir)
|
||||
if not output_dir.exists():
|
||||
@@ -393,8 +438,23 @@ if __name__ == "__main__":
|
||||
# Generate index and metadata, assuming wheels and indices are stored as:
|
||||
# s3://vllm-wheels/{wheel_dir}/<wheel files>
|
||||
# s3://vllm-wheels/<anything>/<index files>
|
||||
wheel_dir = args.wheel_dir or version
|
||||
wheel_base_dir = Path(output_dir).parent / wheel_dir.strip().rstrip("/")
|
||||
#
|
||||
# For ROCm builds, version is "rocm/{commit}" and indices are uploaded to:
|
||||
# - rocm/{commit}/ (same as wheels)
|
||||
# - rocm/nightly/
|
||||
# - rocm/{version}/
|
||||
# All these are under the "rocm/" prefix, so relative paths should be
|
||||
# relative to "rocm/", not the bucket root.
|
||||
if args.wheel_dir:
|
||||
# Explicit wheel-dir provided (e.g., for version-specific indices pointing to commit dir)
|
||||
wheel_dir = args.wheel_dir.strip().rstrip("/")
|
||||
elif version.startswith("rocm/"):
|
||||
# For rocm/commit, wheel_base_dir should be just the commit part
|
||||
# so relative path from rocm/0.12.0/rocm710/vllm/ -> ../../../{commit}/
|
||||
wheel_dir = version.split("/", 1)[1]
|
||||
else:
|
||||
wheel_dir = version
|
||||
wheel_base_dir = Path(output_dir).parent / wheel_dir
|
||||
index_base_dir = Path(output_dir)
|
||||
|
||||
generate_index_and_metadata(
|
||||
|
||||
@@ -44,6 +44,17 @@ cleanup_docker() {
|
||||
fi
|
||||
}
|
||||
|
||||
cleanup_network() {
|
||||
for node in $(seq 0 $((NUM_NODES-1))); do
|
||||
if docker pr -a -q -f name="node${node}" | grep -q .; then
|
||||
docker stop "node${node}"
|
||||
fi
|
||||
done
|
||||
if docker network ls | grep docker-net; then
|
||||
docker network rm docker-net
|
||||
fi
|
||||
}
|
||||
|
||||
# Call the cleanup docker function
|
||||
cleanup_docker
|
||||
|
||||
@@ -76,7 +87,7 @@ mkdir -p "${HF_CACHE}"
|
||||
HF_MOUNT="/root/.cache/huggingface"
|
||||
|
||||
commands=$@
|
||||
echo "Commands:$commands"
|
||||
echo "Raw commands: $commands"
|
||||
|
||||
commands=${commands//"pytest -v -s basic_correctness/test_basic_correctness.py"/"pytest -v -s basic_correctness/test_basic_correctness.py"}
|
||||
|
||||
@@ -158,6 +169,9 @@ if [[ $commands == *" entrypoints/llm "* ]]; then
|
||||
--ignore=entrypoints/llm/test_prompt_validation.py "}
|
||||
fi
|
||||
|
||||
commands=$(echo "$commands" | sed 's/ \\ / /g')
|
||||
echo "Final commands: $commands"
|
||||
|
||||
# --ignore=entrypoints/openai/test_encoder_decoder.py \
|
||||
# --ignore=entrypoints/openai/test_embedding.py \
|
||||
# --ignore=entrypoints/openai/test_oot_registration.py
|
||||
@@ -165,7 +179,6 @@ fi
|
||||
# --ignore=entrypoints/openai/test_models.py <= Fails on MI250 but passes on MI300 as of 2025-03-13
|
||||
|
||||
|
||||
PARALLEL_JOB_COUNT=8
|
||||
MYPYTHONPATH=".."
|
||||
|
||||
# Test that we're launching on the machine that has
|
||||
@@ -176,53 +189,33 @@ if [[ -z "$render_gid" ]]; then
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# check if the command contains shard flag, we will run all shards in parallel because the host have 8 GPUs.
|
||||
if [[ $commands == *"--shard-id="* ]]; then
|
||||
# assign job count as the number of shards used
|
||||
commands=$(echo "$commands" | sed -E "s/--num-shards[[:blank:]]*=[[:blank:]]*[0-9]*/--num-shards=${PARALLEL_JOB_COUNT} /g" | sed 's/ \\ / /g')
|
||||
for GPU in $(seq 0 $(($PARALLEL_JOB_COUNT-1))); do
|
||||
# assign shard-id for each shard
|
||||
commands_gpu=$(echo "$commands" | sed -E "s/--shard-id[[:blank:]]*=[[:blank:]]*[0-9]*/--shard-id=${GPU} /g" | sed 's/ \\ / /g')
|
||||
echo "Shard ${GPU} commands:$commands_gpu"
|
||||
echo "Render devices: $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES"
|
||||
docker run \
|
||||
--device /dev/kfd $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES \
|
||||
--network=host \
|
||||
--shm-size=16gb \
|
||||
--group-add "$render_gid" \
|
||||
--rm \
|
||||
-e HIP_VISIBLE_DEVICES="${GPU}" \
|
||||
-e HF_TOKEN \
|
||||
-e AWS_ACCESS_KEY_ID \
|
||||
-e AWS_SECRET_ACCESS_KEY \
|
||||
-v "${HF_CACHE}:${HF_MOUNT}" \
|
||||
-e "HF_HOME=${HF_MOUNT}" \
|
||||
-e "PYTHONPATH=${MYPYTHONPATH}" \
|
||||
--name "${container_name}_${GPU}" \
|
||||
"${image_name}" \
|
||||
/bin/bash -c "${commands_gpu}" \
|
||||
|& while read -r line; do echo ">>Shard $GPU: $line"; done &
|
||||
PIDS+=($!)
|
||||
done
|
||||
#wait for all processes to finish and collect exit codes
|
||||
for pid in "${PIDS[@]}"; do
|
||||
wait "${pid}"
|
||||
STATUS+=($?)
|
||||
done
|
||||
at_least_one_shard_with_tests=0
|
||||
for st in "${STATUS[@]}"; do
|
||||
if [[ ${st} -ne 0 ]] && [[ ${st} -ne 5 ]]; then
|
||||
echo "One of the processes failed with $st"
|
||||
exit "${st}"
|
||||
elif [[ ${st} -eq 5 ]]; then
|
||||
echo "Shard exited with status 5 (no tests collected) - treating as success"
|
||||
else # This means st is 0
|
||||
at_least_one_shard_with_tests=1
|
||||
fi
|
||||
done
|
||||
if [[ ${#STATUS[@]} -gt 0 && ${at_least_one_shard_with_tests} -eq 0 ]]; then
|
||||
echo "All shards reported no tests collected. Failing the build."
|
||||
exit 1
|
||||
if [[ $commands == *"VLLM_TEST_GROUP_NAME=mi325_4-2-node-tests-4-gpus-in-total"* ]]; then
|
||||
|
||||
export DCKR_VER=$(docker --version | sed 's/Docker version \(.*\), build .*/\1/')
|
||||
|
||||
if [[ "$commands" =~ ^(.*)"["(.*)"] && ["(.*)"]"$ ]]; then
|
||||
prefix=$( echo "${BASH_REMATCH[1]}" | sed 's/;//g')
|
||||
echo "PREFIX: ${prefix}"
|
||||
export composite_command="(command rocm-smi || true)"
|
||||
myIFS=$IFS
|
||||
IFS=','
|
||||
read -ra node0 <<< ${BASH_REMATCH[2]}
|
||||
read -ra node1 <<< ${BASH_REMATCH[3]}
|
||||
IFS=$myIFS
|
||||
for i in "${!node0[@]}";do
|
||||
command_node_0=$(echo ${node0[i]} | sed 's/\"//g')
|
||||
command_node_1=$(echo ${node1[i]} | sed 's/\"//g')
|
||||
|
||||
export commands="./.buildkite/scripts/run-multi-node-test.sh /vllm-workspace/tests 2 2 ${image_name} '${command_node_0}' '${command_node_1}'"
|
||||
echo "COMMANDS: ${commands}"
|
||||
composite_command=$(echo "${composite_command} && ${commands}")
|
||||
done
|
||||
/bin/bash -c "${composite_command}"
|
||||
cleanup_network
|
||||
else
|
||||
echo "Failed to parse node commands! Exiting."
|
||||
cleanup_network
|
||||
exit 111
|
||||
fi
|
||||
else
|
||||
echo "Render devices: $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES"
|
||||
|
||||
@@ -0,0 +1,26 @@
|
||||
#!/bin/bash
|
||||
set -euox pipefail
|
||||
|
||||
echo "--- PP+TP"
|
||||
vllm serve meta-llama/Llama-3.2-3B-Instruct -tp=2 -pp=2 &
|
||||
server_pid=$!
|
||||
timeout 600 bash -c "until curl localhost:8000/v1/models; do sleep 1; done" || exit 1
|
||||
vllm bench serve \
|
||||
--backend vllm \
|
||||
--dataset-name random \
|
||||
--model meta-llama/Llama-3.2-3B-Instruct \
|
||||
--num-prompts 20 \
|
||||
--endpoint /v1/completions
|
||||
kill -s SIGTERM $server_pid &
|
||||
|
||||
echo "--- DP+TP"
|
||||
vllm serve meta-llama/Llama-3.2-3B-Instruct -tp=2 -dp=2 &
|
||||
server_pid=$!
|
||||
timeout 600 bash -c "until curl localhost:8000/v1/models; do sleep 1; done" || exit 1
|
||||
vllm bench serve \
|
||||
--backend vllm \
|
||||
--dataset-name random \
|
||||
--model meta-llama/Llama-3.2-3B-Instruct \
|
||||
--num-prompts 20 \
|
||||
--endpoint /v1/completions
|
||||
kill -s SIGTERM $server_pid &
|
||||
@@ -2,119 +2,19 @@
|
||||
|
||||
# This script build the CPU docker image and run the offline inference inside the container.
|
||||
# It serves a sanity check for compilation and basic model usage.
|
||||
set -ex
|
||||
set -euox pipefail
|
||||
|
||||
# allow to bind to different cores
|
||||
CORE_RANGE=${CORE_RANGE:-48-95}
|
||||
# used for TP/PP E2E test
|
||||
OMP_CORE_RANGE=${OMP_CORE_RANGE:-48-95}
|
||||
NUMA_NODE=${NUMA_NODE:-1}
|
||||
IMAGE_NAME="cpu-test-$NUMA_NODE"
|
||||
TIMEOUT_VAL=$1
|
||||
TEST_COMMAND=$2
|
||||
|
||||
export CMAKE_BUILD_PARALLEL_LEVEL=32
|
||||
|
||||
# Setup cleanup
|
||||
remove_docker_container() {
|
||||
set -e;
|
||||
docker rm -f cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"-avx2 || true;
|
||||
}
|
||||
trap remove_docker_container EXIT
|
||||
remove_docker_container
|
||||
|
||||
# Try building the docker image
|
||||
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --progress plain --tag cpu-test-"$NUMA_NODE" --target vllm-test -f docker/Dockerfile.cpu .
|
||||
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --progress plain --build-arg VLLM_CPU_DISABLE_AVX512="true" --tag cpu-test-"$NUMA_NODE"-avx2 --target vllm-test -f docker/Dockerfile.cpu .
|
||||
# building the docker image
|
||||
echo "--- :docker: Building Docker image"
|
||||
docker build --progress plain --tag "$IMAGE_NAME" --target vllm-test -f docker/Dockerfile.cpu .
|
||||
|
||||
# Run the image, setting --shm-size=4g for tensor parallel.
|
||||
docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=16 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"
|
||||
docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=16 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE"-avx2 cpu-test-"$NUMA_NODE"-avx2
|
||||
|
||||
function cpu_tests() {
|
||||
set -e
|
||||
export NUMA_NODE=$2
|
||||
|
||||
# list packages
|
||||
docker exec cpu-test-"$NUMA_NODE"-avx2 bash -c "
|
||||
set -e
|
||||
pip list"
|
||||
|
||||
docker exec cpu-test-"$NUMA_NODE" bash -c "
|
||||
set -e
|
||||
pip list"
|
||||
|
||||
# offline inference
|
||||
docker exec cpu-test-"$NUMA_NODE"-avx2 bash -c "
|
||||
set -e
|
||||
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
|
||||
|
||||
# Run kernel tests
|
||||
docker exec cpu-test-"$NUMA_NODE" bash -c "
|
||||
set -e
|
||||
pytest -x -v -s tests/kernels/attention/test_cpu_attn.py
|
||||
pytest -x -v -s tests/kernels/moe/test_cpu_fused_moe.py
|
||||
pytest -x -v -s tests/kernels/test_onednn.py"
|
||||
|
||||
# Run basic model test
|
||||
docker exec cpu-test-"$NUMA_NODE" bash -c "
|
||||
set -e
|
||||
# Note: disable until supports V1
|
||||
# pytest -x -v -s tests/kernels/attention/test_cache.py -m cpu_model
|
||||
# pytest -x -v -s tests/kernels/attention/test_mla_decode_cpu.py -m cpu_model
|
||||
|
||||
pytest -x -v -s tests/models/language/generation -m cpu_model
|
||||
VLLM_CPU_SGL_KERNEL=1 pytest -x -v -s tests/models/language/generation -m cpu_model
|
||||
|
||||
pytest -x -v -s tests/models/language/pooling -m cpu_model
|
||||
pytest -x -v -s tests/models/multimodal/generation \
|
||||
--ignore=tests/models/multimodal/generation/test_pixtral.py \
|
||||
-m cpu_model"
|
||||
|
||||
# Run compressed-tensor test
|
||||
docker exec cpu-test-"$NUMA_NODE" bash -c "
|
||||
set -e
|
||||
pytest -x -s -v \
|
||||
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_logprobs"
|
||||
|
||||
# Run AWQ/GPTQ test
|
||||
docker exec cpu-test-"$NUMA_NODE" bash -c "
|
||||
set -e
|
||||
pytest -x -s -v \
|
||||
tests/quantization/test_cpu_wna16.py"
|
||||
|
||||
# Run multi-lora tests
|
||||
docker exec cpu-test-"$NUMA_NODE" bash -c "
|
||||
set -e
|
||||
pytest -x -s -v \
|
||||
tests/lora/test_qwenvl.py"
|
||||
|
||||
# online serving: tp+pp
|
||||
docker exec cpu-test-"$NUMA_NODE" bash -c '
|
||||
set -e
|
||||
VLLM_CPU_OMP_THREADS_BIND=$E2E_OMP_THREADS VLLM_CPU_SGL_KERNEL=1 vllm serve meta-llama/Llama-3.2-3B-Instruct -tp=2 -pp=2 &
|
||||
server_pid=$!
|
||||
timeout 600 bash -c "until curl localhost:8000/v1/models; do sleep 1; done" || exit 1
|
||||
vllm bench serve \
|
||||
--backend vllm \
|
||||
--dataset-name random \
|
||||
--model meta-llama/Llama-3.2-3B-Instruct \
|
||||
--num-prompts 20 \
|
||||
--endpoint /v1/completions
|
||||
kill -s SIGTERM $server_pid &'
|
||||
|
||||
# online serving: tp+dp
|
||||
docker exec cpu-test-"$NUMA_NODE" bash -c '
|
||||
set -e
|
||||
VLLM_CPU_OMP_THREADS_BIND=$E2E_OMP_THREADS VLLM_CPU_SGL_KERNEL=1 vllm serve meta-llama/Llama-3.2-3B-Instruct -tp=2 -dp=2 &
|
||||
server_pid=$!
|
||||
timeout 600 bash -c "until curl localhost:8000/v1/models; do sleep 1; done" || exit 1
|
||||
vllm bench serve \
|
||||
--backend vllm \
|
||||
--dataset-name random \
|
||||
--model meta-llama/Llama-3.2-3B-Instruct \
|
||||
--num-prompts 20 \
|
||||
--endpoint /v1/completions
|
||||
kill -s SIGTERM $server_pid &'
|
||||
}
|
||||
|
||||
# All of CPU tests are expected to be finished less than 40 mins.
|
||||
export -f cpu_tests
|
||||
timeout 2.5h bash -c "cpu_tests $CORE_RANGE $NUMA_NODE"
|
||||
docker run --rm --cpuset-cpus=$CORE_RANGE --cpuset-mems=$NUMA_NODE -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN -e VLLM_CPU_KVCACHE_SPACE=16 -e VLLM_CPU_CI_ENV=1 -e VLLM_CPU_SIM_MULTI_NUMA=1 --shm-size=4g $IMAGE_NAME \
|
||||
timeout $TIMEOUT_VAL bash -c "set -euox pipefail; echo \"--- Print packages\"; pip list; echo \"--- Running tests\"; ${TEST_COMMAND}"
|
||||
|
||||
@@ -5,7 +5,9 @@
|
||||
set -exuo pipefail
|
||||
|
||||
# Try building the docker image
|
||||
cat <<EOF | docker build -t hpu-plugin-v1-test-env -f - .
|
||||
image_name="hpu/upstream-vllm-ci:${BUILDKITE_COMMIT}"
|
||||
container_name="hpu-upstream-vllm-ci-${BUILDKITE_COMMIT}-container"
|
||||
cat <<EOF | docker build -t ${image_name} -f - .
|
||||
FROM gaudi-base-image:latest
|
||||
|
||||
COPY ./ /workspace/vllm
|
||||
@@ -15,7 +17,8 @@ WORKDIR /workspace/vllm
|
||||
ENV no_proxy=localhost,127.0.0.1
|
||||
ENV PT_HPU_ENABLE_LAZY_COLLECTIVES=true
|
||||
|
||||
RUN VLLM_TARGET_DEVICE=empty pip install .
|
||||
RUN bash -c 'pip install -r <(sed "/^torch/d" requirements/build.txt)'
|
||||
RUN VLLM_TARGET_DEVICE=empty pip install --no-build-isolation -e .
|
||||
RUN pip install git+https://github.com/vllm-project/vllm-gaudi.git
|
||||
|
||||
# install development dependencies (for testing)
|
||||
@@ -36,15 +39,20 @@ EOF
|
||||
# functions, while other platforms only need one remove_docker_container
|
||||
# function.
|
||||
EXITCODE=1
|
||||
remove_docker_containers() { docker rm -f hpu-plugin-v1-test || true; }
|
||||
remove_docker_containers() { docker rm -f ${container_name} || true; }
|
||||
trap 'remove_docker_containers; exit $EXITCODE;' EXIT
|
||||
remove_docker_containers
|
||||
|
||||
echo "Running HPU plugin v1 test"
|
||||
docker run --rm --runtime=habana --name=hpu-plugin-v1-test --network=host \
|
||||
docker run --rm --runtime=habana --name=${container_name} --network=host \
|
||||
-e HABANA_VISIBLE_DEVICES=all \
|
||||
hpu-plugin-v1-test-env \
|
||||
/bin/bash "/workspace/vllm-gaudi/tests/upstream_tests/ci_tests.sh"
|
||||
-e VLLM_SKIP_WARMUP=true \
|
||||
-e PT_HPU_ENABLE_LAZY_COLLECTIVES=true \
|
||||
-e PT_HPU_LAZY_MODE=1 \
|
||||
"${image_name}" \
|
||||
/bin/bash -c '
|
||||
cd vllm; timeout 120s python -u examples/offline_inference/basic/generate.py --model facebook/opt-125m
|
||||
'
|
||||
|
||||
EXITCODE=$?
|
||||
if [ $EXITCODE -eq 0 ]; then
|
||||
|
||||
@@ -38,15 +38,18 @@ docker run \
|
||||
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 -O3 -cc.cudagraph_mode=NONE
|
||||
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray
|
||||
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp
|
||||
python3 examples/offline_inference/basic/generate.py --model Intel/Qwen2.5-0.5B-W4A16-G128-AutoRound-LLMC-TEST-ONLY --enforce-eager
|
||||
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager --attention-backend=TRITON_ATTN
|
||||
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager --quantization fp8
|
||||
python3 examples/offline_inference/basic/generate.py --model superjob/Qwen3-4B-Instruct-2507-GPTQ-Int4 --block-size 64 --enforce-eager
|
||||
python3 examples/offline_inference/basic/generate.py --model ibm-research/PowerMoE-3b --block-size 64 --enforce-eager -tp 2
|
||||
python3 examples/offline_inference/basic/generate.py --model ibm-research/PowerMoE-3b --block-size 64 --enforce-eager -tp 2 --enable-expert-parallel
|
||||
cd tests
|
||||
pytest -v -s v1/core
|
||||
pytest -v -s v1/core --ignore=v1/core/test_reset_prefix_cache_e2e.py
|
||||
pytest -v -s v1/engine
|
||||
pytest -v -s v1/sample --ignore=v1/sample/test_logprobs.py --ignore=v1/sample/test_logprobs_e2e.py
|
||||
pytest -v -s v1/worker --ignore=v1/worker/test_gpu_model_runner.py
|
||||
pytest -v -s v1/structured_output
|
||||
pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_tree_attention.py --ignore=v1/spec_decode/test_speculators_eagle3.py
|
||||
pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_tree_attention.py --ignore=v1/spec_decode/test_speculators_eagle3.py --ignore=v1/spec_decode/test_acceptance_length.py
|
||||
pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_nixl_connector.py --ignore=v1/kv_connector/unit/test_example_connector.py --ignore=v1/kv_connector/unit/test_lmcache_integration.py
|
||||
pytest -v -s v1/test_serial_utils.py
|
||||
'
|
||||
|
||||
36
.buildkite/scripts/push-nightly-builds.sh
Executable file
36
.buildkite/scripts/push-nightly-builds.sh
Executable file
@@ -0,0 +1,36 @@
|
||||
#!/bin/bash
|
||||
|
||||
set -ex
|
||||
|
||||
# Get tag variant from argument, default to empty if not provided, should be something like "cu130".
|
||||
# Due to limits in cleanup script, we must move variants to use separate tags like "cu130-nightly",
|
||||
# otherwise they will be cleaned up together with the main "nightly" tags.
|
||||
|
||||
TAG_VARIANT="$1"
|
||||
if [ -n "$TAG_VARIANT" ]; then
|
||||
ORIG_TAG_SUFFIX="-$TAG_VARIANT"
|
||||
TAG_NAME="$TAG_VARIANT-nightly"
|
||||
else
|
||||
ORIG_TAG_SUFFIX=""
|
||||
TAG_NAME="nightly"
|
||||
fi
|
||||
|
||||
ORIG_TAG_NAME="$BUILDKITE_COMMIT"
|
||||
|
||||
echo "Pushing original tag $ORIG_TAG_NAME$ORIG_TAG_SUFFIX to new nightly tag name: $TAG_NAME"
|
||||
|
||||
# pull original arch-dependent images from AWS ECR Public
|
||||
aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7
|
||||
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:$ORIG_TAG_NAME-x86_64$ORIG_TAG_SUFFIX
|
||||
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:$ORIG_TAG_NAME-aarch64$ORIG_TAG_SUFFIX
|
||||
# tag arch-dependent images
|
||||
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$ORIG_TAG_NAME-x86_64$ORIG_TAG_SUFFIX vllm/vllm-openai:$TAG_NAME-x86_64
|
||||
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$ORIG_TAG_NAME-aarch64$ORIG_TAG_SUFFIX vllm/vllm-openai:$TAG_NAME-aarch64
|
||||
# push arch-dependent images to DockerHub
|
||||
docker push vllm/vllm-openai:$TAG_NAME-x86_64
|
||||
docker push vllm/vllm-openai:$TAG_NAME-aarch64
|
||||
# push arch-independent manifest to DockerHub
|
||||
docker manifest create vllm/vllm-openai:$TAG_NAME vllm/vllm-openai:$TAG_NAME-x86_64 vllm/vllm-openai:$TAG_NAME-aarch64 --amend
|
||||
docker manifest create vllm/vllm-openai:$TAG_NAME-$BUILDKITE_COMMIT vllm/vllm-openai:$TAG_NAME-x86_64 vllm/vllm-openai:$TAG_NAME-aarch64 --amend
|
||||
docker manifest push vllm/vllm-openai:$TAG_NAME
|
||||
docker manifest push vllm/vllm-openai:$TAG_NAME-$BUILDKITE_COMMIT
|
||||
@@ -43,7 +43,6 @@ trap cleanup EXIT
|
||||
|
||||
for BACK in "${BACKENDS[@]}"; do
|
||||
VLLM_DEEP_GEMM_WARMUP=skip \
|
||||
VLLM_ALL2ALL_BACKEND=$BACK \
|
||||
vllm serve "$MODEL" \
|
||||
--enforce-eager \
|
||||
--tensor-parallel-size 2 \
|
||||
@@ -52,6 +51,7 @@ for BACK in "${BACKENDS[@]}"; do
|
||||
--enable-eplb \
|
||||
--trust-remote-code \
|
||||
--max-model-len 2048 \
|
||||
--all2all-backend $BACK \
|
||||
--port $PORT &
|
||||
SERVER_PID=$!
|
||||
wait_for_server $PORT
|
||||
|
||||
@@ -18,15 +18,18 @@ wait_for_server() {
|
||||
|
||||
MODEL="Qwen/Qwen3-Next-80B-A3B-Instruct"
|
||||
|
||||
# Set BACKENDS based on platform
|
||||
# Set BACKENDS and platform-specific args based on platform
|
||||
if command -v rocm-smi &> /dev/null || [[ -d /opt/rocm ]] || [[ -n "${ROCM_PATH:-}" ]]; then
|
||||
# ROCm platform
|
||||
BACKENDS=("allgather_reducescatter")
|
||||
# Disable MOE padding for ROCm since it is causing eplb to fail
|
||||
export VLLM_ROCM_MOE_PADDING=0
|
||||
PLATFORM_ARGS=("--no-async-scheduling")
|
||||
echo "Disabled async scheduling for ROCm platform due to issues with spec decode."
|
||||
else
|
||||
# Non-ROCm platform (CUDA/other)
|
||||
BACKENDS=("deepep_high_throughput" "deepep_low_latency")
|
||||
PLATFORM_ARGS=()
|
||||
fi
|
||||
|
||||
cleanup() {
|
||||
@@ -54,6 +57,7 @@ for BACK in "${BACKENDS[@]}"; do
|
||||
--trust-remote-code \
|
||||
--max-model-len 2048 \
|
||||
--gpu-memory-utilization 0.9 \
|
||||
"${PLATFORM_ARGS[@]}" \
|
||||
--port $PORT &
|
||||
SERVER_PID=$!
|
||||
wait_for_server $PORT
|
||||
|
||||
227
.buildkite/scripts/trigger-ci-build.sh
Executable file
227
.buildkite/scripts/trigger-ci-build.sh
Executable file
@@ -0,0 +1,227 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# trigger-ci-build.sh
|
||||
# Trigger a Buildkite CI build using the bk CLI for the current commit and branch
|
||||
# with RUN_ALL=1 and NIGHTLY=1 environment variables.
|
||||
#
|
||||
# Usage: ./trigger-ci-build.sh [options]
|
||||
#
|
||||
# Requires: bk CLI (https://buildkite.com/docs/platform/cli)
|
||||
#
|
||||
# SAFETY: Dry-run by default. Use --execute to actually trigger a build.
|
||||
#
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
# Colors for output
|
||||
RED='\033[0;31m'
|
||||
GREEN='\033[0;32m'
|
||||
YELLOW='\033[1;33m'
|
||||
BLUE='\033[0;34m'
|
||||
NC='\033[0m' # No Color
|
||||
|
||||
# Default configuration
|
||||
PIPELINE="ci"
|
||||
DRY_RUN=true
|
||||
|
||||
usage() {
|
||||
cat <<EOF
|
||||
Usage: $(basename "$0") [options]
|
||||
|
||||
Trigger a Buildkite CI build using the bk CLI for the current commit and branch.
|
||||
Sets RUN_ALL=1 and NIGHTLY=1 environment variables.
|
||||
|
||||
SAFETY: Dry-run by default. Use --execute to actually trigger a build.
|
||||
|
||||
Options:
|
||||
--execute Actually trigger the build (default: dry-run)
|
||||
--pipeline Buildkite pipeline slug (default: ${PIPELINE})
|
||||
--commit Override commit SHA (default: current HEAD)
|
||||
--branch Override branch name (default: current branch)
|
||||
--message Custom build message (default: auto-generated)
|
||||
--help Show this help message
|
||||
|
||||
Prerequisites:
|
||||
- bk CLI installed: brew tap buildkite/buildkite && brew install buildkite/buildkite/bk
|
||||
- bk configured: bk configure
|
||||
|
||||
Examples:
|
||||
$(basename "$0") # Dry-run, show what would happen
|
||||
$(basename "$0") --execute # Actually trigger the build
|
||||
$(basename "$0") --pipeline ci-shadow # Dry-run with different pipeline
|
||||
EOF
|
||||
exit 1
|
||||
}
|
||||
|
||||
log_info() {
|
||||
echo -e "${BLUE}[INFO]${NC} $1"
|
||||
}
|
||||
|
||||
log_success() {
|
||||
echo -e "${GREEN}[OK]${NC} $1"
|
||||
}
|
||||
|
||||
log_warn() {
|
||||
echo -e "${YELLOW}[WARN]${NC} $1"
|
||||
}
|
||||
|
||||
log_error() {
|
||||
echo -e "${RED}[ERROR]${NC} $1" >&2
|
||||
}
|
||||
|
||||
# Parse arguments
|
||||
COMMIT=""
|
||||
BRANCH=""
|
||||
MESSAGE=""
|
||||
|
||||
while [[ $# -gt 0 ]]; do
|
||||
case $1 in
|
||||
--execute)
|
||||
DRY_RUN=false
|
||||
shift
|
||||
;;
|
||||
--pipeline)
|
||||
PIPELINE="$2"
|
||||
shift 2
|
||||
;;
|
||||
--commit)
|
||||
COMMIT="$2"
|
||||
shift 2
|
||||
;;
|
||||
--branch)
|
||||
BRANCH="$2"
|
||||
shift 2
|
||||
;;
|
||||
--message)
|
||||
MESSAGE="$2"
|
||||
shift 2
|
||||
;;
|
||||
--help|-h)
|
||||
usage
|
||||
;;
|
||||
-*)
|
||||
log_error "Unknown option: $1"
|
||||
usage
|
||||
;;
|
||||
*)
|
||||
log_error "Unexpected argument: $1"
|
||||
usage
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
# Check if bk CLI is installed
|
||||
if ! command -v bk &>/dev/null; then
|
||||
log_error "Buildkite CLI (bk) is not installed"
|
||||
echo ""
|
||||
echo "Install with:"
|
||||
echo " brew tap buildkite/buildkite && brew install buildkite/buildkite/bk"
|
||||
echo ""
|
||||
echo "Then configure:"
|
||||
echo " bk configure"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Check if we're in a git repository
|
||||
if ! git rev-parse --is-inside-work-tree &>/dev/null; then
|
||||
log_error "Not in a git repository"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Get current commit and branch if not overridden
|
||||
if [[ -z "$COMMIT" ]]; then
|
||||
COMMIT=$(git rev-parse HEAD)
|
||||
fi
|
||||
|
||||
if [[ -z "$BRANCH" ]]; then
|
||||
BRANCH=$(git branch --show-current)
|
||||
if [[ -z "$BRANCH" ]]; then
|
||||
# Detached HEAD state - try to get branch from ref
|
||||
BRANCH=$(git rev-parse --abbrev-ref HEAD)
|
||||
fi
|
||||
fi
|
||||
|
||||
# Generate default message if not provided
|
||||
if [[ -z "$MESSAGE" ]]; then
|
||||
COMMIT_MSG=$(git log -1 --pretty=format:"%s" "$COMMIT" 2>/dev/null || echo "Manual build")
|
||||
MESSAGE="[Manual] ${COMMIT_MSG}"
|
||||
fi
|
||||
|
||||
# Safety check: Verify the commit exists on the remote
|
||||
log_info "Verifying commit exists on remote..."
|
||||
git fetch origin --quiet 2>/dev/null || true
|
||||
|
||||
# Check if commit is reachable from any remote branch
|
||||
REMOTE_BRANCHES=$(git branch -r --contains "$COMMIT" 2>/dev/null || true)
|
||||
if [[ -z "$REMOTE_BRANCHES" ]]; then
|
||||
log_error "Commit ${COMMIT} does not exist on any remote branch!"
|
||||
echo ""
|
||||
echo "The CI system will fail to checkout this commit."
|
||||
echo "Please push your changes first:"
|
||||
echo ""
|
||||
echo " git push origin ${BRANCH}"
|
||||
echo ""
|
||||
exit 1
|
||||
fi
|
||||
|
||||
log_success "Commit found on remote branches:"
|
||||
echo "$REMOTE_BRANCHES" | head -5 | sed 's/^/ /'
|
||||
if [[ $(echo "$REMOTE_BRANCHES" | wc -l) -gt 5 ]]; then
|
||||
echo " ... and more"
|
||||
fi
|
||||
echo ""
|
||||
|
||||
log_info "Pipeline: ${PIPELINE}"
|
||||
log_info "Branch: ${BRANCH}"
|
||||
log_info "Commit: ${COMMIT}"
|
||||
log_info "Message: ${MESSAGE}"
|
||||
log_info "Environment: RUN_ALL=1, NIGHTLY=1"
|
||||
echo ""
|
||||
|
||||
# Build the command
|
||||
CMD=(bk build create
|
||||
-y
|
||||
-w
|
||||
-i
|
||||
--pipeline "${PIPELINE}"
|
||||
--commit "${COMMIT}"
|
||||
--branch "${BRANCH}"
|
||||
--message "${MESSAGE}"
|
||||
--env "RUN_ALL=1"
|
||||
--env "NIGHTLY=1"
|
||||
)
|
||||
|
||||
if [[ "$DRY_RUN" == true ]]; then
|
||||
echo "=========================================="
|
||||
log_warn "DRY-RUN MODE - No build will be triggered"
|
||||
echo "=========================================="
|
||||
echo ""
|
||||
echo "Command that would be executed:"
|
||||
echo ""
|
||||
# Escape single quotes in values for safe shell display
|
||||
escape_for_shell() {
|
||||
printf '%s' "$1" | sed "s/'/'\\\\''/g"
|
||||
}
|
||||
echo " bk build create \\"
|
||||
echo " -y \\"
|
||||
echo " -w \\"
|
||||
echo " -i \\"
|
||||
echo " --pipeline '$(escape_for_shell "${PIPELINE}")' \\"
|
||||
echo " --commit '$(escape_for_shell "${COMMIT}")' \\"
|
||||
echo " --branch '$(escape_for_shell "${BRANCH}")' \\"
|
||||
echo " --message '$(escape_for_shell "${MESSAGE}")' \\"
|
||||
echo " --env 'RUN_ALL=1' \\"
|
||||
echo " --env 'NIGHTLY=1'"
|
||||
echo ""
|
||||
echo "=========================================="
|
||||
echo -e "${YELLOW}To actually trigger this build, run:${NC}"
|
||||
echo ""
|
||||
echo " $0 --execute"
|
||||
echo "=========================================="
|
||||
exit 0
|
||||
fi
|
||||
|
||||
log_info "Triggering build..."
|
||||
|
||||
# Execute the command - bk will print the URL and open browser
|
||||
"${CMD[@]}"
|
||||
70
.buildkite/scripts/upload-release-wheels-pypi.sh
Normal file
70
.buildkite/scripts/upload-release-wheels-pypi.sh
Normal file
@@ -0,0 +1,70 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
set -e
|
||||
|
||||
BUCKET="vllm-wheels"
|
||||
SUBPATH=$BUILDKITE_COMMIT
|
||||
S3_COMMIT_PREFIX="s3://$BUCKET/$SUBPATH/"
|
||||
|
||||
RELEASE_VERSION=$(buildkite-agent meta-data get release-version)
|
||||
GIT_VERSION=$(git describe --exact-match --tags $BUILDKITE_COMMIT 2>/dev/null)
|
||||
|
||||
echo "Release version from Buildkite: $RELEASE_VERSION"
|
||||
|
||||
if [[ -z "$GIT_VERSION" ]]; then
|
||||
echo "[FATAL] Not on a git tag, cannot create release."
|
||||
exit 1
|
||||
else
|
||||
echo "Git version for commit $BUILDKITE_COMMIT: $GIT_VERSION"
|
||||
fi
|
||||
# sanity check for version mismatch
|
||||
if [[ "$RELEASE_VERSION" != "$GIT_VERSION" ]]; then
|
||||
if [[ "$FORCE_RELEASE_IGNORE_VERSION_MISMATCH" == "true" ]]; then
|
||||
echo "[WARNING] Force release and ignore version mismatch"
|
||||
else
|
||||
echo "[FATAL] Release version from Buildkite does not match Git version."
|
||||
exit 1
|
||||
fi
|
||||
fi
|
||||
PURE_VERSION=${RELEASE_VERSION#v} # remove leading 'v'
|
||||
|
||||
# check pypi token
|
||||
if [[ -z "$PYPI_TOKEN" ]]; then
|
||||
echo "[FATAL] PYPI_TOKEN is not set."
|
||||
exit 1
|
||||
else
|
||||
export TWINE_USERNAME="__token__"
|
||||
export TWINE_PASSWORD="$PYPI_TOKEN"
|
||||
fi
|
||||
|
||||
set -x # avoid printing secrets above
|
||||
|
||||
# install twine from pypi
|
||||
python3 -m venv /tmp/vllm-release-env
|
||||
source /tmp/vllm-release-env/bin/activate
|
||||
pip install twine
|
||||
python3 -m twine --version
|
||||
|
||||
# copy release wheels to local directory
|
||||
DIST_DIR=/tmp/vllm-release-dist
|
||||
echo "Existing wheels on S3:"
|
||||
aws s3 ls "$S3_COMMIT_PREFIX"
|
||||
echo "Copying wheels to local directory"
|
||||
mkdir -p $DIST_DIR
|
||||
# include only wheels for the release version, ignore all files with "dev" or "rc" in the name (without excluding 'aarch64')
|
||||
aws s3 cp --recursive --exclude "*" --include "vllm-${PURE_VERSION}*.whl" --exclude "*dev*" --exclude "*rc[0-9]*" "$S3_COMMIT_PREFIX" $DIST_DIR
|
||||
echo "Wheels copied to local directory"
|
||||
# generate source tarball
|
||||
git archive --format=tar.gz --output="$DIST_DIR/vllm-${PURE_VERSION}.tar.gz" $BUILDKITE_COMMIT
|
||||
ls -la $DIST_DIR
|
||||
|
||||
# upload wheels to PyPI (only default variant, i.e. files without '+' in the name)
|
||||
PYPI_WHEEL_FILES=$(find $DIST_DIR -name "vllm-${PURE_VERSION}*.whl" -not -name "*+*")
|
||||
if [[ -z "$PYPI_WHEEL_FILES" ]]; then
|
||||
echo "No default variant wheels found, quitting..."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
python3 -m twine check $PYPI_WHEEL_FILES
|
||||
python3 -m twine upload --non-interactive --verbose $PYPI_WHEEL_FILES
|
||||
echo "Wheels uploaded to PyPI"
|
||||
151
.buildkite/scripts/upload-rocm-wheels.sh
Executable file
151
.buildkite/scripts/upload-rocm-wheels.sh
Executable file
@@ -0,0 +1,151 @@
|
||||
#!/usr/bin/env bash
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
#
|
||||
# Upload ROCm wheels to S3 with proper index generation
|
||||
#
|
||||
# Required environment variables:
|
||||
# AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY (or IAM role)
|
||||
# S3_BUCKET (default: vllm-wheels)
|
||||
#
|
||||
# S3 path structure:
|
||||
# s3://vllm-wheels/rocm/{commit}/ - All wheels for this commit
|
||||
# s3://vllm-wheels/rocm/nightly/ - Index pointing to latest nightly
|
||||
# s3://vllm-wheels/rocm/{version}/ - Index for release versions
|
||||
|
||||
set -ex
|
||||
|
||||
# ======== Configuration ========
|
||||
BUCKET="${S3_BUCKET:-vllm-wheels}"
|
||||
ROCM_SUBPATH="rocm/${BUILDKITE_COMMIT}"
|
||||
S3_COMMIT_PREFIX="s3://$BUCKET/$ROCM_SUBPATH/"
|
||||
INDICES_OUTPUT_DIR="rocm-indices"
|
||||
PYTHON="${PYTHON_PROG:-python3}"
|
||||
|
||||
# ROCm uses manylinux_2_35 (Ubuntu 22.04 based)
|
||||
MANYLINUX_VERSION="manylinux_2_35"
|
||||
|
||||
echo "========================================"
|
||||
echo "ROCm Wheel Upload Configuration"
|
||||
echo "========================================"
|
||||
echo "S3 Bucket: $BUCKET"
|
||||
echo "S3 Path: $ROCM_SUBPATH"
|
||||
echo "Commit: $BUILDKITE_COMMIT"
|
||||
echo "Branch: $BUILDKITE_BRANCH"
|
||||
echo "========================================"
|
||||
|
||||
# ======== Part 0: Setup Python ========
|
||||
|
||||
# Detect if python3.12+ is available
|
||||
has_new_python=$($PYTHON -c "print(1 if __import__('sys').version_info >= (3,12) else 0)" 2>/dev/null || echo 0)
|
||||
if [[ "$has_new_python" -eq 0 ]]; then
|
||||
# Use new python from docker
|
||||
# Use --user to ensure files are created with correct ownership (not root)
|
||||
docker pull python:3-slim
|
||||
PYTHON="docker run --rm --user $(id -u):$(id -g) -v $(pwd):/app -w /app python:3-slim python3"
|
||||
fi
|
||||
|
||||
echo "Using python interpreter: $PYTHON"
|
||||
echo "Python version: $($PYTHON --version)"
|
||||
|
||||
# ======== Part 1: Collect and prepare wheels ========
|
||||
|
||||
# Collect all wheels
|
||||
mkdir -p all-rocm-wheels
|
||||
cp artifacts/rocm-base-wheels/*.whl all-rocm-wheels/ 2>/dev/null || true
|
||||
cp artifacts/rocm-vllm-wheel/*.whl all-rocm-wheels/ 2>/dev/null || true
|
||||
|
||||
WHEEL_COUNT=$(ls all-rocm-wheels/*.whl 2>/dev/null | wc -l)
|
||||
echo "Total wheels to upload: $WHEEL_COUNT"
|
||||
|
||||
if [ "$WHEEL_COUNT" -eq 0 ]; then
|
||||
echo "ERROR: No wheels found to upload!"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Rename linux to manylinux in wheel filenames
|
||||
for wheel in all-rocm-wheels/*.whl; do
|
||||
if [[ "$wheel" == *"linux"* ]] && [[ "$wheel" != *"manylinux"* ]]; then
|
||||
new_wheel="${wheel/linux/$MANYLINUX_VERSION}"
|
||||
mv -- "$wheel" "$new_wheel"
|
||||
echo "Renamed: $(basename "$wheel") -> $(basename "$new_wheel")"
|
||||
fi
|
||||
done
|
||||
|
||||
echo ""
|
||||
echo "Wheels to upload:"
|
||||
ls -lh all-rocm-wheels/
|
||||
|
||||
# ======== Part 2: Upload wheels to S3 ========
|
||||
|
||||
echo ""
|
||||
echo "Uploading wheels to $S3_COMMIT_PREFIX"
|
||||
for wheel in all-rocm-wheels/*.whl; do
|
||||
aws s3 cp "$wheel" "$S3_COMMIT_PREFIX"
|
||||
done
|
||||
|
||||
# ======== Part 3: Generate and upload indices ========
|
||||
|
||||
# List existing wheels in commit directory
|
||||
echo ""
|
||||
echo "Generating indices..."
|
||||
obj_json="rocm-objects.json"
|
||||
aws s3api list-objects-v2 --bucket "$BUCKET" --prefix "$ROCM_SUBPATH/" --delimiter / --output json > "$obj_json"
|
||||
|
||||
mkdir -p "$INDICES_OUTPUT_DIR"
|
||||
|
||||
# Use the existing generate-nightly-index.py
|
||||
# HACK: Replace regex module with stdlib re (same as CUDA script)
|
||||
sed -i 's/import regex as re/import re/g' .buildkite/scripts/generate-nightly-index.py
|
||||
|
||||
$PYTHON .buildkite/scripts/generate-nightly-index.py \
|
||||
--version "$ROCM_SUBPATH" \
|
||||
--current-objects "$obj_json" \
|
||||
--output-dir "$INDICES_OUTPUT_DIR" \
|
||||
--comment "ROCm commit $BUILDKITE_COMMIT"
|
||||
|
||||
# Upload indices to commit directory
|
||||
echo "Uploading indices to $S3_COMMIT_PREFIX"
|
||||
aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "$S3_COMMIT_PREFIX"
|
||||
|
||||
# Update rocm/nightly/ if on main branch and not a PR
|
||||
if [[ "$BUILDKITE_BRANCH" == "main" && "$BUILDKITE_PULL_REQUEST" == "false" ]] || [[ "$NIGHTLY" == "1" ]]; then
|
||||
echo "Updating rocm/nightly/ index..."
|
||||
aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "s3://$BUCKET/rocm/nightly/"
|
||||
fi
|
||||
|
||||
# Extract version from vLLM wheel and update version-specific index
|
||||
VLLM_WHEEL=$(ls all-rocm-wheels/vllm*.whl 2>/dev/null | head -1)
|
||||
if [ -n "$VLLM_WHEEL" ]; then
|
||||
VERSION=$(unzip -p "$VLLM_WHEEL" '**/METADATA' | grep '^Version: ' | cut -d' ' -f2)
|
||||
echo "Version in wheel: $VERSION"
|
||||
PURE_VERSION="${VERSION%%+*}"
|
||||
PURE_VERSION="${PURE_VERSION%%.rocm}"
|
||||
echo "Pure version: $PURE_VERSION"
|
||||
|
||||
if [[ "$VERSION" != *"dev"* ]]; then
|
||||
echo "Updating rocm/$PURE_VERSION/ index..."
|
||||
aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "s3://$BUCKET/rocm/$PURE_VERSION/"
|
||||
fi
|
||||
fi
|
||||
|
||||
# ======== Part 4: Summary ========
|
||||
|
||||
echo ""
|
||||
echo "========================================"
|
||||
echo "ROCm Wheel Upload Complete!"
|
||||
echo "========================================"
|
||||
echo ""
|
||||
echo "Wheels available at:"
|
||||
echo " s3://$BUCKET/$ROCM_SUBPATH/"
|
||||
echo ""
|
||||
echo "Install command (by commit):"
|
||||
echo " pip install vllm --extra-index-url https://${BUCKET}.s3.amazonaws.com/$ROCM_SUBPATH/"
|
||||
echo ""
|
||||
if [[ "$BUILDKITE_BRANCH" == "main" ]] || [[ "$NIGHTLY" == "1" ]]; then
|
||||
echo "Install command (nightly):"
|
||||
echo " pip install vllm --extra-index-url https://${BUCKET}.s3.amazonaws.com/rocm/nightly/"
|
||||
fi
|
||||
echo ""
|
||||
echo "Wheel count: $WHEEL_COUNT"
|
||||
echo "========================================"
|
||||
@@ -70,7 +70,9 @@ steps:
|
||||
- vllm/
|
||||
- tests/test_inputs.py
|
||||
- tests/test_outputs.py
|
||||
- tests/test_pooling_params.py
|
||||
- tests/multimodal
|
||||
- tests/renderers
|
||||
- tests/standalone_tests/lazy_imports.py
|
||||
- tests/tokenizers_
|
||||
- tests/tool_parsers
|
||||
@@ -81,7 +83,9 @@ steps:
|
||||
- python3 standalone_tests/lazy_imports.py
|
||||
- pytest -v -s test_inputs.py
|
||||
- pytest -v -s test_outputs.py
|
||||
- pytest -v -s test_pooling_params.py
|
||||
- pytest -v -s -m 'cpu_test' multimodal
|
||||
- pytest -v -s renderers
|
||||
- pytest -v -s tokenizers_
|
||||
- pytest -v -s tool_parsers
|
||||
- pytest -v -s transformers_utils
|
||||
@@ -229,6 +233,7 @@ steps:
|
||||
- tests/compile/fullgraph/test_basic_correctness.py
|
||||
- examples/offline_inference/rlhf.py
|
||||
- examples/offline_inference/rlhf_colocate.py
|
||||
- examples/offline_inference/new_weight_syncing/
|
||||
- tests/examples/offline_inference/data_parallel.py
|
||||
- tests/v1/distributed
|
||||
- tests/v1/engine/test_engine_core_client.py
|
||||
@@ -264,10 +269,16 @@ steps:
|
||||
- pytest -v -s distributed/test_symm_mem_allreduce.py
|
||||
# TODO: create a dedicated test section for multi-GPU example tests
|
||||
# when we have multiple distributed example tests
|
||||
# OLD rlhf examples
|
||||
- pushd ../examples/offline_inference
|
||||
- VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf.py
|
||||
- VLLM_ALLOW_INSECURE_SERIALIZATION=1 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
|
||||
- popd
|
||||
# NEW rlhf examples
|
||||
- pushd ../examples/offline_inference/new_weight_syncing
|
||||
- VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf.py
|
||||
- VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf_async_new_apis.py
|
||||
- popd
|
||||
|
||||
- label: Distributed Tests (8 GPUs) # 4min
|
||||
timeout_in_minutes: 10
|
||||
@@ -428,6 +439,8 @@ steps:
|
||||
timeout_in_minutes: 30
|
||||
gpu: h100
|
||||
source_file_dependencies:
|
||||
- vllm/config/attention.py
|
||||
- vllm/model_executor/layers/attention
|
||||
- vllm/v1/attention
|
||||
- tests/v1/attention
|
||||
commands:
|
||||
@@ -452,10 +465,12 @@ steps:
|
||||
timeout_in_minutes: 30
|
||||
gpu: b200
|
||||
source_file_dependencies:
|
||||
- vllm/config/attention.py
|
||||
- vllm/model_executor/layers/attention
|
||||
- vllm/v1/attention
|
||||
- tests/v1/attention
|
||||
commands:
|
||||
- VLLM_DISABLE_FLASHINFER_PREFILL=1 pytest -v -s v1/attention # TODO: FI prefill is bugged and causes incorrectness, fix this
|
||||
- pytest -v -s v1/attention
|
||||
|
||||
- label: V1 Test others (CPU) # 5 mins
|
||||
mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
|
||||
@@ -499,7 +514,7 @@ steps:
|
||||
- python3 offline_inference/vision_language_multi_image.py --seed 0
|
||||
- python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
|
||||
# for pooling models
|
||||
- python3 pooling/pooling/vision_language_pooling.py --seed 0
|
||||
- python3 pooling/embed/vision_embedding_offline.py --seed 0
|
||||
# for features demo
|
||||
- python3 offline_inference/prefix_caching.py
|
||||
- python3 offline_inference/llm_engine_example.py
|
||||
@@ -519,6 +534,7 @@ steps:
|
||||
- tests/cuda
|
||||
commands:
|
||||
- pytest -v -s cuda/test_cuda_context.py
|
||||
- pytest -v -s cuda/test_platform_no_cuda_init.py
|
||||
|
||||
- label: Samplers Test # 56min
|
||||
timeout_in_minutes: 75
|
||||
@@ -598,9 +614,11 @@ steps:
|
||||
- tests/compile
|
||||
commands:
|
||||
- pytest -v -s compile/fullgraph/test_full_graph.py -k 'not test_fp8_kv_scale_compile'
|
||||
# Limit to no custom ops to reduce running time
|
||||
# Wrap with quotes to escape yaml and avoid starting -k string with a -
|
||||
- "pytest -v -s compile/distributed/test_fusions_e2e.py -k 'TRITON and not +quant_fp8 and not Llama-4'"
|
||||
# # Limit to no custom ops to reduce running time
|
||||
# # Wrap with quotes to escape yaml and avoid starting -k string with a -
|
||||
# - "pytest -v -s compile/distributed/test_fusions_e2e.py -k 'TRITON and not +quant_fp8 and not Llama-4'"
|
||||
# Old E2E tests were removed in https://github.com/vllm-project/vllm/pull/33293
|
||||
# in favor of new tests in fusions_e2e. We avoid replicating the new jobs in this file as it's deprecated.
|
||||
|
||||
- label: Cudagraph test
|
||||
timeout_in_minutes: 20
|
||||
@@ -634,8 +652,9 @@ steps:
|
||||
# grade: Blocking
|
||||
source_file_dependencies:
|
||||
- csrc/attention/
|
||||
- vllm/attention
|
||||
- vllm/v1/attention
|
||||
# TODO: remove this dependency (https://github.com/vllm-project/vllm/issues/32267)
|
||||
- vllm/model_executor/layers/attention
|
||||
- tests/kernels/attention
|
||||
commands:
|
||||
- pytest -v -s kernels/attention --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
|
||||
@@ -703,6 +722,17 @@ steps:
|
||||
- pytest -v -s kernels/moe/test_batched_deepgemm.py
|
||||
- pytest -v -s kernels/attention/test_deepgemm_attention.py
|
||||
|
||||
- label: Kernels Helion Test
|
||||
timeout_in_minutes: 30
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
agent_pool: mi325_1
|
||||
source_file_dependencies:
|
||||
- vllm/utils/import_utils.py
|
||||
- tests/kernels/helion/
|
||||
commands:
|
||||
- pip install helion
|
||||
- pytest -v -s kernels/helion/
|
||||
|
||||
- label: Model Executor Test # 23min
|
||||
timeout_in_minutes: 35
|
||||
torch_nightly: true
|
||||
@@ -724,7 +754,7 @@ steps:
|
||||
- label: Benchmarks # 11min
|
||||
timeout_in_minutes: 20
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
agent_pool: mi325_8
|
||||
agent_pool: mi325_1
|
||||
# grade: Blocking
|
||||
working_dir: "/vllm-workspace/.buildkite"
|
||||
source_file_dependencies:
|
||||
@@ -735,7 +765,7 @@ steps:
|
||||
- label: Benchmarks CLI Test # 7min
|
||||
timeout_in_minutes: 20
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
agent_pool: mi325_8
|
||||
agent_pool: mi325_1
|
||||
# grade: Blocking
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
@@ -834,10 +864,11 @@ steps:
|
||||
torch_nightly: true
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/models/test_terratorch.py
|
||||
- tests/models/test_transformers.py
|
||||
- tests/models/test_registry.py
|
||||
commands:
|
||||
- pytest -v -s models/test_transformers.py models/test_registry.py
|
||||
- pytest -v -s models/test_terratorch.py models/test_transformers.py models/test_registry.py
|
||||
|
||||
- label: Basic Models Test (Other CPU) # 5min
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
@@ -855,7 +886,7 @@ steps:
|
||||
|
||||
- label: Language Models Tests (Standard)
|
||||
timeout_in_minutes: 25
|
||||
mirror_hardwares: [amdexperimental]
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
agent_pool: mi325_1
|
||||
# grade: Blocking
|
||||
torch_nightly: true
|
||||
@@ -1114,7 +1145,7 @@ steps:
|
||||
- csrc/quantization/cutlass_w8a8/moe/
|
||||
- vllm/model_executor/layers/fused_moe/cutlass_moe.py
|
||||
- vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py
|
||||
- vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py
|
||||
- vllm/model_executor/layers/fused_moe/flashinfer_a2a_prepare_finalize.py
|
||||
- vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
|
||||
- vllm/v1/attention/backends/flashinfer.py
|
||||
- vllm/v1/attention/backends/mla/cutlass_mla.py
|
||||
@@ -1160,44 +1191,26 @@ steps:
|
||||
- vllm/model_executor/layers/layernorm.py
|
||||
- vllm/model_executor/layers/activation.py
|
||||
- vllm/model_executor/layers/quantization/input_quant_fp8.py
|
||||
- tests/compile/test_fusion_attn.py
|
||||
- tests/compile/test_silu_mul_quant_fusion.py
|
||||
- tests/compile/distributed/test_fusion_all_reduce.py
|
||||
- tests/compile/distributed/test_fusions_e2e.py
|
||||
- tests/compile/passes/test_fusion_attn.py
|
||||
- tests/compile/passes/test_silu_mul_quant_fusion.py
|
||||
- tests/compile/passes/distributed/test_fusion_all_reduce.py
|
||||
- tests/compile/fullgraph/test_full_graph.py
|
||||
commands:
|
||||
- nvidia-smi
|
||||
- pytest -v -s tests/compile/test_fusion_attn.py
|
||||
- pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
|
||||
- pytest -v -s tests/compile/passes/test_fusion_attn.py
|
||||
- pytest -v -s tests/compile/passes/test_silu_mul_quant_fusion.py
|
||||
# this runner has 2 GPUs available even though num_gpus=2 is not set
|
||||
- pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
|
||||
# Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time
|
||||
# Wrap with quotes to escape yaml
|
||||
- "pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and not +quant_fp8 and not +rms_norm'"
|
||||
- pytest -v -s tests/compile/passes/distributed/test_fusion_all_reduce.py
|
||||
|
||||
# # Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time
|
||||
# # Wrap with quotes to escape yaml
|
||||
# - "pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and not +quant_fp8 and not +rms_norm'"
|
||||
# Old E2E tests were removed in https://github.com/vllm-project/vllm/pull/33293
|
||||
# in favor of new tests in fusions_e2e. We avoid replicating the new jobs in this file as it's deprecated.
|
||||
|
||||
# test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40)
|
||||
- pytest -v -s tests/compile/fullgraph/test_full_graph.py::test_fp8_kv_scale_compile
|
||||
|
||||
- label: Blackwell Fusion E2E Tests # 30 min
|
||||
timeout_in_minutes: 40
|
||||
working_dir: "/vllm-workspace/"
|
||||
gpu: b200
|
||||
optional: true
|
||||
num_gpus: 2
|
||||
source_file_dependencies:
|
||||
- csrc/quantization/fp4/
|
||||
- vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
|
||||
- vllm/v1/attention/backends/flashinfer.py
|
||||
- vllm/compilation/
|
||||
# can affect pattern matching
|
||||
- vllm/model_executor/layers/layernorm.py
|
||||
- vllm/model_executor/layers/activation.py
|
||||
- vllm/model_executor/layers/quantization/input_quant_fp8.py
|
||||
- tests/compile/distributed/test_fusions_e2e.py
|
||||
commands:
|
||||
- nvidia-smi
|
||||
# Run all e2e fusion tests
|
||||
- pytest -v -s tests/compile/distributed/test_fusions_e2e.py
|
||||
|
||||
- label: Blackwell GPT-OSS Eval
|
||||
timeout_in_minutes: 60
|
||||
working_dir: "/vllm-workspace/"
|
||||
@@ -1260,7 +1273,7 @@ steps:
|
||||
|
||||
- label: 2 Node Tests (4 GPUs in total) # 16min
|
||||
timeout_in_minutes: 30
|
||||
mirror_hardwares: [amdexperimental]
|
||||
mirror_hardwares: [amdexperimental, amdmultinode]
|
||||
agent_pool: mi325_4
|
||||
# grade: Blocking
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
@@ -1274,15 +1287,15 @@ steps:
|
||||
- tests/distributed/
|
||||
- tests/examples/offline_inference/data_parallel.py
|
||||
commands:
|
||||
- # the following commands are for the first node, with ip 192.168.10.10 (ray environment already set up)
|
||||
- VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed'
|
||||
- NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed'
|
||||
- # the following commands are for the first node, with ip 192.168.10.10 (ray environment already set up) | grep 'Same node test passed' | grep 'Node count test passed'
|
||||
- VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py
|
||||
- NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py
|
||||
- python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=0 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code
|
||||
- VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py
|
||||
- VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py
|
||||
- # the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up)
|
||||
- VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed'
|
||||
- NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed'
|
||||
- VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py
|
||||
- NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py
|
||||
- python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=1 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code
|
||||
|
||||
- label: Distributed Tests (2 GPUs) # 68min
|
||||
@@ -1451,7 +1464,7 @@ steps:
|
||||
- bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large-amd.txt
|
||||
|
||||
- label: NixlConnector PD accuracy tests (Distributed) # 30min
|
||||
mirror_hardwares: [amdexperimental]
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
agent_pool: mi325_4
|
||||
# grade: Blocking
|
||||
timeout_in_minutes: 30
|
||||
@@ -1462,10 +1475,10 @@ steps:
|
||||
- tests/v1/kv_connector/nixl_integration/
|
||||
commands:
|
||||
- uv pip install --system -r /vllm-workspace/requirements/kv_connectors_rocm.txt
|
||||
- VLLM_ATTENTION_BACKEND=ROCM_ATTN bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh
|
||||
- ROCM_ATTN=1 bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh
|
||||
|
||||
- label: DP EP NixlConnector PD accuracy tests (Distributed) # 15min
|
||||
mirror_hardwares: [amdexperimental]
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
agent_pool: mi325_4
|
||||
# grade: Blocking
|
||||
timeout_in_minutes: 15
|
||||
@@ -1476,7 +1489,7 @@ steps:
|
||||
- tests/v1/kv_connector/nixl_integration/
|
||||
commands:
|
||||
- uv pip install --system -r /vllm-workspace/requirements/kv_connectors_rocm.txt
|
||||
- VLLM_ATTENTION_BACKEND=ROCM_ATTN DP_EP=1 bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh
|
||||
- DP_EP=1 ROCM_ATTN=1 bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh
|
||||
|
||||
##### multi gpus test #####
|
||||
##### A100 test #####
|
||||
@@ -1491,6 +1504,9 @@ steps:
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
commands:
|
||||
# Work around HIP bug tracked here: https://github.com/ROCm/hip/issues/3876
|
||||
# TODO: Remove when the bug is fixed in a future ROCm release
|
||||
- export TORCH_NCCL_BLOCKING_WAIT=1
|
||||
# NOTE: don't test llama model here, it seems hf implementation is buggy
|
||||
# see https://github.com/vllm-project/vllm/pull/5689 for details
|
||||
- pytest -v -s distributed/test_custom_all_reduce.py
|
||||
@@ -1541,12 +1557,15 @@ steps:
|
||||
working_dir: "/vllm-workspace/"
|
||||
num_gpus: 2
|
||||
commands:
|
||||
- VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_async_tp.py
|
||||
- pytest -v -s tests/compile/distributed/test_sequence_parallelism.py
|
||||
- pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
|
||||
- VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/passes/distributed/test_async_tp.py
|
||||
- pytest -v -s tests/compile/passes/distributed/test_sequence_parallelism.py
|
||||
- pytest -v -s tests/compile/passes/distributed/test_fusion_all_reduce.py
|
||||
#- pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm
|
||||
- "VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'"
|
||||
- VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/distributed/test_sequence_parallel.py
|
||||
# - "VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'"
|
||||
# Old E2E tests were removed in https://github.com/vllm-project/vllm/pull/33293
|
||||
# in favor of new tests in fusions_e2e. We avoid replicating the new jobs in this file as it's deprecated.
|
||||
|
||||
- VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/correctness_e2e/test_sequence_parallel.py
|
||||
- pytest -v -s tests/distributed/test_context_parallel.py
|
||||
- HIP_VISIBLE_DEVICES=0,1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=allgather_reducescatter --disable-nccl-for-dp-synchronization
|
||||
- pytest -v -s tests/v1/distributed/test_dbo.py
|
||||
@@ -1662,17 +1681,6 @@ steps:
|
||||
commands:
|
||||
- bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1
|
||||
|
||||
- label: DeepSeek V2-Lite Async EPLB Accuracy
|
||||
timeout_in_minutes: 60
|
||||
mirror_hardwares: [amdexperimental]
|
||||
agent_pool: mi325_4
|
||||
# grade: Blocking
|
||||
gpu: h100
|
||||
optional: true
|
||||
num_gpus: 4
|
||||
working_dir: "/vllm-workspace"
|
||||
commands:
|
||||
- bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_async_eplb.sh 0.25 1319 8030
|
||||
|
||||
- label: Qwen3-Next-80B-A3B-Instruct MTP Async EPLB Accuracy
|
||||
timeout_in_minutes: 60
|
||||
|
||||
@@ -63,7 +63,9 @@ steps:
|
||||
- vllm/
|
||||
- tests/test_inputs.py
|
||||
- tests/test_outputs.py
|
||||
- tests/test_pooling_params.py
|
||||
- tests/multimodal
|
||||
- tests/renderers
|
||||
- tests/standalone_tests/lazy_imports.py
|
||||
- tests/tokenizers_
|
||||
- tests/tool_parsers
|
||||
@@ -74,7 +76,9 @@ steps:
|
||||
- python3 standalone_tests/lazy_imports.py
|
||||
- pytest -v -s test_inputs.py
|
||||
- pytest -v -s test_outputs.py
|
||||
- pytest -v -s test_pooling_params.py
|
||||
- pytest -v -s -m 'cpu_test' multimodal
|
||||
- pytest -v -s renderers
|
||||
- pytest -v -s tokenizers_
|
||||
- pytest -v -s tool_parsers
|
||||
- pytest -v -s transformers_utils
|
||||
@@ -202,6 +206,7 @@ steps:
|
||||
- tests/compile/fullgraph/test_basic_correctness.py
|
||||
- examples/offline_inference/rlhf.py
|
||||
- examples/offline_inference/rlhf_colocate.py
|
||||
- examples/offline_inference/new_weight_syncing/
|
||||
- tests/examples/offline_inference/data_parallel.py
|
||||
- tests/v1/distributed
|
||||
- tests/v1/engine/test_engine_core_client.py
|
||||
@@ -236,10 +241,16 @@ steps:
|
||||
- pytest -v -s distributed/test_symm_mem_allreduce.py
|
||||
# TODO: create a dedicated test section for multi-GPU example tests
|
||||
# when we have multiple distributed example tests
|
||||
# OLD rlhf examples
|
||||
- pushd ../examples/offline_inference
|
||||
- VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf.py
|
||||
- VLLM_ALLOW_INSECURE_SERIALIZATION=1 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
|
||||
- popd
|
||||
# NEW rlhf examples
|
||||
- pushd ../examples/offline_inference/new_weight_syncing
|
||||
- VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf.py
|
||||
- VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf_async_new_apis.py
|
||||
- popd
|
||||
|
||||
- label: Distributed Tests (8 GPUs) # 4min
|
||||
timeout_in_minutes: 10
|
||||
@@ -360,7 +371,7 @@ steps:
|
||||
- pytest -v -s v1/sample
|
||||
- pytest -v -s v1/logits_processors
|
||||
- pytest -v -s v1/worker
|
||||
- pytest -v -s v1/spec_decode
|
||||
- pytest -v -s -m 'not slow_test' v1/spec_decode
|
||||
- pytest -v -s -m 'not cpu_test' v1/kv_connector/unit
|
||||
- pytest -v -s -m 'not cpu_test' v1/metrics
|
||||
- pytest -v -s v1/test_oracle.py
|
||||
@@ -374,6 +385,8 @@ steps:
|
||||
timeout_in_minutes: 30
|
||||
gpu: h100
|
||||
source_file_dependencies:
|
||||
- vllm/config/attention.py
|
||||
- vllm/model_executor/layers/attention
|
||||
- vllm/v1/attention
|
||||
- tests/v1/attention
|
||||
commands:
|
||||
@@ -396,10 +409,12 @@ steps:
|
||||
timeout_in_minutes: 30
|
||||
gpu: b200
|
||||
source_file_dependencies:
|
||||
- vllm/config/attention.py
|
||||
- vllm/model_executor/layers/attention
|
||||
- vllm/v1/attention
|
||||
- tests/v1/attention
|
||||
commands:
|
||||
- VLLM_DISABLE_FLASHINFER_PREFILL=1 pytest -v -s v1/attention # TODO: FI prefill is bugged and causes incorrectness, fix this
|
||||
- pytest -v -s v1/attention
|
||||
|
||||
- label: V1 Test others (CPU) # 5 mins
|
||||
source_file_dependencies:
|
||||
@@ -438,7 +453,7 @@ steps:
|
||||
- python3 offline_inference/vision_language_multi_image.py --seed 0
|
||||
- python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
|
||||
# for pooling models
|
||||
- python3 pooling/pooling/vision_language_pooling.py --seed 0
|
||||
- python3 pooling/embed/vision_embedding_offline.py --seed 0
|
||||
# for features demo
|
||||
- python3 offline_inference/prefix_caching.py
|
||||
- python3 offline_inference/llm_engine_example.py
|
||||
@@ -504,6 +519,7 @@ steps:
|
||||
# However, find does not normally propagate error codes, so we combine it with xargs
|
||||
# (using -0 for proper path handling)
|
||||
- "find compile/ -maxdepth 1 -name 'test_*.py' -print0 | xargs -0 -n1 -I{} pytest -s -v '{}'"
|
||||
- pytest -s -v compile/passes --ignore compile/passes/distributed
|
||||
|
||||
- label: PyTorch Fullgraph Smoke Test # 15min
|
||||
timeout_in_minutes: 30
|
||||
@@ -531,9 +547,11 @@ steps:
|
||||
commands:
|
||||
# fp8 kv scales not supported on sm89, tested on Blackwell instead
|
||||
- pytest -v -s compile/fullgraph/test_full_graph.py -k 'not test_fp8_kv_scale_compile'
|
||||
# Limit to no custom ops to reduce running time
|
||||
# Wrap with quotes to escape yaml and avoid starting -k string with a -
|
||||
- "pytest -v -s compile/distributed/test_fusions_e2e.py -k 'TRITON and not +quant_fp8 and not Llama-4'"
|
||||
# # Limit to no custom ops to reduce running time
|
||||
# # Wrap with quotes to escape yaml and avoid starting -k string with a -
|
||||
# - "pytest -v -s compile/distributed/test_fusions_e2e.py -k 'TRITON and not +quant_fp8 and not Llama-4'"
|
||||
# Old E2E tests were removed in https://github.com/vllm-project/vllm/pull/33293
|
||||
# in favor of new tests in fusions_e2e. We avoid replicating the new jobs in this file as it's deprecated.
|
||||
|
||||
- label: Cudagraph test
|
||||
timeout_in_minutes: 20
|
||||
@@ -562,8 +580,9 @@ steps:
|
||||
mirror_hardwares: [amdexperimental]
|
||||
source_file_dependencies:
|
||||
- csrc/attention/
|
||||
- vllm/attention
|
||||
- vllm/v1/attention
|
||||
# TODO: remove this dependency (https://github.com/vllm-project/vllm/issues/32267)
|
||||
- vllm/model_executor/layers/attention
|
||||
- tests/kernels/attention
|
||||
commands:
|
||||
- pytest -v -s kernels/attention --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
|
||||
@@ -624,6 +643,56 @@ steps:
|
||||
- pytest -v -s kernels/moe/test_batched_deepgemm.py
|
||||
- pytest -v -s kernels/attention/test_deepgemm_attention.py
|
||||
|
||||
- label: Kernels Helion Test
|
||||
timeout_in_minutes: 30
|
||||
gpu: h100
|
||||
source_file_dependencies:
|
||||
- vllm/utils/import_utils.py
|
||||
- tests/kernels/helion/
|
||||
commands:
|
||||
- pip install helion
|
||||
- pytest -v -s kernels/helion/
|
||||
|
||||
|
||||
- label: Kernels FP8 MoE Test (1 H100)
|
||||
timeout_in_minutes: 90
|
||||
gpu: h100
|
||||
num_gpus: 1
|
||||
optional: true
|
||||
commands:
|
||||
- pytest -v -s kernels/moe/test_cutlass_moe.py
|
||||
- pytest -v -s kernels/moe/test_flashinfer.py
|
||||
- pytest -v -s kernels/moe/test_gpt_oss_triton_kernels.py
|
||||
- pytest -v -s kernels/moe/test_modular_oai_triton_moe.py
|
||||
- pytest -v -s kernels/moe/test_moe.py
|
||||
# - pytest -v -s kernels/moe/test_block_fp8.py - failing on main
|
||||
- pytest -v -s kernels/moe/test_block_int8.py
|
||||
- pytest -v -s kernels/moe/test_triton_moe_no_act_mul.py
|
||||
- pytest -v -s kernels/moe/test_triton_moe_ptpc_fp8.py
|
||||
|
||||
- label: Kernels FP8 MoE Test (2 H100s)
|
||||
timeout_in_minutes: 90
|
||||
gpu: h100
|
||||
num_gpus: 2
|
||||
optional: true
|
||||
commands:
|
||||
- pytest -v -s kernels/moe/test_deepep_deepgemm_moe.py
|
||||
- pytest -v -s kernels/moe/test_deepep_moe.py
|
||||
- pytest -v -s kernels/moe/test_pplx_cutlass_moe.py
|
||||
# - pytest -v -s kernels/moe/test_pplx_moe.py - failing on main
|
||||
|
||||
- label: Kernels Fp4 MoE Test (B200)
|
||||
timeout_in_minutes: 60
|
||||
gpu: b200
|
||||
num_gpus: 1
|
||||
optional: true
|
||||
commands:
|
||||
- pytest -v -s kernels/moe/test_cutedsl_moe.py
|
||||
- pytest -v -s kernels/moe/test_flashinfer_moe.py
|
||||
- pytest -v -s kernels/moe/test_nvfp4_moe.py
|
||||
- pytest -v -s kernels/moe/test_ocp_mx_moe.py
|
||||
|
||||
|
||||
- label: Model Executor Test # 23min
|
||||
timeout_in_minutes: 35
|
||||
torch_nightly: true
|
||||
@@ -736,10 +805,11 @@ steps:
|
||||
torch_nightly: true
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/models/test_terratorch.py
|
||||
- tests/models/test_transformers.py
|
||||
- tests/models/test_registry.py
|
||||
commands:
|
||||
- pytest -v -s models/test_transformers.py models/test_registry.py
|
||||
- pytest -v -s models/test_terratorch.py models/test_transformers.py models/test_registry.py
|
||||
|
||||
- label: Basic Models Test (Other CPU) # 5min
|
||||
timeout_in_minutes: 10
|
||||
@@ -951,7 +1021,7 @@ steps:
|
||||
# Whisper needs spawn method to avoid deadlock
|
||||
- VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper
|
||||
|
||||
- label: Blackwell Test # 21 min
|
||||
- label: Blackwell Test # 23 min
|
||||
timeout_in_minutes: 30
|
||||
working_dir: "/vllm-workspace/"
|
||||
gpu: b200
|
||||
@@ -961,7 +1031,7 @@ steps:
|
||||
- csrc/quantization/cutlass_w8a8/moe/
|
||||
- vllm/model_executor/layers/fused_moe/cutlass_moe.py
|
||||
- vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py
|
||||
- vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py
|
||||
- vllm/model_executor/layers/fused_moe/flashinfer_a2a_prepare_finalize.py
|
||||
- vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
|
||||
- vllm/v1/attention/backends/flashinfer.py
|
||||
- vllm/v1/attention/backends/mla/cutlass_mla.py
|
||||
@@ -991,6 +1061,8 @@ steps:
|
||||
- pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py
|
||||
- pytest -v -s tests/kernels/moe/test_flashinfer.py
|
||||
- pytest -v -s tests/kernels/moe/test_cutedsl_moe.py
|
||||
# e2e
|
||||
- pytest -v -s tests/models/quantization/test_nvfp4.py
|
||||
|
||||
- label: Blackwell Fusion and Compile Tests # 30 min
|
||||
timeout_in_minutes: 40
|
||||
@@ -1009,42 +1081,23 @@ steps:
|
||||
- vllm/model_executor/layers/quantization/input_quant_fp8.py
|
||||
- tests/compile/test_fusion_attn.py
|
||||
- tests/compile/test_silu_mul_quant_fusion.py
|
||||
- tests/compile/distributed/test_fusion_all_reduce.py
|
||||
- tests/compile/distributed/test_fusions_e2e.py
|
||||
- tests/compile/passes/distributed/test_fusion_all_reduce.py
|
||||
- tests/compile/fullgraph/test_full_graph.py
|
||||
commands:
|
||||
- nvidia-smi
|
||||
- pytest -v -s tests/compile/test_fusion_attn.py
|
||||
- pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
|
||||
# this runner has 2 GPUs available even though num_gpus=2 is not set
|
||||
- pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
|
||||
# Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time
|
||||
# Wrap with quotes to escape yaml
|
||||
- "pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and not +quant_fp8 and not +rms_norm'"
|
||||
- pytest -v -s tests/compile/passes/distributed/test_fusion_all_reduce.py
|
||||
# # Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time
|
||||
# # Wrap with quotes to escape yaml
|
||||
# - "pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and not +quant_fp8 and not +rms_norm'"
|
||||
# Old E2E tests were removed in https://github.com/vllm-project/vllm/pull/33293
|
||||
# in favor of new tests in fusions_e2e. We avoid replicating the new jobs in this file as it's deprecated.
|
||||
|
||||
# test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40)
|
||||
- pytest -v -s tests/compile/fullgraph/test_full_graph.py::test_fp8_kv_scale_compile
|
||||
|
||||
- label: Blackwell Fusion E2E Tests # 30 min
|
||||
timeout_in_minutes: 40
|
||||
working_dir: "/vllm-workspace/"
|
||||
gpu: b200
|
||||
optional: true
|
||||
num_gpus: 2
|
||||
source_file_dependencies:
|
||||
- csrc/quantization/fp4/
|
||||
- vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
|
||||
- vllm/v1/attention/backends/flashinfer.py
|
||||
- vllm/compilation/
|
||||
# can affect pattern matching
|
||||
- vllm/model_executor/layers/layernorm.py
|
||||
- vllm/model_executor/layers/activation.py
|
||||
- vllm/model_executor/layers/quantization/input_quant_fp8.py
|
||||
- tests/compile/distributed/test_fusions_e2e.py
|
||||
commands:
|
||||
- nvidia-smi
|
||||
# Run all e2e fusion tests
|
||||
- pytest -v -s tests/compile/distributed/test_fusions_e2e.py
|
||||
|
||||
- label: Blackwell GPT-OSS Eval
|
||||
timeout_in_minutes: 60
|
||||
working_dir: "/vllm-workspace/"
|
||||
@@ -1102,6 +1155,8 @@ steps:
|
||||
- pytest -v -s distributed/test_shm_broadcast.py
|
||||
- pytest -v -s distributed/test_shm_buffer.py
|
||||
- pytest -v -s distributed/test_shm_storage.py
|
||||
- pytest -v -s distributed/test_packed_tensor.py
|
||||
- pytest -v -s distributed/test_weight_transfer.py
|
||||
|
||||
- label: 2 Node Tests (4 GPUs in total) # 16min
|
||||
timeout_in_minutes: 30
|
||||
@@ -1216,7 +1271,7 @@ steps:
|
||||
- pytest -v -s distributed/test_distributed_oot.py
|
||||
- pytest -v -s entrypoints/openai/test_oot_registration.py # it needs a clean process
|
||||
- pytest -v -s models/test_oot_registration.py # it needs a clean process
|
||||
- pytest -v -s plugins/lora_resolvers # unit tests for in-tree lora resolver plugins
|
||||
- pytest -v -s plugins/lora_resolvers # unit tests for lora resolver plugins
|
||||
|
||||
- label: Pipeline + Context Parallelism Test # 45min
|
||||
timeout_in_minutes: 60
|
||||
@@ -1319,6 +1374,20 @@ steps:
|
||||
- TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
|
||||
- pytest -v -s -x lora/test_mixtral.py
|
||||
|
||||
- label: Acceptance Length Test (Large Models) # optional
|
||||
timeout_in_minutes: 120
|
||||
gpu: h100
|
||||
optional: true
|
||||
num_gpus: 1
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
source_file_dependencies:
|
||||
- vllm/v1/spec_decode/
|
||||
- vllm/model_executor/models/mlp_speculator.py
|
||||
- tests/v1/spec_decode/test_acceptance_length.py
|
||||
commands:
|
||||
- export VLLM_ALLOW_INSECURE_SERIALIZATION=1
|
||||
- pytest -v -s v1/spec_decode/test_acceptance_length.py -m slow_test
|
||||
|
||||
- label: LM Eval Large Models # optional
|
||||
gpu: a100
|
||||
optional: true
|
||||
@@ -1344,22 +1413,31 @@ steps:
|
||||
- export VLLM_USE_DEEP_GEMM=0 # We found Triton is faster than DeepGEMM for H100
|
||||
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt --tp-size=4
|
||||
|
||||
##### H200 test #####
|
||||
- label: Distributed Tests (H200) # optional
|
||||
gpu: h200
|
||||
- label: Sequence Parallel Tests (H100) # 60 min
|
||||
timeout_in_minutes: 60
|
||||
working_dir: "/vllm-workspace/"
|
||||
gpu: h100
|
||||
optional: true
|
||||
num_gpus: 2
|
||||
commands:
|
||||
- export VLLM_TEST_CLEAN_GPU_MEMORY=1
|
||||
# Run sequence parallel tests
|
||||
- pytest -v -s tests/compile/correctness_e2e/test_sequence_parallel.py
|
||||
- pytest -v -s tests/compile/passes/distributed/test_sequence_parallelism.py
|
||||
|
||||
- label: Distributed Tests (H100) # optional
|
||||
gpu: h100
|
||||
optional: true
|
||||
working_dir: "/vllm-workspace/"
|
||||
num_gpus: 2
|
||||
commands:
|
||||
- VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_async_tp.py
|
||||
- pytest -v -s tests/compile/distributed/test_sequence_parallelism.py
|
||||
- pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
|
||||
- "VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'"
|
||||
- VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/distributed/test_sequence_parallel.py
|
||||
- VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/passes/distributed/test_async_tp.py
|
||||
- pytest -v -s tests/distributed/test_context_parallel.py
|
||||
- CUDA_VISIBLE_DEVICES=1,2 VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=deepep_high_throughput
|
||||
- VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=deepep_high_throughput
|
||||
- pytest -v -s tests/v1/distributed/test_dbo.py
|
||||
|
||||
##### H200 test #####
|
||||
|
||||
- label: LM Eval Large Models (H200) # optional
|
||||
timeout_in_minutes: 60
|
||||
gpu: h200
|
||||
|
||||
@@ -4,8 +4,10 @@ depends_on:
|
||||
steps:
|
||||
- label: V1 attention (H100)
|
||||
timeout_in_minutes: 30
|
||||
gpu: h100
|
||||
device: h100
|
||||
source_file_dependencies:
|
||||
- vllm/config/attention.py
|
||||
- vllm/model_executor/layers/attention
|
||||
- vllm/v1/attention
|
||||
- tests/v1/attention
|
||||
commands:
|
||||
@@ -13,9 +15,11 @@ steps:
|
||||
|
||||
- label: V1 attention (B200)
|
||||
timeout_in_minutes: 30
|
||||
gpu: b200
|
||||
device: b200
|
||||
source_file_dependencies:
|
||||
- vllm/config/attention.py
|
||||
- vllm/model_executor/layers/attention
|
||||
- vllm/v1/attention
|
||||
- tests/v1/attention
|
||||
commands:
|
||||
- VLLM_DISABLE_FLASHINFER_PREFILL=1 pytest -v -s v1/attention # TODO: FI prefill is bugged and causes incorrectness, fix this
|
||||
- pytest -v -s v1/attention
|
||||
|
||||
@@ -2,56 +2,202 @@ group: Compile
|
||||
depends_on:
|
||||
- image-build
|
||||
steps:
|
||||
- label: Fusion and Compile Tests (B200)
|
||||
timeout_in_minutes: 40
|
||||
- label: Sequence Parallel Correctness Tests (2 GPUs)
|
||||
timeout_in_minutes: 50
|
||||
working_dir: "/vllm-workspace/"
|
||||
gpu: b200
|
||||
num_devices: 2
|
||||
source_file_dependencies:
|
||||
- csrc/quantization/fp4/
|
||||
- vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
|
||||
- vllm/v1/attention/backends/flashinfer.py
|
||||
- vllm/model_executor/layers/
|
||||
- vllm/compilation/
|
||||
- vllm/v1/worker/
|
||||
- vllm/v1/cudagraph_dispatcher.py
|
||||
- vllm/compilation/
|
||||
# can affect pattern matching
|
||||
- vllm/model_executor/layers/layernorm.py
|
||||
- vllm/model_executor/layers/activation.py
|
||||
- vllm/model_executor/layers/quantization/input_quant_fp8.py
|
||||
- tests/compile/test_fusion_attn.py
|
||||
- tests/compile/test_silu_mul_quant_fusion.py
|
||||
- tests/compile/distributed/test_fusion_all_reduce.py
|
||||
- tests/compile/distributed/test_fusions_e2e.py
|
||||
- tests/compile/fullgraph/test_full_graph.py
|
||||
- tests/compile/correctness_e2e/test_sequence_parallel.py
|
||||
commands:
|
||||
- nvidia-smi
|
||||
- pytest -v -s tests/compile/test_fusion_attn.py
|
||||
- pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
|
||||
# this runner has 2 GPUs available even though num_gpus=2 is not set
|
||||
- pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
|
||||
# Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time
|
||||
# Wrap with quotes to escape yaml
|
||||
- "pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and not +quant_fp8 and not +rms_norm'"
|
||||
# test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40)
|
||||
- pytest -v -s tests/compile/fullgraph/test_full_graph.py::test_fp8_kv_scale_compile
|
||||
- export VLLM_TEST_CLEAN_GPU_MEMORY=1
|
||||
- pytest -v -s tests/compile/correctness_e2e/test_sequence_parallel.py
|
||||
|
||||
- label: Fusion E2E (2 GPUs)(B200)
|
||||
timeout_in_minutes: 40
|
||||
- label: Sequence Parallel Correctness Tests (2xH100)
|
||||
timeout_in_minutes: 50
|
||||
working_dir: "/vllm-workspace/"
|
||||
gpu: b200
|
||||
device: h100
|
||||
optional: true
|
||||
num_gpus: 2
|
||||
num_devices: 2
|
||||
commands:
|
||||
- export VLLM_TEST_CLEAN_GPU_MEMORY=1
|
||||
- pytest -v -s tests/compile/correctness_e2e/test_sequence_parallel.py
|
||||
|
||||
- label: AsyncTP Correctness Tests (2xH100)
|
||||
timeout_in_minutes: 50
|
||||
working_dir: "/vllm-workspace/"
|
||||
device: h100
|
||||
optional: true
|
||||
num_devices: 2
|
||||
commands:
|
||||
- export VLLM_TEST_CLEAN_GPU_MEMORY=1
|
||||
- pytest -v -s tests/compile/correctness_e2e/test_async_tp.py
|
||||
|
||||
- label: Distributed Compile Unit Tests (2xH100)
|
||||
timeout_in_minutes: 20
|
||||
working_dir: "/vllm-workspace/"
|
||||
device: h100
|
||||
num_devices: 2
|
||||
source_file_dependencies:
|
||||
- vllm/compilation/
|
||||
- vllm/model_executor/layers
|
||||
- tests/compile/passes/distributed/
|
||||
commands:
|
||||
- export VLLM_TEST_CLEAN_GPU_MEMORY=1
|
||||
- pytest -s -v tests/compile/passes/distributed
|
||||
|
||||
- label: Fusion and Compile Unit Tests (B200)
|
||||
timeout_in_minutes: 20
|
||||
working_dir: "/vllm-workspace/"
|
||||
device: b200
|
||||
source_file_dependencies:
|
||||
- csrc/quantization/fp4/
|
||||
- vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
|
||||
- vllm/v1/attention/backends/flashinfer.py
|
||||
- vllm/compilation/
|
||||
# can affect pattern matching
|
||||
- vllm/model_executor/layers/quantization/
|
||||
- vllm/model_executor/layers/layernorm.py
|
||||
- vllm/model_executor/layers/activation.py
|
||||
- vllm/model_executor/layers/quantization/input_quant_fp8.py
|
||||
- tests/compile/distributed/test_fusions_e2e.py
|
||||
- vllm/model_executor/layers/attention/attention.py
|
||||
- vllm/v1/attention/backends/flashinfer.py
|
||||
- vllm/compilation/ # TODO(luka) limit to vllm/compilation/passes
|
||||
- tests/compile/passes/test_fusion_attn.py
|
||||
- tests/compile/passes/test_silu_mul_quant_fusion.py
|
||||
- tests/compile/passes/distributed/test_fusion_all_reduce.py
|
||||
- tests/compile/fullgraph/test_full_graph.py
|
||||
commands:
|
||||
# b200 runners are limited, so we limit the tests to the minimum set only supported on Blackwell
|
||||
- nvidia-smi
|
||||
- pytest -v -s tests/compile/passes/test_fusion_attn.py -k FLASHINFER
|
||||
- pytest -v -s tests/compile/passes/test_silu_mul_quant_fusion.py
|
||||
# this runner has 2 GPUs available even though num_devices=2 is not set
|
||||
- pytest -v -s tests/compile/passes/distributed/test_fusion_all_reduce.py
|
||||
# test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40)
|
||||
# TODO(luka) move to H100 once pass tests run on H100
|
||||
- pytest -v -s tests/compile/fullgraph/test_full_graph.py::test_fp8_kv_scale_compile
|
||||
|
||||
- label: Fusion E2E Quick (H100)
|
||||
timeout_in_minutes: 15
|
||||
working_dir: "/vllm-workspace/"
|
||||
device: h100
|
||||
num_devices: 1
|
||||
source_file_dependencies:
|
||||
- csrc/quantization/
|
||||
- vllm/model_executor/
|
||||
- vllm/v1/attention/
|
||||
- vllm/compilation/
|
||||
- tests/compile/fusions_e2e/
|
||||
commands:
|
||||
- nvidia-smi
|
||||
# Run all e2e fusion tests
|
||||
- pytest -v -s tests/compile/distributed/test_fusions_e2e.py
|
||||
# Run all models and attn backends but only Inductor partition and native custom ops
|
||||
- pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and not +rms_norm and not +quant_fp8"
|
||||
# Qwen requires +quant_fp8 as -quant_fp8 rms+quant fusion is not supported
|
||||
- pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and not +rms_norm and +quant_fp8 and qwen3"
|
||||
|
||||
- label: Fusion E2E Config Sweep (H100)
|
||||
timeout_in_minutes: 30
|
||||
working_dir: "/vllm-workspace/"
|
||||
device: h100
|
||||
num_devices: 1
|
||||
source_file_dependencies:
|
||||
- csrc/quantization/
|
||||
- vllm/compilation/
|
||||
# can affect pattern matching
|
||||
- vllm/model_executor/layers/layernorm.py
|
||||
- vllm/model_executor/layers/activation.py
|
||||
- vllm/model_executor/layers/attention/attention.py
|
||||
- vllm/model_executor/layers/quantization/input_quant_fp8.py
|
||||
- tests/compile/fusions_e2e/
|
||||
commands:
|
||||
- nvidia-smi
|
||||
# Run just llama3 (fp8) for all config combinations
|
||||
- pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "llama-3"
|
||||
|
||||
- label: Fusion E2E Config Sweep (B200)
|
||||
timeout_in_minutes: 30
|
||||
working_dir: "/vllm-workspace/"
|
||||
device: b200
|
||||
num_devices: 1
|
||||
optional: true
|
||||
commands:
|
||||
- nvidia-smi
|
||||
# Run all models and attn backends but only Inductor partition and native custom ops
|
||||
# -k "inductor_partition and not +rms_norm and not +quant_fp8"
|
||||
# Qwen requires +quant_fp8 as -quant_fp8 rms+quant fusion is not supported
|
||||
# -k "inductor_partition and not +rms_norm and +quant_fp8 and qwen3"
|
||||
# Run just llama3 (fp8 & fp4) for all config combinations
|
||||
# -k "llama-3"
|
||||
- pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and not +rms_norm and not +quant_fp8" -k "inductor_partition and not +rms_norm and +quant_fp8 and qwen3" -k "llama-3"
|
||||
|
||||
- label: Fusion E2E TP2 Quick (H100)
|
||||
timeout_in_minutes: 20
|
||||
working_dir: "/vllm-workspace/"
|
||||
device: h100
|
||||
num_devices: 2
|
||||
source_file_dependencies:
|
||||
- csrc/quantization/
|
||||
- vllm/model_executor/
|
||||
- vllm/v1/attention/
|
||||
- vllm/compilation/
|
||||
- tests/compile/fusions_e2e/
|
||||
commands:
|
||||
- nvidia-smi
|
||||
# Run all models and attn backends but only Inductor partition and native custom ops
|
||||
- pytest -v -s tests/compile/fusions_e2e/test_tp2_ar_rms.py -k "inductor_partition and not +rms_norm and not +quant_fp8"
|
||||
- pytest -v -s tests/compile/fusions_e2e/test_tp2_async_tp.py -k "inductor_partition and not +rms_norm and not +quant_fp8"
|
||||
|
||||
- label: Fusion E2E TP2 AR-RMS Config Sweep (H100)
|
||||
timeout_in_minutes: 40
|
||||
working_dir: "/vllm-workspace/"
|
||||
device: h100
|
||||
num_devices: 2
|
||||
source_file_dependencies:
|
||||
- csrc/quantization/
|
||||
- vllm/compilation/
|
||||
# can affect pattern matching
|
||||
- vllm/model_executor/layers/layernorm.py
|
||||
- vllm/model_executor/layers/activation.py
|
||||
- vllm/model_executor/layers/attention/attention.py
|
||||
- vllm/model_executor/layers/quantization/input_quant_fp8.py
|
||||
- tests/compile/fusions_e2e/
|
||||
commands:
|
||||
- nvidia-smi
|
||||
# Run just llama3 (fp4 & fp8 & bf16) for all config combinations
|
||||
- pytest -v -s tests/compile/fusions_e2e/test_tp2_ar_rms.py -k "llama-3"
|
||||
|
||||
- label: Fusion E2E TP2 AsyncTP Config Sweep (H100)
|
||||
timeout_in_minutes: 40
|
||||
working_dir: "/vllm-workspace/"
|
||||
device: h100
|
||||
num_devices: 2
|
||||
source_file_dependencies:
|
||||
- csrc/quantization/
|
||||
- vllm/compilation/
|
||||
# can affect pattern matching
|
||||
- vllm/model_executor/layers/layernorm.py
|
||||
- vllm/model_executor/layers/activation.py
|
||||
- vllm/model_executor/layers/attention/attention.py
|
||||
- vllm/model_executor/layers/quantization/input_quant_fp8.py
|
||||
- tests/compile/fusions_e2e/
|
||||
commands:
|
||||
- nvidia-smi
|
||||
# Run just llama3 (fp8 & bf16) for all config combinations
|
||||
- pytest -v -s tests/compile/fusions_e2e/test_tp2_async_tp.py -k "llama-3"
|
||||
|
||||
- label: Fusion E2E TP2 (B200)
|
||||
timeout_in_minutes: 20
|
||||
working_dir: "/vllm-workspace/"
|
||||
device: b200
|
||||
num_devices: 2
|
||||
source_file_dependencies:
|
||||
- csrc/quantization/
|
||||
- vllm/model_executor/
|
||||
- vllm/v1/attention/
|
||||
- vllm/compilation/
|
||||
- tests/compile/fusions_e2e/
|
||||
commands:
|
||||
- nvidia-smi
|
||||
# Run all models and attn backends but only Inductor partition and native custom ops
|
||||
# for ar-rms-quant-fp4, also sweep llama3
|
||||
- pytest -v -s tests/compile/fusions_e2e/test_tp2_ar_rms.py -k "inductor_partition and not +rms_norm and not +quant_fp8" -k "Llama-3.1-8B-Instruct-FP4"
|
||||
- pytest -v -s tests/compile/fusions_e2e/test_tp2_async_tp.py -k "inductor_partition and not +rms_norm and not +quant_fp8"
|
||||
|
||||
@@ -9,6 +9,7 @@ steps:
|
||||
- tests/cuda
|
||||
commands:
|
||||
- pytest -v -s cuda/test_cuda_context.py
|
||||
- pytest -v -s cuda/test_platform_no_cuda_init.py
|
||||
|
||||
- label: Cudagraph
|
||||
timeout_in_minutes: 20
|
||||
|
||||
@@ -5,7 +5,7 @@ steps:
|
||||
- label: Distributed Comm Ops
|
||||
timeout_in_minutes: 20
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
num_gpus: 2
|
||||
num_devices: 2
|
||||
source_file_dependencies:
|
||||
- vllm/distributed
|
||||
- tests/distributed
|
||||
@@ -16,9 +16,9 @@ steps:
|
||||
- pytest -v -s distributed/test_shm_storage.py
|
||||
|
||||
- label: Distributed (2 GPUs)
|
||||
timeout_in_minutes: 90
|
||||
timeout_in_minutes: 60
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
num_gpus: 2
|
||||
num_devices: 2
|
||||
source_file_dependencies:
|
||||
- vllm/compilation/
|
||||
- vllm/distributed/
|
||||
@@ -47,14 +47,13 @@ steps:
|
||||
- pytest -v -s ./compile/test_wrapper.py
|
||||
- VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
|
||||
- VLLM_TEST_SAME_HOST=1 VLLM_TEST_WITH_DEFAULT_DEVICE_SET=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
|
||||
- pytest -v -s distributed/test_sequence_parallel.py
|
||||
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
|
||||
- pytest -v -s v1/worker/test_worker_memory_snapshot.py
|
||||
|
||||
- label: Distributed Tests (4 GPUs)
|
||||
timeout_in_minutes: 50
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
num_gpus: 4
|
||||
num_devices: 4
|
||||
source_file_dependencies:
|
||||
- vllm/distributed/
|
||||
- tests/distributed/test_utils
|
||||
@@ -63,6 +62,7 @@ steps:
|
||||
- tests/compile/fullgraph/test_basic_correctness.py
|
||||
- examples/offline_inference/rlhf.py
|
||||
- examples/offline_inference/rlhf_colocate.py
|
||||
- examples/offline_inference/new_weight_syncing/
|
||||
- tests/examples/offline_inference/data_parallel.py
|
||||
- tests/v1/distributed
|
||||
- tests/v1/engine/test_engine_core_client.py
|
||||
@@ -97,14 +97,19 @@ steps:
|
||||
- pytest -v -s distributed/test_symm_mem_allreduce.py
|
||||
# TODO: create a dedicated test section for multi-GPU example tests
|
||||
# when we have multiple distributed example tests
|
||||
# OLD rlhf examples
|
||||
- cd ../examples/offline_inference
|
||||
- VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf.py
|
||||
- VLLM_ALLOW_INSECURE_SERIALIZATION=1 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
|
||||
# NEW rlhf examples
|
||||
- cd new_weight_syncing
|
||||
- VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf.py
|
||||
- VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf_async_new_apis.py
|
||||
|
||||
- label: Distributed Tests (8 GPUs)(H100)
|
||||
timeout_in_minutes: 10
|
||||
gpu: h100
|
||||
num_gpus: 8
|
||||
device: h100
|
||||
num_devices: 8
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
source_file_dependencies:
|
||||
- examples/offline_inference/torchrun_dp_example.py
|
||||
@@ -120,9 +125,9 @@ steps:
|
||||
- torchrun --nproc-per-node=8 ../examples/offline_inference/torchrun_dp_example.py --tp-size=2 --pp-size=1 --dp-size=4 --enable-ep
|
||||
|
||||
- label: Distributed Tests (4 GPUs)(A100)
|
||||
gpu: a100
|
||||
device: a100
|
||||
optional: true
|
||||
num_gpus: 4
|
||||
num_devices: 4
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
commands:
|
||||
@@ -133,26 +138,22 @@ steps:
|
||||
- TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
|
||||
- pytest -v -s -x lora/test_mixtral.py
|
||||
|
||||
- label: Distributed Tests (2 GPUs)(H200)
|
||||
gpu: h200
|
||||
- label: Distributed Tests (2 GPUs)(H100)
|
||||
timeout_in_minutes: 15
|
||||
device: h100
|
||||
optional: true
|
||||
working_dir: "/vllm-workspace/"
|
||||
num_gpus: 2
|
||||
num_devices: 2
|
||||
commands:
|
||||
- VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_async_tp.py
|
||||
- pytest -v -s tests/compile/distributed/test_sequence_parallelism.py
|
||||
- pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
|
||||
- VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'
|
||||
- VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/distributed/test_sequence_parallel.py
|
||||
- pytest -v -s tests/distributed/test_context_parallel.py
|
||||
- CUDA_VISIBLE_DEVICES=1,2 VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=deepep_high_throughput
|
||||
- VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=deepep_high_throughput
|
||||
- pytest -v -s tests/v1/distributed/test_dbo.py
|
||||
|
||||
- label: Distributed Tests (2 GPUs)(B200)
|
||||
gpu: b200
|
||||
device: b200
|
||||
optional: true
|
||||
working_dir: "/vllm-workspace/"
|
||||
num_gpus: 2
|
||||
num_devices: 2
|
||||
commands:
|
||||
- pytest -v -s tests/distributed/test_context_parallel.py
|
||||
- pytest -v -s tests/distributed/test_nccl_symm_mem_allreduce.py
|
||||
@@ -161,8 +162,9 @@ steps:
|
||||
- label: 2 Node Test (4 GPUs)
|
||||
timeout_in_minutes: 30
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
num_gpus: 2
|
||||
num_devices: 2
|
||||
num_nodes: 2
|
||||
no_plugin: true
|
||||
source_file_dependencies:
|
||||
- vllm/distributed/
|
||||
- vllm/engine/
|
||||
@@ -171,12 +173,12 @@ steps:
|
||||
- tests/distributed/
|
||||
- tests/examples/offline_inference/data_parallel.py
|
||||
commands:
|
||||
- ./.buildkite/scripts/run-multi-node-test.sh /vllm-workspace/tests 2 2 public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:0bec63fa317e1fbd62e19b0fc31c43c81bf89077 "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=0 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py" "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=1 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code"
|
||||
- ./.buildkite/scripts/run-multi-node-test.sh /vllm-workspace/tests 2 2 $IMAGE_TAG "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=0 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py" "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=1 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code"
|
||||
|
||||
- label: Distributed NixlConnector PD accuracy (4 GPUs)
|
||||
timeout_in_minutes: 30
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
num_gpus: 4
|
||||
num_devices: 4
|
||||
source_file_dependencies:
|
||||
- vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
|
||||
- tests/v1/kv_connector/nixl_integration/
|
||||
@@ -184,10 +186,21 @@ steps:
|
||||
- uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
|
||||
- bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh
|
||||
|
||||
- label: DP EP Distributed NixlConnector PD accuracy tests (4 GPUs)
|
||||
timeout_in_minutes: 30
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
num_devices: 4
|
||||
source_file_dependencies:
|
||||
- vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
|
||||
- tests/v1/kv_connector/nixl_integration/
|
||||
commands:
|
||||
- uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
|
||||
- DP_EP=1 bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh
|
||||
|
||||
- label: Pipeline + Context Parallelism (4 GPUs))
|
||||
timeout_in_minutes: 60
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
num_gpus: 4
|
||||
num_devices: 4
|
||||
source_file_dependencies:
|
||||
- vllm/distributed/
|
||||
- vllm/engine/
|
||||
@@ -196,4 +209,4 @@ steps:
|
||||
- tests/distributed/
|
||||
commands:
|
||||
- pytest -v -s distributed/test_pp_cudagraph.py
|
||||
- pytest -v -s distributed/test_pipeline_parallel.py
|
||||
- pytest -v -s distributed/test_pipeline_parallel.py
|
||||
|
||||
@@ -4,27 +4,27 @@ depends_on:
|
||||
steps:
|
||||
- label: DeepSeek V2-Lite Accuracy
|
||||
timeout_in_minutes: 60
|
||||
gpu: h100
|
||||
device: h100
|
||||
optional: true
|
||||
num_gpus: 4
|
||||
num_devices: 4
|
||||
working_dir: "/vllm-workspace"
|
||||
commands:
|
||||
- bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh 0.25 200 8010
|
||||
|
||||
- label: Qwen3-30B-A3B-FP8-block Accuracy
|
||||
timeout_in_minutes: 60
|
||||
gpu: h100
|
||||
device: h100
|
||||
optional: true
|
||||
num_gpus: 4
|
||||
num_devices: 4
|
||||
working_dir: "/vllm-workspace"
|
||||
commands:
|
||||
- bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020
|
||||
|
||||
- label: Qwen3-30B-A3B-FP8-block Accuracy (B200)
|
||||
timeout_in_minutes: 60
|
||||
gpu: b200
|
||||
device: b200
|
||||
optional: true
|
||||
num_gpus: 2
|
||||
num_devices: 2
|
||||
working_dir: "/vllm-workspace"
|
||||
commands:
|
||||
- bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1
|
||||
@@ -33,10 +33,11 @@ steps:
|
||||
timeout_in_minutes: 30
|
||||
optional: true
|
||||
soft_fail: true
|
||||
num_gpus: 2
|
||||
num_devices: 2
|
||||
working_dir: "/vllm-workspace"
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- .buildkite/scripts/run-prime-rl-test.sh
|
||||
commands:
|
||||
- nvidia-smi
|
||||
- bash .buildkite/scripts/run-prime-rl-test.sh
|
||||
|
||||
@@ -23,4 +23,8 @@ steps:
|
||||
# TODO: accuracy does not match, whether setting
|
||||
# VLLM_USE_FLASHINFER_SAMPLER or not on H100.
|
||||
- pytest -v -s v1/e2e
|
||||
- pytest -v -s v1/engine
|
||||
# Run this test standalone for now;
|
||||
# need to untangle use (implicit) use of spawn/fork across the tests.
|
||||
- pytest -v -s v1/engine/test_preprocess_error_handling.py
|
||||
# Run the rest of v1/engine tests
|
||||
- pytest -v -s v1/engine --ignore v1/engine/test_preprocess_error_handling.py
|
||||
|
||||
@@ -14,7 +14,7 @@ steps:
|
||||
- label: EPLB Execution
|
||||
timeout_in_minutes: 20
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
num_gpus: 4
|
||||
num_devices: 4
|
||||
source_file_dependencies:
|
||||
- vllm/distributed/eplb
|
||||
- tests/distributed/test_eplb_execute.py
|
||||
|
||||
@@ -15,8 +15,9 @@ steps:
|
||||
timeout_in_minutes: 35
|
||||
source_file_dependencies:
|
||||
- csrc/attention/
|
||||
- vllm/attention
|
||||
- vllm/v1/attention
|
||||
# TODO: remove this dependency (https://github.com/vllm-project/vllm/issues/32267)
|
||||
- vllm/model_executor/layers/attention
|
||||
- tests/kernels/attention
|
||||
commands:
|
||||
- pytest -v -s kernels/attention --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
|
||||
@@ -57,8 +58,8 @@ steps:
|
||||
|
||||
- label: Kernels DeepGEMM Test (H100)
|
||||
timeout_in_minutes: 45
|
||||
gpu: h100
|
||||
num_gpus: 1
|
||||
device: h100
|
||||
num_devices: 1
|
||||
source_file_dependencies:
|
||||
- tools/install_deepgemm.sh
|
||||
- vllm/utils/deep_gemm.py
|
||||
@@ -77,7 +78,7 @@ steps:
|
||||
- label: Kernels (B200)
|
||||
timeout_in_minutes: 30
|
||||
working_dir: "/vllm-workspace/"
|
||||
gpu: b200
|
||||
device: b200
|
||||
# optional: true
|
||||
source_file_dependencies:
|
||||
- csrc/quantization/fp4/
|
||||
@@ -85,7 +86,7 @@ steps:
|
||||
- csrc/quantization/cutlass_w8a8/moe/
|
||||
- vllm/model_executor/layers/fused_moe/cutlass_moe.py
|
||||
- vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py
|
||||
- vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py
|
||||
- vllm/model_executor/layers/fused_moe/flashinfer_a2a_prepare_finalize.py
|
||||
- vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
|
||||
- vllm/v1/attention/backends/flashinfer.py
|
||||
- vllm/v1/attention/backends/mla/cutlass_mla.py
|
||||
@@ -114,4 +115,55 @@ steps:
|
||||
- pytest -v -s tests/kernels/moe/test_nvfp4_moe.py
|
||||
- pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py
|
||||
- pytest -v -s tests/kernels/moe/test_flashinfer.py
|
||||
- pytest -v -s tests/kernels/moe/test_cutedsl_moe.py
|
||||
- pytest -v -s tests/kernels/moe/test_cutedsl_moe.py
|
||||
# e2e
|
||||
- pytest -v -s tests/models/quantization/test_nvfp4.py
|
||||
|
||||
- label: Kernels Helion Test
|
||||
timeout_in_minutes: 30
|
||||
device: h100
|
||||
source_file_dependencies:
|
||||
- vllm/utils/import_utils.py
|
||||
- tests/kernels/helion/
|
||||
commands:
|
||||
- pip install helion
|
||||
- pytest -v -s kernels/helion/
|
||||
|
||||
|
||||
- label: Kernels FP8 MoE Test (1 H100)
|
||||
timeout_in_minutes: 90
|
||||
device: h100
|
||||
num_devices: 1
|
||||
optional: true
|
||||
commands:
|
||||
- pytest -v -s kernels/moe/test_cutlass_moe.py
|
||||
- pytest -v -s kernels/moe/test_flashinfer.py
|
||||
- pytest -v -s kernels/moe/test_gpt_oss_triton_kernels.py
|
||||
- pytest -v -s kernels/moe/test_modular_oai_triton_moe.py
|
||||
- pytest -v -s kernels/moe/test_moe.py
|
||||
# - pytest -v -s kernels/moe/test_block_fp8.py - failing on main
|
||||
- pytest -v -s kernels/moe/test_block_int8.py
|
||||
- pytest -v -s kernels/moe/test_triton_moe_no_act_mul.py
|
||||
- pytest -v -s kernels/moe/test_triton_moe_ptpc_fp8.py
|
||||
|
||||
- label: Kernels FP8 MoE Test (2 H100s)
|
||||
timeout_in_minutes: 90
|
||||
device: h100
|
||||
num_devices: 2
|
||||
optional: true
|
||||
commands:
|
||||
- pytest -v -s kernels/moe/test_deepep_deepgemm_moe.py
|
||||
- pytest -v -s kernels/moe/test_deepep_moe.py
|
||||
- pytest -v -s kernels/moe/test_pplx_cutlass_moe.py
|
||||
# - pytest -v -s kernels/moe/test_pplx_moe.py - failing on main
|
||||
|
||||
- label: Kernels Fp4 MoE Test (B200)
|
||||
timeout_in_minutes: 60
|
||||
device: b200
|
||||
num_devices: 1
|
||||
optional: true
|
||||
commands:
|
||||
- pytest -v -s kernels/moe/test_cutedsl_moe.py
|
||||
- pytest -v -s kernels/moe/test_flashinfer_moe.py
|
||||
- pytest -v -s kernels/moe/test_nvfp4_moe.py
|
||||
- pytest -v -s kernels/moe/test_ocp_mx_moe.py
|
||||
|
||||
@@ -12,9 +12,9 @@ steps:
|
||||
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt
|
||||
|
||||
- label: LM Eval Large Models (4 GPUs)(A100)
|
||||
gpu: a100
|
||||
device: a100
|
||||
optional: true
|
||||
num_gpus: 4
|
||||
num_devices: 4
|
||||
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
|
||||
source_file_dependencies:
|
||||
- csrc/
|
||||
@@ -24,9 +24,9 @@ steps:
|
||||
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4
|
||||
|
||||
- label: LM Eval Large Models (4 GPUs)(H100)
|
||||
gpu: h100
|
||||
device: h100
|
||||
optional: true
|
||||
num_gpus: 4
|
||||
num_devices: 4
|
||||
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
|
||||
source_file_dependencies:
|
||||
- csrc/
|
||||
@@ -37,10 +37,39 @@ steps:
|
||||
|
||||
- label: LM Eval Small Models (B200)
|
||||
timeout_in_minutes: 120
|
||||
gpu: b200
|
||||
device: b200
|
||||
optional: true
|
||||
source_file_dependencies:
|
||||
- csrc/
|
||||
- vllm/model_executor/layers/quantization
|
||||
commands:
|
||||
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt
|
||||
|
||||
- label: LM Eval Large Models (H200)
|
||||
timeout_in_minutes: 60
|
||||
device: h200
|
||||
optional: true
|
||||
num_devices: 8
|
||||
commands:
|
||||
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-h200.txt
|
||||
|
||||
- label: MoE Refactor Integration Test (H100 - TEMPORARY)
|
||||
device: h100
|
||||
optional: true
|
||||
num_devices: 2
|
||||
commands:
|
||||
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=evals/gsm8k/configs/moe-refactor/config-h100.txt
|
||||
|
||||
- label: MoE Refactor Integration Test (B200 - TEMPORARY)
|
||||
device: b200
|
||||
optional: true
|
||||
num_devices: 2
|
||||
commands:
|
||||
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=evals/gsm8k/configs/moe-refactor/config-b200.txt
|
||||
|
||||
- label: MoE Refactor Integration Test (B200 DP - TEMPORARY)
|
||||
device: b200
|
||||
optional: true
|
||||
num_devices: 2
|
||||
commands:
|
||||
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=evals/gsm8k/configs/moe-refactor-dp-ep/config-b200.txt
|
||||
|
||||
@@ -14,7 +14,7 @@ steps:
|
||||
|
||||
- label: LoRA TP (Distributed)
|
||||
timeout_in_minutes: 30
|
||||
num_gpus: 4
|
||||
num_devices: 4
|
||||
source_file_dependencies:
|
||||
- vllm/lora
|
||||
- tests/lora
|
||||
|
||||
@@ -16,7 +16,7 @@ steps:
|
||||
- pytest -v -s v1/sample
|
||||
- pytest -v -s v1/logits_processors
|
||||
- pytest -v -s v1/worker
|
||||
- pytest -v -s v1/spec_decode
|
||||
- pytest -v -s -m 'not slow_test' v1/spec_decode
|
||||
- pytest -v -s -m 'not cpu_test' v1/kv_connector/unit
|
||||
- pytest -v -s -m 'not cpu_test' v1/metrics
|
||||
- pytest -v -s v1/test_oracle.py
|
||||
@@ -27,11 +27,12 @@ steps:
|
||||
- pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
|
||||
|
||||
- label: V1 Others (CPU)
|
||||
depends_on: ~
|
||||
depends_on:
|
||||
- image-build-cpu
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/v1
|
||||
no_gpu: true
|
||||
device: cpu
|
||||
commands:
|
||||
# split the test to avoid interference
|
||||
- pytest -v -s -m 'cpu_test' v1/core
|
||||
@@ -71,7 +72,7 @@ steps:
|
||||
- python3 offline_inference/vision_language_multi_image.py --seed 0
|
||||
- python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
|
||||
# for pooling models
|
||||
- python3 pooling/pooling/vision_language_pooling.py --seed 0
|
||||
- python3 pooling/embed/vision_embedding_offline.py --seed 0
|
||||
# for features demo
|
||||
- python3 offline_inference/prefix_caching.py
|
||||
- python3 offline_inference/llm_engine_example.py
|
||||
@@ -82,7 +83,7 @@ steps:
|
||||
|
||||
- label: Metrics, Tracing (2 GPUs)
|
||||
timeout_in_minutes: 20
|
||||
num_gpus: 2
|
||||
num_devices: 2
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/v1/tracing
|
||||
@@ -114,24 +115,29 @@ steps:
|
||||
- pytest -v -s utils_
|
||||
|
||||
- label: Async Engine, Inputs, Utils, Worker, Config (CPU)
|
||||
depends_on: ~
|
||||
depends_on:
|
||||
- image-build-cpu
|
||||
timeout_in_minutes: 30
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/test_inputs.py
|
||||
- tests/test_outputs.py
|
||||
- tests/test_pooling_params.py
|
||||
- tests/multimodal
|
||||
- tests/renderers
|
||||
- tests/standalone_tests/lazy_imports.py
|
||||
- tests/tokenizers_
|
||||
- tests/tool_parsers
|
||||
- tests/transformers_utils
|
||||
- tests/config
|
||||
no_gpu: true
|
||||
device: cpu
|
||||
commands:
|
||||
- python3 standalone_tests/lazy_imports.py
|
||||
- pytest -v -s test_inputs.py
|
||||
- pytest -v -s test_outputs.py
|
||||
- pytest -v -s test_pooling_params.py
|
||||
- pytest -v -s -m 'cpu_test' multimodal
|
||||
- pytest -v -s renderers
|
||||
- pytest -v -s tokenizers_
|
||||
- pytest -v -s tool_parsers
|
||||
- pytest -v -s transformers_utils
|
||||
@@ -140,7 +146,7 @@ steps:
|
||||
- label: GPT-OSS Eval (B200)
|
||||
timeout_in_minutes: 60
|
||||
working_dir: "/vllm-workspace/"
|
||||
gpu: b200
|
||||
device: b200
|
||||
optional: true
|
||||
source_file_dependencies:
|
||||
- tests/evals/gpt_oss
|
||||
@@ -153,7 +159,7 @@ steps:
|
||||
|
||||
- label: Batch Invariance (H100)
|
||||
timeout_in_minutes: 25
|
||||
gpu: h100
|
||||
device: h100
|
||||
source_file_dependencies:
|
||||
- vllm/v1/attention
|
||||
- vllm/model_executor/layers
|
||||
@@ -162,4 +168,18 @@ steps:
|
||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||
- pip install pytest-timeout pytest-forked
|
||||
- pytest -v -s v1/determinism/test_batch_invariance.py
|
||||
- pytest -v -s v1/determinism/test_rms_norm_batch_invariant.py
|
||||
- pytest -v -s v1/determinism/test_rms_norm_batch_invariant.py
|
||||
|
||||
- label: Acceptance Length Test (Large Models) # optional
|
||||
timeout_in_minutes: 25
|
||||
gpu: h100
|
||||
optional: true
|
||||
num_gpus: 1
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
source_file_dependencies:
|
||||
- vllm/v1/spec_decode/
|
||||
- vllm/model_executor/models/mlp_speculator.py
|
||||
- tests/v1/spec_decode/test_acceptance_length.py
|
||||
commands:
|
||||
- export VLLM_ALLOW_INSECURE_SERIALIZATION=1
|
||||
- pytest -v -s v1/spec_decode/test_acceptance_length.py -m slow_test
|
||||
|
||||
@@ -33,18 +33,21 @@ steps:
|
||||
timeout_in_minutes: 45
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/models/test_terratorch.py
|
||||
- tests/models/test_transformers.py
|
||||
- tests/models/test_registry.py
|
||||
commands:
|
||||
- pytest -v -s models/test_transformers.py models/test_registry.py
|
||||
- pytest -v -s models/test_terratorch.py models/test_transformers.py models/test_registry.py
|
||||
|
||||
- label: Basic Models Test (Other CPU) # 5min
|
||||
depends_on:
|
||||
- image-build-cpu
|
||||
timeout_in_minutes: 10
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/models/test_utils.py
|
||||
- tests/models/test_vision.py
|
||||
no_gpu: true
|
||||
device: cpu
|
||||
commands:
|
||||
- pytest -v -s models/test_utils.py models/test_vision.py
|
||||
|
||||
|
||||
@@ -5,7 +5,7 @@ steps:
|
||||
- label: Distributed Model Tests (2 GPUs)
|
||||
timeout_in_minutes: 50
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
num_gpus: 2
|
||||
num_devices: 2
|
||||
source_file_dependencies:
|
||||
- vllm/model_executor/model_loader/sharded_state_loader.py
|
||||
- vllm/model_executor/models/
|
||||
|
||||
@@ -14,11 +14,13 @@ steps:
|
||||
- cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model # Otherwise, mp_method="spawn" doesn't work
|
||||
|
||||
- label: Multi-Modal Processor Test (CPU)
|
||||
depends_on:
|
||||
- image-build-cpu
|
||||
timeout_in_minutes: 60
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/models/multimodal
|
||||
no_gpu: true
|
||||
device: cpu
|
||||
commands:
|
||||
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
|
||||
- pytest -v -s models/multimodal/processing --ignore models/multimodal/processing/test_tensor_schema.py
|
||||
|
||||
@@ -5,7 +5,7 @@ steps:
|
||||
- label: Plugin Tests (2 GPUs)
|
||||
timeout_in_minutes: 60
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
num_gpus: 2
|
||||
num_devices: 2
|
||||
source_file_dependencies:
|
||||
- vllm/plugins/
|
||||
- tests/plugins/
|
||||
|
||||
@@ -3,7 +3,7 @@ depends_on:
|
||||
- image-build
|
||||
steps:
|
||||
- label: PyTorch Compilation Unit Tests
|
||||
timeout_in_minutes: 30
|
||||
timeout_in_minutes: 10
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/compile
|
||||
@@ -17,8 +17,16 @@ steps:
|
||||
# (using -0 for proper path handling)
|
||||
- "find compile/ -maxdepth 1 -name 'test_*.py' -print0 | xargs -0 -n1 -I{} pytest -s -v '{}'"
|
||||
|
||||
- label: PyTorch Compilation Passes Unit Tests
|
||||
timeout_in_minutes: 20
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/compile/passes
|
||||
commands:
|
||||
- pytest -s -v compile/passes --ignore compile/passes/distributed
|
||||
|
||||
- label: PyTorch Fullgraph Smoke Test
|
||||
timeout_in_minutes: 30
|
||||
timeout_in_minutes: 35
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/compile
|
||||
@@ -30,16 +38,13 @@ steps:
|
||||
- "find compile/fullgraph/ -name 'test_*.py' -not -name 'test_full_graph.py' -exec pytest -s -v {} \\;"
|
||||
|
||||
- label: PyTorch Fullgraph
|
||||
timeout_in_minutes: 40
|
||||
timeout_in_minutes: 30
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/compile
|
||||
commands:
|
||||
# fp8 kv scales not supported on sm89, tested on Blackwell instead
|
||||
- pytest -v -s compile/fullgraph/test_full_graph.py -k 'not test_fp8_kv_scale_compile'
|
||||
# Limit to no custom ops to reduce running time
|
||||
# Wrap with quotes to escape yaml and avoid starting -k string with a -
|
||||
- "pytest -v -s compile/distributed/test_fusions_e2e.py -k 'TRITON and not +quant_fp8 and not Llama-4'"
|
||||
|
||||
- label: Pytorch Nightly Dependency Override Check # 2min
|
||||
# if this test fails, it means the nightly torch version is not compatible with some
|
||||
|
||||
@@ -16,14 +16,14 @@ steps:
|
||||
# https://github.com/pytorch/ao/issues/2919, we'll have to skip new torchao tests for now
|
||||
# we can only upgrade after this is resolved
|
||||
# TODO(jerryzh168): resolve the above comment
|
||||
- uv pip install --system torchao==0.13.0 --index-url https://download.pytorch.org/whl/cu129
|
||||
- uv pip install --system torchao==0.14.1 --index-url https://download.pytorch.org/whl/cu129
|
||||
- uv pip install --system conch-triton-kernels
|
||||
- VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py
|
||||
|
||||
- label: Quantized MoE Test (B200)
|
||||
timeout_in_minutes: 60
|
||||
working_dir: "/vllm-workspace/"
|
||||
gpu: b200
|
||||
device: b200
|
||||
source_file_dependencies:
|
||||
- tests/quantization/test_blackwell_moe.py
|
||||
- vllm/model_executor/models/deepseek_v2.py
|
||||
|
||||
@@ -5,7 +5,7 @@ steps:
|
||||
- label: Weight Loading Multiple GPU # 33min
|
||||
timeout_in_minutes: 45
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
num_gpus: 2
|
||||
num_devices: 2
|
||||
optional: true
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
@@ -15,8 +15,8 @@ steps:
|
||||
|
||||
- label: Weight Loading Multiple GPU - Large Models # optional
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
num_gpus: 2
|
||||
gpu: a100
|
||||
num_devices: 2
|
||||
device: a100
|
||||
optional: true
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
|
||||
16
.github/CODEOWNERS
vendored
16
.github/CODEOWNERS
vendored
@@ -2,8 +2,8 @@
|
||||
# for more info about CODEOWNERS file
|
||||
|
||||
# This lists cover the "core" components of vLLM that require careful review
|
||||
/vllm/attention @LucasWilkinson
|
||||
/vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-redhat @njhill @22quinn
|
||||
/vllm/model_executor/layers/attention @LucasWilkinson
|
||||
/vllm/model_executor/layers/fused_moe @mgoin @pavanimajety
|
||||
/vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth @yewentao256 @pavanimajety
|
||||
/vllm/model_executor/layers/mamba @tdoublep
|
||||
@@ -16,7 +16,7 @@
|
||||
/vllm/entrypoints @aarnphm @chaunceyjiang
|
||||
/vllm/tool_parsers @aarnphm @chaunceyjiang
|
||||
/vllm/compilation @zou3519 @youkaichao @ProExpertProg
|
||||
/vllm/distributed/kv_transfer @NickLucche @ApostaC
|
||||
/vllm/distributed/kv_transfer @NickLucche @ApostaC @orozery
|
||||
CMakeLists.txt @tlrmchlsmth @LucasWilkinson
|
||||
|
||||
# Any change to the VllmConfig changes can have a large user-facing impact,
|
||||
@@ -30,12 +30,14 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
|
||||
/vllm/v1/attention/backends/mla @pavanimajety
|
||||
/vllm/v1/attention/backends/flashinfer.py @mgoin @pavanimajety
|
||||
/vllm/v1/attention/backends/triton_attn.py @tdoublep
|
||||
/vllm/v1/core @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @alexm-redhat @heheda12345 @ApostaC
|
||||
/vllm/v1/core @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @alexm-redhat @heheda12345 @ApostaC @orozery
|
||||
/vllm/v1/sample @22quinn @houseroad @njhill
|
||||
/vllm/v1/spec_decode @benchislett @luccafong
|
||||
/vllm/v1/structured_output @mgoin @russellb @aarnphm @benchislett
|
||||
/vllm/v1/kv_cache_interface.py @heheda12345
|
||||
/vllm/v1/offloading @ApostaC
|
||||
/vllm/v1/kv_offload @ApostaC @orozery
|
||||
/vllm/v1/worker/gpu/kv_connector.py @orozery
|
||||
/vllm/v1/worker/kv_connector_model_runner_mixin.py @orozery
|
||||
|
||||
# Model runner V2
|
||||
/vllm/v1/worker/gpu @WoosukKwon
|
||||
@@ -54,13 +56,13 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
|
||||
/tests/test_inputs.py @DarkLight1337 @ywang96
|
||||
/tests/v1/entrypoints/llm/test_struct_output_generate.py @mgoin @russellb @aarnphm
|
||||
/tests/v1/structured_output @mgoin @russellb @aarnphm
|
||||
/tests/v1/core @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @alexm-redhat @heheda12345 @ApostaC
|
||||
/tests/v1/core @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @alexm-redhat @heheda12345 @ApostaC @orozery
|
||||
/tests/weight_loading @mgoin @youkaichao @yewentao256
|
||||
/tests/lora @jeejeelee
|
||||
/tests/models/language/generation/test_hybrid.py @tdoublep
|
||||
/tests/v1/kv_connector/nixl_integration @NickLucche
|
||||
/tests/v1/kv_connector @ApostaC
|
||||
/tests/v1/offloading @ApostaC
|
||||
/tests/v1/kv_connector @ApostaC @orozery
|
||||
/tests/v1/kv_offload @ApostaC @orozery
|
||||
/tests/v1/determinism @yewentao256
|
||||
|
||||
# Transformers modeling backend
|
||||
|
||||
12
.github/mergify.yml
vendored
12
.github/mergify.yml
vendored
@@ -414,6 +414,18 @@ pull_request_rules:
|
||||
remove:
|
||||
- needs-rebase
|
||||
|
||||
- name: label-bug
|
||||
description: Automatically apply bug label
|
||||
conditions:
|
||||
- label != stale
|
||||
- or:
|
||||
- title~=(?i)\bbug\b
|
||||
- title~=(?i)\bbugfix\b
|
||||
actions:
|
||||
label:
|
||||
add:
|
||||
- bug
|
||||
|
||||
- name: label-kv-connector
|
||||
description: Automatically apply kv-connector label
|
||||
conditions:
|
||||
|
||||
3
.github/workflows/macos-smoke-test.yml
vendored
3
.github/workflows/macos-smoke-test.yml
vendored
@@ -29,8 +29,9 @@ jobs:
|
||||
|
||||
- name: Install dependencies and build vLLM
|
||||
run: |
|
||||
uv pip install -r requirements/cpu-build.txt --index-strategy unsafe-best-match
|
||||
uv pip install -r requirements/cpu.txt --index-strategy unsafe-best-match
|
||||
uv pip install -e .
|
||||
uv pip install -e . --no-build-isolation
|
||||
env:
|
||||
CMAKE_BUILD_PARALLEL_LEVEL: 4
|
||||
|
||||
|
||||
6
.gitignore
vendored
6
.gitignore
vendored
@@ -7,6 +7,9 @@ vllm/vllm_flash_attn/*
|
||||
# OpenAI triton kernels copied from source
|
||||
vllm/third_party/triton_kernels/*
|
||||
|
||||
# FlashMLA interface copied from source
|
||||
vllm/third_party/flashmla/flash_mla_interface.py
|
||||
|
||||
# triton jit
|
||||
.triton
|
||||
|
||||
@@ -191,6 +194,9 @@ CLAUDE.md
|
||||
AGENTS.md
|
||||
.codex/
|
||||
|
||||
# Cursor
|
||||
.cursor/
|
||||
|
||||
# DS Store
|
||||
.DS_Store
|
||||
|
||||
|
||||
@@ -121,24 +121,9 @@ repos:
|
||||
name: Update Dockerfile dependency graph
|
||||
entry: tools/pre_commit/update-dockerfile-graph.sh
|
||||
language: script
|
||||
- id: enforce-import-regex-instead-of-re
|
||||
name: Enforce import regex as re
|
||||
entry: python tools/pre_commit/enforce_regex_import.py
|
||||
language: python
|
||||
types: [python]
|
||||
pass_filenames: false
|
||||
additional_dependencies: [regex]
|
||||
# forbid directly import triton
|
||||
- id: forbid-direct-triton-import
|
||||
name: "Forbid direct 'import triton'"
|
||||
entry: python tools/pre_commit/check_triton_import.py
|
||||
language: python
|
||||
types: [python]
|
||||
pass_filenames: false
|
||||
additional_dependencies: [regex]
|
||||
- id: check-pickle-imports
|
||||
name: Prevent new pickle/cloudpickle imports
|
||||
entry: python tools/pre_commit/check_pickle_imports.py
|
||||
- id: check-forbidden-imports
|
||||
name: Check for forbidden imports
|
||||
entry: python tools/pre_commit/check_forbidden_imports.py
|
||||
language: python
|
||||
types: [python]
|
||||
additional_dependencies: [regex]
|
||||
@@ -147,6 +132,17 @@ repos:
|
||||
entry: python tools/pre_commit/validate_config.py
|
||||
language: python
|
||||
additional_dependencies: [regex]
|
||||
- id: validate-docker-versions
|
||||
name: Validate docker/versions.json matches Dockerfile
|
||||
entry: python tools/generate_versions_json.py --check
|
||||
language: python
|
||||
files: ^docker/(Dockerfile|versions\.json)$
|
||||
pass_filenames: false
|
||||
additional_dependencies: [dockerfile-parse]
|
||||
- id: attention-backend-docs
|
||||
name: Check attention backend documentation is up to date
|
||||
entry: python tools/pre_commit/generate_attention_backend_docs.py --check
|
||||
language: python
|
||||
# Keep `suggestion` last
|
||||
- id: suggestion
|
||||
name: Suggestion
|
||||
|
||||
@@ -56,8 +56,8 @@ endif()
|
||||
# requirements.txt files and should be kept consistent. The ROCm torch
|
||||
# versions are derived from docker/Dockerfile.rocm
|
||||
#
|
||||
set(TORCH_SUPPORTED_VERSION_CUDA "2.9.1")
|
||||
set(TORCH_SUPPORTED_VERSION_ROCM "2.9.1")
|
||||
set(TORCH_SUPPORTED_VERSION_CUDA "2.10.0")
|
||||
set(TORCH_SUPPORTED_VERSION_ROCM "2.10.0")
|
||||
|
||||
#
|
||||
# Try to find python package with an executable that exactly matches
|
||||
@@ -377,7 +377,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
# preselected input type pairs and schedules.
|
||||
# Generate sources:
|
||||
set(MARLIN_GEN_SCRIPT
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/csrc/quantization/gptq_marlin/generate_kernels.py)
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/csrc/quantization/marlin/generate_kernels.py)
|
||||
file(MD5 ${MARLIN_GEN_SCRIPT} MARLIN_GEN_SCRIPT_HASH)
|
||||
list(JOIN CUDA_ARCHS "," CUDA_ARCHS_STR)
|
||||
set(MARLIN_GEN_SCRIPT_HASH_AND_ARCH "${MARLIN_GEN_SCRIPT_HASH}(ARCH:${CUDA_ARCHS_STR})")
|
||||
@@ -412,7 +412,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
endif()
|
||||
|
||||
if (MARLIN_ARCHS)
|
||||
file(GLOB MARLIN_TEMPLATE_KERNEL_SRC "csrc/quantization/gptq_marlin/sm80_kernel_*_float16.cu")
|
||||
file(GLOB MARLIN_TEMPLATE_KERNEL_SRC "csrc/quantization/marlin/sm80_kernel_*_float16.cu")
|
||||
set_gencode_flags_for_srcs(
|
||||
SRCS "${MARLIN_TEMPLATE_KERNEL_SRC}"
|
||||
CUDA_ARCHS "${MARLIN_ARCHS}")
|
||||
@@ -422,7 +422,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
endif()
|
||||
list(APPEND VLLM_EXT_SRC ${MARLIN_TEMPLATE_KERNEL_SRC})
|
||||
|
||||
file(GLOB MARLIN_TEMPLATE_BF16_KERNEL_SRC "csrc/quantization/gptq_marlin/sm80_kernel_*_bfloat16.cu")
|
||||
file(GLOB MARLIN_TEMPLATE_BF16_KERNEL_SRC "csrc/quantization/marlin/sm80_kernel_*_bfloat16.cu")
|
||||
set_gencode_flags_for_srcs(
|
||||
SRCS "${MARLIN_TEMPLATE_BF16_KERNEL_SRC}"
|
||||
CUDA_ARCHS "${MARLIN_BF16_ARCHS}")
|
||||
@@ -433,8 +433,8 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
list(APPEND VLLM_EXT_SRC ${MARLIN_TEMPLATE_BF16_KERNEL_SRC})
|
||||
endif()
|
||||
|
||||
if (MARLIN_SM75_ARCHS)
|
||||
file(GLOB MARLIN_TEMPLATE_SM75_KERNEL_SRC "csrc/quantization/gptq_marlin/sm75_kernel_*.cu")
|
||||
if (MARLIN_SM75_ARCHS)
|
||||
file(GLOB MARLIN_TEMPLATE_SM75_KERNEL_SRC "csrc/quantization/marlin/sm75_kernel_*.cu")
|
||||
set_gencode_flags_for_srcs(
|
||||
SRCS "${MARLIN_TEMPLATE_SM75_KERNEL_SRC}"
|
||||
CUDA_ARCHS "${MARLIN_SM75_ARCHS}")
|
||||
@@ -445,8 +445,8 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
list(APPEND VLLM_EXT_SRC ${MARLIN_TEMPLATE_SM75_KERNEL_SRC})
|
||||
endif()
|
||||
|
||||
if (MARLIN_FP8_ARCHS)
|
||||
file(GLOB MARLIN_TEMPLATE_FP8_KERNEL_SRC "csrc/quantization/gptq_marlin/sm89_kernel_*.cu")
|
||||
if (MARLIN_FP8_ARCHS)
|
||||
file(GLOB MARLIN_TEMPLATE_FP8_KERNEL_SRC "csrc/quantization/marlin/sm89_kernel_*.cu")
|
||||
set_gencode_flags_for_srcs(
|
||||
SRCS "${MARLIN_TEMPLATE_FP8_KERNEL_SRC}"
|
||||
CUDA_ARCHS "${MARLIN_FP8_ARCHS}")
|
||||
@@ -458,11 +458,10 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
endif()
|
||||
|
||||
set(MARLIN_SRCS
|
||||
"csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu"
|
||||
"csrc/quantization/gptq_marlin/gptq_marlin.cu"
|
||||
"csrc/quantization/gptq_marlin/marlin_int4_fp8_preprocess.cu"
|
||||
"csrc/quantization/gptq_marlin/gptq_marlin_repack.cu"
|
||||
"csrc/quantization/gptq_marlin/awq_marlin_repack.cu")
|
||||
"csrc/quantization/marlin/marlin.cu"
|
||||
"csrc/quantization/marlin/marlin_int4_fp8_preprocess.cu"
|
||||
"csrc/quantization/marlin/gptq_marlin_repack.cu"
|
||||
"csrc/quantization/marlin/awq_marlin_repack.cu")
|
||||
set_gencode_flags_for_srcs(
|
||||
SRCS "${MARLIN_SRCS}"
|
||||
CUDA_ARCHS "${MARLIN_OTHER_ARCHS}")
|
||||
@@ -1043,7 +1042,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
list(APPEND VLLM_MOE_EXT_SRC ${MARLIN_MOE_SRC})
|
||||
endif()
|
||||
|
||||
if (MARLIN_MOE_SM75_ARCHS)
|
||||
if (MARLIN_MOE_SM75_ARCHS)
|
||||
file(GLOB MARLIN_MOE_SM75_SRC "csrc/moe/marlin_moe_wna16/sm75_kernel_*.cu")
|
||||
set_gencode_flags_for_srcs(
|
||||
SRCS "${MARLIN_MOE_SM75_SRC}"
|
||||
|
||||
@@ -11,7 +11,7 @@ This directory used to contain vLLM's benchmark scripts and utilities for perfor
|
||||
|
||||
## Usage
|
||||
|
||||
For detailed usage instructions, examples, and dataset information, see the [Benchmark CLI documentation](https://docs.vllm.ai/en/latest/contributing/benchmarks.html#benchmark-cli).
|
||||
For detailed usage instructions, examples, and dataset information, see the [Benchmark CLI documentation](https://docs.vllm.ai/en/latest/benchmarking/cli/#benchmark-cli).
|
||||
|
||||
For full CLI reference see:
|
||||
|
||||
|
||||
266
benchmarks/attention_benchmarks/README.md
Normal file
266
benchmarks/attention_benchmarks/README.md
Normal file
@@ -0,0 +1,266 @@
|
||||
# vLLM Attention Benchmarking Suite
|
||||
|
||||
Fast, flexible benchmarking for vLLM attention and MLA backends with an extended batch specification grammar.
|
||||
|
||||
## Quick Start
|
||||
|
||||
```bash
|
||||
cd benchmarks/attention_benchmarks
|
||||
|
||||
# Run a pre-configured benchmark
|
||||
python benchmark.py --config configs/mla_decode.yaml
|
||||
python benchmark.py --config configs/mla_mixed_batch.yaml
|
||||
python benchmark.py --config configs/speculative_decode.yaml
|
||||
python benchmark.py --config configs/standard_attention.yaml
|
||||
python benchmark.py --config configs/reorder_threshold.yaml
|
||||
|
||||
# Or run custom benchmarks
|
||||
python benchmark.py \
|
||||
--backends flash flashinfer \
|
||||
--batch-specs "q2k" "8q1s1k" "2q2k_32q1s1k" \
|
||||
--output-csv results.csv
|
||||
```
|
||||
|
||||
## Simplified Batch Specification Grammar
|
||||
|
||||
Express workloads concisely using query length and sequence length:
|
||||
|
||||
```python
|
||||
"q2k" # 2048-token prefill (q_len=2048, seq_len=2048)
|
||||
"q1s1k" # Decode: 1 token with 1K sequence
|
||||
"8q1s1k" # 8 decode requests
|
||||
"q4s1k" # 4-token extend (e.g., spec decode)
|
||||
"2q2k_32q1s1k" # Mixed: 2 prefills + 32 decodes
|
||||
"16q4s1k" # 16 spec decode (4 tokens each)
|
||||
```
|
||||
|
||||
### Grammar Rule
|
||||
|
||||
```text
|
||||
Format: (<count>?) q<q_len>(k?) (s<seq_len>(k?))?
|
||||
|
||||
- count: Number of identical requests (optional, default=1)
|
||||
- q_len: Query length (number of new tokens)
|
||||
- seq_len: Total sequence length (optional, defaults to q_len for prefill)
|
||||
- 'k': Multiplies value by 1024
|
||||
|
||||
Mixed batches: Use _ to combine (e.g., "2q2k_32q1s1k")
|
||||
```
|
||||
|
||||
**Note**: Decode, prefill, and spec decode are just different query lengths - no special syntax needed!
|
||||
|
||||
## Pre-configured Benchmarks
|
||||
|
||||
The suite includes several pre-configured YAML benchmark configurations:
|
||||
|
||||
### MLA Decode Benchmark
|
||||
|
||||
Tests pure decode performance across MLA backends with varying batch sizes and sequence lengths.
|
||||
|
||||
```bash
|
||||
python benchmark.py --config configs/mla_decode.yaml
|
||||
```
|
||||
|
||||
### MLA Mixed Batch Benchmark
|
||||
|
||||
Tests chunked prefill performance with mixed prefill + decode batches.
|
||||
|
||||
```bash
|
||||
python benchmark.py --config configs/mla_mixed_batch.yaml
|
||||
```
|
||||
|
||||
### Speculative Decoding Benchmark
|
||||
|
||||
Tests speculative decode scenarios (K-token verification) and reorder_batch_threshold optimization.
|
||||
|
||||
```bash
|
||||
python benchmark.py --config configs/speculative_decode.yaml
|
||||
```
|
||||
|
||||
### Standard Attention Benchmark
|
||||
|
||||
Tests standard attention backends (Flash/Triton/FlashInfer) with pure prefill, decode, and mixed batches.
|
||||
|
||||
```bash
|
||||
python benchmark.py --config configs/standard_attention.yaml
|
||||
```
|
||||
|
||||
### Reorder Threshold Study
|
||||
|
||||
**Question:** At what query length does the prefill pipeline become faster than the decode pipeline?
|
||||
|
||||
Tests query lengths from 1-1024 across 9 batch sizes to find the crossover point. Uses `decode_vs_prefill` mode to compare both pipelines for each query length.
|
||||
|
||||
```bash
|
||||
python benchmark.py --config configs/reorder_threshold.yaml
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Universal Benchmark
|
||||
|
||||
The `benchmark.py` script handles **all** backends - both standard attention and MLA.
|
||||
|
||||
### Standard Attention (Flash/Triton/FlashInfer)
|
||||
|
||||
```bash
|
||||
python benchmark.py \
|
||||
--backends flash triton flashinfer \
|
||||
--batch-specs "q2k" "8q1s1k" "2q2k_32q1s1k" \
|
||||
--num-layers 10 \
|
||||
--repeats 5 \
|
||||
--output-csv results.csv
|
||||
```
|
||||
|
||||
### MLA Backends
|
||||
|
||||
```bash
|
||||
# Compare all MLA backends
|
||||
python benchmark.py \
|
||||
--backends cutlass_mla flashinfer_mla flashattn_mla flashmla \
|
||||
--batch-specs "64q1s1k" "64q1s4k" \
|
||||
--output-csv mla_results.csv
|
||||
```
|
||||
|
||||
### Parameter Sweeps
|
||||
|
||||
Use `--sweep-param` and `--sweep-values` to run parameter sweeps from the CLI:
|
||||
|
||||
#### CUTLASS MLA num-splits Optimization
|
||||
|
||||
**Question:** What is the optimal `num_kv_splits` for CUTLASS MLA?
|
||||
|
||||
```bash
|
||||
python benchmark.py \
|
||||
--backend cutlass_mla \
|
||||
--batch-specs "64q1s1k" "64q1s4k" "64q1s16k" \
|
||||
--sweep-param num_kv_splits \
|
||||
--sweep-values 1 2 4 8 16 \
|
||||
--output-json optimal_splits.json
|
||||
```
|
||||
|
||||
#### Reorder Batch Threshold Optimization
|
||||
|
||||
**Question:** What's the optimal `reorder_batch_threshold` for speculative decoding?
|
||||
|
||||
```bash
|
||||
python benchmark.py \
|
||||
--backend flashmla \
|
||||
--batch-specs "q4s1k" "q8s2k" \
|
||||
--sweep-param reorder_batch_threshold \
|
||||
--sweep-values 1 4 16 64 256 512 \
|
||||
--output-csv threshold_sweep.csv
|
||||
```
|
||||
|
||||
### All Command-Line Options
|
||||
|
||||
```text
|
||||
--config CONFIG # Path to YAML config file (overrides other args)
|
||||
--backends BACKEND [BACKEND ...] # flash, triton, flashinfer, cutlass_mla,
|
||||
# flashinfer_mla, flashattn_mla, flashmla
|
||||
--backend BACKEND # Single backend (alternative to --backends)
|
||||
--batch-specs SPEC [SPEC ...] # Batch specifications using extended grammar
|
||||
|
||||
# Model configuration
|
||||
--num-layers N # Number of layers
|
||||
--head-dim N # Head dimension
|
||||
--num-q-heads N # Query heads
|
||||
--num-kv-heads N # KV heads
|
||||
--block-size N # Block size
|
||||
|
||||
# Benchmark settings
|
||||
--device DEVICE # Device (default: cuda:0)
|
||||
--repeats N # Repetitions
|
||||
--warmup-iters N # Warmup iterations
|
||||
--profile-memory # Profile memory usage
|
||||
|
||||
# Parameter sweeps
|
||||
--sweep-param PARAM # Parameter name to sweep (e.g., num_kv_splits,
|
||||
# reorder_batch_threshold)
|
||||
--sweep-values N [N ...] # Values to sweep for the parameter
|
||||
|
||||
# Output
|
||||
--output-csv FILE # Save to CSV
|
||||
--output-json FILE # Save to JSON
|
||||
```
|
||||
|
||||
## Hardware Requirements
|
||||
|
||||
| Backend | Hardware |
|
||||
|---------|----------|
|
||||
| Flash/Triton/FlashInfer | Any CUDA GPU |
|
||||
| CUTLASS MLA | Blackwell (SM100+) |
|
||||
| FlashAttn MLA | Hopper (SM90+) |
|
||||
| FlashMLA | Hopper (SM90+) |
|
||||
| FlashInfer-MLA | Any CUDA GPU |
|
||||
|
||||
## Using MLA Runner Directly
|
||||
|
||||
All MLA backends are available through `mla_runner.run_mla_benchmark()`:
|
||||
|
||||
```python
|
||||
from mla_runner import run_mla_benchmark
|
||||
from common import BenchmarkConfig
|
||||
|
||||
config = BenchmarkConfig(
|
||||
backend="cutlass_mla",
|
||||
batch_spec="64q1s4k",
|
||||
num_layers=10,
|
||||
head_dim=576,
|
||||
num_q_heads=128,
|
||||
num_kv_heads=1,
|
||||
block_size=128,
|
||||
device="cuda:0",
|
||||
repeats=5,
|
||||
warmup_iters=3,
|
||||
)
|
||||
|
||||
# CUTLASS MLA with specific num_kv_splits
|
||||
result = run_mla_benchmark("cutlass_mla", config, num_kv_splits=4)
|
||||
print(f"Time: {result.mean_time:.6f}s")
|
||||
|
||||
# FlashInfer-MLA
|
||||
result = run_mla_benchmark("flashinfer_mla", config)
|
||||
|
||||
# FlashAttn MLA (Hopper SM90+)
|
||||
result = run_mla_benchmark("flashattn_mla", config, reorder_batch_threshold=64)
|
||||
|
||||
# FlashMLA (Hopper SM90+)
|
||||
result = run_mla_benchmark("flashmla", config, reorder_batch_threshold=64)
|
||||
```
|
||||
|
||||
## Python API
|
||||
|
||||
```python
|
||||
from batch_spec import parse_batch_spec, format_batch_spec, get_batch_stats
|
||||
from common import BenchmarkConfig, BenchmarkResult, ResultsFormatter
|
||||
|
||||
# Parse batch specs
|
||||
requests = parse_batch_spec("2q2k_q4s1k_32q1s1k")
|
||||
print(format_batch_spec(requests))
|
||||
# "2 prefill (2x2k), 1 extend (1xq4kv1k), 32 decode (32x1k)"
|
||||
|
||||
# Get batch statistics
|
||||
stats = get_batch_stats(requests)
|
||||
print(f"Total tokens: {stats['total_tokens']}")
|
||||
print(f"Num decode: {stats['num_decode']}, Num prefill: {stats['num_prefill']}")
|
||||
|
||||
# Format results
|
||||
formatter = ResultsFormatter()
|
||||
formatter.save_csv(results, "output.csv")
|
||||
formatter.save_json(results, "output.json")
|
||||
```
|
||||
|
||||
## Tips
|
||||
|
||||
**1. Warmup matters** - Use `--warmup-iters 10` for stable results
|
||||
|
||||
**2. Multiple repeats** - Use `--repeats 20` for low variance
|
||||
|
||||
**3. Save results** - Always use `--output-csv` or `--output-json`
|
||||
|
||||
**4. Test incrementally** - Start with `--num-layers 1 --repeats 1`
|
||||
|
||||
**5. Extended grammar** - Leverage spec decode, chunked prefill patterns
|
||||
|
||||
**6. Parameter sweeps** - Use `--sweep-param` and `--sweep-values` to find optimal values
|
||||
44
benchmarks/attention_benchmarks/__init__.py
Normal file
44
benchmarks/attention_benchmarks/__init__.py
Normal file
@@ -0,0 +1,44 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
"""vLLM Attention Benchmarking Suite."""
|
||||
|
||||
from .batch_spec import (
|
||||
BatchRequest,
|
||||
format_batch_spec,
|
||||
get_batch_stats,
|
||||
parse_batch_spec,
|
||||
reorder_for_flashinfer,
|
||||
split_by_type,
|
||||
)
|
||||
from .common import (
|
||||
BenchmarkConfig,
|
||||
BenchmarkResult,
|
||||
MockLayer,
|
||||
MockModelConfig,
|
||||
ResultsFormatter,
|
||||
get_attention_scale,
|
||||
is_mla_backend,
|
||||
setup_mla_dims,
|
||||
)
|
||||
|
||||
__all__ = [
|
||||
# Batch specification
|
||||
"BatchRequest",
|
||||
"parse_batch_spec",
|
||||
"format_batch_spec",
|
||||
"reorder_for_flashinfer",
|
||||
"split_by_type",
|
||||
"get_batch_stats",
|
||||
# Benchmarking infrastructure
|
||||
"BenchmarkConfig",
|
||||
"BenchmarkResult",
|
||||
"ResultsFormatter",
|
||||
# Mock objects
|
||||
"MockLayer",
|
||||
"MockModelConfig",
|
||||
# Utilities
|
||||
"setup_mla_dims",
|
||||
"get_attention_scale",
|
||||
"is_mla_backend",
|
||||
]
|
||||
231
benchmarks/attention_benchmarks/batch_spec.py
Normal file
231
benchmarks/attention_benchmarks/batch_spec.py
Normal file
@@ -0,0 +1,231 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
"""
|
||||
Simplified batch specification grammar for attention benchmarks.
|
||||
|
||||
Grammar (underscore-separated segments):
|
||||
Format: (<count>?) q<q_len>(k?) (s<seq_len>(k?))?
|
||||
|
||||
- count: Number of identical requests (optional, default=1)
|
||||
- q_len: Query length (number of new tokens)
|
||||
- seq_len: Total sequence length (optional, defaults to q_len for prefill)
|
||||
- 'k' suffix: Multiplies value by 1024
|
||||
|
||||
Common patterns:
|
||||
- Prefill: q_len == seq_len (e.g., "q2k" → 2048 new tokens, 2048 seq)
|
||||
- Decode: q_len == 1 (e.g., "q1s1k" → 1 token, 1024 seq length)
|
||||
- Extend: q_len < seq_len (e.g., "q4s1k" → 4 tokens, 1024 seq length)
|
||||
|
||||
Examples:
|
||||
q2k -> [(2048, 2048)] # Prefill: 2048 tokens
|
||||
q1s1k -> [(1, 1024)] # Decode: 1 token, 1K sequence
|
||||
8q1s1k -> [(1, 1024)] * 8 # 8 decode requests
|
||||
q4s1k -> [(4, 1024)] # 4-token extend (spec decode)
|
||||
2q1k_32q1s1k -> [(1024, 1024)] * 2 + [(1, 1024)] * 32 # Mixed batch
|
||||
16q4s1k -> [(4, 1024)] * 16 # 16 spec decode requests
|
||||
"""
|
||||
|
||||
from collections import Counter
|
||||
from dataclasses import dataclass
|
||||
|
||||
import regex as re
|
||||
|
||||
|
||||
@dataclass
|
||||
class BatchRequest:
|
||||
"""Represents a single request in a batch."""
|
||||
|
||||
q_len: int # Query length (number of new tokens)
|
||||
kv_len: int # Total KV cache length
|
||||
|
||||
@property
|
||||
def is_decode(self) -> bool:
|
||||
"""True if this is a decode request (q_len == 1)."""
|
||||
return self.q_len == 1
|
||||
|
||||
@property
|
||||
def is_prefill(self) -> bool:
|
||||
"""True if this is a pure prefill (q_len == kv_len)."""
|
||||
return self.q_len == self.kv_len
|
||||
|
||||
@property
|
||||
def is_extend(self) -> bool:
|
||||
"""True if this is context extension (q_len > 1, kv_len > q_len)."""
|
||||
return self.q_len > 1 and self.kv_len > self.q_len
|
||||
|
||||
@property
|
||||
def context_len(self) -> int:
|
||||
"""Context length (KV cache - query)."""
|
||||
return self.kv_len - self.q_len
|
||||
|
||||
def as_tuple(self) -> tuple[int, int]:
|
||||
"""Return as (q_len, kv_len) tuple for compatibility."""
|
||||
return (self.q_len, self.kv_len)
|
||||
|
||||
|
||||
def _parse_size(size_str: str, k_suffix: str) -> int:
|
||||
"""Parse size string with optional 'k' suffix."""
|
||||
size = int(size_str)
|
||||
return size * 1024 if k_suffix == "k" else size
|
||||
|
||||
|
||||
def parse_batch_spec(spec: str) -> list[BatchRequest]:
|
||||
"""
|
||||
Parse batch specification string into list of BatchRequest objects.
|
||||
|
||||
Grammar: (<count>?) q<q_len>(k?) (s<seq_len>(k?))?
|
||||
|
||||
Args:
|
||||
spec: Batch specification string (see module docstring for grammar)
|
||||
|
||||
Returns:
|
||||
List of BatchRequest objects
|
||||
|
||||
Raises:
|
||||
ValueError: If spec format is invalid
|
||||
"""
|
||||
requests = []
|
||||
|
||||
for seg in spec.split("_"):
|
||||
# Unified pattern: (<count>?) q<q_len>(k?) (s<seq_len>(k?))?
|
||||
m = re.match(r"^(?:(\d+))?q(\d+)(k?)(?:s(\d+)(k?))?$", seg)
|
||||
if m:
|
||||
cnt = int(m.group(1)) if m.group(1) else 1
|
||||
q_len = _parse_size(m.group(2), m.group(3))
|
||||
kv_len = _parse_size(m.group(4), m.group(5)) if m.group(4) else q_len
|
||||
requests.extend([BatchRequest(q_len=q_len, kv_len=kv_len)] * cnt)
|
||||
continue
|
||||
|
||||
raise ValueError(f"Invalid batch spec segment: '{seg}'")
|
||||
|
||||
return requests
|
||||
|
||||
|
||||
def format_batch_spec(requests: list[BatchRequest]) -> str:
|
||||
"""
|
||||
Format list of BatchRequest into human-readable string.
|
||||
|
||||
Groups requests by type and provides counts and sizes.
|
||||
|
||||
Args:
|
||||
requests: List of BatchRequest objects
|
||||
|
||||
Returns:
|
||||
Formatted string describing the batch
|
||||
"""
|
||||
kinds = {
|
||||
"prefill": [],
|
||||
"extend": [],
|
||||
"decode": [],
|
||||
}
|
||||
|
||||
for req in requests:
|
||||
tup = (req.q_len, req.kv_len)
|
||||
if req.is_prefill:
|
||||
kinds["prefill"].append(tup)
|
||||
elif req.is_extend:
|
||||
kinds["extend"].append(tup)
|
||||
elif req.is_decode:
|
||||
kinds["decode"].append(tup)
|
||||
|
||||
parts = []
|
||||
for kind in ["prefill", "extend", "decode"]:
|
||||
lst = kinds[kind]
|
||||
if not lst:
|
||||
continue
|
||||
|
||||
cnt_total = len(lst)
|
||||
ctr = Counter(lst)
|
||||
inner = []
|
||||
|
||||
for (q, kv), cnt in ctr.items():
|
||||
if kind == "prefill":
|
||||
size = f"{q // 1024}k" if q % 1024 == 0 else str(q)
|
||||
inner.append(f"{cnt}x{size}")
|
||||
elif kind == "decode":
|
||||
size = f"{kv // 1024}k" if kv % 1024 == 0 else str(kv)
|
||||
inner.append(f"{cnt}x{size}")
|
||||
else: # extend
|
||||
qstr = f"{q // 1024}k" if q % 1024 == 0 else str(q)
|
||||
kstr = f"{kv // 1024}k" if kv % 1024 == 0 else str(kv)
|
||||
inner.append(f"{cnt}xq{qstr}kv{kstr}")
|
||||
|
||||
parts.append(f"{cnt_total} {kind} ({', '.join(inner)})")
|
||||
|
||||
return ", ".join(parts)
|
||||
|
||||
|
||||
def reorder_for_flashinfer(requests: list[BatchRequest]) -> list[BatchRequest]:
|
||||
"""
|
||||
Reorder requests for FlashInfer: decode first, then prefill.
|
||||
|
||||
FlashInfer expects decode requests before prefill requests for
|
||||
optimal performance.
|
||||
|
||||
Args:
|
||||
requests: Original list of BatchRequest
|
||||
|
||||
Returns:
|
||||
Reordered list with decode requests first
|
||||
"""
|
||||
decodes = [r for r in requests if r.is_decode]
|
||||
non_decodes = [r for r in requests if not r.is_decode]
|
||||
return decodes + non_decodes
|
||||
|
||||
|
||||
def split_by_type(
|
||||
requests: list[BatchRequest],
|
||||
) -> dict[str, list[BatchRequest]]:
|
||||
"""
|
||||
Split requests by type for analysis.
|
||||
|
||||
Args:
|
||||
requests: List of BatchRequest
|
||||
|
||||
Returns:
|
||||
Dict with keys: 'decode', 'prefill', 'extend'
|
||||
"""
|
||||
result = {
|
||||
"decode": [],
|
||||
"prefill": [],
|
||||
"extend": [],
|
||||
}
|
||||
|
||||
for req in requests:
|
||||
if req.is_decode:
|
||||
result["decode"].append(req)
|
||||
elif req.is_prefill:
|
||||
result["prefill"].append(req)
|
||||
elif req.is_extend:
|
||||
result["extend"].append(req)
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def get_batch_stats(requests: list[BatchRequest]) -> dict:
|
||||
"""
|
||||
Compute statistics about a batch.
|
||||
|
||||
Args:
|
||||
requests: List of BatchRequest
|
||||
|
||||
Returns:
|
||||
Dict with batch statistics
|
||||
"""
|
||||
by_type = split_by_type(requests)
|
||||
|
||||
return {
|
||||
"total_requests": len(requests),
|
||||
"num_decode": len(by_type["decode"]),
|
||||
"num_prefill": len(by_type["prefill"]),
|
||||
"num_extend": len(by_type["extend"]),
|
||||
"total_tokens": sum(r.q_len for r in requests),
|
||||
"total_kv_cache": sum(r.kv_len for r in requests),
|
||||
"max_q_len": max((r.q_len for r in requests), default=0),
|
||||
"max_kv_len": max((r.kv_len for r in requests), default=0),
|
||||
"avg_q_len": sum(r.q_len for r in requests) / len(requests) if requests else 0,
|
||||
"avg_kv_len": (
|
||||
sum(r.kv_len for r in requests) / len(requests) if requests else 0
|
||||
),
|
||||
}
|
||||
886
benchmarks/attention_benchmarks/benchmark.py
Normal file
886
benchmarks/attention_benchmarks/benchmark.py
Normal file
@@ -0,0 +1,886 @@
|
||||
#!/usr/bin/env python3
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
"""
|
||||
Universal vLLM Attention Benchmark
|
||||
|
||||
Benchmark any attention backend with the extended grammar.
|
||||
Supports standard attention (Flash/Triton/FlashInfer) and MLA backends.
|
||||
|
||||
Examples:
|
||||
# Standard attention
|
||||
python benchmark.py --backends flash flashinfer --batch-specs "q2k" "8q1s1k"
|
||||
|
||||
# MLA backends
|
||||
python benchmark.py --backends cutlass_mla flashinfer_mla --batch-specs "64q1s1k"
|
||||
|
||||
# Parameter sweep (CLI)
|
||||
python benchmark.py --backend cutlass_mla \
|
||||
--batch-specs "64q1s1k" \
|
||||
--sweep-param num_kv_splits \
|
||||
--sweep-values 1 4 8 16
|
||||
|
||||
# Parameter sweep (YAML config - recommended)
|
||||
python benchmark.py --config configs/cutlass_numsplits.yaml
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import sys
|
||||
from dataclasses import replace
|
||||
from pathlib import Path
|
||||
|
||||
import yaml
|
||||
from rich.console import Console
|
||||
from tqdm import tqdm
|
||||
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent.parent))
|
||||
|
||||
from batch_spec import parse_batch_spec
|
||||
from common import (
|
||||
BenchmarkConfig,
|
||||
BenchmarkResult,
|
||||
ModelParameterSweep,
|
||||
ParameterSweep,
|
||||
ResultsFormatter,
|
||||
is_mla_backend,
|
||||
)
|
||||
|
||||
|
||||
def run_standard_attention_benchmark(config: BenchmarkConfig) -> BenchmarkResult:
|
||||
"""Run standard attention benchmark (Flash/Triton/FlashInfer)."""
|
||||
from runner import run_attention_benchmark
|
||||
|
||||
return run_attention_benchmark(config)
|
||||
|
||||
|
||||
def run_mla_benchmark(config: BenchmarkConfig, **kwargs) -> BenchmarkResult:
|
||||
"""Run MLA benchmark with appropriate backend."""
|
||||
from mla_runner import run_mla_benchmark as run_mla
|
||||
|
||||
return run_mla(config.backend, config, **kwargs)
|
||||
|
||||
|
||||
def run_benchmark(config: BenchmarkConfig, **kwargs) -> BenchmarkResult:
|
||||
"""
|
||||
Run a single benchmark with proper backend selection.
|
||||
|
||||
Args:
|
||||
config: BenchmarkConfig with backend, batch_spec, and model params
|
||||
**kwargs: Additional arguments passed to MLA benchmarks
|
||||
|
||||
Returns:
|
||||
BenchmarkResult (may have error field set on failure)
|
||||
"""
|
||||
try:
|
||||
if is_mla_backend(config.backend):
|
||||
return run_mla_benchmark(config, **kwargs)
|
||||
else:
|
||||
return run_standard_attention_benchmark(config)
|
||||
except Exception as e:
|
||||
return BenchmarkResult(
|
||||
config=config,
|
||||
mean_time=float("inf"),
|
||||
std_time=0,
|
||||
min_time=float("inf"),
|
||||
max_time=float("inf"),
|
||||
error=str(e),
|
||||
)
|
||||
|
||||
|
||||
def run_model_parameter_sweep(
|
||||
backends: list[str],
|
||||
batch_specs: list[str],
|
||||
base_config_args: dict,
|
||||
sweep: ModelParameterSweep,
|
||||
console: Console,
|
||||
) -> list[BenchmarkResult]:
|
||||
"""
|
||||
Run model parameter sweep for given backends and batch specs.
|
||||
|
||||
Args:
|
||||
backends: List of backend names
|
||||
batch_specs: List of batch specifications
|
||||
base_config_args: Base configuration arguments (num_layers, head_dim, etc.)
|
||||
sweep: ModelParameterSweep configuration
|
||||
console: Rich console for output
|
||||
|
||||
Returns:
|
||||
List of BenchmarkResult objects
|
||||
"""
|
||||
all_results = []
|
||||
|
||||
console.print(
|
||||
f"[yellow]Model sweep mode: testing {sweep.param_name} = {sweep.values}[/]"
|
||||
)
|
||||
|
||||
total = len(backends) * len(batch_specs) * len(sweep.values)
|
||||
|
||||
with tqdm(total=total, desc="Benchmarking") as pbar:
|
||||
for backend in backends:
|
||||
for spec in batch_specs:
|
||||
for value in sweep.values:
|
||||
# Create config with modified model parameter
|
||||
config_args = base_config_args.copy()
|
||||
config_args[sweep.param_name] = value
|
||||
|
||||
# Create config with original backend for running
|
||||
clean_config = BenchmarkConfig(
|
||||
backend=backend, batch_spec=spec, **config_args
|
||||
)
|
||||
|
||||
# Run benchmark
|
||||
result = run_benchmark(clean_config)
|
||||
|
||||
# Replace backend with labeled version for display
|
||||
backend_label = sweep.get_label(backend, value)
|
||||
labeled_config = replace(result.config, backend=backend_label)
|
||||
result = replace(result, config=labeled_config)
|
||||
all_results.append(result)
|
||||
|
||||
if not result.success:
|
||||
console.print(
|
||||
f"[red]Error {backend} {spec} {sweep.param_name}="
|
||||
f"{value}: {result.error}[/]"
|
||||
)
|
||||
|
||||
pbar.update(1)
|
||||
|
||||
# Display sweep results - create separate table for each parameter value
|
||||
console.print("\n[bold green]Model Parameter Sweep Results:[/]")
|
||||
formatter = ResultsFormatter(console)
|
||||
|
||||
# Group results by parameter value and extract backend mapping
|
||||
by_param_value = {}
|
||||
backend_mapping = {} # Maps labeled backend -> original backend
|
||||
|
||||
for r in all_results:
|
||||
# Extract original backend and param value from labeled backend
|
||||
# The label format is: {backend}_{param_name}_{value}
|
||||
# We need to reverse engineer this
|
||||
labeled_backend = r.config.backend
|
||||
|
||||
# Try each backend to find which one this result belongs to
|
||||
for backend in backends:
|
||||
for value in sweep.values:
|
||||
expected_label = sweep.get_label(backend, value)
|
||||
if labeled_backend == expected_label:
|
||||
backend_mapping[labeled_backend] = backend
|
||||
param_value = str(value)
|
||||
|
||||
if param_value not in by_param_value:
|
||||
by_param_value[param_value] = []
|
||||
by_param_value[param_value].append(r)
|
||||
break
|
||||
|
||||
# Create a table for each parameter value
|
||||
sorted_param_values = sorted(
|
||||
by_param_value.keys(), key=lambda x: int(x) if x.isdigit() else x
|
||||
)
|
||||
|
||||
for param_value in sorted_param_values:
|
||||
console.print(f"\n[bold cyan]{sweep.param_name} = {param_value}[/]")
|
||||
param_results = by_param_value[param_value]
|
||||
|
||||
# Create modified results with original backend names
|
||||
modified_results = []
|
||||
for r in param_results:
|
||||
# Get the original backend name from our mapping
|
||||
original_backend = backend_mapping[r.config.backend]
|
||||
modified_config = replace(r.config, backend=original_backend)
|
||||
modified_result = replace(r, config=modified_config)
|
||||
modified_results.append(modified_result)
|
||||
|
||||
# Print table with original backend names
|
||||
formatter.print_table(modified_results, backends, compare_to_fastest=True)
|
||||
|
||||
# Show optimal backend for each (param_value, batch_spec) combination
|
||||
console.print(
|
||||
f"\n[bold cyan]Optimal backend for each ({sweep.param_name}, batch_spec):[/]"
|
||||
)
|
||||
|
||||
# Group by (param_value, batch_spec)
|
||||
by_param_and_spec = {}
|
||||
for r in all_results:
|
||||
if r.success:
|
||||
# Find which (backend, value) this result corresponds to
|
||||
labeled_backend = r.config.backend
|
||||
for backend in backends:
|
||||
for value in sweep.values:
|
||||
expected_label = sweep.get_label(backend, value)
|
||||
if labeled_backend == expected_label:
|
||||
param_value = str(value)
|
||||
spec = r.config.batch_spec
|
||||
key = (param_value, spec)
|
||||
|
||||
if key not in by_param_and_spec:
|
||||
by_param_and_spec[key] = []
|
||||
by_param_and_spec[key].append(r)
|
||||
break
|
||||
|
||||
# Sort by param value then spec
|
||||
sorted_keys = sorted(
|
||||
by_param_and_spec.keys(),
|
||||
key=lambda x: (int(x[0]) if x[0].isdigit() else x[0], x[1]),
|
||||
)
|
||||
|
||||
current_param_value = None
|
||||
for param_value, spec in sorted_keys:
|
||||
# Print header when param value changes
|
||||
if param_value != current_param_value:
|
||||
console.print(f"\n [bold]{sweep.param_name}={param_value}:[/]")
|
||||
current_param_value = param_value
|
||||
|
||||
results = by_param_and_spec[(param_value, spec)]
|
||||
best = min(results, key=lambda r: r.mean_time)
|
||||
|
||||
# Extract original backend name using the mapping
|
||||
backend_name = backend_mapping[best.config.backend]
|
||||
|
||||
# Show all backends' times for comparison
|
||||
times_str = " | ".join(
|
||||
[
|
||||
f"{backend_mapping[r.config.backend]}: {r.mean_time:.6f}s"
|
||||
for r in sorted(results, key=lambda r: r.mean_time)
|
||||
]
|
||||
)
|
||||
|
||||
console.print(
|
||||
f" {spec:12s} -> [bold green]{backend_name:15s}[/] ({times_str})"
|
||||
)
|
||||
|
||||
return all_results
|
||||
|
||||
|
||||
def run_parameter_sweep(
|
||||
backends: list[str],
|
||||
batch_specs: list[str],
|
||||
base_config_args: dict,
|
||||
sweep: ParameterSweep,
|
||||
console: Console,
|
||||
) -> list[BenchmarkResult]:
|
||||
"""
|
||||
Run parameter sweep for given backends and batch specs.
|
||||
|
||||
Args:
|
||||
backends: List of backend names
|
||||
batch_specs: List of batch specifications
|
||||
base_config_args: Base configuration arguments (num_layers, head_dim, etc.)
|
||||
sweep: ParameterSweep configuration
|
||||
console: Rich console for output
|
||||
|
||||
Returns:
|
||||
List of BenchmarkResult objects
|
||||
"""
|
||||
all_results = []
|
||||
|
||||
# Build list of values to sweep (including auto if requested)
|
||||
sweep_values = list(sweep.values)
|
||||
if sweep.include_auto:
|
||||
sweep_values.append("auto")
|
||||
|
||||
console.print(f"[yellow]Sweep mode: testing {sweep.param_name} = {sweep_values}[/]")
|
||||
|
||||
total = len(backends) * len(batch_specs) * len(sweep_values)
|
||||
|
||||
with tqdm(total=total, desc="Benchmarking") as pbar:
|
||||
for backend in backends:
|
||||
for spec in batch_specs:
|
||||
for value in sweep_values:
|
||||
# Create config with original backend for running
|
||||
config = BenchmarkConfig(
|
||||
backend=backend, batch_spec=spec, **base_config_args
|
||||
)
|
||||
|
||||
# Prepare kwargs for benchmark runner
|
||||
kwargs = {}
|
||||
if value != "auto":
|
||||
kwargs[sweep.param_name] = value
|
||||
|
||||
# Run benchmark
|
||||
result = run_benchmark(config, **kwargs)
|
||||
|
||||
# Replace backend with labeled version for display
|
||||
backend_label = sweep.get_label(backend, value)
|
||||
labeled_config = replace(result.config, backend=backend_label)
|
||||
result = replace(result, config=labeled_config)
|
||||
all_results.append(result)
|
||||
|
||||
if not result.success:
|
||||
console.print(
|
||||
f"[red]Error {backend} {spec} {sweep.param_name}="
|
||||
f"{value}: {result.error}[/]"
|
||||
)
|
||||
|
||||
pbar.update(1)
|
||||
|
||||
# Display sweep results
|
||||
console.print("\n[bold green]Sweep Results:[/]")
|
||||
backend_labels = [sweep.get_label(b, v) for b in backends for v in sweep_values]
|
||||
formatter = ResultsFormatter(console)
|
||||
formatter.print_table(all_results, backend_labels)
|
||||
|
||||
# Show optimal values
|
||||
console.print(f"\n[bold cyan]Optimal {sweep.param_name} per batch spec:[/]")
|
||||
by_spec = {}
|
||||
for r in all_results:
|
||||
if r.success:
|
||||
spec = r.config.batch_spec
|
||||
if spec not in by_spec:
|
||||
by_spec[spec] = []
|
||||
by_spec[spec].append(r)
|
||||
|
||||
for spec in sorted(by_spec.keys()):
|
||||
results = by_spec[spec]
|
||||
best = min(results, key=lambda r: r.mean_time)
|
||||
console.print(
|
||||
f" {spec}: [bold green]{best.config.backend}[/] ({best.mean_time:.6f}s)"
|
||||
)
|
||||
|
||||
return all_results
|
||||
|
||||
|
||||
def load_config_from_yaml(config_path: str) -> dict:
|
||||
"""Load configuration from YAML file."""
|
||||
with open(config_path) as f:
|
||||
return yaml.safe_load(f)
|
||||
|
||||
|
||||
def generate_batch_specs_from_ranges(ranges: list[dict]) -> list[str]:
|
||||
"""
|
||||
Generate batch specs from range specifications.
|
||||
|
||||
Args:
|
||||
ranges: List of range specifications, each containing:
|
||||
- template: Batch spec template (e.g., "q{q_len}kv1k")
|
||||
- q_len: Dict with start, stop, step, end_inclusive (optional)
|
||||
- Other parameters can also be ranges
|
||||
|
||||
Returns:
|
||||
List of generated batch spec strings
|
||||
|
||||
Example:
|
||||
ranges = [
|
||||
{
|
||||
"template": "q{q_len}kv1k",
|
||||
"q_len": {
|
||||
"start": 1,
|
||||
"stop": 16,
|
||||
"step": 1,
|
||||
"end_inclusive": true # Optional, defaults to true
|
||||
}
|
||||
}
|
||||
]
|
||||
Returns: ["q1kv1k", "q2kv1k", ..., "q16kv1k"]
|
||||
"""
|
||||
all_specs = []
|
||||
|
||||
for range_spec in ranges:
|
||||
template = range_spec.get("template")
|
||||
if not template:
|
||||
raise ValueError("Range specification must include 'template'")
|
||||
|
||||
# Extract all range parameters from the spec
|
||||
range_params = {}
|
||||
for key, value in range_spec.items():
|
||||
if key == "template":
|
||||
continue
|
||||
if isinstance(value, dict) and "start" in value:
|
||||
# This is a range specification
|
||||
start = value["start"]
|
||||
stop = value["stop"]
|
||||
step = value.get("step", 1)
|
||||
# Check if end should be inclusive (default: True)
|
||||
end_inclusive = value.get("end_inclusive", True)
|
||||
|
||||
# Adjust stop based on end_inclusive
|
||||
if end_inclusive:
|
||||
range_params[key] = list(range(start, stop + 1, step))
|
||||
else:
|
||||
range_params[key] = list(range(start, stop, step))
|
||||
else:
|
||||
# This is a fixed value
|
||||
range_params[key] = [value]
|
||||
|
||||
# Generate all combinations (Cartesian product)
|
||||
if range_params:
|
||||
import itertools
|
||||
|
||||
param_names = list(range_params.keys())
|
||||
param_values = [range_params[name] for name in param_names]
|
||||
|
||||
for values in itertools.product(*param_values):
|
||||
params = dict(zip(param_names, values))
|
||||
spec = template.format(**params)
|
||||
all_specs.append(spec)
|
||||
else:
|
||||
# No parameters, just use template as-is
|
||||
all_specs.append(template)
|
||||
|
||||
return all_specs
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Universal vLLM attention benchmark",
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
epilog=__doc__,
|
||||
)
|
||||
|
||||
# Config file
|
||||
parser.add_argument(
|
||||
"--config",
|
||||
help="Path to YAML config file (overrides other args)",
|
||||
)
|
||||
|
||||
# Backend selection
|
||||
parser.add_argument(
|
||||
"--backends",
|
||||
nargs="+",
|
||||
help="Backends to benchmark (flash, triton, flashinfer, cutlass_mla, "
|
||||
"flashinfer_mla, flashattn_mla, flashmla)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--backend",
|
||||
help="Single backend (alternative to --backends)",
|
||||
)
|
||||
|
||||
# Batch specifications
|
||||
parser.add_argument(
|
||||
"--batch-specs",
|
||||
nargs="+",
|
||||
default=["q2k", "8q1s1k"],
|
||||
help="Batch specifications using extended grammar",
|
||||
)
|
||||
|
||||
# Model config
|
||||
parser.add_argument("--num-layers", type=int, default=10, help="Number of layers")
|
||||
parser.add_argument("--head-dim", type=int, default=128, help="Head dimension")
|
||||
parser.add_argument("--num-q-heads", type=int, default=32, help="Query heads")
|
||||
parser.add_argument("--num-kv-heads", type=int, default=8, help="KV heads")
|
||||
parser.add_argument("--block-size", type=int, default=16, help="Block size")
|
||||
|
||||
# Benchmark settings
|
||||
parser.add_argument("--device", default="cuda:0", help="Device")
|
||||
parser.add_argument("--repeats", type=int, default=1, help="Repetitions")
|
||||
parser.add_argument("--warmup-iters", type=int, default=3, help="Warmup iterations")
|
||||
parser.add_argument("--profile-memory", action="store_true", help="Profile memory")
|
||||
|
||||
# Parameter sweep (use YAML config for advanced sweeps)
|
||||
parser.add_argument(
|
||||
"--sweep-param",
|
||||
help="Parameter name to sweep (e.g., num_kv_splits, reorder_batch_threshold)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--sweep-values",
|
||||
type=int,
|
||||
nargs="+",
|
||||
help="Values to sweep for the parameter",
|
||||
)
|
||||
|
||||
# Output
|
||||
parser.add_argument("--output-csv", help="Save to CSV")
|
||||
parser.add_argument("--output-json", help="Save to JSON")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
console = Console()
|
||||
console.print("[bold cyan]vLLM Attention Benchmark[/]")
|
||||
|
||||
# Load config from YAML if provided
|
||||
if args.config:
|
||||
console.print(f"[yellow]Loading config from: {args.config}[/]")
|
||||
yaml_config = load_config_from_yaml(args.config)
|
||||
|
||||
# Show description if available
|
||||
if "description" in yaml_config:
|
||||
console.print(f"[dim]{yaml_config['description']}[/]")
|
||||
|
||||
# Override args with YAML values
|
||||
# (YAML takes precedence unless CLI arg was explicitly set)
|
||||
# Backend(s)
|
||||
if "backend" in yaml_config:
|
||||
args.backend = yaml_config["backend"]
|
||||
args.backends = None
|
||||
elif "backends" in yaml_config:
|
||||
args.backends = yaml_config["backends"]
|
||||
args.backend = None
|
||||
|
||||
# Check for special modes
|
||||
if "mode" in yaml_config:
|
||||
args.mode = yaml_config["mode"]
|
||||
else:
|
||||
args.mode = None
|
||||
|
||||
# Batch specs and sizes
|
||||
# Support both explicit batch_specs and generated batch_spec_ranges
|
||||
if "batch_spec_ranges" in yaml_config:
|
||||
# Generate batch specs from ranges
|
||||
generated_specs = generate_batch_specs_from_ranges(
|
||||
yaml_config["batch_spec_ranges"]
|
||||
)
|
||||
# Combine with any explicit batch_specs
|
||||
if "batch_specs" in yaml_config:
|
||||
args.batch_specs = yaml_config["batch_specs"] + generated_specs
|
||||
else:
|
||||
args.batch_specs = generated_specs
|
||||
console.print(
|
||||
f"[dim]Generated {len(generated_specs)} batch specs from ranges[/]"
|
||||
)
|
||||
elif "batch_specs" in yaml_config:
|
||||
args.batch_specs = yaml_config["batch_specs"]
|
||||
|
||||
if "batch_sizes" in yaml_config:
|
||||
args.batch_sizes = yaml_config["batch_sizes"]
|
||||
else:
|
||||
args.batch_sizes = None
|
||||
|
||||
# Model config
|
||||
if "model" in yaml_config:
|
||||
model = yaml_config["model"]
|
||||
args.num_layers = model.get("num_layers", args.num_layers)
|
||||
args.head_dim = model.get("head_dim", args.head_dim)
|
||||
args.num_q_heads = model.get("num_q_heads", args.num_q_heads)
|
||||
args.num_kv_heads = model.get("num_kv_heads", args.num_kv_heads)
|
||||
args.block_size = model.get("block_size", args.block_size)
|
||||
|
||||
# Benchmark settings
|
||||
if "benchmark" in yaml_config:
|
||||
bench = yaml_config["benchmark"]
|
||||
args.device = bench.get("device", args.device)
|
||||
args.repeats = bench.get("repeats", args.repeats)
|
||||
args.warmup_iters = bench.get("warmup_iters", args.warmup_iters)
|
||||
args.profile_memory = bench.get("profile_memory", args.profile_memory)
|
||||
|
||||
# Parameter sweep configuration
|
||||
if "parameter_sweep" in yaml_config:
|
||||
sweep_config = yaml_config["parameter_sweep"]
|
||||
args.parameter_sweep = ParameterSweep(
|
||||
param_name=sweep_config["param_name"],
|
||||
values=sweep_config["values"],
|
||||
include_auto=sweep_config.get("include_auto", False),
|
||||
label_format=sweep_config.get(
|
||||
"label_format", "{backend}_{param_name}_{value}"
|
||||
),
|
||||
)
|
||||
else:
|
||||
args.parameter_sweep = None
|
||||
|
||||
# Model parameter sweep configuration
|
||||
if "model_parameter_sweep" in yaml_config:
|
||||
sweep_config = yaml_config["model_parameter_sweep"]
|
||||
args.model_parameter_sweep = ModelParameterSweep(
|
||||
param_name=sweep_config["param_name"],
|
||||
values=sweep_config["values"],
|
||||
label_format=sweep_config.get(
|
||||
"label_format", "{backend}_{param_name}_{value}"
|
||||
),
|
||||
)
|
||||
else:
|
||||
args.model_parameter_sweep = None
|
||||
|
||||
# Output
|
||||
if "output" in yaml_config:
|
||||
output = yaml_config["output"]
|
||||
if "csv" in output and not args.output_csv:
|
||||
args.output_csv = output["csv"]
|
||||
if "json" in output and not args.output_json:
|
||||
args.output_json = output["json"]
|
||||
|
||||
console.print()
|
||||
|
||||
# Handle CLI-based parameter sweep (if not from YAML)
|
||||
if (
|
||||
(not hasattr(args, "parameter_sweep") or args.parameter_sweep is None)
|
||||
and args.sweep_param
|
||||
and args.sweep_values
|
||||
):
|
||||
args.parameter_sweep = ParameterSweep(
|
||||
param_name=args.sweep_param,
|
||||
values=args.sweep_values,
|
||||
include_auto=False,
|
||||
label_format="{backend}_{param_name}_{value}",
|
||||
)
|
||||
|
||||
# Determine backends
|
||||
backends = args.backends or ([args.backend] if args.backend else ["flash"])
|
||||
console.print(f"Backends: {', '.join(backends)}")
|
||||
console.print(f"Batch specs: {', '.join(args.batch_specs)}")
|
||||
console.print()
|
||||
|
||||
# Run benchmarks
|
||||
all_results = []
|
||||
|
||||
# Handle special mode: decode_vs_prefill comparison
|
||||
if hasattr(args, "mode") and args.mode == "decode_vs_prefill":
|
||||
console.print("[yellow]Mode: Decode vs Prefill pipeline comparison[/]")
|
||||
console.print(
|
||||
"[dim]For each query length, testing both decode and prefill pipelines[/]"
|
||||
)
|
||||
console.print("[dim]Using batched execution for optimal performance[/]")
|
||||
|
||||
# Extract batch sizes from config
|
||||
batch_sizes = getattr(args, "batch_sizes", [1])
|
||||
backend = backends[0] # Use first backend (should only be one)
|
||||
|
||||
# Calculate total benchmarks
|
||||
total = len(batch_sizes)
|
||||
|
||||
with tqdm(total=total, desc="Benchmarking") as pbar:
|
||||
for batch_size in batch_sizes:
|
||||
# Prepare all configs for this batch size
|
||||
configs_with_thresholds = []
|
||||
|
||||
for spec in args.batch_specs:
|
||||
# Parse the batch spec to get query length
|
||||
requests = parse_batch_spec(spec)
|
||||
if not requests:
|
||||
console.print(
|
||||
f"[red]Error: Could not parse batch spec '{spec}'[/]"
|
||||
)
|
||||
continue
|
||||
|
||||
# Get query length from first request
|
||||
query_length = requests[0].q_len
|
||||
|
||||
# Create batch spec for this batch size
|
||||
# For batch_size > 1, we need to prepend the count
|
||||
batch_spec = f"{batch_size}{spec}" if batch_size > 1 else spec
|
||||
|
||||
# Create base config (without backend name)
|
||||
base_config = BenchmarkConfig(
|
||||
backend=backend, # Will be overridden later
|
||||
batch_spec=batch_spec,
|
||||
num_layers=args.num_layers,
|
||||
head_dim=args.head_dim,
|
||||
num_q_heads=args.num_q_heads,
|
||||
num_kv_heads=args.num_kv_heads,
|
||||
block_size=args.block_size,
|
||||
device=args.device,
|
||||
repeats=args.repeats,
|
||||
warmup_iters=args.warmup_iters,
|
||||
profile_memory=args.profile_memory,
|
||||
)
|
||||
|
||||
# Add decode pipeline config
|
||||
decode_threshold = query_length
|
||||
config_decode = replace(
|
||||
base_config,
|
||||
backend=f"{backend}_decode_qlen{query_length}_bs{batch_size}",
|
||||
)
|
||||
configs_with_thresholds.append((config_decode, decode_threshold))
|
||||
|
||||
# Add prefill pipeline config if query_length > 1
|
||||
if query_length > 1:
|
||||
prefill_threshold = query_length - 1
|
||||
config_prefill = replace(
|
||||
base_config,
|
||||
backend=f"{backend}_prefill_qlen{query_length}"
|
||||
f"_bs{batch_size}",
|
||||
)
|
||||
configs_with_thresholds.append(
|
||||
(config_prefill, prefill_threshold)
|
||||
)
|
||||
|
||||
# Run all benchmarks for this batch size in one go (batched mode)
|
||||
try:
|
||||
from mla_runner import run_mla_benchmark as run_mla
|
||||
|
||||
# Use batched API: pass list of (config, threshold) tuples
|
||||
timing_results = run_mla(backend, configs_with_thresholds)
|
||||
|
||||
# Create BenchmarkResult objects from timing results
|
||||
for (config, _), timing in zip(
|
||||
configs_with_thresholds, timing_results
|
||||
):
|
||||
result = BenchmarkResult(
|
||||
config=config,
|
||||
mean_time=timing["mean"],
|
||||
std_time=timing["std"],
|
||||
min_time=timing["min"],
|
||||
max_time=timing["max"],
|
||||
throughput_tokens_per_sec=timing.get("throughput", None),
|
||||
)
|
||||
all_results.append(result)
|
||||
|
||||
except Exception as e:
|
||||
import traceback
|
||||
|
||||
console.print(
|
||||
f"[red]Error running batched benchmarks for "
|
||||
f"batch_size={batch_size}: {e}[/]"
|
||||
)
|
||||
console.print("[red]Traceback:[/]")
|
||||
traceback.print_exc()
|
||||
# Add error results for all configs
|
||||
for config, _ in configs_with_thresholds:
|
||||
result = BenchmarkResult(
|
||||
config=config,
|
||||
mean_time=float("inf"),
|
||||
std_time=0,
|
||||
min_time=float("inf"),
|
||||
max_time=float("inf"),
|
||||
error=str(e),
|
||||
)
|
||||
all_results.append(result)
|
||||
|
||||
pbar.update(1)
|
||||
|
||||
# Display decode vs prefill results
|
||||
console.print("\n[bold green]Decode vs Prefill Results:[/]")
|
||||
|
||||
# Group by batch size
|
||||
by_batch_size = {}
|
||||
for r in all_results:
|
||||
if r.success:
|
||||
# Extract batch size from backend name
|
||||
parts = r.config.backend.split("_")
|
||||
bs_part = [p for p in parts if p.startswith("bs")]
|
||||
if bs_part:
|
||||
bs = int(bs_part[0][2:])
|
||||
if bs not in by_batch_size:
|
||||
by_batch_size[bs] = []
|
||||
by_batch_size[bs].append(r)
|
||||
|
||||
# For each batch size, analyze crossover point
|
||||
for bs in sorted(by_batch_size.keys()):
|
||||
console.print(f"\n[bold cyan]Batch size: {bs}[/]")
|
||||
results = by_batch_size[bs]
|
||||
|
||||
# Group by query length
|
||||
by_qlen = {}
|
||||
for r in results:
|
||||
parts = r.config.backend.split("_")
|
||||
qlen_part = [p for p in parts if p.startswith("qlen")]
|
||||
if qlen_part:
|
||||
qlen = int(qlen_part[0][4:])
|
||||
if qlen not in by_qlen:
|
||||
by_qlen[qlen] = {}
|
||||
|
||||
pipeline = "decode" if "decode" in r.config.backend else "prefill"
|
||||
by_qlen[qlen][pipeline] = r
|
||||
|
||||
# Find crossover point
|
||||
last_decode_faster = None
|
||||
for qlen in sorted(by_qlen.keys()):
|
||||
pipelines = by_qlen[qlen]
|
||||
if "decode" in pipelines and "prefill" in pipelines:
|
||||
decode_time = pipelines["decode"].mean_time
|
||||
prefill_time = pipelines["prefill"].mean_time
|
||||
faster = "decode" if decode_time < prefill_time else "prefill"
|
||||
|
||||
speedup = (
|
||||
prefill_time / decode_time
|
||||
if decode_time < prefill_time
|
||||
else decode_time / prefill_time
|
||||
)
|
||||
|
||||
console.print(
|
||||
f" qlen={qlen:3d}: decode={decode_time:.6f}s, "
|
||||
f"prefill={prefill_time:.6f}s -> "
|
||||
f"[bold]{faster}[/] ({speedup:.2f}x)"
|
||||
)
|
||||
|
||||
if faster == "decode":
|
||||
last_decode_faster = qlen
|
||||
|
||||
if last_decode_faster is not None:
|
||||
optimal_threshold = last_decode_faster
|
||||
console.print(
|
||||
f"\n [bold green]Optimal threshold for batch_size={bs}: "
|
||||
f"{optimal_threshold}[/]"
|
||||
)
|
||||
console.print(
|
||||
f" [dim](Use decode pipeline for query_length <= "
|
||||
f"{optimal_threshold})[/]"
|
||||
)
|
||||
else:
|
||||
console.print(
|
||||
f"\n [yellow]Prefill always faster for batch_size={bs}[/]"
|
||||
)
|
||||
|
||||
# Handle model parameter sweep mode
|
||||
elif hasattr(args, "model_parameter_sweep") and args.model_parameter_sweep:
|
||||
# Model parameter sweep
|
||||
base_config_args = {
|
||||
"num_layers": args.num_layers,
|
||||
"head_dim": args.head_dim,
|
||||
"num_q_heads": args.num_q_heads,
|
||||
"num_kv_heads": args.num_kv_heads,
|
||||
"block_size": args.block_size,
|
||||
"device": args.device,
|
||||
"repeats": args.repeats,
|
||||
"warmup_iters": args.warmup_iters,
|
||||
"profile_memory": args.profile_memory,
|
||||
}
|
||||
all_results = run_model_parameter_sweep(
|
||||
backends,
|
||||
args.batch_specs,
|
||||
base_config_args,
|
||||
args.model_parameter_sweep,
|
||||
console,
|
||||
)
|
||||
|
||||
# Handle parameter sweep mode (unified)
|
||||
elif hasattr(args, "parameter_sweep") and args.parameter_sweep:
|
||||
# Unified parameter sweep
|
||||
base_config_args = {
|
||||
"num_layers": args.num_layers,
|
||||
"head_dim": args.head_dim,
|
||||
"num_q_heads": args.num_q_heads,
|
||||
"num_kv_heads": args.num_kv_heads,
|
||||
"block_size": args.block_size,
|
||||
"device": args.device,
|
||||
"repeats": args.repeats,
|
||||
"warmup_iters": args.warmup_iters,
|
||||
"profile_memory": args.profile_memory,
|
||||
}
|
||||
all_results = run_parameter_sweep(
|
||||
backends, args.batch_specs, base_config_args, args.parameter_sweep, console
|
||||
)
|
||||
|
||||
else:
|
||||
# Normal mode: compare backends
|
||||
total = len(backends) * len(args.batch_specs)
|
||||
|
||||
with tqdm(total=total, desc="Benchmarking") as pbar:
|
||||
for spec in args.batch_specs:
|
||||
for backend in backends:
|
||||
config = BenchmarkConfig(
|
||||
backend=backend,
|
||||
batch_spec=spec,
|
||||
num_layers=args.num_layers,
|
||||
head_dim=args.head_dim,
|
||||
num_q_heads=args.num_q_heads,
|
||||
num_kv_heads=args.num_kv_heads,
|
||||
block_size=args.block_size,
|
||||
device=args.device,
|
||||
repeats=args.repeats,
|
||||
warmup_iters=args.warmup_iters,
|
||||
profile_memory=args.profile_memory,
|
||||
)
|
||||
|
||||
result = run_benchmark(config)
|
||||
all_results.append(result)
|
||||
|
||||
if not result.success:
|
||||
console.print(f"[red]Error {backend} {spec}: {result.error}[/]")
|
||||
|
||||
pbar.update(1)
|
||||
|
||||
# Display results
|
||||
console.print("\n[bold green]Results:[/]")
|
||||
formatter = ResultsFormatter(console)
|
||||
formatter.print_table(all_results, backends)
|
||||
|
||||
# Save results
|
||||
if all_results:
|
||||
formatter = ResultsFormatter(console)
|
||||
if args.output_csv:
|
||||
formatter.save_csv(all_results, args.output_csv)
|
||||
if args.output_json:
|
||||
formatter.save_json(all_results, args.output_json)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
503
benchmarks/attention_benchmarks/common.py
Normal file
503
benchmarks/attention_benchmarks/common.py
Normal file
@@ -0,0 +1,503 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
"""Common utilities for attention benchmarking."""
|
||||
|
||||
import csv
|
||||
import json
|
||||
import math
|
||||
from dataclasses import asdict, dataclass
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from rich.console import Console
|
||||
from rich.table import Table
|
||||
|
||||
# Mock classes for vLLM attention infrastructure
|
||||
|
||||
|
||||
class MockHfConfig:
|
||||
"""Mock HuggingFace config that satisfies vLLM's requirements."""
|
||||
|
||||
def __init__(self, mla_dims: dict):
|
||||
self.num_attention_heads = mla_dims["num_q_heads"]
|
||||
self.num_key_value_heads = mla_dims["num_kv_heads"]
|
||||
self.hidden_size = mla_dims["head_dim"] * mla_dims["num_q_heads"]
|
||||
self.model_type = "deepseek_v2"
|
||||
self.is_encoder_decoder = False
|
||||
self.kv_lora_rank = mla_dims["kv_lora_rank"]
|
||||
self.qk_nope_head_dim = mla_dims["qk_nope_head_dim"]
|
||||
self.qk_rope_head_dim = mla_dims["qk_rope_head_dim"]
|
||||
self.v_head_dim = mla_dims["v_head_dim"]
|
||||
self.qk_head_dim = mla_dims["qk_nope_head_dim"] + mla_dims["qk_rope_head_dim"]
|
||||
|
||||
def get_text_config(self):
|
||||
return self
|
||||
|
||||
|
||||
# Import AttentionLayerBase at module level to avoid circular dependencies
|
||||
try:
|
||||
from vllm.model_executor.layers.attention_layer_base import AttentionLayerBase
|
||||
|
||||
_HAS_ATTENTION_LAYER_BASE = True
|
||||
except ImportError:
|
||||
_HAS_ATTENTION_LAYER_BASE = False
|
||||
AttentionLayerBase = object # Fallback
|
||||
|
||||
|
||||
class MockKVBProj:
|
||||
"""Mock KV projection layer for MLA prefill mode.
|
||||
|
||||
Mimics ColumnParallelLinear behavior for kv_b_proj in MLA backends.
|
||||
Projects kv_c_normed to [qk_nope_head_dim + v_head_dim] per head.
|
||||
"""
|
||||
|
||||
def __init__(self, num_heads: int, qk_nope_head_dim: int, v_head_dim: int):
|
||||
self.num_heads = num_heads
|
||||
self.qk_nope_head_dim = qk_nope_head_dim
|
||||
self.v_head_dim = v_head_dim
|
||||
self.out_dim = qk_nope_head_dim + v_head_dim
|
||||
|
||||
def __call__(self, x: torch.Tensor) -> tuple[torch.Tensor]:
|
||||
"""
|
||||
Project kv_c_normed to output space.
|
||||
|
||||
Args:
|
||||
x: Input tensor [num_tokens, kv_lora_rank]
|
||||
|
||||
Returns:
|
||||
Tuple containing output tensor
|
||||
[num_tokens, num_heads, qk_nope_head_dim + v_head_dim]
|
||||
"""
|
||||
num_tokens = x.shape[0]
|
||||
result = torch.randn(
|
||||
num_tokens,
|
||||
self.num_heads,
|
||||
self.out_dim,
|
||||
device=x.device,
|
||||
dtype=x.dtype,
|
||||
)
|
||||
return (result,) # Return as tuple to match ColumnParallelLinear API
|
||||
|
||||
|
||||
class MockLayer(AttentionLayerBase):
|
||||
"""Mock attention layer with scale parameters and impl.
|
||||
|
||||
Inherits from AttentionLayerBase so it passes isinstance checks
|
||||
in get_layers_from_vllm_config when FlashInfer prefill is enabled.
|
||||
"""
|
||||
|
||||
def __init__(self, device: torch.device, impl=None, kv_cache_spec=None):
|
||||
# Don't call super().__init__() as AttentionLayerBase doesn't have __init__
|
||||
self._k_scale = torch.tensor(1.0, device=device)
|
||||
self._v_scale = torch.tensor(1.0, device=device)
|
||||
self._q_scale = torch.tensor(1.0, device=device)
|
||||
# Scalar floats for kernels that need them
|
||||
self._k_scale_float = float(self._k_scale.item())
|
||||
self._v_scale_float = float(self._v_scale.item())
|
||||
self._q_scale_float = float(self._q_scale.item())
|
||||
# AttentionImpl for metadata builders to query
|
||||
self.impl = impl
|
||||
# KV cache spec for get_kv_cache_spec
|
||||
self._kv_cache_spec = kv_cache_spec
|
||||
|
||||
def get_attn_backend(self):
|
||||
"""Get the attention backend class (required by AttentionLayerBase)."""
|
||||
# Return None as this is just a mock layer for benchmarking
|
||||
return None
|
||||
|
||||
def get_kv_cache_spec(self):
|
||||
"""Get the KV cache spec (required by AttentionLayerBase)."""
|
||||
return self._kv_cache_spec
|
||||
|
||||
|
||||
class MockModelConfig:
|
||||
"""Mock model configuration."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
num_q_heads: int,
|
||||
num_kv_heads: int,
|
||||
head_dim: int,
|
||||
dtype: torch.dtype = torch.float16,
|
||||
max_model_len: int = 32768,
|
||||
):
|
||||
self._n_q = num_q_heads
|
||||
self._n_kv = num_kv_heads
|
||||
self._d = head_dim
|
||||
self.dtype = dtype
|
||||
self.max_model_len = max_model_len
|
||||
|
||||
def get_num_attention_heads(self, _=None) -> int:
|
||||
return self._n_q
|
||||
|
||||
def get_num_kv_heads(self, _=None) -> int:
|
||||
return self._n_kv
|
||||
|
||||
def get_head_size(self) -> int:
|
||||
return self._d
|
||||
|
||||
def get_num_layers(self) -> int:
|
||||
"""Mock method for layer count queries."""
|
||||
return 1
|
||||
|
||||
def get_sliding_window_for_layer(self, _layer_idx: int):
|
||||
"""Mock method for sliding window queries."""
|
||||
return None
|
||||
|
||||
def get_logits_soft_cap_for_layer(self, _layer_idx: int):
|
||||
"""Mock method for logits soft cap queries."""
|
||||
return None
|
||||
|
||||
def get_sm_scale_for_layer(self, _layer_idx: int) -> float:
|
||||
"""Mock method for SM scale queries."""
|
||||
return 1.0 / (self.get_head_size() ** 0.5)
|
||||
|
||||
|
||||
class MockParallelConfig:
|
||||
"""Mock parallel configuration."""
|
||||
|
||||
pass
|
||||
|
||||
|
||||
class MockCompilationConfig:
|
||||
"""Mock compilation configuration."""
|
||||
|
||||
def __init__(self):
|
||||
self.full_cuda_graph = False
|
||||
self.static_forward_context = {}
|
||||
|
||||
|
||||
class MockVLLMConfig:
|
||||
"""Mock VLLM configuration."""
|
||||
|
||||
def __init__(self):
|
||||
self.compilation_config = MockCompilationConfig()
|
||||
|
||||
|
||||
class MockRunner:
|
||||
"""Mock GPU runner for metadata builders."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
seq_lens: np.ndarray,
|
||||
query_start_locs: np.ndarray,
|
||||
device: torch.device,
|
||||
num_q_heads: int,
|
||||
num_kv_heads: int,
|
||||
head_dim: int,
|
||||
dtype: torch.dtype,
|
||||
):
|
||||
self.model_config = MockModelConfig(num_q_heads, num_kv_heads, head_dim, dtype)
|
||||
self.parallel_config = MockParallelConfig()
|
||||
self.vllm_config = MockVLLMConfig()
|
||||
self.seq_lens_np = seq_lens
|
||||
self.query_start_loc_np = query_start_locs
|
||||
self.device = device
|
||||
self.attention_chunk_size = None
|
||||
self.num_query_heads = num_q_heads
|
||||
self.num_kv_heads = num_kv_heads
|
||||
self.dtype = dtype
|
||||
|
||||
|
||||
@dataclass
|
||||
class ParameterSweep:
|
||||
"""Configuration for sweeping a backend parameter."""
|
||||
|
||||
param_name: str # Name of the backend parameter to sweep
|
||||
values: list[Any] # List of values to test
|
||||
include_auto: bool = False # Also test with param unset (auto mode)
|
||||
label_format: str = "{backend}_{param_name}_{value}" # Result label template
|
||||
|
||||
def get_label(self, backend: str, value: Any) -> str:
|
||||
"""Generate a label for a specific parameter value."""
|
||||
return self.label_format.format(
|
||||
backend=backend, param_name=self.param_name, value=value
|
||||
)
|
||||
|
||||
|
||||
@dataclass
|
||||
class ModelParameterSweep:
|
||||
"""Configuration for sweeping a model configuration parameter."""
|
||||
|
||||
param_name: str # Name of the model config parameter to sweep (e.g., "num_q_heads")
|
||||
values: list[Any] # List of values to test
|
||||
label_format: str = "{backend}_{param_name}_{value}" # Result label template
|
||||
|
||||
def get_label(self, backend: str, value: Any) -> str:
|
||||
"""Generate a label for a specific parameter value."""
|
||||
return self.label_format.format(
|
||||
backend=backend, param_name=self.param_name, value=value
|
||||
)
|
||||
|
||||
|
||||
@dataclass
|
||||
class BenchmarkConfig:
|
||||
"""Configuration for a single benchmark run."""
|
||||
|
||||
backend: str
|
||||
batch_spec: str
|
||||
num_layers: int
|
||||
head_dim: int
|
||||
num_q_heads: int
|
||||
num_kv_heads: int
|
||||
block_size: int
|
||||
device: str
|
||||
dtype: torch.dtype = torch.float16
|
||||
repeats: int = 1
|
||||
warmup_iters: int = 3
|
||||
profile_memory: bool = False
|
||||
use_cuda_graphs: bool = False
|
||||
|
||||
# MLA-specific
|
||||
kv_lora_rank: int | None = None
|
||||
qk_nope_head_dim: int | None = None
|
||||
qk_rope_head_dim: int | None = None
|
||||
v_head_dim: int | None = None
|
||||
|
||||
# Backend-specific tuning
|
||||
num_kv_splits: int | None = None # CUTLASS MLA
|
||||
reorder_batch_threshold: int | None = None # FlashAttn MLA, FlashMLA
|
||||
|
||||
|
||||
@dataclass
|
||||
class BenchmarkResult:
|
||||
"""Results from a single benchmark run."""
|
||||
|
||||
config: BenchmarkConfig
|
||||
mean_time: float # seconds
|
||||
std_time: float # seconds
|
||||
min_time: float # seconds
|
||||
max_time: float # seconds
|
||||
throughput_tokens_per_sec: float | None = None
|
||||
memory_allocated_mb: float | None = None
|
||||
memory_reserved_mb: float | None = None
|
||||
error: str | None = None
|
||||
|
||||
@property
|
||||
def success(self) -> bool:
|
||||
"""Whether benchmark completed successfully."""
|
||||
return self.error is None
|
||||
|
||||
def to_dict(self) -> dict[str, Any]:
|
||||
"""Convert to dictionary for serialization."""
|
||||
return {
|
||||
"config": asdict(self.config),
|
||||
"mean_time": self.mean_time,
|
||||
"std_time": self.std_time,
|
||||
"min_time": self.min_time,
|
||||
"max_time": self.max_time,
|
||||
"throughput_tokens_per_sec": self.throughput_tokens_per_sec,
|
||||
"memory_allocated_mb": self.memory_allocated_mb,
|
||||
"memory_reserved_mb": self.memory_reserved_mb,
|
||||
"error": self.error,
|
||||
}
|
||||
|
||||
|
||||
class ResultsFormatter:
|
||||
"""Format and display benchmark results."""
|
||||
|
||||
def __init__(self, console: Console | None = None):
|
||||
self.console = console or Console()
|
||||
|
||||
def print_table(
|
||||
self,
|
||||
results: list[BenchmarkResult],
|
||||
backends: list[str],
|
||||
compare_to_fastest: bool = True,
|
||||
):
|
||||
"""
|
||||
Print results as a rich table.
|
||||
|
||||
Args:
|
||||
results: List of BenchmarkResult
|
||||
backends: List of backend names being compared
|
||||
compare_to_fastest: Show percentage comparison to fastest
|
||||
"""
|
||||
# Group by batch spec
|
||||
by_spec = {}
|
||||
for r in results:
|
||||
spec = r.config.batch_spec
|
||||
if spec not in by_spec:
|
||||
by_spec[spec] = {}
|
||||
by_spec[spec][r.config.backend] = r
|
||||
|
||||
# Create shortened backend names for display
|
||||
def shorten_backend_name(name: str) -> str:
|
||||
"""Shorten long backend names for table display."""
|
||||
# Remove common prefixes
|
||||
name = name.replace("flashattn_mla", "famla")
|
||||
name = name.replace("flashinfer_mla", "fimla")
|
||||
name = name.replace("flashmla", "fmla")
|
||||
name = name.replace("cutlass_mla", "cmla")
|
||||
name = name.replace("numsplits", "ns")
|
||||
return name
|
||||
|
||||
table = Table(title="Attention Benchmark Results")
|
||||
table.add_column("Batch\nSpec", no_wrap=True)
|
||||
|
||||
multi = len(backends) > 1
|
||||
for backend in backends:
|
||||
short_name = shorten_backend_name(backend)
|
||||
# Time column
|
||||
col_time = f"{short_name}\nTime (s)"
|
||||
table.add_column(col_time, justify="right", no_wrap=False)
|
||||
if multi and compare_to_fastest:
|
||||
# Relative performance column
|
||||
col_rel = f"{short_name}\nvs Best"
|
||||
table.add_column(col_rel, justify="right", no_wrap=False)
|
||||
|
||||
# Add rows
|
||||
for spec in sorted(by_spec.keys()):
|
||||
spec_results = by_spec[spec]
|
||||
times = {b: r.mean_time for b, r in spec_results.items() if r.success}
|
||||
best_time = min(times.values()) if times else 0.0
|
||||
|
||||
row = [spec]
|
||||
for backend in backends:
|
||||
if backend in spec_results:
|
||||
r = spec_results[backend]
|
||||
if r.success:
|
||||
row.append(f"{r.mean_time:.6f}")
|
||||
if multi and compare_to_fastest:
|
||||
pct = (
|
||||
(r.mean_time / best_time * 100) if best_time > 0 else 0
|
||||
)
|
||||
pct_str = f"{pct:.1f}%"
|
||||
if r.mean_time == best_time:
|
||||
pct_str = f"[bold green]{pct_str}[/]"
|
||||
row.append(pct_str)
|
||||
else:
|
||||
row.append("[red]ERROR[/]")
|
||||
if multi and compare_to_fastest:
|
||||
row.append("-")
|
||||
else:
|
||||
row.append("-")
|
||||
if multi and compare_to_fastest:
|
||||
row.append("-")
|
||||
|
||||
table.add_row(*row)
|
||||
|
||||
self.console.print(table)
|
||||
|
||||
def save_csv(self, results: list[BenchmarkResult], path: str):
|
||||
"""Save results to CSV file."""
|
||||
if not results:
|
||||
return
|
||||
|
||||
path_obj = Path(path)
|
||||
path_obj.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
with open(path, "w", newline="") as f:
|
||||
writer = csv.DictWriter(
|
||||
f,
|
||||
fieldnames=[
|
||||
"backend",
|
||||
"batch_spec",
|
||||
"num_layers",
|
||||
"mean_time",
|
||||
"std_time",
|
||||
"throughput",
|
||||
"memory_mb",
|
||||
],
|
||||
)
|
||||
writer.writeheader()
|
||||
for r in results:
|
||||
writer.writerow(
|
||||
{
|
||||
"backend": r.config.backend,
|
||||
"batch_spec": r.config.batch_spec,
|
||||
"num_layers": r.config.num_layers,
|
||||
"mean_time": r.mean_time,
|
||||
"std_time": r.std_time,
|
||||
"throughput": r.throughput_tokens_per_sec or 0,
|
||||
"memory_mb": r.memory_allocated_mb or 0,
|
||||
}
|
||||
)
|
||||
|
||||
self.console.print(f"[green]Saved CSV results to {path}[/]")
|
||||
|
||||
def save_json(self, results: list[BenchmarkResult], path: str):
|
||||
"""Save results to JSON file."""
|
||||
path_obj = Path(path)
|
||||
path_obj.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
data = [r.to_dict() for r in results]
|
||||
with open(path, "w") as f:
|
||||
json.dump(data, f, indent=2, default=str)
|
||||
|
||||
self.console.print(f"[green]Saved JSON results to {path}[/]")
|
||||
|
||||
|
||||
def setup_mla_dims(model_name: str = "deepseek-v3") -> dict:
|
||||
"""
|
||||
Get MLA dimensions for known models.
|
||||
|
||||
Args:
|
||||
model_name: Model identifier
|
||||
|
||||
Returns:
|
||||
Dict with MLA dimension configuration
|
||||
"""
|
||||
configs = {
|
||||
"deepseek-v2": {
|
||||
"kv_lora_rank": 512,
|
||||
"qk_nope_head_dim": 128,
|
||||
"qk_rope_head_dim": 64,
|
||||
"v_head_dim": 128,
|
||||
"num_q_heads": 128,
|
||||
"num_kv_heads": 1,
|
||||
"head_dim": 576,
|
||||
},
|
||||
"deepseek-v3": {
|
||||
"kv_lora_rank": 512,
|
||||
"qk_nope_head_dim": 128,
|
||||
"qk_rope_head_dim": 64,
|
||||
"v_head_dim": 128,
|
||||
"num_q_heads": 128,
|
||||
"num_kv_heads": 1,
|
||||
"head_dim": 576,
|
||||
},
|
||||
"deepseek-v2-lite": {
|
||||
"kv_lora_rank": 512,
|
||||
"qk_nope_head_dim": 128,
|
||||
"qk_rope_head_dim": 64,
|
||||
"v_head_dim": 128,
|
||||
"num_q_heads": 16,
|
||||
"num_kv_heads": 1,
|
||||
"head_dim": 576,
|
||||
},
|
||||
}
|
||||
|
||||
if model_name not in configs:
|
||||
raise ValueError(
|
||||
f"Unknown model '{model_name}'. Known models: {list(configs.keys())}"
|
||||
)
|
||||
|
||||
return configs[model_name]
|
||||
|
||||
|
||||
def get_attention_scale(head_dim: int) -> float:
|
||||
"""Compute attention scale factor (1/sqrt(d))."""
|
||||
return 1.0 / math.sqrt(head_dim)
|
||||
|
||||
|
||||
def is_mla_backend(backend: str) -> bool:
|
||||
"""
|
||||
Check if backend is an MLA backend using the backend's is_mla() property.
|
||||
|
||||
Args:
|
||||
backend: Backend name (e.g., "CUTLASS_MLA", "FLASHINFER_MLA")
|
||||
|
||||
Returns:
|
||||
True if the backend is an MLA backend, False otherwise
|
||||
"""
|
||||
from vllm.v1.attention.backends.registry import AttentionBackendEnum
|
||||
|
||||
try:
|
||||
backend_class = AttentionBackendEnum[backend.upper()].get_class()
|
||||
return backend_class.is_mla()
|
||||
except (KeyError, ValueError, ImportError):
|
||||
return False
|
||||
61
benchmarks/attention_benchmarks/configs/mla_decode.yaml
Normal file
61
benchmarks/attention_benchmarks/configs/mla_decode.yaml
Normal file
@@ -0,0 +1,61 @@
|
||||
# MLA decode-only benchmark configuration
|
||||
|
||||
model:
|
||||
name: "deepseek-v3"
|
||||
num_layers: 60
|
||||
num_q_heads: 128
|
||||
num_kv_heads: 1 # MLA uses single latent KV
|
||||
head_dim: 576
|
||||
kv_lora_rank: 512
|
||||
qk_nope_head_dim: 128
|
||||
qk_rope_head_dim: 64
|
||||
v_head_dim: 128
|
||||
block_size: 128 # CUTLASS MLA and FlashAttn MLA use 128
|
||||
|
||||
batch_specs:
|
||||
# Small batches, varying sequence lengths
|
||||
- "16q1s512" # 16 requests, 512 KV cache
|
||||
- "16q1s1k" # 16 requests, 1k KV cache
|
||||
- "16q1s2k" # 16 requests, 2k KV cache
|
||||
- "16q1s4k" # 16 requests, 4k KV cache
|
||||
|
||||
# Medium batches
|
||||
- "32q1s1k" # 32 requests, 1k KV cache
|
||||
- "32q1s2k" # 32 requests, 2k KV cache
|
||||
- "32q1s4k" # 32 requests, 4k KV cache
|
||||
- "32q1s8k" # 32 requests, 8k KV cache
|
||||
|
||||
# Large batches
|
||||
- "64q1s1k" # 64 requests, 1k KV cache
|
||||
- "64q1s2k" # 64 requests, 2k KV cache
|
||||
- "64q1s4k" # 64 requests, 4k KV cache
|
||||
- "64q1s8k" # 64 requests, 8k KV cache
|
||||
|
||||
# Very large batches
|
||||
- "128q1s1k" # 128 requests, 1k KV cache
|
||||
- "128q1s2k" # 128 requests, 2k KV cache
|
||||
|
||||
# Long context
|
||||
- "32q1s16k" # 32 requests, 16k KV cache
|
||||
- "32q1s32k" # 32 requests, 32k KV cache
|
||||
|
||||
backends:
|
||||
- cutlass_mla
|
||||
- flashinfer_mla
|
||||
- flashattn_mla # Hopper only
|
||||
- flashmla # Hopper only
|
||||
|
||||
device: "cuda:0"
|
||||
repeats: 5
|
||||
warmup_iters: 3
|
||||
profile_memory: true
|
||||
|
||||
# Backend-specific tuning
|
||||
cutlass_mla:
|
||||
num_kv_splits: auto # or specific value like 4, 8, 16
|
||||
|
||||
flashattn_mla:
|
||||
reorder_batch_threshold: 512
|
||||
|
||||
flashmla:
|
||||
reorder_batch_threshold: 1
|
||||
60
benchmarks/attention_benchmarks/configs/mla_mixed_batch.yaml
Normal file
60
benchmarks/attention_benchmarks/configs/mla_mixed_batch.yaml
Normal file
@@ -0,0 +1,60 @@
|
||||
# MLA mixed batch benchmark (prefill + decode)
|
||||
# Tests chunked prefill performance
|
||||
|
||||
model:
|
||||
name: "deepseek-v3"
|
||||
num_layers: 60
|
||||
num_q_heads: 128
|
||||
num_kv_heads: 1
|
||||
head_dim: 576
|
||||
kv_lora_rank: 512
|
||||
qk_nope_head_dim: 128
|
||||
qk_rope_head_dim: 64
|
||||
v_head_dim: 128
|
||||
block_size: 128
|
||||
|
||||
batch_specs:
|
||||
# Small prefill + decode
|
||||
- "1q1k_8q1s1k" # 1 prefill + 8 decode
|
||||
- "2q2k_16q1s1k" # 2 prefill + 16 decode
|
||||
- "4q1k_32q1s2k" # 4 prefill + 32 decode
|
||||
|
||||
# Medium prefill + decode
|
||||
- "2q4k_32q1s2k" # 2 medium prefill + 32 decode
|
||||
- "4q4k_64q1s2k" # 4 medium prefill + 64 decode
|
||||
- "8q2k_64q1s4k" # 8 prefill + 64 decode
|
||||
|
||||
# Large prefill + decode (chunked prefill stress test)
|
||||
- "2q8k_32q1s1k" # 2 large prefill + 32 decode
|
||||
- "1q16k_16q1s2k" # 1 very large prefill + 16 decode
|
||||
- "2q16k_32q1s4k" # 2 very large prefill + 32 decode
|
||||
|
||||
# Context extension + decode
|
||||
- "2q1kkv2k_16q1s1k" # 2 extend + 16 decode
|
||||
- "4q2kkv4k_32q1s2k" # 4 extend + 32 decode
|
||||
- "2q1kkv8k_32q1s2k" # 2 large extend + 32 decode
|
||||
|
||||
# Explicitly chunked prefill
|
||||
- "q8k" # 8k prefill with chunking hint
|
||||
- "q16k" # 16k prefill with chunking hint
|
||||
- "2q8k_32q1s2k" # 2 chunked prefill + 32 decode
|
||||
|
||||
# High decode ratio (realistic serving)
|
||||
- "1q2k_63q1s1k" # 1 prefill + 63 decode
|
||||
- "2q2k_62q1s2k" # 2 prefill + 62 decode
|
||||
- "4q4k_60q1s4k" # 4 prefill + 60 decode
|
||||
|
||||
backends:
|
||||
- cutlass_mla
|
||||
- flashinfer_mla
|
||||
- flashattn_mla # Hopper only
|
||||
- flashmla # Hopper only
|
||||
|
||||
device: "cuda:0"
|
||||
repeats: 5
|
||||
warmup_iters: 3
|
||||
profile_memory: true
|
||||
|
||||
# Analyze chunked prefill workspace size impact
|
||||
chunked_prefill:
|
||||
test_workspace_sizes: [4096, 8192, 16384, 32768, 65536]
|
||||
@@ -0,0 +1,88 @@
|
||||
# Study 4: What is optimal reorder_batch_threshold for MLA backends supporting query length > 1?
|
||||
# Question: At what query length does prefill pipeline become faster than decode pipeline?
|
||||
# Methodology: For each query length, compare decode vs prefill performance to find crossover point
|
||||
# Applies to: FlashAttn MLA, FlashMLA
|
||||
|
||||
description: "Decode vs Prefill pipeline crossover analysis"
|
||||
|
||||
# Test FlashAttn MLA
|
||||
backend: flashattn_mla
|
||||
|
||||
# Mode: decode_vs_prefill comparison (special sweep mode)
|
||||
# For each batch spec, we'll test both decode and prefill pipelines
|
||||
mode: "decode_vs_prefill"
|
||||
|
||||
# Query lengths to test (from old benchmark_mla_threshold.py methodology)
|
||||
# Each query length will be tested with BOTH decode and prefill pipelines:
|
||||
# - decode: threshold >= query_length (forces decode pipeline)
|
||||
# - prefill: threshold < query_length (forces prefill pipeline)
|
||||
#
|
||||
# We use q<N>s1k format which creates q_len=N, seq_len=1024 requests
|
||||
# This tests different query lengths with fixed sequence length context
|
||||
#
|
||||
# Using batch_spec_ranges for automatic generation:
|
||||
batch_spec_ranges:
|
||||
- template: "q{q_len}s1k"
|
||||
q_len:
|
||||
start: 1
|
||||
stop: 16
|
||||
step: 1
|
||||
end_inclusive: false
|
||||
- template: "q{q_len}s1k"
|
||||
q_len:
|
||||
start: 16
|
||||
stop: 64
|
||||
step: 2
|
||||
end_inclusive: false
|
||||
- template: "q{q_len}s1k"
|
||||
q_len:
|
||||
start: 64
|
||||
stop: 1024
|
||||
step: 4
|
||||
end_inclusive: true
|
||||
|
||||
# Batch sizes to test (from old script)
|
||||
batch_sizes:
|
||||
- 1
|
||||
- 2
|
||||
- 4
|
||||
- 8
|
||||
- 16
|
||||
- 32
|
||||
- 64
|
||||
- 128
|
||||
- 256
|
||||
|
||||
# Model configuration (DeepSeek V2/V3 defaults)
|
||||
model:
|
||||
num_layers: 10
|
||||
head_dim: 576
|
||||
num_q_heads: 128
|
||||
num_kv_heads: 1
|
||||
block_size: 128
|
||||
|
||||
# Benchmark settings
|
||||
benchmark:
|
||||
device: "cuda:0"
|
||||
repeats: 15 # More repeats for spec decode variance
|
||||
warmup_iters: 5
|
||||
profile_memory: false
|
||||
|
||||
# Output
|
||||
output:
|
||||
csv: "reorder_threshold_results.csv"
|
||||
json: "reorder_threshold_results.json"
|
||||
|
||||
# Expected outcome (reproduces old benchmark_mla_threshold.py study):
|
||||
# - For each batch size, find the crossover point where prefill becomes faster than decode
|
||||
# - Show decode vs prefill performance across all query lengths
|
||||
# - Determine optimal reorder_batch_threshold based on last query length where decode is faster
|
||||
# - Understand how crossover point varies with batch size
|
||||
# - Provide data-driven guidance for default threshold value
|
||||
#
|
||||
# Methodology (from old script):
|
||||
# - Each query length tested with BOTH pipelines:
|
||||
# * decode: threshold >= query_length (forces decode pipeline)
|
||||
# * prefill: threshold < query_length (forces prefill pipeline)
|
||||
# - Compare which is faster to find crossover point
|
||||
#
|
||||
@@ -0,0 +1,62 @@
|
||||
# Speculative decoding benchmark configuration
|
||||
# Tests reorder_batch_threshold optimization
|
||||
|
||||
model:
|
||||
name: "deepseek-v3"
|
||||
num_layers: 60
|
||||
num_q_heads: 128
|
||||
num_kv_heads: 1
|
||||
head_dim: 576
|
||||
kv_lora_rank: 512
|
||||
qk_nope_head_dim: 128
|
||||
qk_rope_head_dim: 64
|
||||
v_head_dim: 128
|
||||
|
||||
batch_specs:
|
||||
# Pure speculative decode (K-token verification)
|
||||
- "q2s1k" # 2-token spec, 1k KV
|
||||
- "q4s1k" # 4-token spec, 1k KV
|
||||
- "q8s1k" # 8-token spec, 1k KV
|
||||
- "q16s1k" # 16-token spec, 1k KV
|
||||
|
||||
# Speculative with different context lengths
|
||||
- "q4s2k" # 4-token spec, 2k KV
|
||||
- "q4s4k" # 4-token spec, 4k KV
|
||||
- "q8s2k" # 8-token spec, 2k KV
|
||||
- "q8s4k" # 8-token spec, 4k KV
|
||||
|
||||
# Mixed: speculative + regular decode
|
||||
- "32q4s1k" # 32 spec requests
|
||||
- "16q4s1k_16q1s1k" # 16 spec + 16 regular
|
||||
- "8q8s2k_24q1s2k" # 8 spec (8-tok) + 24 regular
|
||||
|
||||
# Mixed: speculative + prefill + decode
|
||||
- "2q1k_16q4s1k_16q1s1k" # 2 prefill + 16 spec + 16 decode
|
||||
- "4q2k_32q4s2k_32q1s2k" # 4 prefill + 32 spec + 32 decode
|
||||
|
||||
# Large batches with speculation
|
||||
- "64q4s1k" # 64 spec requests
|
||||
- "32q8s2k" # 32 spec (8-token)
|
||||
- "16q16s4k" # 16 spec (16-token)
|
||||
|
||||
# Backends that support query length > 1
|
||||
backends:
|
||||
- flashattn_mla # reorder_batch_threshold = 512
|
||||
- flashmla # reorder_batch_threshold = 1 (tunable)
|
||||
|
||||
# FlashInfer-MLA also supports uniform spec-as-decode but with different mechanism
|
||||
# - flashinfer_mla
|
||||
|
||||
# Benchmark settings
|
||||
benchmark:
|
||||
device: "cuda:0"
|
||||
repeats: 10 # More repeats for statistical significance
|
||||
warmup_iters: 5
|
||||
profile_memory: false
|
||||
|
||||
# Test these threshold values for optimization
|
||||
parameter_sweep:
|
||||
param_name: "reorder_batch_threshold"
|
||||
values: [1, 2, 4, 8, 16, 32, 64, 128, 256, 512]
|
||||
include_auto: false
|
||||
label_format: "{backend}_threshold_{value}"
|
||||
@@ -0,0 +1,40 @@
|
||||
# Standard attention backend benchmark configuration
|
||||
|
||||
model:
|
||||
num_layers: 32
|
||||
num_q_heads: 32
|
||||
num_kv_heads: 8 # GQA with 4:1 ratio
|
||||
head_dim: 128
|
||||
block_size: 16
|
||||
|
||||
batch_specs:
|
||||
# Pure prefill
|
||||
- "q512" # Small prefill (512 tokens)
|
||||
- "q2k" # Medium prefill (2048 tokens)
|
||||
- "q4k" # Large prefill (4096 tokens)
|
||||
- "q8k" # Very large prefill (8192 tokens)
|
||||
|
||||
# Pure decode
|
||||
- "8q1s1k" # 8 requests, 1k KV cache each
|
||||
- "16q1s2k" # 16 requests, 2k KV cache each
|
||||
- "32q1s1k" # 32 requests, 1k KV cache each
|
||||
- "64q1s4k" # 64 requests, 4k KV cache each
|
||||
|
||||
# Mixed prefill/decode
|
||||
- "2q2k_8q1s1k" # 2 prefill + 8 decode
|
||||
- "4q1k_16q1s2k" # 4 prefill + 16 decode
|
||||
- "2q4k_32q1s1k" # 2 large prefill + 32 decode
|
||||
|
||||
# Context extension
|
||||
- "q1ks2k" # 1k query, 2k sequence (chunked prefill)
|
||||
- "2q1ks4k" # 2 requests: 1k query, 4k sequence
|
||||
|
||||
backends:
|
||||
- flash
|
||||
- triton
|
||||
- flashinfer
|
||||
|
||||
device: "cuda:0"
|
||||
repeats: 5
|
||||
warmup_iters: 3
|
||||
profile_memory: false
|
||||
836
benchmarks/attention_benchmarks/mla_runner.py
Normal file
836
benchmarks/attention_benchmarks/mla_runner.py
Normal file
@@ -0,0 +1,836 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
"""
|
||||
MLA benchmark runner - shared utilities for MLA benchmarks.
|
||||
|
||||
This module provides helpers for running MLA backends without
|
||||
needing full VllmConfig integration.
|
||||
"""
|
||||
|
||||
import importlib
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from batch_spec import parse_batch_spec
|
||||
from common import (
|
||||
BenchmarkResult,
|
||||
MockHfConfig,
|
||||
MockKVBProj,
|
||||
MockLayer,
|
||||
setup_mla_dims,
|
||||
)
|
||||
|
||||
from vllm.config import (
|
||||
CacheConfig,
|
||||
CompilationConfig,
|
||||
ModelConfig,
|
||||
ParallelConfig,
|
||||
SchedulerConfig,
|
||||
VllmConfig,
|
||||
set_current_vllm_config,
|
||||
)
|
||||
|
||||
# ============================================================================
|
||||
# VllmConfig Creation
|
||||
# ============================================================================
|
||||
|
||||
|
||||
def _add_mock_methods_to_model_config(model_config: ModelConfig) -> None:
|
||||
"""
|
||||
Add mock methods for layer-specific queries to ModelConfig.
|
||||
|
||||
These methods are needed by metadata builders but aren't normally
|
||||
present on ModelConfig when used in benchmark contexts.
|
||||
"""
|
||||
import types
|
||||
|
||||
model_config.get_num_layers = types.MethodType(lambda self: 1, model_config)
|
||||
model_config.get_sliding_window_for_layer = types.MethodType(
|
||||
lambda self, _i: None, model_config
|
||||
)
|
||||
model_config.get_logits_soft_cap_for_layer = types.MethodType(
|
||||
lambda self, _i: None, model_config
|
||||
)
|
||||
model_config.get_sm_scale_for_layer = types.MethodType(
|
||||
lambda self, _i: 1.0 / model_config.get_head_size() ** 0.5, model_config
|
||||
)
|
||||
|
||||
|
||||
def create_minimal_vllm_config(
|
||||
model_name: str = "deepseek-v3",
|
||||
block_size: int = 128,
|
||||
max_num_seqs: int = 256,
|
||||
mla_dims: dict | None = None,
|
||||
) -> VllmConfig:
|
||||
"""
|
||||
Create minimal VllmConfig for MLA benchmarks.
|
||||
|
||||
Args:
|
||||
model_name: Model name (deepseek-v2, deepseek-v3, etc.) - used if mla_dims not
|
||||
provided
|
||||
block_size: KV cache block size
|
||||
max_num_seqs: Maximum number of sequences
|
||||
mla_dims: Optional custom MLA dimensions dict. If not provided, uses
|
||||
setup_mla_dims(model_name)
|
||||
|
||||
Returns:
|
||||
VllmConfig for benchmarking
|
||||
"""
|
||||
# Get MLA dimensions - use provided or load from model name
|
||||
if mla_dims is None:
|
||||
mla_dims = setup_mla_dims(model_name)
|
||||
|
||||
# Create mock HF config first (avoids downloading from HuggingFace)
|
||||
mock_hf_config = MockHfConfig(mla_dims)
|
||||
|
||||
# Create a temporary minimal config.json to avoid HF downloads
|
||||
# This ensures consistent ModelConfig construction without network access
|
||||
import json
|
||||
import os
|
||||
import shutil
|
||||
import tempfile
|
||||
|
||||
minimal_config = {
|
||||
"architectures": ["DeepseekV2ForCausalLM"],
|
||||
"model_type": "deepseek_v2",
|
||||
"num_attention_heads": mla_dims["num_q_heads"],
|
||||
"num_key_value_heads": mla_dims["num_kv_heads"],
|
||||
"hidden_size": mla_dims["head_dim"] * mla_dims["num_q_heads"],
|
||||
"torch_dtype": "bfloat16",
|
||||
"max_position_embeddings": 163840, # DeepSeek V3 default
|
||||
"rope_theta": 10000.0,
|
||||
"vocab_size": 128256,
|
||||
}
|
||||
|
||||
# Create temporary directory with config.json
|
||||
temp_dir = tempfile.mkdtemp(prefix="vllm_bench_")
|
||||
config_path = os.path.join(temp_dir, "config.json")
|
||||
with open(config_path, "w") as f:
|
||||
json.dump(minimal_config, f)
|
||||
|
||||
try:
|
||||
# Create model config using local path - no HF downloads
|
||||
model_config = ModelConfig(
|
||||
model=temp_dir, # Use local temp directory
|
||||
tokenizer=None,
|
||||
tokenizer_mode="auto",
|
||||
trust_remote_code=True,
|
||||
dtype="bfloat16",
|
||||
seed=0,
|
||||
max_model_len=32768,
|
||||
quantization=None,
|
||||
quantization_param_path=None,
|
||||
enforce_eager=False,
|
||||
max_context_len_to_capture=None,
|
||||
max_seq_len_to_capture=8192,
|
||||
max_logprobs=20,
|
||||
disable_sliding_window=False,
|
||||
skip_tokenizer_init=True,
|
||||
served_model_name=None,
|
||||
limit_mm_per_prompt=None,
|
||||
use_async_output_proc=True,
|
||||
config_format="auto",
|
||||
)
|
||||
finally:
|
||||
# Clean up temporary directory
|
||||
shutil.rmtree(temp_dir, ignore_errors=True)
|
||||
|
||||
# Override with our mock config
|
||||
model_config.hf_config = mock_hf_config
|
||||
model_config.hf_text_config = mock_hf_config
|
||||
|
||||
# Add mock methods for layer-specific queries
|
||||
_add_mock_methods_to_model_config(model_config)
|
||||
|
||||
# Create sub-configs
|
||||
cache_config = CacheConfig(
|
||||
block_size=block_size,
|
||||
gpu_memory_utilization=0.9,
|
||||
swap_space=0,
|
||||
cache_dtype="auto",
|
||||
enable_prefix_caching=False,
|
||||
)
|
||||
|
||||
scheduler_config = SchedulerConfig(
|
||||
max_num_seqs=max_num_seqs,
|
||||
max_num_batched_tokens=8192,
|
||||
max_model_len=32768,
|
||||
is_encoder_decoder=False,
|
||||
enable_chunked_prefill=True,
|
||||
)
|
||||
|
||||
parallel_config = ParallelConfig(
|
||||
tensor_parallel_size=1,
|
||||
)
|
||||
|
||||
compilation_config = CompilationConfig()
|
||||
|
||||
return VllmConfig(
|
||||
model_config=model_config,
|
||||
cache_config=cache_config,
|
||||
parallel_config=parallel_config,
|
||||
scheduler_config=scheduler_config,
|
||||
compilation_config=compilation_config,
|
||||
)
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Backend Configuration
|
||||
# ============================================================================
|
||||
|
||||
|
||||
# Backend name to class name prefix mapping
|
||||
_BACKEND_NAME_MAP = {
|
||||
"flashattn_mla": "FlashAttnMLA",
|
||||
"flashmla": "FlashMLA",
|
||||
"flashinfer_mla": "FlashInferMLA",
|
||||
"cutlass_mla": "CutlassMLA",
|
||||
}
|
||||
|
||||
# Special properties that differ from defaults
|
||||
_BACKEND_PROPERTIES = {
|
||||
"flashmla": {
|
||||
"query_format": "concat", # Single concatenated tensor (vs tuple)
|
||||
"block_size": 64, # FlashMLA uses fixed block size
|
||||
},
|
||||
"flashinfer_mla": {
|
||||
"block_size": 64, # FlashInfer MLA only supports 32 or 64
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
def _get_backend_config(backend: str) -> dict:
|
||||
"""
|
||||
Get backend configuration using naming conventions.
|
||||
|
||||
All MLA backends follow the pattern:
|
||||
- Module: vllm.v1.attention.backends.mla.{backend}
|
||||
- Impl: {Name}Impl
|
||||
- Metadata: {Name}Metadata (or MLACommonMetadata)
|
||||
- DecodeMetadata: {Name}DecodeMetadata (or MLACommonDecodeMetadata)
|
||||
- MetadataBuilder: {Name}MetadataBuilder
|
||||
"""
|
||||
if backend not in _BACKEND_NAME_MAP:
|
||||
raise ValueError(f"Unknown backend: {backend}")
|
||||
|
||||
name = _BACKEND_NAME_MAP[backend]
|
||||
props = _BACKEND_PROPERTIES.get(backend, {})
|
||||
|
||||
# Check if backend uses common metadata (FlashInfer, CUTLASS)
|
||||
uses_common = backend in ("flashinfer_mla", "cutlass_mla")
|
||||
|
||||
return {
|
||||
"module": f"vllm.v1.attention.backends.mla.{backend}",
|
||||
"impl_class": f"{name}Impl",
|
||||
"metadata_class": "MLACommonMetadata" if uses_common else f"{name}Metadata",
|
||||
"decode_metadata_class": "MLACommonDecodeMetadata"
|
||||
if uses_common
|
||||
else f"{name}DecodeMetadata",
|
||||
"builder_class": f"{name}MetadataBuilder",
|
||||
"query_format": props.get("query_format", "tuple"),
|
||||
"block_size": props.get("block_size", None),
|
||||
}
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Metadata Building Helpers
|
||||
# ============================================================================
|
||||
|
||||
|
||||
def _build_attention_metadata(
|
||||
requests: list,
|
||||
block_size: int,
|
||||
device: torch.device,
|
||||
builder_instance,
|
||||
) -> tuple:
|
||||
"""
|
||||
Build attention metadata from batch requests.
|
||||
|
||||
Args:
|
||||
requests: List of BatchRequest objects
|
||||
block_size: KV cache block size
|
||||
device: Target device
|
||||
builder_instance: Metadata builder instance
|
||||
|
||||
Returns:
|
||||
Tuple of (metadata, kv_cache_num_blocks)
|
||||
"""
|
||||
q_lens = [r.q_len for r in requests]
|
||||
kv_lens = [r.kv_len for r in requests]
|
||||
total_q = sum(q_lens)
|
||||
max_kv = max(kv_lens)
|
||||
|
||||
# Build query start locations
|
||||
q_start_cpu = torch.tensor(
|
||||
[0] + [sum(q_lens[: i + 1]) for i in range(len(q_lens))],
|
||||
dtype=torch.int32,
|
||||
)
|
||||
q_start_gpu = q_start_cpu.to(device)
|
||||
|
||||
# Build sequence lengths
|
||||
seq_lens_cpu = torch.tensor(kv_lens, dtype=torch.int32)
|
||||
seq_lens_gpu = seq_lens_cpu.to(device)
|
||||
|
||||
# Build num_computed_tokens (context length for each request)
|
||||
context_lens = [kv_len - q_len for q_len, kv_len in zip(q_lens, kv_lens)]
|
||||
num_computed_tokens_cpu = torch.tensor(context_lens, dtype=torch.int32)
|
||||
|
||||
# Build block table
|
||||
num_blocks_per_req = [(kv + block_size - 1) // block_size for kv in kv_lens]
|
||||
max_num_blocks = max(num_blocks_per_req)
|
||||
|
||||
block_table_cpu = np.zeros((len(requests), max_num_blocks), dtype=np.int32)
|
||||
current_block = 0
|
||||
for i, num_blocks in enumerate(num_blocks_per_req):
|
||||
for j in range(num_blocks):
|
||||
block_table_cpu[i, j] = current_block
|
||||
current_block += 1
|
||||
|
||||
block_table_gpu = torch.from_numpy(block_table_cpu).to(device)
|
||||
|
||||
# Build slot mapping
|
||||
slot_mapping_list = []
|
||||
for i, (q_len, kv_len, num_blocks) in enumerate(
|
||||
zip(q_lens, kv_lens, num_blocks_per_req)
|
||||
):
|
||||
context_len = kv_len - q_len
|
||||
for j in range(q_len):
|
||||
token_kv_idx = context_len + j
|
||||
block_idx = token_kv_idx // block_size
|
||||
offset_in_block = token_kv_idx % block_size
|
||||
global_block_id = block_table_cpu[i, block_idx]
|
||||
slot_id = global_block_id * block_size + offset_in_block
|
||||
slot_mapping_list.append(slot_id)
|
||||
|
||||
slot_mapping = torch.tensor(slot_mapping_list, dtype=torch.int64, device=device)
|
||||
|
||||
# Create CommonAttentionMetadata
|
||||
from vllm.v1.attention.backends.utils import CommonAttentionMetadata
|
||||
|
||||
common_attn_metadata = CommonAttentionMetadata(
|
||||
num_reqs=len(requests),
|
||||
max_query_len=max(q_lens),
|
||||
max_seq_len=max_kv,
|
||||
num_actual_tokens=total_q,
|
||||
query_start_loc=q_start_gpu,
|
||||
query_start_loc_cpu=q_start_cpu,
|
||||
seq_lens=seq_lens_gpu,
|
||||
_seq_lens_cpu=seq_lens_cpu,
|
||||
_num_computed_tokens_cpu=num_computed_tokens_cpu,
|
||||
slot_mapping=slot_mapping,
|
||||
block_table_tensor=block_table_gpu,
|
||||
dcp_local_seq_lens=None,
|
||||
)
|
||||
|
||||
# Use the production build() method
|
||||
metadata = builder_instance.build(
|
||||
common_prefix_len=0,
|
||||
common_attn_metadata=common_attn_metadata,
|
||||
fast_build=False,
|
||||
)
|
||||
|
||||
return metadata, current_block
|
||||
|
||||
|
||||
def _create_input_tensors(
|
||||
total_q: int,
|
||||
mla_dims: dict,
|
||||
query_format: str,
|
||||
device: torch.device,
|
||||
dtype: torch.dtype,
|
||||
):
|
||||
"""
|
||||
Create input tensors for both decode and prefill modes.
|
||||
|
||||
MLA requires different tensor formats for decode vs prefill:
|
||||
- Decode: Uses kv_lora_rank (512) dimension
|
||||
- Prefill: Uses qk_nope_head_dim (128) to stay under FlashAttention's 256 limit
|
||||
|
||||
Args:
|
||||
total_q: Total number of query tokens
|
||||
mla_dims: MLA dimension configuration
|
||||
query_format: Either "tuple" or "concat"
|
||||
device: Target device
|
||||
dtype: Tensor dtype
|
||||
|
||||
Returns:
|
||||
Tuple of (decode_inputs, prefill_inputs)
|
||||
- decode_inputs: Query tensor(s) for decode mode
|
||||
- prefill_inputs: Dict with 'q', 'k_c_normed', 'k_pe', 'k_scale' for prefill
|
||||
"""
|
||||
if query_format == "tuple":
|
||||
# Decode mode format: (q_nope, q_pe) where q_nope has kv_lora_rank dim
|
||||
q_nope_decode = torch.randn(
|
||||
total_q,
|
||||
mla_dims["num_q_heads"],
|
||||
mla_dims["kv_lora_rank"],
|
||||
device=device,
|
||||
dtype=dtype,
|
||||
)
|
||||
q_pe = torch.randn(
|
||||
total_q,
|
||||
mla_dims["num_q_heads"],
|
||||
mla_dims["qk_rope_head_dim"],
|
||||
device=device,
|
||||
dtype=dtype,
|
||||
)
|
||||
decode_inputs = (q_nope_decode, q_pe)
|
||||
|
||||
# For prefill, we need q with qk_nope_head_dim instead of kv_lora_rank
|
||||
q_nope_prefill = torch.randn(
|
||||
total_q,
|
||||
mla_dims["num_q_heads"],
|
||||
mla_dims["qk_nope_head_dim"],
|
||||
device=device,
|
||||
dtype=dtype,
|
||||
)
|
||||
prefill_q = torch.cat([q_nope_prefill, q_pe], dim=-1)
|
||||
else: # concat
|
||||
decode_inputs = torch.randn(
|
||||
total_q,
|
||||
mla_dims["num_q_heads"],
|
||||
mla_dims["kv_lora_rank"] + mla_dims["qk_rope_head_dim"],
|
||||
device=device,
|
||||
dtype=dtype,
|
||||
)
|
||||
# For prefill with concat format
|
||||
prefill_q = torch.randn(
|
||||
total_q,
|
||||
mla_dims["num_q_heads"],
|
||||
mla_dims["qk_nope_head_dim"] + mla_dims["qk_rope_head_dim"],
|
||||
device=device,
|
||||
dtype=dtype,
|
||||
)
|
||||
|
||||
# Create additional inputs needed for prefill forward
|
||||
k_c_normed = torch.randn(
|
||||
total_q,
|
||||
mla_dims["kv_lora_rank"],
|
||||
device=device,
|
||||
dtype=dtype,
|
||||
)
|
||||
k_pe = torch.randn(
|
||||
total_q,
|
||||
1, # Single head for MLA
|
||||
mla_dims["qk_rope_head_dim"],
|
||||
device=device,
|
||||
dtype=dtype,
|
||||
)
|
||||
k_scale = torch.ones(1, device=device, dtype=torch.float32)
|
||||
|
||||
output = torch.zeros(
|
||||
total_q,
|
||||
mla_dims["num_q_heads"] * mla_dims["v_head_dim"],
|
||||
device=device,
|
||||
dtype=dtype,
|
||||
)
|
||||
|
||||
prefill_inputs = {
|
||||
"q": prefill_q,
|
||||
"k_c_normed": k_c_normed,
|
||||
"k_pe": k_pe,
|
||||
"k_scale": k_scale,
|
||||
"output": output,
|
||||
}
|
||||
|
||||
return decode_inputs, prefill_inputs
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Backend Initialization
|
||||
# ============================================================================
|
||||
|
||||
|
||||
def _create_backend_impl(
|
||||
backend_cfg: dict,
|
||||
mla_dims: dict,
|
||||
vllm_config: VllmConfig,
|
||||
device: torch.device,
|
||||
):
|
||||
"""
|
||||
Create backend implementation instance.
|
||||
|
||||
Args:
|
||||
backend_cfg: Backend configuration dict
|
||||
mla_dims: MLA dimension configuration
|
||||
vllm_config: VllmConfig instance
|
||||
device: Target device
|
||||
|
||||
Returns:
|
||||
Tuple of (impl, layer, builder_instance)
|
||||
"""
|
||||
# Import backend classes
|
||||
backend_module = importlib.import_module(backend_cfg["module"])
|
||||
impl_class = getattr(backend_module, backend_cfg["impl_class"])
|
||||
|
||||
# Calculate scale
|
||||
scale = 1.0 / np.sqrt(mla_dims["qk_nope_head_dim"] + mla_dims["qk_rope_head_dim"])
|
||||
|
||||
# Create mock kv_b_proj layer for prefill mode
|
||||
mock_kv_b_proj = MockKVBProj(
|
||||
num_heads=mla_dims["num_q_heads"],
|
||||
qk_nope_head_dim=mla_dims["qk_nope_head_dim"],
|
||||
v_head_dim=mla_dims["v_head_dim"],
|
||||
)
|
||||
|
||||
# Create impl
|
||||
impl = impl_class(
|
||||
num_heads=mla_dims["num_q_heads"],
|
||||
head_size=mla_dims["head_dim"],
|
||||
scale=scale,
|
||||
num_kv_heads=mla_dims["num_kv_heads"],
|
||||
alibi_slopes=None,
|
||||
sliding_window=None,
|
||||
kv_cache_dtype="auto",
|
||||
logits_soft_cap=None,
|
||||
attn_type="decoder",
|
||||
kv_sharing_target_layer_name=None,
|
||||
q_lora_rank=None,
|
||||
kv_lora_rank=mla_dims["kv_lora_rank"],
|
||||
qk_nope_head_dim=mla_dims["qk_nope_head_dim"],
|
||||
qk_rope_head_dim=mla_dims["qk_rope_head_dim"],
|
||||
qk_head_dim=mla_dims["qk_nope_head_dim"] + mla_dims["qk_rope_head_dim"],
|
||||
v_head_dim=mla_dims["v_head_dim"],
|
||||
kv_b_proj=mock_kv_b_proj,
|
||||
)
|
||||
|
||||
# Initialize DCP attributes
|
||||
if not hasattr(impl, "dcp_world_size") or impl.dcp_world_size in (None, -1):
|
||||
impl.dcp_world_size = 1
|
||||
impl.dcp_rank = 0
|
||||
|
||||
# Create KV cache spec for MockLayer
|
||||
from vllm.v1.kv_cache_interface import FullAttentionSpec
|
||||
|
||||
kv_cache_spec = FullAttentionSpec(
|
||||
block_size=backend_cfg["block_size"] or vllm_config.cache_config.block_size,
|
||||
num_kv_heads=1, # MLA uses 1 KV head
|
||||
head_size=576, # MLA head dim
|
||||
dtype=torch.bfloat16,
|
||||
)
|
||||
|
||||
# Create mock layer
|
||||
layer = MockLayer(device, impl=impl, kv_cache_spec=kv_cache_spec)
|
||||
|
||||
# Create builder instance if needed
|
||||
builder_instance = None
|
||||
if backend_cfg["builder_class"]:
|
||||
builder_class = getattr(backend_module, backend_cfg["builder_class"])
|
||||
|
||||
# Populate static_forward_context so builder can find the layer
|
||||
# MockLayer inherits from AttentionLayerBase, so isinstance checks pass
|
||||
vllm_config.compilation_config.static_forward_context = {"placeholder": layer}
|
||||
|
||||
builder_instance = builder_class(
|
||||
kv_cache_spec=kv_cache_spec,
|
||||
layer_names=["placeholder"],
|
||||
vllm_config=vllm_config,
|
||||
device=device,
|
||||
)
|
||||
|
||||
return impl, layer, builder_instance
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Config Helpers
|
||||
# ============================================================================
|
||||
|
||||
|
||||
def _extract_mla_dims_from_config(config) -> dict | None:
|
||||
"""
|
||||
Extract MLA dimensions from BenchmarkConfig if all required fields are present.
|
||||
|
||||
Args:
|
||||
config: BenchmarkConfig instance
|
||||
|
||||
Returns:
|
||||
Dict with MLA dimensions if all fields are provided, None otherwise
|
||||
"""
|
||||
# Check if all MLA-specific fields are provided
|
||||
if all(
|
||||
[
|
||||
config.kv_lora_rank is not None,
|
||||
config.qk_nope_head_dim is not None,
|
||||
config.qk_rope_head_dim is not None,
|
||||
config.v_head_dim is not None,
|
||||
]
|
||||
):
|
||||
return {
|
||||
"kv_lora_rank": config.kv_lora_rank,
|
||||
"qk_nope_head_dim": config.qk_nope_head_dim,
|
||||
"qk_rope_head_dim": config.qk_rope_head_dim,
|
||||
"v_head_dim": config.v_head_dim,
|
||||
"num_q_heads": config.num_q_heads,
|
||||
"num_kv_heads": config.num_kv_heads,
|
||||
"head_dim": config.head_dim,
|
||||
}
|
||||
# Fallback: if MLA fields not fully specified, try to construct from basic fields
|
||||
elif config.head_dim == 576:
|
||||
# This looks like a DeepSeek MLA config, use standard dimensions with custom
|
||||
# head count
|
||||
return {
|
||||
"kv_lora_rank": 512,
|
||||
"qk_nope_head_dim": 128,
|
||||
"qk_rope_head_dim": 64,
|
||||
"v_head_dim": 128,
|
||||
"num_q_heads": config.num_q_heads,
|
||||
"num_kv_heads": config.num_kv_heads,
|
||||
"head_dim": config.head_dim,
|
||||
}
|
||||
return None
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Benchmark Execution
|
||||
# ============================================================================
|
||||
|
||||
|
||||
def _run_single_benchmark(
|
||||
config,
|
||||
impl,
|
||||
layer,
|
||||
builder_instance,
|
||||
backend_cfg: dict,
|
||||
mla_dims: dict,
|
||||
device: torch.device,
|
||||
) -> BenchmarkResult:
|
||||
"""
|
||||
Run a single benchmark iteration.
|
||||
|
||||
Args:
|
||||
config: BenchmarkConfig instance
|
||||
impl: Backend implementation instance
|
||||
layer: MockLayer instance
|
||||
builder_instance: Metadata builder instance
|
||||
backend_cfg: Backend configuration dict
|
||||
mla_dims: MLA dimension configuration
|
||||
device: Target device
|
||||
|
||||
Returns:
|
||||
BenchmarkResult with timing statistics
|
||||
"""
|
||||
# Parse batch spec
|
||||
requests = parse_batch_spec(config.batch_spec)
|
||||
q_lens = [r.q_len for r in requests]
|
||||
total_q = sum(q_lens)
|
||||
|
||||
# Determine block size
|
||||
block_size = backend_cfg["block_size"] or config.block_size
|
||||
|
||||
# Build metadata
|
||||
metadata, num_blocks = _build_attention_metadata(
|
||||
requests, block_size, device, builder_instance
|
||||
)
|
||||
|
||||
# Create KV cache
|
||||
kv_cache = torch.zeros(
|
||||
num_blocks,
|
||||
block_size,
|
||||
mla_dims["kv_lora_rank"] + mla_dims["qk_rope_head_dim"],
|
||||
device=device,
|
||||
dtype=torch.bfloat16,
|
||||
)
|
||||
|
||||
# Create input tensors for both decode and prefill modes
|
||||
decode_inputs, prefill_inputs = _create_input_tensors(
|
||||
total_q,
|
||||
mla_dims,
|
||||
backend_cfg["query_format"],
|
||||
device,
|
||||
torch.bfloat16,
|
||||
)
|
||||
|
||||
# Determine which forward method to use based on metadata
|
||||
if metadata.decode is not None:
|
||||
forward_fn = lambda: impl._forward_decode(
|
||||
decode_inputs, kv_cache, metadata, layer
|
||||
)
|
||||
elif metadata.prefill is not None:
|
||||
forward_fn = lambda: impl._forward_prefill(
|
||||
prefill_inputs["q"],
|
||||
prefill_inputs["k_c_normed"],
|
||||
prefill_inputs["k_pe"],
|
||||
kv_cache,
|
||||
metadata,
|
||||
prefill_inputs["k_scale"],
|
||||
prefill_inputs["output"],
|
||||
)
|
||||
else:
|
||||
raise RuntimeError("Metadata has neither decode nor prefill metadata")
|
||||
|
||||
# Warmup
|
||||
for _ in range(config.warmup_iters):
|
||||
forward_fn()
|
||||
torch.cuda.synchronize()
|
||||
|
||||
# Benchmark
|
||||
times = []
|
||||
for _ in range(config.repeats):
|
||||
start = torch.cuda.Event(enable_timing=True)
|
||||
end = torch.cuda.Event(enable_timing=True)
|
||||
|
||||
start.record()
|
||||
for _ in range(config.num_layers):
|
||||
forward_fn()
|
||||
end.record()
|
||||
|
||||
torch.cuda.synchronize()
|
||||
elapsed_ms = start.elapsed_time(end)
|
||||
times.append(elapsed_ms / 1000.0 / config.num_layers)
|
||||
|
||||
mean_time = float(np.mean(times))
|
||||
return BenchmarkResult(
|
||||
config=config,
|
||||
mean_time=mean_time,
|
||||
std_time=float(np.std(times)),
|
||||
min_time=float(np.min(times)),
|
||||
max_time=float(np.max(times)),
|
||||
throughput_tokens_per_sec=total_q / mean_time if mean_time > 0 else 0,
|
||||
)
|
||||
|
||||
|
||||
def _run_mla_benchmark_batched(
|
||||
backend: str,
|
||||
configs_with_params: list[tuple], # [(config, threshold, num_splits), ...]
|
||||
) -> list[BenchmarkResult]:
|
||||
"""
|
||||
Unified batched MLA benchmark runner for all backends.
|
||||
|
||||
Works for: flashattn_mla, flashmla, flashinfer_mla, cutlass_mla
|
||||
|
||||
This function reuses backend initialization across multiple benchmarks
|
||||
to avoid setup/teardown overhead.
|
||||
|
||||
Args:
|
||||
backend: Backend name
|
||||
configs_with_params: List of (config, threshold, num_splits) tuples
|
||||
- threshold: reorder_batch_threshold (FlashAttn/FlashMLA only)
|
||||
- num_splits: num_kv_splits (CUTLASS only)
|
||||
|
||||
Returns:
|
||||
List of BenchmarkResult objects
|
||||
"""
|
||||
if not configs_with_params:
|
||||
return []
|
||||
|
||||
backend_cfg = _get_backend_config(backend)
|
||||
device = torch.device(configs_with_params[0][0].device)
|
||||
torch.cuda.set_device(device)
|
||||
|
||||
# Determine block size
|
||||
config_block_size = configs_with_params[0][0].block_size
|
||||
block_size = backend_cfg["block_size"] or config_block_size
|
||||
|
||||
# Extract MLA dimensions from the first config
|
||||
first_config = configs_with_params[0][0]
|
||||
mla_dims = _extract_mla_dims_from_config(first_config)
|
||||
|
||||
# If config didn't provide MLA dims, fall back to default model
|
||||
if mla_dims is None:
|
||||
mla_dims = setup_mla_dims("deepseek-v3")
|
||||
|
||||
# Create and set vLLM config for MLA (reused across all benchmarks)
|
||||
vllm_config = create_minimal_vllm_config(
|
||||
model_name="deepseek-v3", # Used only for model path
|
||||
block_size=block_size,
|
||||
mla_dims=mla_dims, # Use custom dims from config or default
|
||||
)
|
||||
|
||||
results = []
|
||||
|
||||
with set_current_vllm_config(vllm_config):
|
||||
# Create backend impl, layer, and builder (reused across benchmarks)
|
||||
impl, layer, builder_instance = _create_backend_impl(
|
||||
backend_cfg, mla_dims, vllm_config, device
|
||||
)
|
||||
|
||||
# Run each benchmark with the shared impl
|
||||
for config, threshold, num_splits in configs_with_params:
|
||||
# Set threshold for this benchmark (FlashAttn/FlashMLA only)
|
||||
original_threshold = None
|
||||
if threshold is not None and builder_instance:
|
||||
original_threshold = builder_instance.reorder_batch_threshold
|
||||
builder_instance.reorder_batch_threshold = threshold
|
||||
|
||||
# Set num_splits for CUTLASS
|
||||
original_num_splits = None
|
||||
if num_splits is not None and hasattr(impl, "_num_kv_splits"):
|
||||
original_num_splits = impl._num_kv_splits
|
||||
impl._num_kv_splits = num_splits
|
||||
|
||||
try:
|
||||
result = _run_single_benchmark(
|
||||
config,
|
||||
impl,
|
||||
layer,
|
||||
builder_instance,
|
||||
backend_cfg,
|
||||
mla_dims,
|
||||
device,
|
||||
)
|
||||
results.append(result)
|
||||
|
||||
finally:
|
||||
# Restore original threshold
|
||||
if original_threshold is not None:
|
||||
builder_instance.reorder_batch_threshold = original_threshold
|
||||
|
||||
# Restore original num_splits
|
||||
if original_num_splits is not None:
|
||||
impl._num_kv_splits = original_num_splits
|
||||
|
||||
return results
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Public API
|
||||
# ============================================================================
|
||||
|
||||
|
||||
def run_mla_benchmark(
|
||||
backend: str,
|
||||
config,
|
||||
reorder_batch_threshold: int | None = None,
|
||||
num_kv_splits: int | None = None,
|
||||
) -> BenchmarkResult | list[BenchmarkResult]:
|
||||
"""
|
||||
Unified MLA benchmark runner for all backends.
|
||||
|
||||
Works for: flashattn_mla, flashmla, flashinfer_mla, cutlass_mla
|
||||
|
||||
Always uses batched execution internally for optimal performance.
|
||||
|
||||
Args:
|
||||
backend: Backend name (flashattn_mla, flashmla, flashinfer_mla, cutlass_mla)
|
||||
config: BenchmarkConfig or list of (BenchmarkConfig, param) tuples
|
||||
reorder_batch_threshold: Threshold override for FlashAttn/FlashMLA
|
||||
(single config mode only)
|
||||
num_kv_splits: Number of KV splits for CUTLASS (single config mode only)
|
||||
|
||||
Returns:
|
||||
BenchmarkResult (single mode) or list of BenchmarkResult (batched mode)
|
||||
"""
|
||||
# Normalize to batched mode: (config, threshold, num_splits)
|
||||
if isinstance(config, list):
|
||||
# Already in batched format
|
||||
if len(config) > 0 and isinstance(config[0], tuple):
|
||||
# Format: [(cfg, param), ...] where param is threshold or num_splits
|
||||
if backend in ("flashattn_mla", "flashmla"):
|
||||
configs_with_params = [(cfg, param, None) for cfg, param in config]
|
||||
else: # cutlass_mla or flashinfer_mla
|
||||
configs_with_params = [(cfg, None, param) for cfg, param in config]
|
||||
else:
|
||||
# Format: [cfg, ...] - just configs
|
||||
configs_with_params = [(cfg, None, None) for cfg in config]
|
||||
return_single = False
|
||||
else:
|
||||
# Single config: convert to batched format
|
||||
configs_with_params = [(config, reorder_batch_threshold, num_kv_splits)]
|
||||
return_single = True
|
||||
|
||||
# Use unified batched execution
|
||||
results = _run_mla_benchmark_batched(backend, configs_with_params)
|
||||
|
||||
# Return single result or list based on input
|
||||
return results[0] if return_single else results
|
||||
481
benchmarks/attention_benchmarks/runner.py
Normal file
481
benchmarks/attention_benchmarks/runner.py
Normal file
@@ -0,0 +1,481 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
"""
|
||||
Standard attention benchmark runner - shared utilities for non-MLA benchmarks.
|
||||
|
||||
This module provides helpers for running standard attention backends
|
||||
(FlashAttention, Triton, FlashInfer) with real vLLM integration.
|
||||
"""
|
||||
|
||||
import types
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from batch_spec import parse_batch_spec, reorder_for_flashinfer
|
||||
from common import BenchmarkConfig, BenchmarkResult, MockLayer, get_attention_scale
|
||||
|
||||
from vllm.config import (
|
||||
CacheConfig,
|
||||
CompilationConfig,
|
||||
DeviceConfig,
|
||||
LoadConfig,
|
||||
ModelConfig,
|
||||
ParallelConfig,
|
||||
SchedulerConfig,
|
||||
VllmConfig,
|
||||
)
|
||||
from vllm.v1.attention.backends.utils import CommonAttentionMetadata
|
||||
from vllm.v1.kv_cache_interface import FullAttentionSpec
|
||||
|
||||
# ============================================================================
|
||||
# Backend Configuration
|
||||
# ============================================================================
|
||||
|
||||
|
||||
_BACKEND_CONFIG = {
|
||||
"flash": {
|
||||
"module": "vllm.v1.attention.backends.flash_attn",
|
||||
"backend_class": "FlashAttentionBackend",
|
||||
"dtype": torch.float16,
|
||||
"cache_layout": "standard",
|
||||
# ^ [2, num_blocks, block_size, num_kv_heads, head_dim]
|
||||
},
|
||||
"triton": {
|
||||
"module": "vllm.v1.attention.backends.triton_attn",
|
||||
"backend_class": "TritonAttentionBackend",
|
||||
"dtype": torch.float32,
|
||||
"cache_layout": "standard",
|
||||
},
|
||||
"flashinfer": {
|
||||
"module": "vllm.v1.attention.backends.flashinfer",
|
||||
"backend_class": "FlashInferBackend",
|
||||
"dtype": torch.float16,
|
||||
"cache_layout": "flashinfer",
|
||||
# ^ [num_blocks, 2, block_size, num_kv_heads, head_dim]
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
def _get_backend_config(backend: str) -> dict:
|
||||
if backend not in _BACKEND_CONFIG:
|
||||
raise ValueError(
|
||||
f"Unknown backend: {backend}. "
|
||||
f"Available: {', '.join(_BACKEND_CONFIG.keys())}"
|
||||
)
|
||||
return _BACKEND_CONFIG[backend]
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Metadata Building Helpers
|
||||
# ============================================================================
|
||||
|
||||
|
||||
def _build_common_attn_metadata(
|
||||
q_lens: list[int],
|
||||
kv_lens: list[int],
|
||||
block_size: int,
|
||||
device: torch.device,
|
||||
) -> CommonAttentionMetadata:
|
||||
"""Build CommonAttentionMetadata from query/kv lengths."""
|
||||
batch_size = len(q_lens)
|
||||
total_tokens = sum(q_lens)
|
||||
|
||||
query_start_loc = torch.zeros(batch_size + 1, dtype=torch.int32, device=device)
|
||||
query_start_loc[1:] = torch.tensor(q_lens, dtype=torch.int32, device=device).cumsum(
|
||||
0
|
||||
)
|
||||
query_start_loc_cpu = query_start_loc.cpu()
|
||||
|
||||
seq_lens = torch.tensor(kv_lens, dtype=torch.int32, device=device)
|
||||
seq_lens_cpu = seq_lens.cpu()
|
||||
max_seq_len = int(seq_lens_cpu.max())
|
||||
|
||||
context_lens = [kv - q for kv, q in zip(kv_lens, q_lens)]
|
||||
num_computed_tokens_cpu = torch.tensor(context_lens, dtype=torch.int32)
|
||||
|
||||
max_blocks = (max(kv_lens) + block_size - 1) // block_size
|
||||
num_blocks = batch_size * max_blocks
|
||||
block_table_tensor = torch.arange(
|
||||
num_blocks, dtype=torch.int32, device=device
|
||||
).view(batch_size, max_blocks)
|
||||
slot_mapping = torch.arange(total_tokens, dtype=torch.int64, device=device)
|
||||
|
||||
max_query_len = max(q_lens)
|
||||
|
||||
return CommonAttentionMetadata(
|
||||
query_start_loc=query_start_loc,
|
||||
query_start_loc_cpu=query_start_loc_cpu,
|
||||
seq_lens=seq_lens,
|
||||
seq_lens_cpu=seq_lens_cpu,
|
||||
num_computed_tokens_cpu=num_computed_tokens_cpu,
|
||||
num_reqs=batch_size,
|
||||
num_actual_tokens=total_tokens,
|
||||
max_query_len=max_query_len,
|
||||
max_seq_len=max_seq_len,
|
||||
block_table_tensor=block_table_tensor,
|
||||
slot_mapping=slot_mapping,
|
||||
causal=True,
|
||||
)
|
||||
|
||||
|
||||
def _create_vllm_config(
|
||||
config: BenchmarkConfig,
|
||||
dtype: torch.dtype,
|
||||
max_num_blocks: int,
|
||||
) -> VllmConfig:
|
||||
"""Create a VllmConfig for benchmarking with mock model methods."""
|
||||
model_config = ModelConfig(
|
||||
model="meta-llama/Meta-Llama-3-8B",
|
||||
tokenizer="meta-llama/Meta-Llama-3-8B",
|
||||
trust_remote_code=False,
|
||||
dtype=dtype,
|
||||
seed=0,
|
||||
max_model_len=1024,
|
||||
)
|
||||
|
||||
cache_config = CacheConfig(
|
||||
block_size=config.block_size,
|
||||
cache_dtype="auto",
|
||||
swap_space=0,
|
||||
)
|
||||
cache_config.num_gpu_blocks = max_num_blocks
|
||||
cache_config.num_cpu_blocks = 0
|
||||
|
||||
parallel_config = ParallelConfig(tensor_parallel_size=1)
|
||||
scheduler_config = SchedulerConfig(
|
||||
max_num_seqs=256,
|
||||
max_num_batched_tokens=8192,
|
||||
max_model_len=8192,
|
||||
is_encoder_decoder=False,
|
||||
enable_chunked_prefill=True,
|
||||
)
|
||||
device_config = DeviceConfig()
|
||||
load_config = LoadConfig()
|
||||
compilation_config = CompilationConfig()
|
||||
|
||||
# Add mock methods for benchmark config values
|
||||
model_config.get_num_layers = types.MethodType(
|
||||
lambda self: config.num_layers, model_config
|
||||
)
|
||||
model_config.get_sliding_window_for_layer = types.MethodType(
|
||||
lambda self, i: None, model_config
|
||||
)
|
||||
model_config.get_logits_soft_cap_for_layer = types.MethodType(
|
||||
lambda self, i: 0.0, model_config
|
||||
)
|
||||
model_config.get_sm_scale_for_layer = types.MethodType(
|
||||
lambda self, i: 1.0 / config.head_dim**0.5, model_config
|
||||
)
|
||||
model_config.get_num_attention_heads = types.MethodType(
|
||||
lambda self, parallel_config=None: config.num_q_heads, model_config
|
||||
)
|
||||
model_config.get_num_kv_heads = types.MethodType(
|
||||
lambda self, parallel_config=None: config.num_kv_heads, model_config
|
||||
)
|
||||
model_config.get_head_size = types.MethodType(
|
||||
lambda self: config.head_dim, model_config
|
||||
)
|
||||
model_config.get_sliding_window = types.MethodType(lambda self: None, model_config)
|
||||
|
||||
return VllmConfig(
|
||||
model_config=model_config,
|
||||
cache_config=cache_config,
|
||||
parallel_config=parallel_config,
|
||||
scheduler_config=scheduler_config,
|
||||
device_config=device_config,
|
||||
load_config=load_config,
|
||||
compilation_config=compilation_config,
|
||||
)
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Backend Initialization
|
||||
# ============================================================================
|
||||
|
||||
|
||||
def _create_backend_impl(
|
||||
backend_cfg: dict,
|
||||
config: BenchmarkConfig,
|
||||
device: torch.device,
|
||||
):
|
||||
"""Create backend implementation instance."""
|
||||
import importlib
|
||||
|
||||
backend_module = importlib.import_module(backend_cfg["module"])
|
||||
backend_class = getattr(backend_module, backend_cfg["backend_class"])
|
||||
|
||||
scale = get_attention_scale(config.head_dim)
|
||||
dtype = backend_cfg["dtype"]
|
||||
|
||||
impl = backend_class.get_impl_cls()(
|
||||
num_heads=config.num_q_heads,
|
||||
head_size=config.head_dim,
|
||||
scale=scale,
|
||||
num_kv_heads=config.num_kv_heads,
|
||||
alibi_slopes=None,
|
||||
sliding_window=None,
|
||||
kv_cache_dtype="auto",
|
||||
)
|
||||
|
||||
kv_cache_spec = FullAttentionSpec(
|
||||
block_size=config.block_size,
|
||||
num_kv_heads=config.num_kv_heads,
|
||||
head_size=config.head_dim,
|
||||
dtype=dtype,
|
||||
)
|
||||
|
||||
layer = MockLayer(device, kv_cache_spec=kv_cache_spec)
|
||||
|
||||
return backend_class, impl, layer, dtype
|
||||
|
||||
|
||||
def _create_metadata_builder(
|
||||
backend_class,
|
||||
kv_cache_spec: FullAttentionSpec,
|
||||
vllm_config: VllmConfig,
|
||||
device: torch.device,
|
||||
):
|
||||
"""Create metadata builder instance."""
|
||||
return backend_class.get_builder_cls()(
|
||||
kv_cache_spec=kv_cache_spec,
|
||||
layer_names=["layer_0"],
|
||||
vllm_config=vllm_config,
|
||||
device=device,
|
||||
)
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Tensor Creation Helpers
|
||||
# ============================================================================
|
||||
|
||||
|
||||
def _create_input_tensors(
|
||||
config: BenchmarkConfig,
|
||||
total_q: int,
|
||||
device: torch.device,
|
||||
dtype: torch.dtype,
|
||||
) -> tuple:
|
||||
"""Create Q, K, V input tensors for all layers."""
|
||||
q_list = [
|
||||
torch.randn(
|
||||
total_q, config.num_q_heads, config.head_dim, device=device, dtype=dtype
|
||||
)
|
||||
for _ in range(config.num_layers)
|
||||
]
|
||||
k_list = [
|
||||
torch.randn(
|
||||
total_q, config.num_kv_heads, config.head_dim, device=device, dtype=dtype
|
||||
)
|
||||
for _ in range(config.num_layers)
|
||||
]
|
||||
v_list = [
|
||||
torch.randn(
|
||||
total_q, config.num_kv_heads, config.head_dim, device=device, dtype=dtype
|
||||
)
|
||||
for _ in range(config.num_layers)
|
||||
]
|
||||
return q_list, k_list, v_list
|
||||
|
||||
|
||||
def _create_kv_cache(
|
||||
config: BenchmarkConfig,
|
||||
max_num_blocks: int,
|
||||
cache_layout: str,
|
||||
device: torch.device,
|
||||
dtype: torch.dtype,
|
||||
) -> list:
|
||||
"""Create KV cache tensors for all layers."""
|
||||
if cache_layout == "flashinfer":
|
||||
# FlashInfer layout: [num_blocks, 2, block_size, num_kv_heads, head_dim]
|
||||
cache_list = [
|
||||
torch.zeros(
|
||||
max_num_blocks,
|
||||
2,
|
||||
config.block_size,
|
||||
config.num_kv_heads,
|
||||
config.head_dim,
|
||||
device=device,
|
||||
dtype=dtype,
|
||||
)
|
||||
for _ in range(config.num_layers)
|
||||
]
|
||||
else:
|
||||
# Standard layout: [2, num_blocks, block_size, num_kv_heads, head_dim]
|
||||
cache_list = [
|
||||
torch.zeros(
|
||||
2,
|
||||
max_num_blocks,
|
||||
config.block_size,
|
||||
config.num_kv_heads,
|
||||
config.head_dim,
|
||||
device=device,
|
||||
dtype=dtype,
|
||||
)
|
||||
for _ in range(config.num_layers)
|
||||
]
|
||||
return cache_list
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Benchmark Execution
|
||||
# ============================================================================
|
||||
|
||||
|
||||
def _run_single_benchmark(
|
||||
config: BenchmarkConfig,
|
||||
impl,
|
||||
layer,
|
||||
q_list: list,
|
||||
k_list: list,
|
||||
v_list: list,
|
||||
cache_list: list,
|
||||
attn_metadata,
|
||||
device: torch.device,
|
||||
dtype: torch.dtype,
|
||||
) -> tuple:
|
||||
"""Run single benchmark iteration with warmup and timing loop."""
|
||||
total_q = q_list[0].shape[0]
|
||||
out = torch.empty(
|
||||
total_q, config.num_q_heads, config.head_dim, device=device, dtype=dtype
|
||||
)
|
||||
|
||||
# Warmup
|
||||
for _ in range(config.warmup_iters):
|
||||
for i in range(config.num_layers):
|
||||
impl.forward(
|
||||
layer,
|
||||
q_list[i],
|
||||
k_list[i],
|
||||
v_list[i],
|
||||
cache_list[i],
|
||||
attn_metadata,
|
||||
output=out,
|
||||
)
|
||||
torch.cuda.synchronize()
|
||||
|
||||
# Benchmark
|
||||
times = []
|
||||
for _ in range(config.repeats):
|
||||
start = torch.cuda.Event(enable_timing=True)
|
||||
end = torch.cuda.Event(enable_timing=True)
|
||||
|
||||
start.record()
|
||||
for i in range(config.num_layers):
|
||||
impl.forward(
|
||||
layer,
|
||||
q_list[i],
|
||||
k_list[i],
|
||||
v_list[i],
|
||||
cache_list[i],
|
||||
attn_metadata,
|
||||
output=out,
|
||||
)
|
||||
end.record()
|
||||
|
||||
torch.cuda.synchronize()
|
||||
elapsed_ms = start.elapsed_time(end)
|
||||
times.append(elapsed_ms / 1000.0 / config.num_layers) # seconds per layer
|
||||
|
||||
mem_stats = {}
|
||||
if config.profile_memory:
|
||||
mem_stats = {
|
||||
"allocated_mb": torch.cuda.memory_allocated(device) / 1024**2,
|
||||
"reserved_mb": torch.cuda.memory_reserved(device) / 1024**2,
|
||||
}
|
||||
|
||||
return times, mem_stats
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Public API
|
||||
# ============================================================================
|
||||
|
||||
|
||||
def run_attention_benchmark(config: BenchmarkConfig) -> BenchmarkResult:
|
||||
"""
|
||||
Run standard attention benchmark with real kernels.
|
||||
|
||||
Supports: flash, triton, flashinfer
|
||||
|
||||
Args:
|
||||
config: Benchmark configuration
|
||||
|
||||
Returns:
|
||||
BenchmarkResult with timing and memory statistics
|
||||
"""
|
||||
device = torch.device(config.device)
|
||||
torch.cuda.set_device(device)
|
||||
|
||||
backend_cfg = _get_backend_config(config.backend)
|
||||
|
||||
requests = parse_batch_spec(config.batch_spec)
|
||||
|
||||
if config.backend == "flashinfer":
|
||||
requests = reorder_for_flashinfer(requests)
|
||||
|
||||
q_lens = [r.q_len for r in requests]
|
||||
kv_lens = [r.kv_len for r in requests]
|
||||
total_q = sum(q_lens)
|
||||
max_kv = max(kv_lens)
|
||||
|
||||
max_num_blocks = (max_kv + config.block_size - 1) // config.block_size
|
||||
|
||||
backend_class, impl, layer, dtype = _create_backend_impl(
|
||||
backend_cfg, config, device
|
||||
)
|
||||
|
||||
common_metadata = _build_common_attn_metadata(
|
||||
q_lens, kv_lens, config.block_size, device
|
||||
)
|
||||
|
||||
kv_cache_spec = FullAttentionSpec(
|
||||
block_size=config.block_size,
|
||||
num_kv_heads=config.num_kv_heads,
|
||||
head_size=config.head_dim,
|
||||
dtype=dtype,
|
||||
)
|
||||
|
||||
vllm_config = _create_vllm_config(config, dtype, max_num_blocks)
|
||||
|
||||
builder = _create_metadata_builder(
|
||||
backend_class, kv_cache_spec, vllm_config, device
|
||||
)
|
||||
|
||||
attn_metadata = builder.build(
|
||||
common_prefix_len=0,
|
||||
common_attn_metadata=common_metadata,
|
||||
)
|
||||
|
||||
q_list, k_list, v_list = _create_input_tensors(config, total_q, device, dtype)
|
||||
|
||||
cache_list = _create_kv_cache(
|
||||
config, max_num_blocks, backend_cfg["cache_layout"], device, dtype
|
||||
)
|
||||
|
||||
times, mem_stats = _run_single_benchmark(
|
||||
config,
|
||||
impl,
|
||||
layer,
|
||||
q_list,
|
||||
k_list,
|
||||
v_list,
|
||||
cache_list,
|
||||
attn_metadata,
|
||||
device,
|
||||
dtype,
|
||||
)
|
||||
|
||||
mean_time = np.mean(times)
|
||||
throughput = total_q / mean_time if mean_time > 0 else 0
|
||||
|
||||
return BenchmarkResult(
|
||||
config=config,
|
||||
mean_time=mean_time,
|
||||
std_time=np.std(times),
|
||||
min_time=np.min(times),
|
||||
max_time=np.max(times),
|
||||
throughput_tokens_per_sec=throughput,
|
||||
memory_allocated_mb=mem_stats.get("allocated_mb"),
|
||||
memory_reserved_mb=mem_stats.get("reserved_mb"),
|
||||
)
|
||||
@@ -20,8 +20,12 @@ FLOAT4_E2M1_MAX = scalar_types.float4_e2m1f.max()
|
||||
FLOAT8_E4M3_MAX = torch.finfo(torch.float8_e4m3fn).max
|
||||
|
||||
PROVIDER_CFGS = {
|
||||
"vllm": dict(backend="vllm", enabled=True),
|
||||
"flashinfer": dict(backend="flashinfer", enabled=True),
|
||||
"vllm": dict(backend="vllm", is_sf_swizzled_layout=False, enabled=True),
|
||||
"vllm-swizzle": dict(backend="vllm", is_sf_swizzled_layout=True, enabled=True),
|
||||
"flashinfer": dict(backend="flashinfer", is_sf_swizzled_layout=False, enabled=True),
|
||||
"flashinfer-swizzle": dict(
|
||||
backend="flashinfer", is_sf_swizzled_layout=True, enabled=True
|
||||
),
|
||||
}
|
||||
|
||||
_enabled = [k for k, v in PROVIDER_CFGS.items() if v["enabled"]]
|
||||
@@ -36,7 +40,7 @@ def compute_global_scale(tensor: torch.Tensor) -> torch.Tensor:
|
||||
@triton.testing.perf_report(
|
||||
triton.testing.Benchmark(
|
||||
x_names=["batch_size"],
|
||||
x_vals=[1, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096],
|
||||
x_vals=[1, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192],
|
||||
x_log=False,
|
||||
line_arg="provider",
|
||||
line_vals=_enabled,
|
||||
@@ -63,19 +67,36 @@ def benchmark(batch_size, provider, N, K):
|
||||
|
||||
if cfg["backend"] == "vllm":
|
||||
# vLLM's FP4 quantization
|
||||
ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
|
||||
lambda: ops.scaled_fp4_quant(a, a_global_scale),
|
||||
quantiles=quantiles,
|
||||
)
|
||||
if cfg["is_sf_swizzled_layout"]:
|
||||
ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
|
||||
lambda: ops.scaled_fp4_quant(
|
||||
a, a_global_scale, is_sf_swizzled_layout=True
|
||||
),
|
||||
quantiles=quantiles,
|
||||
)
|
||||
else:
|
||||
ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
|
||||
lambda: ops.scaled_fp4_quant(
|
||||
a, a_global_scale, is_sf_swizzled_layout=False
|
||||
),
|
||||
quantiles=quantiles,
|
||||
)
|
||||
elif cfg["backend"] == "flashinfer":
|
||||
# FlashInfer's FP4 quantization
|
||||
# Use is_sf_swizzled_layout=True to match vLLM's output format
|
||||
ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
|
||||
lambda: flashinfer_fp4_quantize(
|
||||
a, a_global_scale, is_sf_swizzled_layout=True
|
||||
),
|
||||
quantiles=quantiles,
|
||||
)
|
||||
if cfg["is_sf_swizzled_layout"]:
|
||||
ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
|
||||
lambda: flashinfer_fp4_quantize(
|
||||
a, a_global_scale, is_sf_swizzled_layout=True
|
||||
),
|
||||
quantiles=quantiles,
|
||||
)
|
||||
else:
|
||||
ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
|
||||
lambda: flashinfer_fp4_quantize(
|
||||
a, a_global_scale, is_sf_swizzled_layout=False
|
||||
),
|
||||
quantiles=quantiles,
|
||||
)
|
||||
|
||||
# Convert ms to us for better readability at small batch sizes
|
||||
to_us = lambda t_ms: t_ms * 1000
|
||||
@@ -92,7 +113,9 @@ def prepare_shapes(args):
|
||||
return out
|
||||
|
||||
|
||||
def _test_accuracy_once(M: int, K: int, dtype: torch.dtype, device: str):
|
||||
def _test_accuracy_once(
|
||||
M: int, K: int, dtype: torch.dtype, device: str, is_sf_swizzled_layout: bool
|
||||
):
|
||||
"""Test accuracy between vLLM and FlashInfer FP4 quantization."""
|
||||
# Create input tensor
|
||||
a = torch.randn((M, K), device=device, dtype=dtype)
|
||||
@@ -101,11 +124,13 @@ def _test_accuracy_once(M: int, K: int, dtype: torch.dtype, device: str):
|
||||
a_global_scale = compute_global_scale(a)
|
||||
|
||||
# vLLM quantization
|
||||
vllm_fp4, vllm_scale = ops.scaled_fp4_quant(a, a_global_scale)
|
||||
vllm_fp4, vllm_scale = ops.scaled_fp4_quant(
|
||||
a, a_global_scale, is_sf_swizzled_layout=is_sf_swizzled_layout
|
||||
)
|
||||
|
||||
# FlashInfer quantization (with swizzled layout to match vLLM's output)
|
||||
flashinfer_fp4, flashinfer_scale = flashinfer_fp4_quantize(
|
||||
a, a_global_scale, is_sf_swizzled_layout=True
|
||||
a, a_global_scale, is_sf_swizzled_layout=is_sf_swizzled_layout
|
||||
)
|
||||
flashinfer_scale = flashinfer_scale.view(torch.float8_e4m3fn)
|
||||
|
||||
@@ -114,7 +139,14 @@ def _test_accuracy_once(M: int, K: int, dtype: torch.dtype, device: str):
|
||||
vllm_fp4,
|
||||
flashinfer_fp4,
|
||||
)
|
||||
print(f"M={M}, K={K}, dtype={dtype}: PASSED")
|
||||
# Compare scales
|
||||
torch.testing.assert_close(
|
||||
vllm_scale,
|
||||
flashinfer_scale,
|
||||
)
|
||||
print(
|
||||
f"M={M}, K={K}, dtype={dtype}, is_sf_swizzled_layout={is_sf_swizzled_layout}: PASSED" # noqa: E501
|
||||
)
|
||||
|
||||
|
||||
def test_accuracy():
|
||||
@@ -130,9 +162,10 @@ def test_accuracy():
|
||||
Ms = [1, 1024]
|
||||
Ks = [4096]
|
||||
|
||||
for M in Ms:
|
||||
for K in Ks:
|
||||
_test_accuracy_once(M, K, dtype, device)
|
||||
for is_sf_swizzled_layout in [True, False]:
|
||||
for M in Ms:
|
||||
for K in Ks:
|
||||
_test_accuracy_once(M, K, dtype, device, is_sf_swizzled_layout)
|
||||
|
||||
print("\nAll accuracy tests passed!")
|
||||
|
||||
@@ -145,7 +178,7 @@ if __name__ == "__main__":
|
||||
"--models",
|
||||
nargs="+",
|
||||
type=str,
|
||||
default=["meta-llama/Llama-3.1-8B-Instruct"],
|
||||
default=["meta-llama/Llama-3.3-70B-Instruct"],
|
||||
choices=list(WEIGHT_SHAPES.keys()),
|
||||
)
|
||||
parser.add_argument("--tp-sizes", nargs="+", type=int, default=[1])
|
||||
|
||||
@@ -7,7 +7,7 @@ import itertools
|
||||
import torch
|
||||
|
||||
import vllm.model_executor.layers.activation # noqa F401
|
||||
from vllm.model_executor.custom_op import CustomOp
|
||||
from vllm.model_executor.custom_op import op_registry
|
||||
from vllm.triton_utils import triton
|
||||
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
||||
from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE, set_random_seed
|
||||
@@ -33,14 +33,14 @@ def benchmark_activation(
|
||||
torch.set_default_device(device)
|
||||
|
||||
if func_name == "gelu_and_mul":
|
||||
layer = CustomOp.op_registry[func_name](approximate="none")
|
||||
layer = op_registry[func_name](approximate="none")
|
||||
elif func_name == "gelu_and_mul_tanh":
|
||||
layer = CustomOp.op_registry["gelu_and_mul"](approximate="tanh")
|
||||
layer = op_registry["gelu_and_mul"](approximate="tanh")
|
||||
elif func_name == "fatrelu_and_mul":
|
||||
threshold = 0.5
|
||||
layer = CustomOp.op_registry[func_name](threshold)
|
||||
layer = op_registry[func_name](threshold)
|
||||
else:
|
||||
layer = CustomOp.op_registry[func_name]()
|
||||
layer = op_registry[func_name]()
|
||||
|
||||
x = torch.randn(num_tokens, dim, dtype=dtype, device=device)
|
||||
compiled_layer = torch.compile(layer.forward_native)
|
||||
|
||||
@@ -1,244 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
# Copyright (c) Microsoft Corporation.
|
||||
# Licensed under the MIT License.
|
||||
|
||||
from packaging import version
|
||||
|
||||
from vllm.model_executor.layers.quantization.utils.bitblas_utils import (
|
||||
MINIMUM_BITBLAS_VERSION,
|
||||
)
|
||||
|
||||
try:
|
||||
import bitblas
|
||||
|
||||
if version.parse(bitblas.__version__) < version.parse(MINIMUM_BITBLAS_VERSION):
|
||||
raise ImportError(
|
||||
"bitblas version is wrong. Please "
|
||||
f"install bitblas>={MINIMUM_BITBLAS_VERSION}"
|
||||
)
|
||||
except ImportError as e:
|
||||
bitblas_import_exception = e
|
||||
raise ValueError(
|
||||
"Trying to use the bitblas backend, but could not import"
|
||||
f"with the following error: {bitblas_import_exception}. "
|
||||
"Please install bitblas through the following command: "
|
||||
f"`pip install bitblas>={MINIMUM_BITBLAS_VERSION}`"
|
||||
) from bitblas_import_exception
|
||||
|
||||
from bitblas import Matmul, MatmulConfig, auto_detect_nvidia_target
|
||||
|
||||
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
||||
|
||||
parser = FlexibleArgumentParser(
|
||||
description="Benchmark BitBLAS int4 on a specific target."
|
||||
)
|
||||
|
||||
# Add arguments to the parser
|
||||
parser.add_argument(
|
||||
"--target",
|
||||
type=str,
|
||||
default=auto_detect_nvidia_target(),
|
||||
help="Specify the target device for benchmarking.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--group_size", type=int, default=None, help="Group size for grouped quantization."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--A_dtype",
|
||||
type=str,
|
||||
default="float16",
|
||||
choices=["float16", "float32", "float64", "int32", "int8"],
|
||||
help="Data type of activation A.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--W_dtype",
|
||||
type=str,
|
||||
default="int4",
|
||||
choices=[
|
||||
"float16",
|
||||
"float32",
|
||||
"float64",
|
||||
"int32",
|
||||
"int8",
|
||||
"int4",
|
||||
"int2",
|
||||
"int1",
|
||||
"nf4",
|
||||
"fp4_e2m1",
|
||||
],
|
||||
help="Data type of weight W.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--accum_dtype",
|
||||
type=str,
|
||||
default="float16",
|
||||
choices=["float16", "int32"],
|
||||
help="Data type for accumulation.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--out_dtype",
|
||||
type=str,
|
||||
default="float16",
|
||||
choices=["float16", "float32", "int32", "int8"],
|
||||
help="Data type for output.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--layout",
|
||||
type=str,
|
||||
default="nt",
|
||||
choices=["nt", "nn"],
|
||||
help="Matrix layout, 'nt' for non-transpose A and transpose W.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--with_bias", action="store_true", help="Include bias in the benchmark."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--with_scaling",
|
||||
action="store_true",
|
||||
help="Include scaling factor in the quantization.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--with_zeros", action="store_true", help="Include zeros in the quantization."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--zeros_mode",
|
||||
type=str,
|
||||
default=None,
|
||||
choices=["original", "rescale", "quantized"],
|
||||
help="Specify the mode for calculating zeros.",
|
||||
)
|
||||
|
||||
# Parse the arguments
|
||||
args = parser.parse_args()
|
||||
|
||||
# Assign arguments to variables
|
||||
target = args.target
|
||||
A_dtype = args.A_dtype
|
||||
W_dtype = args.W_dtype
|
||||
accum_dtype = args.accum_dtype
|
||||
out_dtype = args.out_dtype
|
||||
layout = args.layout
|
||||
with_bias = args.with_bias
|
||||
group_size = args.group_size
|
||||
with_scaling = args.with_scaling
|
||||
with_zeros = args.with_zeros
|
||||
zeros_mode = args.zeros_mode
|
||||
|
||||
# Define a list of shared arguments that repeat in every config
|
||||
shared_args = [
|
||||
A_dtype,
|
||||
W_dtype,
|
||||
out_dtype,
|
||||
accum_dtype,
|
||||
layout,
|
||||
with_bias,
|
||||
group_size,
|
||||
with_scaling,
|
||||
with_zeros,
|
||||
zeros_mode,
|
||||
]
|
||||
|
||||
# Define just the (M, K, N) shapes in a more compact list
|
||||
shapes = [
|
||||
# square test
|
||||
(1, 16384, 16384),
|
||||
# BLOOM-176B
|
||||
(1, 43008, 14336),
|
||||
(1, 14336, 14336),
|
||||
(1, 57344, 14336),
|
||||
(1, 14336, 57344),
|
||||
# OPT-65B
|
||||
(1, 9216, 9216),
|
||||
(1, 36864, 9216),
|
||||
(1, 9216, 36864),
|
||||
(1, 22016, 8192),
|
||||
# LLAMA-70B/65B
|
||||
(1, 8192, 22016),
|
||||
(1, 8192, 8192),
|
||||
(1, 28672, 8192),
|
||||
(1, 8192, 28672),
|
||||
# square test
|
||||
(16384, 16384, 16384),
|
||||
# BLOOM-176B
|
||||
(8192, 43008, 14336),
|
||||
(8192, 14336, 14336),
|
||||
(8192, 57344, 14336),
|
||||
(8192, 14336, 57344),
|
||||
# OPT-65B
|
||||
(8192, 9216, 9216),
|
||||
(8192, 36864, 9216),
|
||||
(8192, 9216, 36864),
|
||||
(8192, 22016, 8192),
|
||||
# LLAMA-70B/65B
|
||||
(8192, 8192, 22016),
|
||||
(8192, 8192, 8192),
|
||||
(8192, 28672, 8192),
|
||||
(8192, 8192, 28672),
|
||||
]
|
||||
|
||||
# Build test shapes with all the shared arguments
|
||||
test_shapes = [(MatmulConfig, Matmul, (*shape, *shared_args)) for shape in shapes]
|
||||
|
||||
benchmark_sets = []
|
||||
benchmark_sets.extend(test_shapes)
|
||||
|
||||
benchmark_results = {}
|
||||
for config_class, operator, input_args in benchmark_sets:
|
||||
config = config_class(*input_args)
|
||||
matmul = operator(config, target=target, enable_tuning=True)
|
||||
kernel_latency = matmul.profile_latency()
|
||||
|
||||
print("Time cost is: {:.3f} ms".format(kernel_latency))
|
||||
|
||||
profile_config = {
|
||||
f"{operator.__name__}-{'-'.join([str(i) for i in input_args])}": {
|
||||
"BitBLAS_top20_latency": kernel_latency,
|
||||
}
|
||||
}
|
||||
|
||||
benchmark_results.update(profile_config)
|
||||
|
||||
# Define headers for the table
|
||||
headers = [
|
||||
"PrimFunc",
|
||||
"Input Arguments",
|
||||
"BitBLAS Top20 Latency",
|
||||
]
|
||||
|
||||
# Calculate column widths for pretty printing
|
||||
col_widths = [0, 0, 0]
|
||||
for config_key, values in benchmark_results.items():
|
||||
args_split = config_key.split("-")
|
||||
func_name = args_split[0]
|
||||
input_args_str = "-".join(args_split[1:])
|
||||
col_widths[0] = max(col_widths[0], len(func_name) + 2, len(headers[0]) + 2)
|
||||
col_widths[1] = max(col_widths[1], len(input_args_str) + 2, len(headers[1]) + 2)
|
||||
col_widths[2] = max(
|
||||
col_widths[2],
|
||||
len(f"{values['BitBLAS_top20_latency']:.3f} ms") + 2,
|
||||
len(headers[2]) + 2,
|
||||
)
|
||||
# break only if you want to measure widths from a single example;
|
||||
# otherwise, let it loop over all items.
|
||||
|
||||
# Print header
|
||||
for i, header in enumerate(headers):
|
||||
headers[i] = header.ljust(col_widths[i])
|
||||
print("".join(headers))
|
||||
print("-" * sum(col_widths))
|
||||
|
||||
# Print rows
|
||||
for config_key, values in benchmark_results.items():
|
||||
args_split = config_key.split("-")
|
||||
func_name = args_split[0]
|
||||
input_args_str = "-".join(args_split[1:])
|
||||
row = [
|
||||
func_name,
|
||||
input_args_str,
|
||||
f"{values['BitBLAS_top20_latency']:.3f} ms",
|
||||
]
|
||||
row_str = "".join(
|
||||
[str(cell).ljust(col_widths[idx]) for idx, cell in enumerate(row)]
|
||||
)
|
||||
print(row_str)
|
||||
@@ -9,6 +9,7 @@ but use different quantization strategies and backends.
|
||||
import torch
|
||||
|
||||
import vllm.model_executor.layers.fused_moe.modular_kernel as mk
|
||||
from tests.kernels.moe.utils import make_dummy_moe_config
|
||||
from vllm import _custom_ops as ops
|
||||
from vllm.model_executor.layers.fused_moe.config import fp8_w8a8_moe_quant_config
|
||||
from vllm.model_executor.layers.fused_moe.cutlass_moe import CutlassExpertsFp8
|
||||
@@ -138,12 +139,13 @@ def bench_run(
|
||||
fn = mk.FusedMoEModularKernel(
|
||||
MoEPrepareAndFinalizeNoEP(),
|
||||
CutlassExpertsFp8(
|
||||
out_dtype=a.dtype,
|
||||
e=num_experts,
|
||||
n=n,
|
||||
k=k,
|
||||
moe_config=make_dummy_moe_config(
|
||||
num_experts=num_experts,
|
||||
hidden_dim=k,
|
||||
intermediate_size_per_partition=n,
|
||||
in_dtype=a.dtype,
|
||||
),
|
||||
quant_config=quant_config,
|
||||
device=w1.device,
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
@@ -12,6 +12,7 @@ import torch
|
||||
import torch.utils.benchmark as benchmark
|
||||
|
||||
import vllm.model_executor.layers.fused_moe.modular_kernel as mk
|
||||
from tests.kernels.moe.utils import make_dummy_moe_config
|
||||
from vllm import _custom_ops as ops
|
||||
from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config
|
||||
from vllm.model_executor.layers.fused_moe.config import (
|
||||
@@ -196,10 +197,9 @@ def bench_run(
|
||||
)
|
||||
|
||||
kernel = mk.FusedMoEModularKernel(
|
||||
MoEPrepareAndFinalizeNoEP(defer_input_quant=True),
|
||||
MoEPrepareAndFinalizeNoEP(),
|
||||
CutlassExpertsFp4(
|
||||
out_dtype=dtype,
|
||||
max_experts_per_worker=e,
|
||||
make_dummy_moe_config(),
|
||||
quant_config=quant_config,
|
||||
),
|
||||
)
|
||||
@@ -242,10 +242,9 @@ def bench_run(
|
||||
)
|
||||
|
||||
kernel = mk.FusedMoEModularKernel(
|
||||
MoEPrepareAndFinalizeNoEP(defer_input_quant=True),
|
||||
MoEPrepareAndFinalizeNoEP(),
|
||||
CutlassExpertsFp4(
|
||||
out_dtype=dtype,
|
||||
max_experts_per_worker=e,
|
||||
make_dummy_moe_config(),
|
||||
quant_config=quant_config,
|
||||
),
|
||||
)
|
||||
|
||||
99
benchmarks/kernels/benchmark_fused_topk.py
Normal file
99
benchmarks/kernels/benchmark_fused_topk.py
Normal file
@@ -0,0 +1,99 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import itertools
|
||||
|
||||
import torch
|
||||
|
||||
from vllm.model_executor.layers.fused_moe.router.fused_topk_router import fused_topk
|
||||
from vllm.triton_utils import triton
|
||||
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
||||
|
||||
num_tokens_range = [2**i for i in range(0, 8, 2)]
|
||||
num_experts_range = [16, 32, 64, 128, 256, 512]
|
||||
topk_range = [3, 4]
|
||||
configs = list(itertools.product(num_tokens_range, num_experts_range, topk_range))
|
||||
|
||||
|
||||
def torch_topk(
|
||||
gating_output: torch.Tensor,
|
||||
topk: int,
|
||||
renormalize: bool,
|
||||
scoring_func: str = "softmax",
|
||||
):
|
||||
if scoring_func == "softmax":
|
||||
scores = torch.softmax(gating_output.float(), dim=-1)
|
||||
else:
|
||||
scores = torch.sigmoid(gating_output.float())
|
||||
topk_weights, topk_ids = torch.topk(scores, k=topk, dim=-1)
|
||||
|
||||
if renormalize:
|
||||
topk_weights = topk_weights / topk_weights.sum(dim=-1, keepdim=True)
|
||||
|
||||
return topk_weights, topk_ids
|
||||
|
||||
|
||||
def get_benchmark(scoring_func):
|
||||
@triton.testing.perf_report(
|
||||
triton.testing.Benchmark(
|
||||
x_names=["num_tokens", "num_experts", "topk"],
|
||||
x_vals=[list(_) for _ in configs],
|
||||
line_arg="provider",
|
||||
line_vals=["torch", "vllm"],
|
||||
line_names=["Torch", "vLLM"],
|
||||
styles=[("blue", "-"), ("red", "-")],
|
||||
ylabel="us",
|
||||
plot_name=f"fused-topk-perf-{scoring_func}",
|
||||
args={},
|
||||
)
|
||||
)
|
||||
def benchmark(num_tokens, num_experts, topk, provider):
|
||||
dtype = torch.bfloat16
|
||||
hidden_size = 1024
|
||||
renormalize = True
|
||||
hidden_states = torch.randn(
|
||||
(num_tokens, hidden_size), dtype=dtype, device="cuda"
|
||||
)
|
||||
gating_output = torch.randn(
|
||||
(num_tokens, num_experts), dtype=dtype, device="cuda"
|
||||
)
|
||||
|
||||
quantiles = [0.5, 0.2, 0.8]
|
||||
|
||||
if provider == "torch":
|
||||
ms, min_ms, max_ms = triton.testing.do_bench(
|
||||
lambda: torch_topk(
|
||||
gating_output=gating_output,
|
||||
topk=topk,
|
||||
renormalize=renormalize,
|
||||
scoring_func=scoring_func,
|
||||
),
|
||||
quantiles=quantiles,
|
||||
)
|
||||
else:
|
||||
ms, min_ms, max_ms = triton.testing.do_bench(
|
||||
lambda: fused_topk(
|
||||
hidden_states=hidden_states,
|
||||
gating_output=gating_output,
|
||||
topk=topk,
|
||||
renormalize=renormalize,
|
||||
scoring_func=scoring_func,
|
||||
),
|
||||
quantiles=quantiles,
|
||||
)
|
||||
|
||||
return 1000 * ms, 1000 * max_ms, 1000 * min_ms
|
||||
|
||||
return benchmark
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = FlexibleArgumentParser(description="Benchmark the MoE topk kernel.")
|
||||
parser.add_argument("--scoring-func", type=str, default="softmax")
|
||||
parser.add_argument("--save-path", type=str, default="./configs/fused_topk/")
|
||||
args = parser.parse_args()
|
||||
|
||||
# Get the benchmark function
|
||||
benchmark = get_benchmark(args.scoring_func)
|
||||
# Run performance benchmark
|
||||
benchmark.run(print_data=True, save_path=args.save_path)
|
||||
@@ -6,6 +6,7 @@ import torch.utils.benchmark as benchmark
|
||||
from benchmark_shapes import WEIGHT_SHAPES_MOE
|
||||
|
||||
import vllm.model_executor.layers.fused_moe.modular_kernel as mk
|
||||
from tests.kernels.moe.utils import make_dummy_moe_config
|
||||
from vllm import _custom_ops as ops
|
||||
from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config
|
||||
from vllm.model_executor.layers.fused_moe.config import fp8_w8a8_moe_quant_config
|
||||
@@ -134,13 +135,13 @@ def bench_run(
|
||||
fn = mk.FusedMoEModularKernel(
|
||||
MoEPrepareAndFinalizeNoEP(),
|
||||
CutlassExpertsFp8(
|
||||
out_dtype=a.dtype,
|
||||
# NOTE(rob): w2 is shaped as [E, hidden, intermediate]
|
||||
e=w2.shape[0],
|
||||
n=w2.shape[2],
|
||||
k=w2.shape[1],
|
||||
moe_config=make_dummy_moe_config(
|
||||
num_experts=w2.shape[0],
|
||||
hidden_dim=w2.shape[1],
|
||||
intermediate_size_per_partition=w2.shape[2],
|
||||
in_dtype=a.dtype,
|
||||
),
|
||||
quant_config=quant_config,
|
||||
device=w1.device,
|
||||
),
|
||||
)
|
||||
|
||||
@@ -166,13 +167,13 @@ def bench_run(
|
||||
fn = mk.FusedMoEModularKernel(
|
||||
MoEPrepareAndFinalizeNoEP(),
|
||||
CutlassExpertsFp8(
|
||||
out_dtype=a.dtype,
|
||||
# NOTE(rob): w2 is shaped as [E, hidden, intermediate]
|
||||
e=w2.shape[0],
|
||||
n=w2.shape[2],
|
||||
k=w2.shape[1],
|
||||
moe_config=make_dummy_moe_config(
|
||||
num_experts=w2.shape[0],
|
||||
hidden_dim=w2.shape[1],
|
||||
intermediate_size_per_partition=w2.shape[2],
|
||||
in_dtype=a.dtype,
|
||||
),
|
||||
quant_config=quant_config,
|
||||
device=w1.device,
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
@@ -842,6 +842,7 @@ class BenchmarkTensors:
|
||||
"sorted_token_ids": sorted_token_ids,
|
||||
"expert_ids": expert_ids,
|
||||
"num_tokens_post_padded": num_tokens_post_padded,
|
||||
"token_lora_mapping": self.lora_kernel_meta.token_lora_mapping,
|
||||
"top_k_num": ctx.top_k_num,
|
||||
"device": self.input.device,
|
||||
"N": lora_rank,
|
||||
@@ -915,6 +916,7 @@ class BenchmarkTensors:
|
||||
"sorted_token_ids": sorted_token_ids,
|
||||
"expert_ids": expert_ids,
|
||||
"num_tokens_post_padded": num_tokens_post_padded,
|
||||
"token_lora_mapping": self.lora_kernel_meta.token_lora_mapping,
|
||||
"top_k_num": ctx.top_k_num,
|
||||
"device": self.input.device,
|
||||
"N": lora_rank,
|
||||
|
||||
@@ -231,7 +231,7 @@ def marlin_create_bench_fn(bt: BenchmarkTensors) -> Callable:
|
||||
assert bt.w_tok_s is None
|
||||
assert bt.group_size is not None
|
||||
|
||||
fn = lambda: ops.gptq_marlin_gemm(
|
||||
fn = lambda: ops.marlin_gemm(
|
||||
a=bt.a,
|
||||
c=None,
|
||||
b_q_weight=w_q,
|
||||
|
||||
@@ -6,12 +6,6 @@ import torch.utils.benchmark as benchmark
|
||||
from benchmark_shapes import WEIGHT_SHAPES
|
||||
|
||||
from vllm import _custom_ops as ops
|
||||
from vllm.model_executor.layers.quantization.gptq_marlin_24 import (
|
||||
GPTQ_MARLIN_24_MAX_PARALLEL,
|
||||
GPTQ_MARLIN_24_MIN_THREAD_N,
|
||||
GPTQ_MARLIN_24_SUPPORTED_GROUP_SIZES,
|
||||
GPTQ_MARLIN_24_SUPPORTED_QUANT_TYPES,
|
||||
)
|
||||
from vllm.model_executor.layers.quantization.utils.allspark_utils import (
|
||||
ALLSPARK_AMPERE_M_CUBLAS_THRESHOLD,
|
||||
ALLSPARK_SUPPORTED_QUANT_TYPES,
|
||||
@@ -34,9 +28,6 @@ from vllm.model_executor.layers.quantization.utils.marlin_utils_test import (
|
||||
awq_marlin_quantize,
|
||||
marlin_quantize,
|
||||
)
|
||||
from vllm.model_executor.layers.quantization.utils.marlin_utils_test_24 import (
|
||||
marlin_24_quantize,
|
||||
)
|
||||
from vllm.model_executor.layers.quantization.utils.quant_utils import (
|
||||
gptq_pack,
|
||||
gptq_quantize_weights,
|
||||
@@ -78,14 +69,7 @@ def bench_run(
|
||||
if size_k % group_size != 0:
|
||||
return
|
||||
|
||||
marlin_24_supported = (
|
||||
quant_type in GPTQ_MARLIN_24_SUPPORTED_QUANT_TYPES
|
||||
and group_size in GPTQ_MARLIN_24_SUPPORTED_GROUP_SIZES
|
||||
)
|
||||
repack_supported = (
|
||||
quant_type in GPTQ_MARLIN_24_SUPPORTED_QUANT_TYPES
|
||||
and group_size in MARLIN_SUPPORTED_GROUP_SIZES
|
||||
)
|
||||
repack_supported = group_size in MARLIN_SUPPORTED_GROUP_SIZES
|
||||
allspark_supported = (
|
||||
quant_type in ALLSPARK_SUPPORTED_QUANT_TYPES
|
||||
and group_size == -1
|
||||
@@ -126,14 +110,6 @@ def bench_run(
|
||||
marlin_sort_indices,
|
||||
)
|
||||
|
||||
def gen_marlin_24_params():
|
||||
marlin_24_w_ref = marlin_24_q_w_comp = marlin_24_meta = marlin_24_s = None
|
||||
if marlin_24_supported:
|
||||
(marlin_24_w_ref, marlin_24_q_w_comp, marlin_24_meta, marlin_24_s) = (
|
||||
marlin_24_quantize(b, quant_type, group_size)
|
||||
)
|
||||
return (marlin_24_w_ref, marlin_24_q_w_comp, marlin_24_meta, marlin_24_s)
|
||||
|
||||
def gen_repack_params():
|
||||
q_w_gptq = None
|
||||
repack_sort_indices = None
|
||||
@@ -188,9 +164,6 @@ def bench_run(
|
||||
marlin_g_idx,
|
||||
marlin_sort_indices,
|
||||
) = gen_marlin_params()
|
||||
marlin_24_w_ref, marlin_24_q_w_comp, marlin_24_meta, marlin_24_s = (
|
||||
gen_marlin_24_params()
|
||||
)
|
||||
q_w_gptq, repack_sort_indices = gen_repack_params()
|
||||
qw_reorder, s_reorder, zp_reorder, sm_count, sm_version, CUBLAS_M_THRESHOLD = (
|
||||
gen_allspark_params()
|
||||
@@ -200,9 +173,6 @@ def bench_run(
|
||||
marlin_workspace = MarlinWorkspace(
|
||||
size_n, GPTQ_MARLIN_MIN_THREAD_N, GPTQ_MARLIN_MAX_PARALLEL
|
||||
)
|
||||
marlin_24_workspace = MarlinWorkspace(
|
||||
size_n, GPTQ_MARLIN_24_MIN_THREAD_N, GPTQ_MARLIN_24_MAX_PARALLEL
|
||||
)
|
||||
|
||||
globals = {
|
||||
# Gen params
|
||||
@@ -222,12 +192,6 @@ def bench_run(
|
||||
"marlin_sort_indices": marlin_sort_indices,
|
||||
"marlin_workspace": marlin_workspace,
|
||||
"is_k_full": is_k_full,
|
||||
# Marlin_24 params
|
||||
"marlin_24_w_ref": marlin_24_w_ref,
|
||||
"marlin_24_q_w_comp": marlin_24_q_w_comp,
|
||||
"marlin_24_meta": marlin_24_meta,
|
||||
"marlin_24_s": marlin_24_s,
|
||||
"marlin_24_workspace": marlin_24_workspace,
|
||||
# GPTQ params
|
||||
"q_w_gptq": q_w_gptq,
|
||||
"repack_sort_indices": repack_sort_indices,
|
||||
@@ -239,8 +203,7 @@ def bench_run(
|
||||
"sm_version": sm_version,
|
||||
"CUBLAS_M_THRESHOLD": CUBLAS_M_THRESHOLD,
|
||||
# Kernels
|
||||
"gptq_marlin_gemm": ops.gptq_marlin_gemm,
|
||||
"gptq_marlin_24_gemm": ops.gptq_marlin_24_gemm,
|
||||
"marlin_gemm": ops.marlin_gemm,
|
||||
"gptq_marlin_repack": ops.gptq_marlin_repack,
|
||||
"allspark_w8a16_gemm": ops.allspark_w8a16_gemm,
|
||||
}
|
||||
@@ -263,35 +226,24 @@ def bench_run(
|
||||
|
||||
results.append(
|
||||
benchmark.Timer(
|
||||
stmt="output = gptq_marlin_gemm(a, None, marlin_q_w, marlin_s, None, marlin_s2, marlin_zp, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, quant_type, size_m, size_n, size_k, is_k_full, False, False, False)", # noqa: E501
|
||||
stmt="output = marlin_gemm(a, None, marlin_q_w, marlin_s, None, marlin_s2, marlin_zp, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, quant_type, size_m, size_n, size_k, is_k_full, False, False, False)", # noqa: E501
|
||||
globals=globals,
|
||||
label=label,
|
||||
sub_label=sub_label,
|
||||
description="gptq_marlin_gemm",
|
||||
description="marlin_gemm",
|
||||
).blocked_autorange(min_run_time=min_run_time)
|
||||
)
|
||||
|
||||
results.append(
|
||||
benchmark.Timer(
|
||||
stmt="output = gptq_marlin_gemm(a, None, marlin_q_w, marlin_s, None, marlin_s2, marlin_zp, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, quant_type, size_m, size_n, size_k, is_k_full, False, True, False)", # noqa: E501
|
||||
stmt="output = marlin_gemm(a, None, marlin_q_w, marlin_s, None, marlin_s2, marlin_zp, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, quant_type, size_m, size_n, size_k, is_k_full, False, True, False)", # noqa: E501
|
||||
globals=globals,
|
||||
label=label,
|
||||
sub_label=sub_label,
|
||||
description="gptq_marlin_gemm_fp32",
|
||||
description="marlin_gemm_fp32",
|
||||
).blocked_autorange(min_run_time=min_run_time)
|
||||
)
|
||||
|
||||
if marlin_24_supported:
|
||||
results.append(
|
||||
benchmark.Timer(
|
||||
stmt="output = gptq_marlin_24_gemm(a, marlin_24_q_w_comp, marlin_24_meta, marlin_24_s, marlin_24_workspace.scratch, quant_type, size_m, size_n, size_k)", # noqa: E501
|
||||
globals=globals,
|
||||
label=label,
|
||||
sub_label=sub_label,
|
||||
description="gptq_marlin_24_gemm",
|
||||
).blocked_autorange(min_run_time=min_run_time)
|
||||
)
|
||||
|
||||
if repack_supported:
|
||||
results.append(
|
||||
benchmark.Timer(
|
||||
|
||||
@@ -15,12 +15,18 @@ import ray
|
||||
import torch
|
||||
from ray.experimental.tqdm_ray import tqdm
|
||||
|
||||
from vllm.model_executor.layers.fused_moe import fused_topk
|
||||
from vllm.model_executor.layers.fused_moe.config import (
|
||||
FusedMoEConfig,
|
||||
FusedMoEParallelConfig,
|
||||
FusedMoEQuantConfig,
|
||||
RoutingMethodType,
|
||||
_get_config_dtype_str,
|
||||
)
|
||||
from vllm.model_executor.layers.fused_moe.fused_moe import *
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.model_executor.layers.fused_moe.triton_deep_gemm_moe import (
|
||||
TritonOrDeepGemmExperts,
|
||||
)
|
||||
from vllm.transformers_utils.config import get_config
|
||||
from vllm.triton_utils import triton
|
||||
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
||||
@@ -194,10 +200,36 @@ def benchmark_config(
|
||||
block_shape=block_quant_shape,
|
||||
)
|
||||
|
||||
deep_gemm_experts = None
|
||||
if use_deep_gemm:
|
||||
deep_gemm_experts = mk.FusedMoEModularKernel(
|
||||
prepare_finalize=MoEPrepareAndFinalizeNoEP(),
|
||||
fused_experts=TritonOrDeepGemmExperts(
|
||||
moe_config=FusedMoEConfig(
|
||||
num_experts=num_experts,
|
||||
experts_per_token=topk,
|
||||
hidden_dim=hidden_size,
|
||||
intermediate_size_per_partition=shard_intermediate_size,
|
||||
num_local_experts=num_experts,
|
||||
activation="silu",
|
||||
moe_parallel_config=FusedMoEParallelConfig.make_no_parallel(),
|
||||
in_dtype=init_dtype,
|
||||
routing_method=RoutingMethodType.TopK,
|
||||
device="cuda",
|
||||
),
|
||||
quant_config=quant_config,
|
||||
),
|
||||
)
|
||||
|
||||
with override_config(config):
|
||||
topk_weights, topk_ids, token_expert_indices = fused_topk(
|
||||
x, input_gating, topk, renormalize=not use_deep_gemm
|
||||
)
|
||||
|
||||
if use_deep_gemm:
|
||||
return deep_gemm_experts(
|
||||
x, w1, w2, topk_weights, topk_ids, inplace=True
|
||||
)
|
||||
return fused_experts(
|
||||
x,
|
||||
w1,
|
||||
@@ -206,7 +238,6 @@ def benchmark_config(
|
||||
topk_ids,
|
||||
inplace=True,
|
||||
quant_config=quant_config,
|
||||
allow_deep_gemm=use_deep_gemm,
|
||||
)
|
||||
|
||||
# JIT compilation & warmup
|
||||
@@ -450,6 +481,8 @@ class BenchmarkWorker:
|
||||
block_quant_shape: list[int] = None,
|
||||
use_deep_gemm: bool = False,
|
||||
) -> tuple[dict[str, int], float]:
|
||||
# local import to allow serialization by ray
|
||||
|
||||
set_random_seed(self.seed)
|
||||
dtype_str = _get_config_dtype_str(
|
||||
dtype, use_int8_w8a16=use_int8_w8a16, use_fp8_w8a8=use_fp8_w8a8
|
||||
@@ -503,6 +536,9 @@ class BenchmarkWorker:
|
||||
block_quant_shape: list[int],
|
||||
use_deep_gemm: bool,
|
||||
) -> dict[str, int]:
|
||||
# local import to allow serialization by ray
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
best_config = None
|
||||
best_time = float("inf")
|
||||
if current_platform.is_rocm():
|
||||
@@ -614,20 +650,28 @@ def save_configs(
|
||||
f.write("\n")
|
||||
|
||||
|
||||
def get_compressed_tensors_block_structure(config, default_value=None):
|
||||
config_groups = config.get("config_groups", {})
|
||||
if len(config_groups) != 1:
|
||||
return default_value
|
||||
group = next(iter(config_groups.values()))
|
||||
weights = group.get("weights", {})
|
||||
block_structure = weights.get("block_structure", default_value)
|
||||
return block_structure
|
||||
|
||||
|
||||
def get_weight_block_size_safety(config, default_value=None):
|
||||
quantization_config = getattr(config, "quantization_config", {})
|
||||
if isinstance(quantization_config, dict):
|
||||
return quantization_config.get("weight_block_size", default_value)
|
||||
if "weight_block_size" in quantization_config:
|
||||
return quantization_config["weight_block_size"]
|
||||
return get_compressed_tensors_block_structure(
|
||||
quantization_config, default_value
|
||||
)
|
||||
return default_value
|
||||
|
||||
|
||||
def main(args: argparse.Namespace):
|
||||
print(args)
|
||||
|
||||
config = get_config(model=args.model, trust_remote_code=args.trust_remote_code)
|
||||
if args.model_prefix:
|
||||
config = getattr(config, args.model_prefix)
|
||||
|
||||
def get_model_params(config):
|
||||
if config.architectures[0] == "DbrxForCausalLM":
|
||||
E = config.ffn_config.moe_num_experts
|
||||
topk = config.ffn_config.moe_top_k
|
||||
@@ -643,7 +687,9 @@ def main(args: argparse.Namespace):
|
||||
"DeepseekV3ForCausalLM",
|
||||
"DeepseekV32ForCausalLM",
|
||||
"Glm4MoeForCausalLM",
|
||||
"Glm4MoeLiteForCausalLM",
|
||||
"NemotronHForCausalLM",
|
||||
"MistralLarge3ForCausalLM",
|
||||
):
|
||||
E = config.n_routed_experts
|
||||
topk = config.num_experts_per_tok
|
||||
@@ -664,16 +710,20 @@ def main(args: argparse.Namespace):
|
||||
topk = text_config.num_experts_per_tok
|
||||
intermediate_size = text_config.moe_intermediate_size
|
||||
hidden_size = text_config.hidden_size
|
||||
elif config.architectures[0] in ("HunYuanMoEV1ForCausalLM"):
|
||||
elif config.architectures[0] == "HunYuanMoEV1ForCausalLM":
|
||||
E = config.num_experts
|
||||
topk = config.moe_topk[0]
|
||||
intermediate_size = config.moe_intermediate_size[0]
|
||||
hidden_size = config.hidden_size
|
||||
elif config.architectures[0] in ["Qwen3OmniMoeForConditionalGeneration"]:
|
||||
elif config.architectures[0] == "Qwen3OmniMoeForConditionalGeneration":
|
||||
E = config.thinker_config.text_config.num_experts
|
||||
topk = config.thinker_config.text_config.num_experts_per_tok
|
||||
intermediate_size = config.thinker_config.text_config.moe_intermediate_size
|
||||
hidden_size = config.thinker_config.text_config.hidden_size
|
||||
elif config.architectures[0] == "PixtralForConditionalGeneration":
|
||||
# Pixtral can contain different LLM architectures,
|
||||
# recurse to get their parameters
|
||||
return get_model_params(config.get_text_config())
|
||||
else:
|
||||
# Support for llama4
|
||||
config = config.get_text_config()
|
||||
@@ -682,6 +732,16 @@ def main(args: argparse.Namespace):
|
||||
topk = config.num_experts_per_tok
|
||||
intermediate_size = config.intermediate_size
|
||||
hidden_size = config.hidden_size
|
||||
return E, topk, intermediate_size, hidden_size
|
||||
|
||||
|
||||
def main(args: argparse.Namespace):
|
||||
print(args)
|
||||
|
||||
config = get_config(model=args.model, trust_remote_code=args.trust_remote_code)
|
||||
if args.model_prefix:
|
||||
config = getattr(config, args.model_prefix)
|
||||
E, topk, intermediate_size, hidden_size = get_model_params(config)
|
||||
enable_ep = bool(args.enable_expert_parallel)
|
||||
if enable_ep:
|
||||
ensure_divisibility(E, args.tp_size, "Number of experts")
|
||||
|
||||
@@ -8,10 +8,8 @@ import ray
|
||||
import torch
|
||||
from transformers import AutoConfig
|
||||
|
||||
from vllm.model_executor.layers.fused_moe.fused_moe import *
|
||||
from vllm.model_executor.layers.fused_moe import fused_topk
|
||||
from vllm.model_executor.layers.fused_moe.moe_permute_unpermute import (
|
||||
_moe_permute,
|
||||
_moe_unpermute_and_reduce,
|
||||
moe_permute,
|
||||
moe_unpermute,
|
||||
)
|
||||
@@ -41,16 +39,13 @@ def benchmark_permute(
|
||||
use_fp8_w8a8: bool,
|
||||
use_int8_w8a16: bool,
|
||||
num_iters: int = 100,
|
||||
use_customized_permute: bool = False,
|
||||
) -> float:
|
||||
# init_dtype = torch.float16 if use_fp8_w8a8 else dtype
|
||||
hidden_states = torch.randn(num_tokens, hidden_size, dtype=dtype)
|
||||
# output_hidden_states = torch.empty_like(hidden_states)
|
||||
if use_fp8_w8a8:
|
||||
align_block_size = 128 # deepgemm needs 128 m aligned block
|
||||
qhidden_states, scale = _fp8_quantize(hidden_states, None, None)
|
||||
else:
|
||||
align_block_size = None
|
||||
qhidden_states = hidden_states
|
||||
|
||||
gating_output = torch.randn(num_iters, num_tokens, num_experts, dtype=torch.float32)
|
||||
@@ -64,31 +59,13 @@ def benchmark_permute(
|
||||
input_gating.copy_(gating_output[i])
|
||||
|
||||
def run():
|
||||
if use_customized_permute:
|
||||
(
|
||||
permuted_hidden_states,
|
||||
a1q_scale,
|
||||
first_token_off,
|
||||
inv_perm_idx,
|
||||
m_indices,
|
||||
) = moe_permute(
|
||||
qhidden_states,
|
||||
a1q_scale=None,
|
||||
topk_ids=topk_ids,
|
||||
n_expert=num_experts,
|
||||
expert_map=None,
|
||||
align_block_size=align_block_size,
|
||||
)
|
||||
else:
|
||||
(
|
||||
permuted_hidden_states,
|
||||
a1q_scale,
|
||||
sorted_token_ids,
|
||||
expert_ids,
|
||||
inv_perm,
|
||||
) = _moe_permute(
|
||||
qhidden_states, None, topk_ids, num_experts, None, align_block_size
|
||||
)
|
||||
moe_permute(
|
||||
qhidden_states,
|
||||
a1q_scale=None,
|
||||
topk_ids=topk_ids,
|
||||
n_expert=num_experts,
|
||||
expert_map=None,
|
||||
)
|
||||
|
||||
# JIT compilation & warmup
|
||||
run()
|
||||
@@ -133,16 +110,12 @@ def benchmark_unpermute(
|
||||
use_fp8_w8a8: bool,
|
||||
use_int8_w8a16: bool,
|
||||
num_iters: int = 100,
|
||||
use_customized_permute: bool = False,
|
||||
) -> float:
|
||||
# init_dtype = torch.float16 if use_fp8_w8a8 else dtype
|
||||
hidden_states = torch.randn(num_tokens, hidden_size, dtype=dtype)
|
||||
output_hidden_states = torch.empty_like(hidden_states)
|
||||
if use_fp8_w8a8:
|
||||
align_block_size = 128 # deepgemm needs 128 m aligned block
|
||||
qhidden_states, scale = _fp8_quantize(hidden_states, None, None)
|
||||
else:
|
||||
align_block_size = None
|
||||
qhidden_states = hidden_states
|
||||
|
||||
input_gating = torch.randn(num_tokens, num_experts, dtype=torch.float32)
|
||||
@@ -152,78 +125,36 @@ def benchmark_unpermute(
|
||||
)
|
||||
|
||||
def prepare():
|
||||
if use_customized_permute:
|
||||
(
|
||||
permuted_hidden_states,
|
||||
a1q_scale,
|
||||
first_token_off,
|
||||
inv_perm_idx,
|
||||
m_indices,
|
||||
) = moe_permute(
|
||||
qhidden_states,
|
||||
a1q_scale=None,
|
||||
topk_ids=topk_ids,
|
||||
n_expert=num_experts,
|
||||
expert_map=None,
|
||||
align_block_size=align_block_size,
|
||||
)
|
||||
# convert to fp16/bf16 as gemm output
|
||||
return (
|
||||
permuted_hidden_states.to(dtype),
|
||||
first_token_off,
|
||||
inv_perm_idx,
|
||||
m_indices,
|
||||
)
|
||||
else:
|
||||
(
|
||||
permuted_qhidden_states,
|
||||
a1q_scale,
|
||||
sorted_token_ids,
|
||||
expert_ids,
|
||||
inv_perm,
|
||||
) = _moe_permute(
|
||||
qhidden_states, None, topk_ids, num_experts, None, align_block_size
|
||||
)
|
||||
# convert to fp16/bf16 as gemm output
|
||||
return (
|
||||
permuted_qhidden_states.to(dtype),
|
||||
a1q_scale,
|
||||
sorted_token_ids,
|
||||
expert_ids,
|
||||
inv_perm,
|
||||
)
|
||||
(
|
||||
permuted_hidden_states,
|
||||
_,
|
||||
first_token_off,
|
||||
inv_perm_idx,
|
||||
_,
|
||||
) = moe_permute(
|
||||
qhidden_states,
|
||||
a1q_scale=None,
|
||||
topk_ids=topk_ids,
|
||||
n_expert=num_experts,
|
||||
expert_map=None,
|
||||
)
|
||||
# convert to fp16/bf16 as gemm output
|
||||
return (
|
||||
permuted_hidden_states.to(dtype),
|
||||
first_token_off,
|
||||
inv_perm_idx,
|
||||
)
|
||||
|
||||
def run(input: tuple):
|
||||
if use_customized_permute:
|
||||
(
|
||||
permuted_hidden_states,
|
||||
first_token_off,
|
||||
inv_perm_idx,
|
||||
m_indices,
|
||||
) = input
|
||||
output = torch.empty_like(hidden_states)
|
||||
moe_unpermute(
|
||||
output,
|
||||
permuted_hidden_states,
|
||||
topk_weights,
|
||||
inv_perm_idx,
|
||||
first_token_off,
|
||||
)
|
||||
else:
|
||||
(
|
||||
permuted_hidden_states,
|
||||
a1q_scale,
|
||||
sorted_token_ids,
|
||||
expert_ids,
|
||||
inv_perm,
|
||||
) = input
|
||||
_moe_unpermute_and_reduce(
|
||||
output_hidden_states,
|
||||
permuted_hidden_states,
|
||||
inv_perm,
|
||||
topk_weights,
|
||||
True,
|
||||
)
|
||||
(permuted_hidden_states, first_token_off, inv_perm_idx) = input
|
||||
output = torch.empty_like(hidden_states)
|
||||
moe_unpermute(
|
||||
output,
|
||||
permuted_hidden_states,
|
||||
topk_weights,
|
||||
inv_perm_idx,
|
||||
first_token_off,
|
||||
)
|
||||
|
||||
# JIT compilation & warmup
|
||||
input = prepare()
|
||||
@@ -278,8 +209,7 @@ class BenchmarkWorker:
|
||||
dtype: torch.dtype,
|
||||
use_fp8_w8a8: bool,
|
||||
use_int8_w8a16: bool,
|
||||
use_customized_permute: bool = False,
|
||||
) -> tuple[dict[str, int], float]:
|
||||
) -> tuple[float, float]:
|
||||
set_random_seed(self.seed)
|
||||
|
||||
permute_time = benchmark_permute(
|
||||
@@ -291,7 +221,6 @@ class BenchmarkWorker:
|
||||
use_fp8_w8a8,
|
||||
use_int8_w8a16,
|
||||
num_iters=100,
|
||||
use_customized_permute=use_customized_permute,
|
||||
)
|
||||
unpermute_time = benchmark_unpermute(
|
||||
num_tokens,
|
||||
@@ -302,7 +231,6 @@ class BenchmarkWorker:
|
||||
use_fp8_w8a8,
|
||||
use_int8_w8a16,
|
||||
num_iters=100,
|
||||
use_customized_permute=use_customized_permute,
|
||||
)
|
||||
return permute_time, unpermute_time
|
||||
|
||||
@@ -330,6 +258,7 @@ def main(args: argparse.Namespace):
|
||||
config.architectures[0] == "DeepseekV3ForCausalLM"
|
||||
or config.architectures[0] == "DeepseekV2ForCausalLM"
|
||||
or config.architectures[0] == "Glm4MoeForCausalLM"
|
||||
or config.architectures[0] == "Glm4MoeLiteForCausalLM"
|
||||
):
|
||||
E = config.n_routed_experts
|
||||
topk = config.num_experts_per_tok
|
||||
@@ -348,7 +277,6 @@ def main(args: argparse.Namespace):
|
||||
dtype = torch.float16 if current_platform.is_rocm() else config.dtype
|
||||
use_fp8_w8a8 = args.dtype == "fp8_w8a8"
|
||||
use_int8_w8a16 = args.dtype == "int8_w8a16"
|
||||
use_customized_permute = args.use_customized_permute
|
||||
|
||||
if args.batch_size is None:
|
||||
batch_sizes = [
|
||||
@@ -400,7 +328,6 @@ def main(args: argparse.Namespace):
|
||||
dtype,
|
||||
use_fp8_w8a8,
|
||||
use_int8_w8a16,
|
||||
use_customized_permute,
|
||||
)
|
||||
for batch_size in batch_sizes
|
||||
],
|
||||
@@ -420,7 +347,6 @@ if __name__ == "__main__":
|
||||
parser.add_argument(
|
||||
"--dtype", type=str, choices=["auto", "fp8_w8a8", "int8_w8a16"], default="auto"
|
||||
)
|
||||
parser.add_argument("--use-customized-permute", action="store_true")
|
||||
parser.add_argument("--seed", type=int, default=0)
|
||||
parser.add_argument("--batch-size", type=int, required=False)
|
||||
parser.add_argument("--trust-remote-code", action="store_true")
|
||||
|
||||
@@ -22,8 +22,8 @@ from vllm.utils.argparse_utils import FlexibleArgumentParser
|
||||
|
||||
mp.set_start_method("spawn", force=True)
|
||||
|
||||
assert current_platform.is_cuda(), (
|
||||
"Only support tune w8a8 block fp8 kernel on CUDA device."
|
||||
assert current_platform.is_cuda() or current_platform.is_rocm(), (
|
||||
"Only support tune w8a8 block fp8 kernel on CUDA/ROCm device."
|
||||
)
|
||||
|
||||
DTYPE_MAP = {
|
||||
|
||||
@@ -14,7 +14,7 @@ from vllm._custom_ops import (
|
||||
)
|
||||
from vllm.platforms import CpuArchEnum, current_platform
|
||||
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
||||
from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE
|
||||
from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE, set_random_seed
|
||||
from vllm.v1.attention.backends.cpu_attn import CPUAttentionBackend, _get_attn_isa
|
||||
|
||||
|
||||
@@ -58,7 +58,7 @@ def main(
|
||||
seed: int = 0,
|
||||
iters: int = 20,
|
||||
) -> None:
|
||||
current_platform.seed_everything(seed)
|
||||
set_random_seed(seed)
|
||||
num_seqs = len(seq_lens)
|
||||
query_lens = [x[0] for x in seq_lens]
|
||||
kv_lens = [x[1] for x in seq_lens]
|
||||
|
||||
@@ -7,8 +7,8 @@ import time
|
||||
import numpy as np
|
||||
import torch
|
||||
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
||||
from vllm.utils.torch_utils import set_random_seed
|
||||
|
||||
# Check if CPU MoE operations are available
|
||||
try:
|
||||
@@ -41,7 +41,7 @@ def main(
|
||||
seed: int = 0,
|
||||
iters: int = 20,
|
||||
) -> None:
|
||||
current_platform.seed_everything(seed)
|
||||
set_random_seed(seed)
|
||||
# up_dim = 2 * intermediate_size for gate + up projection
|
||||
up_dim = 2 * intermediate_size
|
||||
|
||||
|
||||
@@ -14,7 +14,6 @@ from vllm.triton_utils import triton
|
||||
from vllm.utils.deep_gemm import (
|
||||
calc_diff,
|
||||
fp8_gemm_nt,
|
||||
get_col_major_tma_aligned_tensor,
|
||||
per_block_cast_to_fp8,
|
||||
)
|
||||
|
||||
@@ -48,8 +47,9 @@ def benchmark_shape(
|
||||
block_size = [128, 128]
|
||||
|
||||
# Pre-quantize A for all implementations
|
||||
A_deepgemm, A_scale_deepgemm = per_token_group_quant_fp8(A, block_size[1])
|
||||
A_scale_deepgemm = get_col_major_tma_aligned_tensor(A_scale_deepgemm)
|
||||
A_deepgemm, A_scale_deepgemm = per_token_group_quant_fp8(
|
||||
A, block_size[1], column_major_scales=True, tma_aligned_scales=True
|
||||
)
|
||||
C_deepgemm = torch.empty((m, n), device="cuda", dtype=torch.bfloat16)
|
||||
A_vllm, A_scale_vllm = per_token_group_quant_fp8(A, block_size[1])
|
||||
A_vllm_cutlass, A_scale_vllm_cutlass = per_token_group_quant_fp8(
|
||||
|
||||
@@ -13,6 +13,8 @@ endif()
|
||||
#
|
||||
# Define environment variables for special configurations
|
||||
#
|
||||
set(ENABLE_AVX2 $ENV{VLLM_CPU_AVX2})
|
||||
set(ENABLE_AVX512 $ENV{VLLM_CPU_AVX512})
|
||||
set(ENABLE_AVX512BF16 $ENV{VLLM_CPU_AVX512BF16})
|
||||
set(ENABLE_AVX512VNNI $ENV{VLLM_CPU_AVX512VNNI})
|
||||
set(ENABLE_AMXBF16 $ENV{VLLM_CPU_AMXBF16})
|
||||
@@ -103,6 +105,16 @@ else()
|
||||
find_isa(${CPUINFO} "bf16" ARM_BF16_FOUND) # Check for ARM BF16 support
|
||||
find_isa(${CPUINFO} "S390" S390_FOUND)
|
||||
find_isa(${CPUINFO} "v" RVV_FOUND) # Check for RISC-V RVV support
|
||||
|
||||
# Support cross-compilation by allowing override via environment variables
|
||||
if (ENABLE_AVX2)
|
||||
set(AVX2_FOUND ON)
|
||||
message(STATUS "AVX2 support enabled via VLLM_CPU_AVX2 environment variable")
|
||||
endif()
|
||||
if (ENABLE_AVX512)
|
||||
set(AVX512_FOUND ON)
|
||||
message(STATUS "AVX512 support enabled via VLLM_CPU_AVX512 environment variable")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if (AVX512_FOUND AND NOT AVX512_DISABLED)
|
||||
@@ -347,6 +359,19 @@ else()
|
||||
add_compile_definitions(-DVLLM_NUMA_DISABLED)
|
||||
endif()
|
||||
|
||||
#
|
||||
# Generate CPU attention dispatch header
|
||||
#
|
||||
message(STATUS "Generating CPU attention dispatch header")
|
||||
execute_process(
|
||||
COMMAND ${Python_EXECUTABLE} ${CMAKE_SOURCE_DIR}/csrc/cpu/generate_cpu_attn_dispatch.py
|
||||
WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}/csrc/cpu
|
||||
RESULT_VARIABLE GEN_RESULT
|
||||
)
|
||||
if(NOT GEN_RESULT EQUAL 0)
|
||||
message(FATAL_ERROR "Failed to generate CPU attention dispatch header")
|
||||
endif()
|
||||
|
||||
#
|
||||
# _C extension
|
||||
#
|
||||
@@ -379,6 +404,12 @@ if (AVX512_FOUND AND NOT AVX512_DISABLED)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if (ASIMD_FOUND AND NOT APPLE_SILICON_FOUND)
|
||||
set(VLLM_EXT_SRC
|
||||
"csrc/cpu/shm.cpp"
|
||||
${VLLM_EXT_SRC})
|
||||
endif()
|
||||
|
||||
if(USE_ONEDNN)
|
||||
set(VLLM_EXT_SRC
|
||||
"csrc/cpu/dnnl_kernels.cpp"
|
||||
|
||||
@@ -19,7 +19,7 @@ else()
|
||||
FetchContent_Declare(
|
||||
flashmla
|
||||
GIT_REPOSITORY https://github.com/vllm-project/FlashMLA
|
||||
GIT_TAG 46d64a8ebef03fa50b4ae74937276a5c940e3f95
|
||||
GIT_TAG c2afa9cb93e674d5a9120a170a6da57b89267208
|
||||
GIT_PROGRESS TRUE
|
||||
CONFIGURE_COMMAND ""
|
||||
BUILD_COMMAND ""
|
||||
@@ -30,6 +30,24 @@ endif()
|
||||
FetchContent_MakeAvailable(flashmla)
|
||||
message(STATUS "FlashMLA is available at ${flashmla_SOURCE_DIR}")
|
||||
|
||||
# Vendor FlashMLA interface into vLLM with torch-ops shim.
|
||||
set(FLASHMLA_VENDOR_DIR "${CMAKE_SOURCE_DIR}/vllm/third_party/flashmla")
|
||||
file(MAKE_DIRECTORY "${FLASHMLA_VENDOR_DIR}")
|
||||
file(READ "${flashmla_SOURCE_DIR}/flash_mla/flash_mla_interface.py"
|
||||
FLASHMLA_INTERFACE_CONTENT)
|
||||
string(REPLACE "import flash_mla.cuda as flash_mla_cuda"
|
||||
"import vllm._flashmla_C\nflash_mla_cuda = torch.ops._flashmla_C"
|
||||
FLASHMLA_INTERFACE_CONTENT
|
||||
"${FLASHMLA_INTERFACE_CONTENT}")
|
||||
file(WRITE "${FLASHMLA_VENDOR_DIR}/flash_mla_interface.py"
|
||||
"${FLASHMLA_INTERFACE_CONTENT}")
|
||||
|
||||
# Install the generated flash_mla_interface.py to the wheel
|
||||
# Use COMPONENT _flashmla_C to ensure it's installed with the C extension
|
||||
install(FILES "${FLASHMLA_VENDOR_DIR}/flash_mla_interface.py"
|
||||
DESTINATION vllm/third_party/flashmla/
|
||||
COMPONENT _flashmla_C)
|
||||
|
||||
# The FlashMLA kernels only work on hopper and require CUDA 12.3 or later.
|
||||
# Only build FlashMLA kernels if we are building for something compatible with
|
||||
# sm90a
|
||||
@@ -55,16 +73,42 @@ if(FLASH_MLA_ARCHS)
|
||||
|
||||
set(FlashMLA_SOURCES
|
||||
${flashmla_SOURCE_DIR}/csrc/torch_api.cpp
|
||||
${flashmla_SOURCE_DIR}/csrc/pybind.cpp
|
||||
${flashmla_SOURCE_DIR}/csrc/smxx/get_mla_metadata.cu
|
||||
${flashmla_SOURCE_DIR}/csrc/smxx/mla_combine.cu
|
||||
${flashmla_SOURCE_DIR}/csrc/sm90/decode/dense/splitkv_mla.cu
|
||||
${flashmla_SOURCE_DIR}/csrc/sm90/decode/sparse_fp8/splitkv_mla.cu
|
||||
|
||||
# Misc kernels for decoding
|
||||
${flashmla_SOURCE_DIR}/csrc/smxx/decode/get_decoding_sched_meta/get_decoding_sched_meta.cu
|
||||
${flashmla_SOURCE_DIR}/csrc/smxx/decode/combine/combine.cu
|
||||
|
||||
# sm90 dense decode
|
||||
${flashmla_SOURCE_DIR}/csrc/sm90/decode/dense/instantiations/fp16.cu
|
||||
${flashmla_SOURCE_DIR}/csrc/sm90/decode/dense/instantiations/bf16.cu
|
||||
|
||||
# sm90 sparse decode
|
||||
${flashmla_SOURCE_DIR}/csrc/sm90/decode/sparse_fp8/instantiations/model1_persistent_h64.cu
|
||||
${flashmla_SOURCE_DIR}/csrc/sm90/decode/sparse_fp8/instantiations/model1_persistent_h128.cu
|
||||
${flashmla_SOURCE_DIR}/csrc/sm90/decode/sparse_fp8/instantiations/v32_persistent_h64.cu
|
||||
${flashmla_SOURCE_DIR}/csrc/sm90/decode/sparse_fp8/instantiations/v32_persistent_h128.cu
|
||||
|
||||
# sm90 sparse prefill
|
||||
${flashmla_SOURCE_DIR}/csrc/sm90/prefill/sparse/fwd.cu
|
||||
${flashmla_SOURCE_DIR}/csrc/sm100/decode/sparse_fp8/splitkv_mla.cu
|
||||
${flashmla_SOURCE_DIR}/csrc/sm90/prefill/sparse/instantiations/phase1_k512.cu
|
||||
${flashmla_SOURCE_DIR}/csrc/sm90/prefill/sparse/instantiations/phase1_k512_topklen.cu
|
||||
${flashmla_SOURCE_DIR}/csrc/sm90/prefill/sparse/instantiations/phase1_k576.cu
|
||||
${flashmla_SOURCE_DIR}/csrc/sm90/prefill/sparse/instantiations/phase1_k576_topklen.cu
|
||||
|
||||
# sm100 dense prefill & backward
|
||||
${flashmla_SOURCE_DIR}/csrc/sm100/prefill/dense/fmha_cutlass_fwd_sm100.cu
|
||||
${flashmla_SOURCE_DIR}/csrc/sm100/prefill/dense/fmha_cutlass_bwd_sm100.cu
|
||||
${flashmla_SOURCE_DIR}/csrc/sm100/prefill/sparse/fwd.cu
|
||||
|
||||
# sm100 sparse prefill
|
||||
${flashmla_SOURCE_DIR}/csrc/sm100/prefill/sparse/fwd/head64/instantiations/phase1_k512.cu
|
||||
${flashmla_SOURCE_DIR}/csrc/sm100/prefill/sparse/fwd/head64/instantiations/phase1_k576.cu
|
||||
${flashmla_SOURCE_DIR}/csrc/sm100/prefill/sparse/fwd/head128/instantiations/phase1_k512.cu
|
||||
${flashmla_SOURCE_DIR}/csrc/sm100/prefill/sparse/fwd/head128/instantiations/phase1_k576.cu
|
||||
${flashmla_SOURCE_DIR}/csrc/sm100/prefill/sparse/fwd_for_small_topk/head128/instantiations/phase1_prefill_k512.cu
|
||||
|
||||
# sm100 sparse decode
|
||||
${flashmla_SOURCE_DIR}/csrc/sm100/decode/head64/instantiations/v32.cu
|
||||
${flashmla_SOURCE_DIR}/csrc/sm100/decode/head64/instantiations/model1.cu
|
||||
${flashmla_SOURCE_DIR}/csrc/sm100/prefill/sparse/fwd_for_small_topk/head128/instantiations/phase1_decode_k512.cu
|
||||
)
|
||||
|
||||
set(FlashMLA_Extension_SOURCES
|
||||
@@ -76,6 +120,7 @@ if(FLASH_MLA_ARCHS)
|
||||
|
||||
set(FlashMLA_INCLUDES
|
||||
${flashmla_SOURCE_DIR}/csrc
|
||||
${flashmla_SOURCE_DIR}/csrc/kerutils/include
|
||||
${flashmla_SOURCE_DIR}/csrc/sm90
|
||||
${flashmla_SOURCE_DIR}/csrc/cutlass/include
|
||||
${flashmla_SOURCE_DIR}/csrc/cutlass/tools/util/include
|
||||
@@ -83,7 +128,6 @@ if(FLASH_MLA_ARCHS)
|
||||
|
||||
set(FlashMLA_Extension_INCLUDES
|
||||
${flashmla_SOURCE_DIR}/csrc
|
||||
${flashmla_SOURCE_DIR}/csrc/sm90
|
||||
${flashmla_SOURCE_DIR}/csrc/extension/sm90/dense_fp8/
|
||||
${flashmla_SOURCE_DIR}/csrc/cutlass/include
|
||||
${flashmla_SOURCE_DIR}/csrc/cutlass/tools/util/include
|
||||
@@ -110,9 +154,12 @@ if(FLASH_MLA_ARCHS)
|
||||
|
||||
# Keep Stable ABI for the module, but *not* for CUDA/C++ files.
|
||||
# This prevents Py_LIMITED_API from affecting nvcc and C++ compiles.
|
||||
# Also enable C++20 for the FlashMLA sources (required for std::span, requires, etc.)
|
||||
target_compile_options(_flashmla_C PRIVATE
|
||||
$<$<COMPILE_LANGUAGE:CUDA>:-UPy_LIMITED_API>
|
||||
$<$<COMPILE_LANGUAGE:CXX>:-UPy_LIMITED_API>)
|
||||
$<$<COMPILE_LANGUAGE:CXX>:-UPy_LIMITED_API>
|
||||
$<$<COMPILE_LANGUAGE:CXX>:-std=c++20>
|
||||
$<$<COMPILE_LANGUAGE:CUDA>:-std=c++20>)
|
||||
|
||||
define_extension_target(
|
||||
_flashmla_extension_C
|
||||
|
||||
@@ -1,9 +1,9 @@
|
||||
# Install OpenAI triton_kernels from https://github.com/triton-lang/triton/tree/main/python/triton_kernels
|
||||
|
||||
set(DEFAULT_TRITON_KERNELS_TAG "v3.5.0")
|
||||
set(DEFAULT_TRITON_KERNELS_TAG "v3.6.0")
|
||||
|
||||
# Set TRITON_KERNELS_SRC_DIR for use with local development with vLLM. We expect TRITON_KERNELS_SRC_DIR to
|
||||
# be directly set to the triton_kernels python directory.
|
||||
# be directly set to the triton_kernels python directory.
|
||||
if (DEFINED ENV{TRITON_KERNELS_SRC_DIR})
|
||||
message(STATUS "[triton_kernels] Fetch from $ENV{TRITON_KERNELS_SRC_DIR}")
|
||||
FetchContent_Declare(
|
||||
@@ -24,7 +24,7 @@ else()
|
||||
)
|
||||
endif()
|
||||
|
||||
# Fetch content
|
||||
# Fetch content
|
||||
FetchContent_MakeAvailable(triton_kernels)
|
||||
|
||||
if (NOT triton_kernels_SOURCE_DIR)
|
||||
@@ -47,7 +47,7 @@ install(CODE "file(MAKE_DIRECTORY \"\${CMAKE_INSTALL_PREFIX}/vllm/third_party/tr
|
||||
## Copy .py files to install directory.
|
||||
install(DIRECTORY
|
||||
${TRITON_KERNELS_PYTHON_DIR}
|
||||
DESTINATION
|
||||
DESTINATION
|
||||
vllm/third_party/triton_kernels/
|
||||
COMPONENT triton_kernels
|
||||
FILES_MATCHING PATTERN "*.py")
|
||||
|
||||
@@ -7,6 +7,7 @@
|
||||
#include <vector>
|
||||
|
||||
void swap_blocks(torch::Tensor& src, torch::Tensor& dst,
|
||||
int64_t block_size_in_bytes,
|
||||
const torch::Tensor& block_mapping);
|
||||
|
||||
void reshape_and_cache(torch::Tensor& key, torch::Tensor& value,
|
||||
|
||||
@@ -24,7 +24,14 @@
|
||||
typedef __hip_bfloat16 __nv_bfloat16;
|
||||
#endif
|
||||
|
||||
#if defined(__gfx942__)
|
||||
constexpr float kFp8ScaleDivisor = 224.f;
|
||||
#else
|
||||
constexpr float kFp8ScaleDivisor = 448.f;
|
||||
#endif
|
||||
|
||||
void swap_blocks(torch::Tensor& src, torch::Tensor& dst,
|
||||
int64_t block_size_in_bytes,
|
||||
const torch::Tensor& block_mapping) {
|
||||
torch::Device src_device = src.device();
|
||||
torch::Device dst_device = dst.device();
|
||||
@@ -49,10 +56,6 @@ void swap_blocks(torch::Tensor& src, torch::Tensor& dst,
|
||||
char* src_ptr = static_cast<char*>(src.data_ptr());
|
||||
char* dst_ptr = static_cast<char*>(dst.data_ptr());
|
||||
|
||||
// We use the stride instead of numel in case the cache is padded for memory
|
||||
// alignment reasons, we assume the blocks data (inclusive of any padding)
|
||||
// is contiguous in memory
|
||||
const int64_t block_size_in_bytes = src.element_size() * src.stride(0);
|
||||
const at::cuda::OptionalCUDAGuard device_guard(
|
||||
src_device.is_cuda() ? src_device : dst_device);
|
||||
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
||||
@@ -205,7 +208,8 @@ __global__ void reshape_and_cache_flash_kernel(
|
||||
const int64_t block_stride, const int64_t page_stride,
|
||||
const int64_t head_stride, const int64_t key_stride,
|
||||
const int64_t value_stride, const int num_heads, const int head_size,
|
||||
const int block_size, const float* k_scale, const float* v_scale) {
|
||||
const int block_size, const float* k_scale, const float* v_scale,
|
||||
const int kv_scale_stride) {
|
||||
const int64_t token_idx = blockIdx.x;
|
||||
const int64_t slot_idx = slot_mapping[token_idx];
|
||||
// NOTE: slot_idx can be -1 if the token is padded
|
||||
@@ -229,21 +233,23 @@ __global__ void reshape_and_cache_flash_kernel(
|
||||
// this is true for the NHD layout where `head_stride == head_size`
|
||||
const bool is_contiguous_heads = (head_stride == head_size);
|
||||
|
||||
float k_scale_val = (kv_dt == Fp8KVCacheDataType::kAuto) ? 0.f : *k_scale;
|
||||
float v_scale_val = (kv_dt == Fp8KVCacheDataType::kAuto) ? 0.f : *v_scale;
|
||||
constexpr int VEC_SIZE = (sizeof(scalar_t) == 2) ? 8 : 4;
|
||||
CopyWithScaleOp<cache_t, scalar_t, kv_dt> k_op{k_scale_val};
|
||||
CopyWithScaleOp<cache_t, scalar_t, kv_dt> v_op{v_scale_val};
|
||||
if (is_contiguous_heads) {
|
||||
// NHD layout
|
||||
|
||||
if (is_contiguous_heads && kv_scale_stride == 0) {
|
||||
// NHD layout and k/v_scales are [1] (i.e. single scale for all heads)
|
||||
// kv cache: [num_blocks, block_size, num_heads, head_size]
|
||||
float k_scale_val = (kv_dt == Fp8KVCacheDataType::kAuto) ? 0.f : *k_scale;
|
||||
float v_scale_val = (kv_dt == Fp8KVCacheDataType::kAuto) ? 0.f : *v_scale;
|
||||
|
||||
CopyWithScaleOp<cache_t, scalar_t, kv_dt> k_op{k_scale_val};
|
||||
CopyWithScaleOp<cache_t, scalar_t, kv_dt> v_op{v_scale_val};
|
||||
|
||||
vectorize_with_alignment<VEC_SIZE>(key_src, key_dst, n_elems, threadIdx.x,
|
||||
blockDim.x, k_op);
|
||||
|
||||
vectorize_with_alignment<VEC_SIZE>(value_src, value_dst, n_elems,
|
||||
threadIdx.x, blockDim.x, v_op);
|
||||
|
||||
} else {
|
||||
// HND layout OR k/v_scales are [num_heads] (i.e. per-attn-head)
|
||||
// HND layout: heads are strided, but each head_size segment is contiguous
|
||||
// kv cache: [num_blocks, num_heads, block_size, head_size]
|
||||
const int lane = threadIdx.x & 31; // 0..31 within warp
|
||||
@@ -259,6 +265,16 @@ __global__ void reshape_and_cache_flash_kernel(
|
||||
cache_t* __restrict__ v_dst_h =
|
||||
value_dst + static_cast<int64_t>(head) * head_stride;
|
||||
|
||||
float k_scale_val = (kv_dt == Fp8KVCacheDataType::kAuto)
|
||||
? 0.f
|
||||
: k_scale[head * kv_scale_stride];
|
||||
float v_scale_val = (kv_dt == Fp8KVCacheDataType::kAuto)
|
||||
? 0.f
|
||||
: v_scale[head * kv_scale_stride];
|
||||
|
||||
CopyWithScaleOp<cache_t, scalar_t, kv_dt> k_op{k_scale_val};
|
||||
CopyWithScaleOp<cache_t, scalar_t, kv_dt> v_op{v_scale_val};
|
||||
|
||||
// within each head, let the 32 threads of the warp perform the vector
|
||||
// copy
|
||||
vectorize_with_alignment<VEC_SIZE>(k_src_h, k_dst_h, head_size, lane, 32,
|
||||
@@ -391,8 +407,7 @@ __global__ void concat_and_cache_ds_mla_kernel(
|
||||
}
|
||||
|
||||
// Compute the scale for the tile
|
||||
float tile_scale = max_abs / 448.f;
|
||||
tile_scale = fmaxf(tile_scale, FLT_MIN);
|
||||
float tile_scale = fmaxf(max_abs / kFp8ScaleDivisor, FLT_MIN);
|
||||
|
||||
// The first lane of each half-warp writes the scale to kv_cache
|
||||
if ((lane_idx == 0) || (lane_idx == 16)) {
|
||||
@@ -461,11 +476,8 @@ __global__ void indexer_k_quant_and_cache_kernel(
|
||||
#endif
|
||||
}
|
||||
|
||||
#if defined(__gfx942__)
|
||||
float scale = fmaxf(amax, 1e-4) / 224.0f;
|
||||
#else
|
||||
float scale = fmaxf(amax, 1e-4) / 448.0f;
|
||||
#endif
|
||||
float scale = fmaxf(amax, 1e-4) / kFp8ScaleDivisor;
|
||||
|
||||
if (use_ue8m0) {
|
||||
scale = exp2f(ceilf(log2f(scale)));
|
||||
}
|
||||
@@ -608,7 +620,8 @@ void reshape_and_cache(
|
||||
slot_mapping.data_ptr<int64_t>(), block_stride, page_stride, \
|
||||
head_stride, key_stride, value_stride, num_heads, head_size, \
|
||||
block_size, reinterpret_cast<const float*>(k_scale.data_ptr()), \
|
||||
reinterpret_cast<const float*>(v_scale.data_ptr()));
|
||||
reinterpret_cast<const float*>(v_scale.data_ptr()), \
|
||||
kv_scale_stride);
|
||||
|
||||
void reshape_and_cache_flash(
|
||||
torch::Tensor& key, // [num_tokens, num_heads, head_size]
|
||||
@@ -617,8 +630,9 @@ void reshape_and_cache_flash(
|
||||
torch::Tensor&
|
||||
value_cache, // [num_blocks, block_size, num_heads, head_size]
|
||||
torch::Tensor& slot_mapping, // [num_tokens] or [num_actual_tokens]
|
||||
const std::string& kv_cache_dtype, torch::Tensor& k_scale,
|
||||
torch::Tensor& v_scale) {
|
||||
const std::string& kv_cache_dtype,
|
||||
torch::Tensor& k_scale, // [1] or [num_heads]
|
||||
torch::Tensor& v_scale) { // [1] or [num_heads]
|
||||
// NOTE(woosuk): In vLLM V1, key.size(0) can be different from
|
||||
// slot_mapping.size(0) because of padding for CUDA graphs.
|
||||
// In vLLM V0, key.size(0) is always equal to slot_mapping.size(0) because
|
||||
@@ -641,6 +655,12 @@ void reshape_and_cache_flash(
|
||||
int64_t head_stride = key_cache.stride(2);
|
||||
TORCH_CHECK(key_cache.stride(0) == value_cache.stride(0));
|
||||
|
||||
TORCH_CHECK(k_scale.sizes() == v_scale.sizes(),
|
||||
"k_scale and v_scale must have the same shape");
|
||||
TORCH_CHECK(k_scale.numel() == 1 || k_scale.numel() == num_heads,
|
||||
"k_scale and v_scale must be of shape [1] or [num_heads]");
|
||||
int kv_scale_stride = (k_scale.numel() > 1) ? 1 : 0;
|
||||
|
||||
dim3 grid(num_tokens);
|
||||
dim3 block(std::min(num_heads * head_size, 512));
|
||||
const at::cuda::OptionalCUDAGuard device_guard(device_of(key));
|
||||
|
||||
@@ -1,79 +1,4 @@
|
||||
#include "cpu_attn_vec.hpp"
|
||||
#include "cpu_attn_vec16.hpp"
|
||||
|
||||
#ifdef CPU_CAPABILITY_AMXBF16
|
||||
#include "cpu_attn_amx.hpp"
|
||||
#define AMX_DISPATCH(...) \
|
||||
case cpu_attention::ISA::AMX: { \
|
||||
using attn_impl = cpu_attention::AttentionImpl<cpu_attention::ISA::AMX, \
|
||||
scalar_t, head_dim>; \
|
||||
return __VA_ARGS__(); \
|
||||
}
|
||||
#else
|
||||
#define AMX_DISPATCH(...) case cpu_attention::ISA::AMX:
|
||||
#endif
|
||||
|
||||
#ifdef __aarch64__
|
||||
#include "cpu_attn_neon.hpp"
|
||||
// NEON requires head_dim to be a multiple of 32
|
||||
#define NEON_DISPATCH(...) \
|
||||
case cpu_attention::ISA::NEON: { \
|
||||
using attn_impl = cpu_attention::AttentionImpl<cpu_attention::ISA::NEON, \
|
||||
scalar_t, head_dim>; \
|
||||
return __VA_ARGS__(); \
|
||||
}
|
||||
#else
|
||||
#define NEON_DISPATCH(...) case cpu_attention::ISA::NEON:
|
||||
#endif // #ifdef __aarch64__
|
||||
|
||||
#define CPU_ATTN_DISPATCH_CASE(HEAD_DIM, ...) \
|
||||
case HEAD_DIM: { \
|
||||
constexpr size_t head_dim = HEAD_DIM; \
|
||||
return __VA_ARGS__(); \
|
||||
}
|
||||
|
||||
#define CPU_ATTN_DISPATCH_CASE_HEADDIM(HEAD_DIM, ...) \
|
||||
[&] { \
|
||||
switch (HEAD_DIM) { \
|
||||
CPU_ATTN_DISPATCH_CASE(32, __VA_ARGS__) \
|
||||
CPU_ATTN_DISPATCH_CASE(64, __VA_ARGS__) \
|
||||
CPU_ATTN_DISPATCH_CASE(80, __VA_ARGS__) \
|
||||
CPU_ATTN_DISPATCH_CASE(96, __VA_ARGS__) \
|
||||
CPU_ATTN_DISPATCH_CASE(112, __VA_ARGS__) \
|
||||
CPU_ATTN_DISPATCH_CASE(128, __VA_ARGS__) \
|
||||
CPU_ATTN_DISPATCH_CASE(160, __VA_ARGS__) \
|
||||
CPU_ATTN_DISPATCH_CASE(192, __VA_ARGS__) \
|
||||
CPU_ATTN_DISPATCH_CASE(224, __VA_ARGS__) \
|
||||
CPU_ATTN_DISPATCH_CASE(256, __VA_ARGS__) \
|
||||
default: { \
|
||||
TORCH_CHECK(false, "Invalid CPU attention head_dim: " + \
|
||||
std::to_string(HEAD_DIM)); \
|
||||
} \
|
||||
} \
|
||||
}()
|
||||
|
||||
#define CPU_ATTN_DISPATCH_IMPL(ISA_TYPE, ...) \
|
||||
[&] { \
|
||||
switch (ISA_TYPE) { \
|
||||
AMX_DISPATCH(__VA_ARGS__) \
|
||||
NEON_DISPATCH(__VA_ARGS__) \
|
||||
case cpu_attention::ISA::VEC: { \
|
||||
using attn_impl = \
|
||||
cpu_attention::AttentionImpl<cpu_attention::ISA::VEC, scalar_t, \
|
||||
head_dim>; \
|
||||
return __VA_ARGS__(); \
|
||||
} \
|
||||
case cpu_attention::ISA::VEC16: { \
|
||||
using attn_impl = \
|
||||
cpu_attention::AttentionImpl<cpu_attention::ISA::VEC16, scalar_t, \
|
||||
head_dim>; \
|
||||
return __VA_ARGS__(); \
|
||||
} \
|
||||
default: { \
|
||||
TORCH_CHECK(false, "Invalid CPU attention ISA type."); \
|
||||
} \
|
||||
} \
|
||||
}()
|
||||
#include "cpu_attn_dispatch_generated.h"
|
||||
|
||||
torch::Tensor get_scheduler_metadata(
|
||||
const int64_t num_req, const int64_t num_heads_q,
|
||||
@@ -122,16 +47,14 @@ torch::Tensor get_scheduler_metadata(
|
||||
input.enable_kv_split = enable_kv_split;
|
||||
|
||||
VLLM_DISPATCH_FLOATING_TYPES(dtype, "get_scheduler_metadata", [&]() {
|
||||
CPU_ATTN_DISPATCH_CASE_HEADDIM(head_dim, [&] {
|
||||
CPU_ATTN_DISPATCH_IMPL(isa, [&]() {
|
||||
input.elem_size = sizeof(scalar_t);
|
||||
input.q_buffer_elem_size = sizeof(attn_impl::q_buffer_t);
|
||||
input.logits_buffer_elem_size = sizeof(attn_impl::logits_buffer_t);
|
||||
input.output_buffer_elem_size =
|
||||
sizeof(attn_impl::partial_output_buffer_t);
|
||||
input.max_num_q_per_iter = attn_impl::MaxQHeadNumPerIteration;
|
||||
input.kv_block_alignment = attn_impl::BlockSizeAlignment;
|
||||
});
|
||||
CPU_ATTN_DISPATCH(head_dim, isa, [&]() {
|
||||
input.elem_size = sizeof(scalar_t);
|
||||
input.q_buffer_elem_size = sizeof(attn_impl::q_buffer_t);
|
||||
input.logits_buffer_elem_size = sizeof(attn_impl::logits_buffer_t);
|
||||
input.output_buffer_elem_size =
|
||||
sizeof(attn_impl::partial_output_buffer_t);
|
||||
input.max_num_q_per_iter = attn_impl::MaxQHeadNumPerIteration;
|
||||
input.kv_block_alignment = attn_impl::BlockSizeAlignment;
|
||||
});
|
||||
});
|
||||
|
||||
@@ -184,18 +107,14 @@ void cpu_attn_reshape_and_cache(
|
||||
|
||||
VLLM_DISPATCH_FLOATING_TYPES(
|
||||
key.scalar_type(), "cpu_attn_reshape_and_cache", [&]() {
|
||||
CPU_ATTN_DISPATCH_CASE_HEADDIM(head_dim, [&] {
|
||||
CPU_ATTN_DISPATCH_IMPL(isa_tag, [&]() {
|
||||
attn_impl::reshape_and_cache(
|
||||
key.data_ptr<scalar_t>(), value.data_ptr<scalar_t>(),
|
||||
key_cache.data_ptr<scalar_t>(),
|
||||
value_cache.data_ptr<scalar_t>(),
|
||||
slot_mapping.data_ptr<int64_t>(), token_num,
|
||||
key_token_num_stride, value_token_num_stride, head_num,
|
||||
key_head_num_stride, value_head_num_stride, num_blocks,
|
||||
num_blocks_stride, cache_head_num_stride, block_size,
|
||||
block_size_stride);
|
||||
});
|
||||
CPU_ATTN_DISPATCH(head_dim, isa_tag, [&]() {
|
||||
attn_impl::reshape_and_cache(
|
||||
key.data_ptr<scalar_t>(), value.data_ptr<scalar_t>(),
|
||||
key_cache.data_ptr<scalar_t>(), value_cache.data_ptr<scalar_t>(),
|
||||
slot_mapping.data_ptr<int64_t>(), token_num, key_token_num_stride,
|
||||
value_token_num_stride, head_num, key_head_num_stride,
|
||||
value_head_num_stride, num_blocks, num_blocks_stride,
|
||||
cache_head_num_stride, block_size, block_size_stride);
|
||||
});
|
||||
});
|
||||
}
|
||||
@@ -257,12 +176,10 @@ void cpu_attention_with_kv_cache(
|
||||
|
||||
VLLM_DISPATCH_FLOATING_TYPES(
|
||||
query.scalar_type(), "cpu_attention_with_kv_cache", [&]() {
|
||||
CPU_ATTN_DISPATCH_CASE_HEADDIM(query.size(2), [&] {
|
||||
CPU_ATTN_DISPATCH_IMPL(input.metadata->isa, [&]() {
|
||||
TORCH_CHECK_EQ(input.block_size % attn_impl::BlockSizeAlignment, 0);
|
||||
cpu_attention::AttentionMainLoop<attn_impl> mainloop;
|
||||
mainloop(&input);
|
||||
});
|
||||
CPU_ATTN_DISPATCH(query.size(2), input.metadata->isa, [&]() {
|
||||
TORCH_CHECK_EQ(input.block_size % attn_impl::BlockSizeAlignment, 0);
|
||||
cpu_attention::AttentionMainLoop<attn_impl> mainloop;
|
||||
mainloop(&input);
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
@@ -377,7 +377,7 @@ class AttentionImpl<ISA::AMX, scalar_t, head_dim> {
|
||||
const int32_t q_heads_per_kv, const int64_t q_num_stride,
|
||||
const int64_t q_head_stride, const float scale) {
|
||||
constexpr int64_t bytes_per_head = head_dim * sizeof(scalar_t);
|
||||
// static_assert(bytes_per_head % AMX_TILE_ROW_BYTES == 0);
|
||||
static_assert(bytes_per_head % AMX_TILE_ROW_BYTES == 0);
|
||||
constexpr int64_t head_size_block_num = bytes_per_head / AMX_TILE_ROW_BYTES;
|
||||
constexpr int64_t head_elem_num_pre_block =
|
||||
AMX_TILE_ROW_BYTES / sizeof(scalar_t);
|
||||
|
||||
@@ -816,14 +816,10 @@ struct VecTypeTrait<float> {
|
||||
using vec_t = vec_op::FP32Vec16;
|
||||
};
|
||||
|
||||
// ARM only supports BF16 with ARMv8.6-A extension
|
||||
#if (defined(__aarch64__) && !defined(ARM_BF16_SUPPORT))
|
||||
#else
|
||||
template <>
|
||||
struct VecTypeTrait<c10::BFloat16> {
|
||||
using vec_t = vec_op::BF16Vec16;
|
||||
};
|
||||
#endif
|
||||
|
||||
#if !defined(__powerpc__) && !defined(__s390x__)
|
||||
template <>
|
||||
@@ -1111,7 +1107,8 @@ class AttentionMainLoop {
|
||||
if (sliding_window_left != -1) {
|
||||
pos = std::max(pos, curr_token_pos - sliding_window_left);
|
||||
}
|
||||
return pos;
|
||||
// Clamp to tile end to avoid OOB when window starts past the tile
|
||||
return std::min(pos, kv_tile_end_pos);
|
||||
}();
|
||||
|
||||
int32_t right_kv_pos = [&]() {
|
||||
@@ -1585,17 +1582,10 @@ class AttentionMainLoop {
|
||||
|
||||
if (use_sink) {
|
||||
alignas(64) float s_aux_fp32[16];
|
||||
#if defined(__aarch64__) && !defined(ARM_BF16_SUPPORT)
|
||||
// ARM without native BF16 support: manual conversion
|
||||
for (int i = 0; i < 16; ++i) {
|
||||
s_aux_fp32[i] = static_cast<float>(curr_s_aux[i]);
|
||||
}
|
||||
#else
|
||||
// All other platforms have BF16Vec16 available
|
||||
vec_op::BF16Vec16 vec_bf16(curr_s_aux);
|
||||
vec_op::FP32Vec16 vec_fp32(vec_bf16);
|
||||
vec_fp32.save(s_aux_fp32);
|
||||
#endif
|
||||
|
||||
float* __restrict__ curr_sum_buffer = sum_buffer;
|
||||
float* __restrict__ curr_max_buffer = max_buffer;
|
||||
|
||||
@@ -4,6 +4,9 @@
|
||||
#include "cpu_attn_impl.hpp"
|
||||
#include <arm_neon.h>
|
||||
#include <type_traits>
|
||||
#ifdef ARM_BF16_SUPPORT
|
||||
#include "cpu_attn_neon_bfmmla.hpp"
|
||||
#endif
|
||||
namespace cpu_attention {
|
||||
|
||||
namespace {
|
||||
@@ -57,7 +60,7 @@ FORCE_INLINE void load_row8_B_as_f32<c10::BFloat16>(const c10::BFloat16* p,
|
||||
#endif
|
||||
}
|
||||
|
||||
// Mx8, with 1 <= M <= 8 , K streamed, unroll-by-4 with NEON FMLAs
|
||||
// Mx8, with 1 <= M <= 8 , K streamed, unroll-by-4 with ASIMD FMLAs
|
||||
// #Loads = (K // 4) * (M + 4 * sizeof(kv_cache_t) / 2)
|
||||
// #FMLAs = (K // 4) * (4 * 2 * M)
|
||||
// We have (4 * 2 * M) FMLAs for (M + 4 * sizeof(kv_cache_t) / 2) loads
|
||||
@@ -264,7 +267,7 @@ class AttentionImpl<ISA::NEON, scalar_t, head_dim> {
|
||||
constexpr static ISA ISAType = ISA::NEON;
|
||||
constexpr static bool scale_on_logits = false; // apply scale on q_buffer
|
||||
|
||||
// static_assert(HeadDim % HeadDimAlignment == 0);
|
||||
static_assert(HeadDim % HeadDimAlignment == 0);
|
||||
// the gemm micro kernel is Mx8
|
||||
static_assert(HeadDimAlignment % 8 == 0);
|
||||
static_assert(BlockSizeAlignment % 8 == 0);
|
||||
@@ -381,6 +384,18 @@ class AttentionImpl<ISA::NEON, scalar_t, head_dim> {
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
#ifdef ARM_BF16_SUPPORT
|
||||
// For BF16 on Arm, reuse the BFMMLA kernels with 32-token alignment.
|
||||
template <int64_t head_dim>
|
||||
class AttentionImpl<ISA::NEON, c10::BFloat16, head_dim>
|
||||
: public AttentionImplNEONBFMMLA<BLOCK_SIZE_ALIGNMENT, ISA::NEON,
|
||||
head_dim> {};
|
||||
#endif
|
||||
} // namespace cpu_attention
|
||||
|
||||
#endif // #ifndef CPU_ATTN_NEON_HPP
|
||||
#undef BLOCK_SIZE_ALIGNMENT
|
||||
#undef HEAD_SIZE_ALIGNMENT
|
||||
#undef MAX_Q_HEAD_NUM_PER_ITER
|
||||
|
||||
#endif // #ifndef CPU_ATTN_ASIMD_HPP
|
||||
|
||||
682
csrc/cpu/cpu_attn_neon_bfmmla.hpp
Normal file
682
csrc/cpu/cpu_attn_neon_bfmmla.hpp
Normal file
@@ -0,0 +1,682 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
// SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
#ifndef CPU_ATTN_NEON_BFMMLA_HPP
|
||||
#define CPU_ATTN_NEON_BFMMLA_HPP
|
||||
|
||||
#include "cpu_attn_impl.hpp"
|
||||
|
||||
#include <arm_neon.h>
|
||||
|
||||
#include <cstdint>
|
||||
#include <vector>
|
||||
|
||||
namespace cpu_attention {
|
||||
|
||||
namespace {
|
||||
|
||||
// BFMMLA tile dimensions
|
||||
constexpr int32_t TILE_ROWS = 2; // M dimension
|
||||
constexpr int32_t TILE_K = 4; // K reduction
|
||||
constexpr int32_t TILE_COLS = 2; // N dimension (column-pair)
|
||||
|
||||
// Derived constants
|
||||
constexpr int32_t OUTPUT_COLS_PER_BLOCK = 8; // 4 column-pairs
|
||||
constexpr int32_t K_TOKENS_PER_GROUP = 8; // Tokens grouped in K cache
|
||||
constexpr int32_t V_TOKENS_PER_ROW_BLOCK = 4; // Tokens per V cache row block
|
||||
constexpr int32_t K_INNER_STRIDE = K_TOKENS_PER_GROUP * TILE_K;
|
||||
constexpr int32_t V_INNER_STRIDE = V_TOKENS_PER_ROW_BLOCK * TILE_COLS;
|
||||
constexpr int32_t PACK_ELEMENTS_PER_K_CHUNK = TILE_ROWS * TILE_K; // A packing
|
||||
|
||||
// Matrix Packing and Accumulator
|
||||
// Reshape two rows of Q into BFMMLA-friendly interleaved
|
||||
// Input: row0 = [a0,a1,a2,a3], row1 = [b0,b1,b2,b3]
|
||||
// Output: [a0,a1,a2,a3,b0,b1,b2,b3, a4,a5,a6,a7,b4,b5,b6,b7]
|
||||
// For K tail (K % TILE_K != 0): pads with zeros to complete the final chunk
|
||||
FORCE_INLINE void reshape_Q_2xK_for_bfmmla(const c10::BFloat16* __restrict r0,
|
||||
const c10::BFloat16* __restrict r1,
|
||||
c10::BFloat16* __restrict dst,
|
||||
int32_t K) {
|
||||
const uint16_t* s0 = reinterpret_cast<const uint16_t*>(r0);
|
||||
const uint16_t* s1 = reinterpret_cast<const uint16_t*>(r1);
|
||||
uint16_t* d = reinterpret_cast<uint16_t*>(dst);
|
||||
|
||||
// Process TILE_K elements at a time (PACK_ELEMENTS_PER_K_CHUNK output)
|
||||
int32_t k = 0;
|
||||
for (; k + TILE_K <= K; k += TILE_K, d += PACK_ELEMENTS_PER_K_CHUNK) {
|
||||
vst1q_u16(d, vcombine_u16(vld1_u16(s0 + k), vld1_u16(s1 + k)));
|
||||
}
|
||||
|
||||
// Handle K tail: pack remaining elements with zero-padding
|
||||
const int32_t tail = K - k;
|
||||
if (tail > 0) {
|
||||
// Pack remaining tail elements: [r0[k..k+tail-1], pad, r1[k..k+tail-1],
|
||||
// pad]
|
||||
for (int32_t t = 0; t < tail; ++t) {
|
||||
d[t] = s0[k + t];
|
||||
d[t + TILE_K] = s1[k + t];
|
||||
}
|
||||
// Zero-pad the rest
|
||||
for (int32_t t = tail; t < TILE_K; ++t) {
|
||||
d[t] = 0;
|
||||
d[t + TILE_K] = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// 2x2 accumulator load/store with compile-time row count
|
||||
template <int32_t m_rows>
|
||||
FORCE_INLINE float32x4_t load_acc_2x2(float* base, int64_t ldc, int col_off) {
|
||||
static_assert(m_rows == 1 || m_rows == 2);
|
||||
float32x2_t row0 = vld1_f32(base + col_off);
|
||||
float32x2_t row1 =
|
||||
(m_rows == 2) ? vld1_f32(base + ldc + col_off) : vdup_n_f32(0.f);
|
||||
return vcombine_f32(row0, row1);
|
||||
}
|
||||
|
||||
template <int32_t m_rows>
|
||||
FORCE_INLINE void store_acc_2x2(float32x4_t acc, float* base, int64_t ldc,
|
||||
int col_off) {
|
||||
static_assert(m_rows == 1 || m_rows == 2);
|
||||
vst1_f32(base + col_off, vget_low_f32(acc));
|
||||
if constexpr (m_rows == 2) {
|
||||
vst1_f32(base + ldc + col_off, vget_high_f32(acc));
|
||||
}
|
||||
}
|
||||
|
||||
// Initialize 4 column-pair accumulators for 2 rows (8 columns total)
|
||||
#define INIT_ACC_ROWPAIR_4(a0, a1, a2, a3, Crow, ldc, m_rows, accum) \
|
||||
do { \
|
||||
if (accum) { \
|
||||
if (m_rows == 2) { \
|
||||
a0 = load_acc_2x2<2>(Crow, ldc, 0); \
|
||||
a1 = load_acc_2x2<2>(Crow, ldc, 2); \
|
||||
a2 = load_acc_2x2<2>(Crow, ldc, 4); \
|
||||
a3 = load_acc_2x2<2>(Crow, ldc, 6); \
|
||||
} else { \
|
||||
a0 = load_acc_2x2<1>(Crow, ldc, 0); \
|
||||
a1 = load_acc_2x2<1>(Crow, ldc, 2); \
|
||||
a2 = load_acc_2x2<1>(Crow, ldc, 4); \
|
||||
a3 = load_acc_2x2<1>(Crow, ldc, 6); \
|
||||
} \
|
||||
} else { \
|
||||
a0 = a1 = a2 = a3 = vdupq_n_f32(0.f); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
// Store 4 column-pair accumulators back to C matrix
|
||||
#define STORE_ACC_ROWPAIR_4(a0, a1, a2, a3, Crow, ldc, m_rows) \
|
||||
do { \
|
||||
if (m_rows == 2) { \
|
||||
store_acc_2x2<2>(a0, Crow, ldc, 0); \
|
||||
store_acc_2x2<2>(a1, Crow, ldc, 2); \
|
||||
store_acc_2x2<2>(a2, Crow, ldc, 4); \
|
||||
store_acc_2x2<2>(a3, Crow, ldc, 6); \
|
||||
} else { \
|
||||
store_acc_2x2<1>(a0, Crow, ldc, 0); \
|
||||
store_acc_2x2<1>(a1, Crow, ldc, 2); \
|
||||
store_acc_2x2<1>(a2, Crow, ldc, 4); \
|
||||
store_acc_2x2<1>(a3, Crow, ldc, 6); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
// Perform 4 BFMMLA operations: acc += A @ B for 4 column-pairs
|
||||
#define BFMMLA_COMPUTE_4(r0, r1, r2, r3, a, b0, b1, b2, b3) \
|
||||
do { \
|
||||
r0 = vbfmmlaq_f32(r0, a, b0); \
|
||||
r1 = vbfmmlaq_f32(r1, a, b1); \
|
||||
r2 = vbfmmlaq_f32(r2, a, b2); \
|
||||
r3 = vbfmmlaq_f32(r3, a, b3); \
|
||||
} while (0)
|
||||
|
||||
// Micro-kernel: updates a small fixed tile using BFMMLA.
|
||||
// RP = number of row-pairs (1,2,4)
|
||||
// Computes C[TILE_ROWS*RP, OUTPUT_COLS_PER_BLOCK] += A_packed @ B.
|
||||
// A_packed interleaves RP row-pairs; B layout is driven by the attention phase:
|
||||
// - AttentionGemmPhase::QK -> token-column layout (Q @ K^T)
|
||||
// - AttentionGemmPhase::PV -> token-row layout (P @ V)
|
||||
// K_static < 0 enables runtime K (PV only)
|
||||
template <int32_t RP, int32_t K_static, AttentionGemmPhase phase>
|
||||
FORCE_INLINE void gemm_rowpairs_x8_bfmmla_neon(
|
||||
const bfloat16_t* const* __restrict A_packed_rp,
|
||||
const int32_t* __restrict m_rows_rp, const bfloat16_t* __restrict B_blk,
|
||||
float* __restrict C, int64_t ldc, bool accumulate, int64_t b_stride,
|
||||
int32_t K_runtime = 0) {
|
||||
static_assert(RP == 1 || RP == 2 || RP == 4, "RP must be 1,2,4");
|
||||
static_assert(K_static < 0 || K_static % TILE_K == 0,
|
||||
"K must be divisible by TILE_K");
|
||||
static_assert(K_static >= 0 || phase == AttentionGemmPhase::PV,
|
||||
"Runtime K only supported for PV");
|
||||
|
||||
constexpr bool runtime_k = (K_static < 0);
|
||||
const int32_t K_iters =
|
||||
runtime_k ? (K_runtime / TILE_K) : (K_static / TILE_K);
|
||||
const int32_t K_tail = runtime_k ? (K_runtime % TILE_K) : 0;
|
||||
|
||||
if (!runtime_k) {
|
||||
// Help the compiler fold away unused K_runtime when K is compile-time
|
||||
(void)K_runtime;
|
||||
}
|
||||
|
||||
auto* C_al = C;
|
||||
const auto* B_al = B_blk;
|
||||
|
||||
// Setup A pointers
|
||||
const bfloat16_t* a_ptr[4] = {
|
||||
A_packed_rp[0],
|
||||
(RP >= 2) ? A_packed_rp[1] : nullptr,
|
||||
(RP >= 4) ? A_packed_rp[2] : nullptr,
|
||||
(RP >= 4) ? A_packed_rp[3] : nullptr,
|
||||
};
|
||||
|
||||
// Setup B pointers based on layout
|
||||
const bfloat16_t* b_ptr[4];
|
||||
if constexpr (phase == AttentionGemmPhase::PV) {
|
||||
b_ptr[0] = B_blk + 0 * b_stride;
|
||||
b_ptr[1] = B_blk + 1 * b_stride;
|
||||
b_ptr[2] = B_blk + 2 * b_stride;
|
||||
b_ptr[3] = B_blk + 3 * b_stride;
|
||||
}
|
||||
|
||||
float32x4_t acc[4][4];
|
||||
|
||||
// Initialize accumulators
|
||||
#define INIT_RP(rp) \
|
||||
if constexpr (RP > rp) { \
|
||||
INIT_ACC_ROWPAIR_4(acc[rp][0], acc[rp][1], acc[rp][2], acc[rp][3], \
|
||||
C_al + (rp * 2) * ldc, ldc, m_rows_rp[rp], accumulate); \
|
||||
}
|
||||
INIT_RP(0);
|
||||
INIT_RP(1);
|
||||
INIT_RP(2);
|
||||
INIT_RP(3);
|
||||
#undef INIT_RP
|
||||
|
||||
// Main compute loop
|
||||
for (int32_t ki = 0; ki < K_iters; ++ki) {
|
||||
bfloat16x8_t b0, b1, b2, b3;
|
||||
if constexpr (phase == AttentionGemmPhase::PV) {
|
||||
b0 = vld1q_bf16(b_ptr[0] + ki * V_INNER_STRIDE);
|
||||
b1 = vld1q_bf16(b_ptr[1] + ki * V_INNER_STRIDE);
|
||||
b2 = vld1q_bf16(b_ptr[2] + ki * V_INNER_STRIDE);
|
||||
b3 = vld1q_bf16(b_ptr[3] + ki * V_INNER_STRIDE);
|
||||
} else {
|
||||
const bfloat16_t* b_base = B_al + ki * b_stride;
|
||||
b0 = vld1q_bf16(b_base + 0 * V_INNER_STRIDE);
|
||||
b1 = vld1q_bf16(b_base + 1 * V_INNER_STRIDE);
|
||||
b2 = vld1q_bf16(b_base + 2 * V_INNER_STRIDE);
|
||||
b3 = vld1q_bf16(b_base + 3 * V_INNER_STRIDE);
|
||||
}
|
||||
|
||||
#define COMPUTE_RP(rp) \
|
||||
if constexpr (RP > rp) { \
|
||||
bfloat16x8_t a = vld1q_bf16(a_ptr[rp] + ki * PACK_ELEMENTS_PER_K_CHUNK); \
|
||||
BFMMLA_COMPUTE_4(acc[rp][0], acc[rp][1], acc[rp][2], acc[rp][3], a, b0, \
|
||||
b1, b2, b3); \
|
||||
}
|
||||
COMPUTE_RP(0);
|
||||
COMPUTE_RP(1);
|
||||
COMPUTE_RP(2);
|
||||
COMPUTE_RP(3);
|
||||
#undef COMPUTE_RP
|
||||
}
|
||||
|
||||
// K tail for runtime PV: fallback path
|
||||
if constexpr (runtime_k) {
|
||||
if (K_tail > 0) {
|
||||
const int32_t tail_offset = K_iters * V_INNER_STRIDE;
|
||||
const int32_t a_tail_offset = K_iters * PACK_ELEMENTS_PER_K_CHUNK;
|
||||
for (int32_t kt = 0; kt < K_tail; ++kt) {
|
||||
float32x4_t b_vecs[4];
|
||||
for (int32_t p = 0; p < 4; ++p) {
|
||||
const bfloat16_t* bp = b_ptr[p] + tail_offset + kt * TILE_COLS;
|
||||
const float b0 = vcvtah_f32_bf16(bp[0]);
|
||||
const float b1 = vcvtah_f32_bf16(bp[1]);
|
||||
const float32x2_t b_pair = vset_lane_f32(b1, vdup_n_f32(b0), 1);
|
||||
b_vecs[p] = vcombine_f32(b_pair, b_pair);
|
||||
}
|
||||
|
||||
#define TAIL_RP(rp) \
|
||||
if constexpr (RP > rp) { \
|
||||
const bfloat16_t* ap = A_packed_rp[rp] + a_tail_offset; \
|
||||
float a_row0 = vcvtah_f32_bf16(ap[kt]); \
|
||||
float a_row1 = \
|
||||
(m_rows_rp[rp] == 2) ? vcvtah_f32_bf16(ap[kt + TILE_K]) : 0.0f; \
|
||||
const float32x4_t a_vec = \
|
||||
vcombine_f32(vdup_n_f32(a_row0), vdup_n_f32(a_row1)); \
|
||||
for (int32_t p = 0; p < 4; ++p) { \
|
||||
acc[rp][p] = vmlaq_f32(acc[rp][p], a_vec, b_vecs[p]); \
|
||||
} \
|
||||
}
|
||||
TAIL_RP(0);
|
||||
TAIL_RP(1);
|
||||
TAIL_RP(2);
|
||||
TAIL_RP(3);
|
||||
#undef TAIL_RP
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Store results
|
||||
#define STORE_RP(rp) \
|
||||
if constexpr (RP > rp) { \
|
||||
STORE_ACC_ROWPAIR_4(acc[rp][0], acc[rp][1], acc[rp][2], acc[rp][3], \
|
||||
C_al + (rp * 2) * ldc, ldc, m_rows_rp[rp]); \
|
||||
}
|
||||
STORE_RP(0);
|
||||
STORE_RP(1);
|
||||
STORE_RP(2);
|
||||
STORE_RP(3);
|
||||
#undef STORE_RP
|
||||
}
|
||||
|
||||
// Meso-kernel: packs a small MBxK slice of A, then tiles over N and calls the
|
||||
// micro-kernel for each OUTPUT_COLS_PER_BLOCK chunk. K_static < 0 enables
|
||||
// runtime K (PV only).
|
||||
template <int32_t MB, int32_t N, int32_t K_static, AttentionGemmPhase phase>
|
||||
FORCE_INLINE void gemm_packA_compute_MB_xN(
|
||||
const c10::BFloat16* __restrict A, const c10::BFloat16* __restrict B,
|
||||
float* __restrict C, int32_t K_runtime, int64_t lda, int64_t ldc,
|
||||
int64_t b_layout_stride, int64_t b_reduction_stride, bool accumulate) {
|
||||
static_assert(MB >= 1 && MB <= 8, "MB must be in [1,8]");
|
||||
static_assert(N % OUTPUT_COLS_PER_BLOCK == 0,
|
||||
"N must be a multiple of OUTPUT_COLS_PER_BLOCK");
|
||||
static_assert(K_static < 0 || K_static % TILE_K == 0,
|
||||
"K must be divisible by TILE_K");
|
||||
static_assert(K_static >= 0 || phase == AttentionGemmPhase::PV,
|
||||
"Runtime K only supported for PV");
|
||||
|
||||
constexpr bool runtime_k = (K_static < 0);
|
||||
const int32_t K_val = runtime_k ? K_runtime : K_static;
|
||||
|
||||
// Keep small packs on-stack to avoid heap churn
|
||||
constexpr int32_t STACK_PACK_STRIDE =
|
||||
(1024 / TILE_K) * PACK_ELEMENTS_PER_K_CHUNK;
|
||||
|
||||
constexpr int32_t ROW_PAIRS = (MB + 1) / TILE_ROWS;
|
||||
const int32_t pack_stride =
|
||||
runtime_k ? ((K_val + TILE_K - 1) / TILE_K) * PACK_ELEMENTS_PER_K_CHUNK
|
||||
: (K_static / TILE_K) * PACK_ELEMENTS_PER_K_CHUNK;
|
||||
|
||||
alignas(64) c10::BFloat16 A_packed_stack[ROW_PAIRS * STACK_PACK_STRIDE];
|
||||
std::vector<c10::BFloat16> A_packed_heap;
|
||||
c10::BFloat16* A_packed =
|
||||
(pack_stride <= STACK_PACK_STRIDE)
|
||||
? A_packed_stack
|
||||
: (A_packed_heap.resize(ROW_PAIRS * pack_stride),
|
||||
A_packed_heap.data());
|
||||
|
||||
for (int32_t rp = 0; rp < ROW_PAIRS; ++rp) {
|
||||
const int32_t m = rp * TILE_ROWS;
|
||||
const int32_t m_rows = (m + 1 < MB) ? TILE_ROWS : 1;
|
||||
const c10::BFloat16* A0 = A + m * lda;
|
||||
const c10::BFloat16* A1 = (m_rows == TILE_ROWS) ? (A + (m + 1) * lda) : A0;
|
||||
reshape_Q_2xK_for_bfmmla(A0, A1, A_packed + rp * pack_stride, K_val);
|
||||
}
|
||||
|
||||
for (int32_t n = 0; n < N; n += OUTPUT_COLS_PER_BLOCK) {
|
||||
const c10::BFloat16* B_blk_c10 =
|
||||
(phase == AttentionGemmPhase::PV)
|
||||
? (B + (n / TILE_COLS) * b_layout_stride)
|
||||
: (B + (n / OUTPUT_COLS_PER_BLOCK) * b_layout_stride);
|
||||
const bfloat16_t* B_blk = reinterpret_cast<const bfloat16_t*>(B_blk_c10);
|
||||
|
||||
// Process row-pairs in groups of 4, 2, then 1
|
||||
int32_t row_pair_idx = 0;
|
||||
|
||||
#define PROCESS_RP_GROUP(group_size) \
|
||||
for (; row_pair_idx + (group_size - 1) < ROW_PAIRS; \
|
||||
row_pair_idx += group_size) { \
|
||||
const bfloat16_t* Ap[group_size]; \
|
||||
int32_t mr[group_size]; \
|
||||
for (int32_t i = 0; i < group_size; ++i) { \
|
||||
Ap[i] = reinterpret_cast<const bfloat16_t*>( \
|
||||
A_packed + (row_pair_idx + i) * pack_stride); \
|
||||
mr[i] = (((row_pair_idx + i) * TILE_ROWS + 1) < MB) ? TILE_ROWS : 1; \
|
||||
} \
|
||||
float* C_blk = C + (row_pair_idx * TILE_ROWS) * ldc + n; \
|
||||
if constexpr (runtime_k) { \
|
||||
gemm_rowpairs_x8_bfmmla_neon<group_size, -1, phase>( \
|
||||
Ap, mr, B_blk, C_blk, ldc, accumulate, b_layout_stride, K_val); \
|
||||
} else { \
|
||||
gemm_rowpairs_x8_bfmmla_neon<group_size, K_static, phase>( \
|
||||
Ap, mr, B_blk, C_blk, ldc, accumulate, \
|
||||
(phase == AttentionGemmPhase::PV) ? b_layout_stride \
|
||||
: b_reduction_stride); \
|
||||
} \
|
||||
}
|
||||
|
||||
PROCESS_RP_GROUP(4);
|
||||
PROCESS_RP_GROUP(2);
|
||||
PROCESS_RP_GROUP(1);
|
||||
#undef PROCESS_RP_GROUP
|
||||
}
|
||||
}
|
||||
|
||||
// Macro-kernel: iterates over M in MB={8,4,2,1} chunks.
|
||||
// Supports compile-time K specialization when K >= 0; otherwise uses runtime K
|
||||
// (runtime K path is only supported for PV).
|
||||
template <AttentionGemmPhase phase, int32_t N, int32_t K = -1>
|
||||
FORCE_INLINE void gemm_macro_neon_bfmmla(
|
||||
const c10::BFloat16* __restrict A, const c10::BFloat16* __restrict B,
|
||||
float* __restrict C, int32_t M, int32_t K_runtime, int64_t lda, int64_t ldc,
|
||||
int64_t b_layout_stride, int64_t b_reduction_stride, bool accumulate) {
|
||||
static_assert(N % OUTPUT_COLS_PER_BLOCK == 0,
|
||||
"N must be a multiple of OUTPUT_COLS_PER_BLOCK");
|
||||
|
||||
if constexpr (K >= 0) {
|
||||
static_assert(K % TILE_K == 0, "K must be divisible by TILE_K");
|
||||
for (int32_t m = 0; m < M;) {
|
||||
const int32_t rem = M - m;
|
||||
const c10::BFloat16* A_blk = A + m * lda;
|
||||
float* C_blk = C + m * ldc;
|
||||
|
||||
#define DISPATCH_MB(mb) \
|
||||
gemm_packA_compute_MB_xN<mb, N, K, phase>(A_blk, B, C_blk, 0, lda, ldc, \
|
||||
b_layout_stride, \
|
||||
b_reduction_stride, accumulate)
|
||||
|
||||
if (rem >= 8) {
|
||||
DISPATCH_MB(8);
|
||||
m += 8;
|
||||
} else if (rem >= 4) {
|
||||
DISPATCH_MB(4);
|
||||
m += 4;
|
||||
} else if (rem >= 2) {
|
||||
DISPATCH_MB(2);
|
||||
m += 2;
|
||||
} else {
|
||||
DISPATCH_MB(1);
|
||||
m += 1;
|
||||
}
|
||||
#undef DISPATCH_MB
|
||||
}
|
||||
} else {
|
||||
static_assert(phase == AttentionGemmPhase::PV,
|
||||
"Runtime K specialization only supported for PV.");
|
||||
const int32_t K_val = K_runtime;
|
||||
|
||||
for (int32_t m = 0; m < M;) {
|
||||
const int32_t rem = M - m;
|
||||
const c10::BFloat16* A_blk = A + m * lda;
|
||||
float* C_blk = C + m * ldc;
|
||||
|
||||
#define DISPATCH_MB_RUNTIME(mb) \
|
||||
gemm_packA_compute_MB_xN<mb, N, -1, phase>(A_blk, B, C_blk, K_val, lda, ldc, \
|
||||
b_layout_stride, \
|
||||
b_reduction_stride, accumulate)
|
||||
|
||||
if (rem >= 8) {
|
||||
DISPATCH_MB_RUNTIME(8);
|
||||
m += 8;
|
||||
} else if (rem >= 4) {
|
||||
DISPATCH_MB_RUNTIME(4);
|
||||
m += 4;
|
||||
} else if (rem >= 2) {
|
||||
DISPATCH_MB_RUNTIME(2);
|
||||
m += 2;
|
||||
} else {
|
||||
DISPATCH_MB_RUNTIME(1);
|
||||
m += 1;
|
||||
}
|
||||
#undef DISPATCH_MB_RUNTIME
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#undef INIT_ACC_ROWPAIR_4
|
||||
#undef STORE_ACC_ROWPAIR_4
|
||||
#undef BFMMLA_COMPUTE_4
|
||||
|
||||
} // namespace
|
||||
|
||||
// TileGemm Adapter for Attention
|
||||
|
||||
template <typename kv_cache_t, int32_t BlockTokens, int32_t HeadDim>
|
||||
class TileGemmNEONBFMMLA {
|
||||
public:
|
||||
template <AttentionGemmPhase phase, int32_t head_dim_ct>
|
||||
FORCE_INLINE static void gemm(const int32_t m_size, void* __restrict__ a_tile,
|
||||
kv_cache_t* __restrict__ b_tile,
|
||||
float* __restrict__ c_tile, const int64_t lda,
|
||||
[[maybe_unused]] const int64_t ldb,
|
||||
const int64_t ldc,
|
||||
[[maybe_unused]] const int32_t block_size,
|
||||
[[maybe_unused]] const int32_t dynamic_k_size,
|
||||
const bool accum_c) {
|
||||
static_assert(BlockTokens % OUTPUT_COLS_PER_BLOCK == 0);
|
||||
// BFMMLA kernels require compile-time head_dim; keep head_dim_ct only for
|
||||
// API parity with other tile_gemm implementations.
|
||||
if constexpr (head_dim_ct >= 0) {
|
||||
static_assert(head_dim_ct == HeadDim,
|
||||
"BFMMLA expects head_dim_ct to match HeadDim; PV passes "
|
||||
"-1 for API parity.");
|
||||
}
|
||||
|
||||
if constexpr (phase == AttentionGemmPhase::QK) {
|
||||
const int64_t b_reduction_stride = K_INNER_STRIDE;
|
||||
const int64_t b_token_block_stride = (HeadDim / TILE_K) * K_INNER_STRIDE;
|
||||
|
||||
gemm_macro_neon_bfmmla<AttentionGemmPhase::QK, BlockTokens, HeadDim>(
|
||||
reinterpret_cast<const c10::BFloat16*>(a_tile), b_tile, c_tile,
|
||||
m_size, 0, lda, ldc, b_token_block_stride, b_reduction_stride,
|
||||
accum_c);
|
||||
} else {
|
||||
const int64_t b_pair_stride =
|
||||
(block_size / V_TOKENS_PER_ROW_BLOCK) * V_INNER_STRIDE;
|
||||
|
||||
// PV gemm with runtime K specialization
|
||||
switch (dynamic_k_size) {
|
||||
case 32:
|
||||
gemm_macro_neon_bfmmla<AttentionGemmPhase::PV, HeadDim, 32>(
|
||||
reinterpret_cast<const c10::BFloat16*>(a_tile), b_tile, c_tile,
|
||||
m_size, 32, lda, ldc, b_pair_stride, 0, accum_c);
|
||||
break;
|
||||
case 128:
|
||||
gemm_macro_neon_bfmmla<AttentionGemmPhase::PV, HeadDim, 128>(
|
||||
reinterpret_cast<const c10::BFloat16*>(a_tile), b_tile, c_tile,
|
||||
m_size, 128, lda, ldc, b_pair_stride, 0, accum_c);
|
||||
break;
|
||||
case 256:
|
||||
gemm_macro_neon_bfmmla<AttentionGemmPhase::PV, HeadDim, 256>(
|
||||
reinterpret_cast<const c10::BFloat16*>(a_tile), b_tile, c_tile,
|
||||
m_size, 256, lda, ldc, b_pair_stride, 0, accum_c);
|
||||
break;
|
||||
default:
|
||||
gemm_macro_neon_bfmmla<AttentionGemmPhase::PV, HeadDim>(
|
||||
reinterpret_cast<const c10::BFloat16*>(a_tile), b_tile, c_tile,
|
||||
m_size, dynamic_k_size, lda, ldc, b_pair_stride, 0, accum_c);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
// Shared ASIMD BFMMLA implementation (BF16 only). The block size alignment and
|
||||
// ISA tag are template parameters so we can reuse the same kernels for
|
||||
// different NEON configurations.
|
||||
template <int64_t block_size_alignment, ISA isa_type, int64_t head_dim>
|
||||
class AttentionImplNEONBFMMLA {
|
||||
public:
|
||||
using query_t = c10::BFloat16;
|
||||
using q_buffer_t = c10::BFloat16;
|
||||
using kv_cache_t = c10::BFloat16;
|
||||
using logits_buffer_t = float;
|
||||
using partial_output_buffer_t = float;
|
||||
using prob_buffer_t = c10::BFloat16;
|
||||
|
||||
static constexpr int64_t BlockSizeAlignment = block_size_alignment;
|
||||
// HeadDimAlignment equals head_dim so that the PV phase processes
|
||||
// the full head dimension in a single gemm call.
|
||||
static constexpr int64_t HeadDimAlignment = head_dim;
|
||||
static constexpr int64_t MaxQHeadNumPerIteration = 16;
|
||||
static constexpr int64_t HeadDim = head_dim;
|
||||
static constexpr ISA ISAType = isa_type;
|
||||
static constexpr bool scale_on_logits = false;
|
||||
|
||||
static_assert(HeadDim % OUTPUT_COLS_PER_BLOCK == 0);
|
||||
static_assert(BlockSizeAlignment % OUTPUT_COLS_PER_BLOCK == 0);
|
||||
static_assert(HeadDim % TILE_K == 0, "HeadDim must be a multiple of TILE_K");
|
||||
|
||||
public:
|
||||
template <template <typename tile_gemm_t> typename attention>
|
||||
FORCE_INLINE void execute_attention(DEFINE_CPU_ATTENTION_PARAMS) {
|
||||
attention<
|
||||
TileGemmNEONBFMMLA<kv_cache_t, static_cast<int32_t>(BlockSizeAlignment),
|
||||
static_cast<int32_t>(HeadDim)>>
|
||||
attention_iteration;
|
||||
attention_iteration(CPU_ATTENTION_PARAMS);
|
||||
}
|
||||
|
||||
// Key cache stride per token group (TokenColumn layout; QK)
|
||||
static constexpr int64_t k_cache_token_group_stride(
|
||||
[[maybe_unused]] const int32_t block_size) {
|
||||
static_assert(BlockSizeAlignment % K_TOKENS_PER_GROUP == 0);
|
||||
return (BlockSizeAlignment / K_TOKENS_PER_GROUP) *
|
||||
((head_dim / TILE_K) * K_INNER_STRIDE);
|
||||
}
|
||||
|
||||
// Value cache stride per token group (TokenRow layout; PV)
|
||||
static constexpr int64_t v_cache_token_group_stride(
|
||||
[[maybe_unused]] const int32_t block_size) {
|
||||
static_assert(BlockSizeAlignment % V_TOKENS_PER_ROW_BLOCK == 0);
|
||||
return (BlockSizeAlignment / V_TOKENS_PER_ROW_BLOCK) * V_INNER_STRIDE;
|
||||
}
|
||||
|
||||
// The stride to move to the "next" head_dim group
|
||||
// is the full V cache size per head, since HeadDimAlignment == head_dim.
|
||||
// Hence, the stride is not used in this case
|
||||
static constexpr int64_t v_cache_head_group_stride(
|
||||
[[maybe_unused]] const int32_t block_size) {
|
||||
return head_dim * block_size;
|
||||
}
|
||||
|
||||
// Convert Q heads to BF16 and apply scale factor using native BF16 intrinsics
|
||||
static void copy_q_heads_tile(c10::BFloat16* __restrict__ src,
|
||||
c10::BFloat16* __restrict__ q_buffer,
|
||||
const int32_t q_num,
|
||||
const int32_t q_heads_per_kv,
|
||||
const int64_t q_num_stride,
|
||||
const int64_t q_head_stride, float scale) {
|
||||
constexpr int32_t dim = static_cast<int32_t>(head_dim);
|
||||
const float32x4_t scale_vec = vdupq_n_f32(scale);
|
||||
|
||||
for (int32_t qi = 0; qi < q_num; ++qi) {
|
||||
for (int32_t hi = 0; hi < q_heads_per_kv; ++hi) {
|
||||
c10::BFloat16* __restrict__ curr_q =
|
||||
src + qi * q_num_stride + hi * q_head_stride;
|
||||
c10::BFloat16* __restrict__ dst =
|
||||
q_buffer + qi * q_heads_per_kv * head_dim + hi * head_dim;
|
||||
|
||||
for (int32_t i = 0; i < dim; i += OUTPUT_COLS_PER_BLOCK) {
|
||||
bfloat16x8_t in8 =
|
||||
vld1q_bf16(reinterpret_cast<const bfloat16_t*>(curr_q + i));
|
||||
float32x4_t lo = vmulq_f32(vcvtq_low_f32_bf16(in8), scale_vec);
|
||||
float32x4_t hi = vmulq_f32(vcvtq_high_f32_bf16(in8), scale_vec);
|
||||
|
||||
bfloat16x4_t lo_b = vcvt_bf16_f32(lo);
|
||||
bfloat16x4_t hi_b = vcvt_bf16_f32(hi);
|
||||
bfloat16x8_t out = vcombine_bf16(lo_b, hi_b);
|
||||
vst1q_bf16(reinterpret_cast<bfloat16_t*>(dst + i), out);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public:
|
||||
// Reshape and cache K/V into BFMMLA-optimized layouts
|
||||
// K cache:
|
||||
// [block_size/K_TOKENS_PER_GROUP][head_dim/TILE_K][K_INNER_STRIDE]
|
||||
// - TokenColumn
|
||||
// V cache:
|
||||
// [head_dim/TILE_COLS][block_size/V_TOKENS_PER_ROW_BLOCK][V_INNER_STRIDE]
|
||||
// - TokenRows
|
||||
static void reshape_and_cache(
|
||||
const c10::BFloat16* __restrict__ key,
|
||||
const c10::BFloat16* __restrict__ value,
|
||||
c10::BFloat16* __restrict__ key_cache,
|
||||
c10::BFloat16* __restrict__ value_cache,
|
||||
const int64_t* __restrict__ slot_mapping, const int64_t token_num,
|
||||
const int64_t key_token_num_stride, const int64_t value_token_num_stride,
|
||||
const int64_t head_num, const int64_t key_head_num_stride,
|
||||
const int64_t value_head_num_stride,
|
||||
[[maybe_unused]] const int64_t num_blocks,
|
||||
const int64_t num_blocks_stride, const int64_t cache_head_num_stride,
|
||||
const int64_t block_size,
|
||||
[[maybe_unused]] const int64_t block_size_stride) {
|
||||
const int64_t k_block_stride = (head_dim / TILE_K) * K_INNER_STRIDE;
|
||||
const int64_t v_pair_stride =
|
||||
(block_size / V_TOKENS_PER_ROW_BLOCK) * V_INNER_STRIDE;
|
||||
|
||||
#pragma omp parallel for
|
||||
for (int64_t head_idx = 0; head_idx < head_num; ++head_idx) {
|
||||
for (int64_t token_idx = 0; token_idx < token_num; ++token_idx) {
|
||||
const int64_t pos = slot_mapping[token_idx];
|
||||
if (pos < 0) continue;
|
||||
|
||||
const int64_t block_idx = pos / block_size;
|
||||
const int64_t block_offset = pos % block_size;
|
||||
|
||||
// Key cache: TokenColumn QK
|
||||
{
|
||||
const c10::BFloat16* __restrict key_src =
|
||||
key + token_idx * key_token_num_stride +
|
||||
head_idx * key_head_num_stride;
|
||||
|
||||
c10::BFloat16* __restrict key_base = key_cache +
|
||||
block_idx * num_blocks_stride +
|
||||
head_idx * cache_head_num_stride;
|
||||
|
||||
const int64_t block_in_block = block_offset / K_TOKENS_PER_GROUP;
|
||||
const int64_t pair_in_block =
|
||||
(block_offset % K_TOKENS_PER_GROUP) / TILE_COLS;
|
||||
const int64_t lane_base = (block_offset & 1) ? TILE_K : 0;
|
||||
|
||||
c10::BFloat16* __restrict block_base =
|
||||
key_base + block_in_block * k_block_stride;
|
||||
|
||||
for (int64_t hd4 = 0; hd4 < head_dim / TILE_K; ++hd4) {
|
||||
uint16_t* dst_u16 = reinterpret_cast<uint16_t*>(
|
||||
block_base + hd4 * K_INNER_STRIDE +
|
||||
pair_in_block * V_INNER_STRIDE + lane_base);
|
||||
const uint16_t* src_u16 =
|
||||
reinterpret_cast<const uint16_t*>(key_src + hd4 * TILE_K);
|
||||
vst1_u16(dst_u16, vld1_u16(src_u16));
|
||||
}
|
||||
}
|
||||
|
||||
// Value cache: TokenRow PV
|
||||
{
|
||||
const c10::BFloat16* __restrict value_src =
|
||||
value + token_idx * value_token_num_stride +
|
||||
head_idx * value_head_num_stride;
|
||||
|
||||
c10::BFloat16* __restrict value_base =
|
||||
value_cache + block_idx * num_blocks_stride +
|
||||
head_idx * cache_head_num_stride;
|
||||
|
||||
const int64_t row_block = block_offset / V_TOKENS_PER_ROW_BLOCK;
|
||||
const int64_t lane = block_offset & (V_TOKENS_PER_ROW_BLOCK - 1);
|
||||
|
||||
c10::BFloat16* __restrict row_block_base =
|
||||
value_base + row_block * V_INNER_STRIDE;
|
||||
|
||||
for (int64_t hd2 = 0; hd2 < head_dim / TILE_COLS; ++hd2) {
|
||||
c10::BFloat16* __restrict dst_val =
|
||||
row_block_base + hd2 * v_pair_stride;
|
||||
|
||||
const uint16_t* src_u16 =
|
||||
reinterpret_cast<const uint16_t*>(value_src);
|
||||
uint16_t* dst_u16 = reinterpret_cast<uint16_t*>(dst_val);
|
||||
dst_u16[lane] = src_u16[hd2 * TILE_COLS + 0];
|
||||
dst_u16[lane + V_TOKENS_PER_ROW_BLOCK] =
|
||||
src_u16[hd2 * TILE_COLS + 1];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace cpu_attention
|
||||
|
||||
#endif // CPU_ATTN_ASIMD_BFMMLA_HPP
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user