Compare commits
811 Commits
v0.14.0rc2
...
v0.16.0
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
89a77b1084 | ||
|
|
d3c1513f5f | ||
|
|
5dbfbc967b | ||
|
|
c86cdcbcd2 | ||
|
|
3c9496f146 | ||
|
|
2d5be1dd5c | ||
|
|
7a06e5b05b | ||
|
|
946b2f106c | ||
|
|
5e8adb0c49 | ||
|
|
9be1ff2d3a | ||
|
|
b3ee90f961 | ||
|
|
c44d0c6d66 | ||
|
|
83db96d8cd | ||
|
|
dbfb79fe45 | ||
|
|
b2e1fc3589 | ||
|
|
55a1baebc5 | ||
|
|
e1e9841631 | ||
|
|
5bd63387c3 | ||
|
|
22b64948f6 | ||
|
|
7c233dbb36 | ||
|
|
a75a5b54c7 | ||
|
|
f97ca67176 | ||
|
|
084aa19f02 | ||
|
|
1ecfabe525 | ||
|
|
4df841fe75 | ||
|
|
a263aa6140 | ||
|
|
179ae7da8f | ||
|
|
c4df59ad43 | ||
|
|
785cf28fff | ||
|
|
a96197f564 | ||
|
|
ab10d79855 | ||
|
|
7fcb705b80 | ||
|
|
b956cdf818 | ||
|
|
ed17f54c8b | ||
|
|
860981d8d8 | ||
|
|
52181baaea | ||
|
|
de3869bb4d | ||
|
|
ce9b3cd3e9 | ||
|
|
db4ede9743 | ||
|
|
2cb2340f7a | ||
|
|
4df44c16ba | ||
|
|
81fe69cae5 | ||
|
|
dd6a6e1190 | ||
|
|
edb359cce4 | ||
|
|
6ed5eda300 | ||
|
|
11a4c9d30d | ||
|
|
15a0b9e570 | ||
|
|
c490d8cc73 | ||
|
|
48312e579a | ||
|
|
bc32444b23 | ||
|
|
18e8545297 | ||
|
|
6f7adc533a | ||
|
|
40218a82ba | ||
|
|
1c3b22058f | ||
|
|
3920cafdd6 | ||
|
|
ec28784fdc | ||
|
|
55aeec04f5 | ||
|
|
906077181b | ||
|
|
89a385d79f | ||
|
|
4a2d00eafd | ||
|
|
207c3a0c20 | ||
|
|
ae2e93f89b | ||
|
|
9e9acce577 | ||
|
|
fe5438200b | ||
|
|
77c09e1130 | ||
|
|
16786da735 | ||
|
|
aaa2efbe98 | ||
|
|
aca5967416 | ||
|
|
67a746e87f | ||
|
|
7bec435130 | ||
|
|
5c52644b10 | ||
|
|
2ce9fe4ad0 | ||
|
|
cd8b405bd0 | ||
|
|
4707f7ebb4 | ||
|
|
c39ee9ee2b | ||
|
|
350ca72c04 | ||
|
|
1fb0495a72 | ||
|
|
85ee1d962b | ||
|
|
51a7bda625 | ||
|
|
6e7b1c4b59 | ||
|
|
2991dd3d22 | ||
|
|
ac32e66cf9 | ||
|
|
f79d9dce16 | ||
|
|
ba5cbbf107 | ||
|
|
233b26ab35 | ||
|
|
791a94bed0 | ||
|
|
e969a169ef | ||
|
|
6d8d34be6d | ||
|
|
1363e3d6d5 | ||
|
|
965525667b | ||
|
|
6550815c3a | ||
|
|
7439e4f41b | ||
|
|
ac04dd374f | ||
|
|
035a6cb09a | ||
|
|
a32cb49b60 | ||
|
|
20d7454c9b | ||
|
|
5819ca8944 | ||
|
|
79028d4388 | ||
|
|
325ab6b0a8 | ||
|
|
91a07ff618 | ||
|
|
d5c4800112 | ||
|
|
42d5d705f9 | ||
|
|
116880a5a0 | ||
|
|
4145e50d85 | ||
|
|
20f5d185a6 | ||
|
|
1887acca9e | ||
|
|
92e7562a99 | ||
|
|
87d0d17ab5 | ||
|
|
a57c8228ff | ||
|
|
1ee95841bd | ||
|
|
7d8c6804e2 | ||
|
|
af3162d3aa | ||
|
|
5b2a9422f0 | ||
|
|
c1858b7ec8 | ||
|
|
82914d2ae8 | ||
|
|
81a90e5277 | ||
|
|
1c3a221d3b | ||
|
|
7bd42e609d | ||
|
|
a2522839d8 | ||
|
|
59a5cb387a | ||
|
|
8322d4e47f | ||
|
|
3e472e81f9 | ||
|
|
038914b7c8 | ||
|
|
d2f4a71cd5 | ||
|
|
2abd97592f | ||
|
|
6abb0454ad | ||
|
|
db6f71d4c9 | ||
|
|
fd03538bf9 | ||
|
|
1f70313e59 | ||
|
|
07daee132b | ||
|
|
9595afda18 | ||
|
|
c1395f72cd | ||
|
|
007b183d74 | ||
|
|
add9f1fbd9 | ||
|
|
e3bf79ffa0 | ||
|
|
fb1270f1f8 | ||
|
|
72bb24e2db | ||
|
|
a7be77beef | ||
|
|
bbe0574d8e | ||
|
|
4d9513537d | ||
|
|
439afa4eea | ||
|
|
fa4e0fb028 | ||
|
|
ce498a6d61 | ||
|
|
9f14c9224d | ||
|
|
535de06cb1 | ||
|
|
4292c90a2a | ||
|
|
6e98f6d8b6 | ||
|
|
2f6d17cb2f | ||
|
|
192ad4648b | ||
|
|
0e92298622 | ||
|
|
87d9a26166 | ||
|
|
80f921ba4b | ||
|
|
711edaf0d0 | ||
|
|
1d367a738e | ||
|
|
32a02c7ca2 | ||
|
|
f67ee8b859 | ||
|
|
e57ef99b40 | ||
|
|
f8516a1ab9 | ||
|
|
824058076c | ||
|
|
8e32690869 | ||
|
|
a208439537 | ||
|
|
bcd2f74c0d | ||
|
|
f79f777803 | ||
|
|
4c8d1bf361 | ||
|
|
061da6bcf7 | ||
|
|
4403e3ed4c | ||
|
|
08e094997e | ||
|
|
d88a1df699 | ||
|
|
90d74ebaa4 | ||
|
|
45f8fd6f97 | ||
|
|
5e1e0a0fbd | ||
|
|
eb5ed20743 | ||
|
|
2647163674 | ||
|
|
9fb27dd3b3 | ||
|
|
4dffc5e044 | ||
|
|
e1bf04b6c2 | ||
|
|
02080179a3 | ||
|
|
1b8fe6f7c4 | ||
|
|
52ee21021a | ||
|
|
655efb3e69 | ||
|
|
bd8da29a66 | ||
|
|
2a99c5a6c8 | ||
|
|
3f7662d650 | ||
|
|
a372f3f40a | ||
|
|
61e632aea1 | ||
|
|
b1bb18de8d | ||
|
|
2267cb1cfd | ||
|
|
0d6ccf68fa | ||
|
|
18e7cbbb15 | ||
|
|
f0d5251715 | ||
|
|
5c4f2dd6ef | ||
|
|
f3d8a34671 | ||
|
|
4bc913aeec | ||
|
|
fbb3cf6981 | ||
|
|
2df2b3499d | ||
|
|
2a8d84e66d | ||
|
|
a3acfa1071 | ||
|
|
be8168ff88 | ||
|
|
f6af34626d | ||
|
|
ceab70c89d | ||
|
|
52683ccbe1 | ||
|
|
e346e2d056 | ||
|
|
83449a5ff0 | ||
|
|
dad2d6a590 | ||
|
|
32e84fa1ff | ||
|
|
fd9c83d0e0 | ||
|
|
b95cc5014d | ||
|
|
61397891ce | ||
|
|
ef248ff740 | ||
|
|
e10604480b | ||
|
|
bf001da4bf | ||
|
|
a0a984ac2e | ||
|
|
f1cb9b5544 | ||
|
|
4c4b6f7a97 | ||
|
|
10546f925a | ||
|
|
e69c990c21 | ||
|
|
5eac9a1b34 | ||
|
|
1b60b45d0d | ||
|
|
4b3803d180 | ||
|
|
5019c59dd2 | ||
|
|
089cd4f002 | ||
|
|
0130223bd9 | ||
|
|
5d1aef3004 | ||
|
|
ffe1fc7a28 | ||
|
|
8b7346d5f1 | ||
|
|
6141ebe0dd | ||
|
|
199e3cb476 | ||
|
|
9f8cb81b44 | ||
|
|
d7e17aaacd | ||
|
|
528e9b1490 | ||
|
|
d95b4be47a | ||
|
|
4061dcf4c5 | ||
|
|
0aca8b8c62 | ||
|
|
9eb58f8cf1 | ||
|
|
b10d05b8a8 | ||
|
|
b398e5c819 | ||
|
|
78061ef584 | ||
|
|
528b3076af | ||
|
|
a502831d36 | ||
|
|
ba871fb788 | ||
|
|
ab374786c7 | ||
|
|
808dd87b30 | ||
|
|
beb8899482 | ||
|
|
ce88756b96 | ||
|
|
a3154a6092 | ||
|
|
7c036432fc | ||
|
|
318b120766 | ||
|
|
c3b40dc3e7 | ||
|
|
a01ef3fa51 | ||
|
|
7320ca3942 | ||
|
|
cf0a99f84d | ||
|
|
e535d90deb | ||
|
|
0b225fb7b2 | ||
|
|
46b4a02794 | ||
|
|
8869cd8ec1 | ||
|
|
cd86fff38f | ||
|
|
b5f8c3092d | ||
|
|
21997f45b1 | ||
|
|
672023877b | ||
|
|
754a8ca942 | ||
|
|
302ecf64ff | ||
|
|
b6bb2842cf | ||
|
|
79b6ec6aab | ||
|
|
d6416fdde9 | ||
|
|
0fb3157267 | ||
|
|
a358e4dffe | ||
|
|
079781177a | ||
|
|
63c0889416 | ||
|
|
1e86c802d4 | ||
|
|
fedf64332e | ||
|
|
2238a12c13 | ||
|
|
ce0afe2451 | ||
|
|
88c3e114d8 | ||
|
|
92924b2ddd | ||
|
|
27cb2f678f | ||
|
|
22d9a056d5 | ||
|
|
13b842f271 | ||
|
|
15f40b20aa | ||
|
|
793af538a3 | ||
|
|
6f5e7cda57 | ||
|
|
68feb76a6f | ||
|
|
4cb59dea6a | ||
|
|
608b556507 | ||
|
|
f0a1c8453a | ||
|
|
8980001c93 | ||
|
|
527bcd14d4 | ||
|
|
f68e3ea4e1 | ||
|
|
d5c41db35b | ||
|
|
1618e25492 | ||
|
|
f3888aca83 | ||
|
|
f0bca83ee4 | ||
|
|
73419abfae | ||
|
|
e77f162cf5 | ||
|
|
8ecd213c0b | ||
|
|
5b55c0bea7 | ||
|
|
15e0bb9c42 | ||
|
|
6c64c41b4a | ||
|
|
a2ef06e1b3 | ||
|
|
0a3c71e7e5 | ||
|
|
29fba76781 | ||
|
|
9df152bbf6 | ||
|
|
876a16f4fb | ||
|
|
aaa901ad55 | ||
|
|
010ec0c30e | ||
|
|
64a40a7ab4 | ||
|
|
31aedfe7d6 | ||
|
|
67ebaff528 | ||
|
|
2b465570e6 | ||
|
|
9ca66ecc10 | ||
|
|
c3a9752b0c | ||
|
|
f451b4558b | ||
|
|
3f96fcf646 | ||
|
|
6c1f9e4c18 | ||
|
|
67239c4c42 | ||
|
|
8ece60768f | ||
|
|
fd0e377244 | ||
|
|
f857a03f6b | ||
|
|
74898a7015 | ||
|
|
8f5d51203b | ||
|
|
ae5b7aff2b | ||
|
|
a11bc12d53 | ||
|
|
58cb55e4de | ||
|
|
cf896ae0e3 | ||
|
|
c5113f60f2 | ||
|
|
174f16700b | ||
|
|
8e2ad97ad0 | ||
|
|
10152d2194 | ||
|
|
1a7894dbdf | ||
|
|
c87eac18f7 | ||
|
|
f45870b53f | ||
|
|
ba45bedfd1 | ||
|
|
9432ed8c7e | ||
|
|
726d89720c | ||
|
|
d334dd26c4 | ||
|
|
070c811d6f | ||
|
|
8bfc8d5600 | ||
|
|
ec51831a22 | ||
|
|
80b918f2bd | ||
|
|
c46b0cd0af | ||
|
|
133765760b | ||
|
|
bfb9bdaf3f | ||
|
|
2284461d02 | ||
|
|
8e2a469b3b | ||
|
|
23591e631e | ||
|
|
0493d897c4 | ||
|
|
8c8ebeb941 | ||
|
|
831453fcef | ||
|
|
5a66c9cc76 | ||
|
|
5e73e4900c | ||
|
|
c6e7404cc5 | ||
|
|
17b17c0684 | ||
|
|
8bb6271c77 | ||
|
|
8b3f0a99dd | ||
|
|
8311f083bd | ||
|
|
40c35038d2 | ||
|
|
a5aa4d5c0f | ||
|
|
615e8033e5 | ||
|
|
d09135fbd0 | ||
|
|
8688c3d460 | ||
|
|
5400014d55 | ||
|
|
3a92c6f3b5 | ||
|
|
e01ff5c070 | ||
|
|
fb946a7f89 | ||
|
|
a650ad1588 | ||
|
|
d697581a7c | ||
|
|
5eeba80c74 | ||
|
|
08b1195e62 | ||
|
|
3bba2edb0f | ||
|
|
53fc166402 | ||
|
|
31b25f6516 | ||
|
|
abb34ac43a | ||
|
|
2515bbd027 | ||
|
|
c487a8eef4 | ||
|
|
9e138cb01d | ||
|
|
f9d03599ef | ||
|
|
39037d258e | ||
|
|
51550179fc | ||
|
|
07ea184f00 | ||
|
|
a663b218ae | ||
|
|
1bd47d6e5a | ||
|
|
141cd43967 | ||
|
|
6bf3b46d78 | ||
|
|
77c4f45c6c | ||
|
|
ca1969186d | ||
|
|
ab597c869a | ||
|
|
4197168ea5 | ||
|
|
59bcc5b6f2 | ||
|
|
3e440786af | ||
|
|
8bdd3979d8 | ||
|
|
c4e744dbd4 | ||
|
|
8ebf372e9d | ||
|
|
f210f0b7b1 | ||
|
|
392c5af4fe | ||
|
|
af9b69f977 | ||
|
|
8e5e40daf4 | ||
|
|
2e8de86777 | ||
|
|
247d1a32ea | ||
|
|
ecb4f82209 | ||
|
|
5914090765 | ||
|
|
f1acbd68c5 | ||
|
|
9581185d51 | ||
|
|
2dd359f953 | ||
|
|
22ad649501 | ||
|
|
36d450e3b8 | ||
|
|
a2b877df6c | ||
|
|
35fb0b8613 | ||
|
|
2eb673a088 | ||
|
|
a97b5e206d | ||
|
|
911b51b69f | ||
|
|
604e3b87e8 | ||
|
|
706f123b23 | ||
|
|
fb7abfc1d0 | ||
|
|
5d3d6e44e8 | ||
|
|
46ec6d71c7 | ||
|
|
e82fa448c4 | ||
|
|
d9aa39a3bb | ||
|
|
3a6d5cbefd | ||
|
|
f5d7049cc1 | ||
|
|
3c3c547ce0 | ||
|
|
1cbccb6dba | ||
|
|
bd92089d33 | ||
|
|
a6760f1525 | ||
|
|
66e601ef79 | ||
|
|
0cd259b2d8 | ||
|
|
83fb2d09e8 | ||
|
|
f3a5ee705f | ||
|
|
7cbbca9aaa | ||
|
|
5ec44056f7 | ||
|
|
492a7983dd | ||
|
|
a608b4c6c2 | ||
|
|
1f3a2c2944 | ||
|
|
7227d06156 | ||
|
|
14385c80fc | ||
|
|
76139d0801 | ||
|
|
da8d0c441a | ||
|
|
58996f3589 | ||
|
|
b539f988e1 | ||
|
|
6c00645712 | ||
|
|
b781eeaa15 | ||
|
|
e0b005d9cf | ||
|
|
3b8f0fe59e | ||
|
|
c831911be2 | ||
|
|
157caf511b | ||
|
|
0b53bec60b | ||
|
|
c568581ff3 | ||
|
|
2d7053438a | ||
|
|
5a93b9162b | ||
|
|
6d86fde09c | ||
|
|
510ed1e8d3 | ||
|
|
8caffd92df | ||
|
|
58a05b0ca1 | ||
|
|
6ee7f18f33 | ||
|
|
8f987883cb | ||
|
|
ebe0ba91db | ||
|
|
43a013c3a2 | ||
|
|
c25dbee40d | ||
|
|
19ab0f7ce5 | ||
|
|
67fe677c53 | ||
|
|
d56afd45fd | ||
|
|
a2393ed496 | ||
|
|
be6931ee27 | ||
|
|
9ef3b718d9 | ||
|
|
bb17e8f11c | ||
|
|
dcd80206b7 | ||
|
|
f4a0921c9c | ||
|
|
208c56256f | ||
|
|
9ac818a551 | ||
|
|
6ca2c91b96 | ||
|
|
e33192b269 | ||
|
|
61274bdef5 | ||
|
|
b40db4dfec | ||
|
|
11b556878b | ||
|
|
ee484b3f4b | ||
|
|
a9b53dd435 | ||
|
|
254db42ede | ||
|
|
105d104576 | ||
|
|
566cdb6cfb | ||
|
|
2f0d3ba745 | ||
|
|
edf927bc9f | ||
|
|
22aeb43007 | ||
|
|
a698e8e7ad | ||
|
|
151e5451c2 | ||
|
|
73b243463b | ||
|
|
7e67df5570 | ||
|
|
ff6c1da4e6 | ||
|
|
fcb9df99bd | ||
|
|
1ebdff412a | ||
|
|
91601ff478 | ||
|
|
d4dbb7af63 | ||
|
|
203d0bc0c2 | ||
|
|
17ab54de81 | ||
|
|
cd775bdbe0 | ||
|
|
da5e7b12be | ||
|
|
719ac592ed | ||
|
|
1209b784f2 | ||
|
|
5fa0f6efa9 | ||
|
|
bc0d291bfe | ||
|
|
9ad7f89f55 | ||
|
|
6450b536a6 | ||
|
|
0f19427db5 | ||
|
|
51931c5c9a | ||
|
|
06b557ecd9 | ||
|
|
81c2a889ce | ||
|
|
8edaf38570 | ||
|
|
5c86a89805 | ||
|
|
0ccecf8833 | ||
|
|
0b9a735e11 | ||
|
|
14d03b8ddb | ||
|
|
d0cbac5827 | ||
|
|
c0d820457a | ||
|
|
97ef11dd34 | ||
|
|
ecc3dd66cc | ||
|
|
7e1f10d562 | ||
|
|
a28b94e6ef | ||
|
|
0118cdcc02 | ||
|
|
136c499f6e | ||
|
|
ebd0a17e0e | ||
|
|
37c9859fab | ||
|
|
4561f13985 | ||
|
|
6cc6d92be5 | ||
|
|
dfab5f3764 | ||
|
|
586a57ad7e | ||
|
|
3a41459501 | ||
|
|
8518b30447 | ||
|
|
2d6b537157 | ||
|
|
68b0a6c1ba | ||
|
|
5206e5e28c | ||
|
|
fec9da0af4 | ||
|
|
bbbd696af9 | ||
|
|
9b77bb790d | ||
|
|
305e53ade8 | ||
|
|
1cb4341fbc | ||
|
|
1fb648bf10 | ||
|
|
7e22309755 | ||
|
|
90c2007932 | ||
|
|
d95d650762 | ||
|
|
13d8746c54 | ||
|
|
10e94c84f6 | ||
|
|
243e78c20f | ||
|
|
aac0b817fa | ||
|
|
05f3d714db | ||
|
|
3f3f89529d | ||
|
|
5da4c7d789 | ||
|
|
160c6fa387 | ||
|
|
a8eb1182f1 | ||
|
|
fa6e599a61 | ||
|
|
7ef5873752 | ||
|
|
5e4e0e51f4 | ||
|
|
f61c9da711 | ||
|
|
7fe255889e | ||
|
|
dc917cceb8 | ||
|
|
fc56f4a071 | ||
|
|
d08b356ee0 | ||
|
|
f744810184 | ||
|
|
44f08af3a7 | ||
|
|
955b43a5a5 | ||
|
|
744ef30484 | ||
|
|
300622e609 | ||
|
|
69d09fdd6c | ||
|
|
3a63be0faa | ||
|
|
803e3f3f68 | ||
|
|
70917b1c55 | ||
|
|
c517d8c934 | ||
|
|
fc37187a51 | ||
|
|
ff365eea94 | ||
|
|
444e2e7e1f | ||
|
|
bc14663e6a | ||
|
|
654a71fc3c | ||
|
|
15e302dfce | ||
|
|
d117a4d1a9 | ||
|
|
421012b63a | ||
|
|
841d53aaa8 | ||
|
|
1752262e96 | ||
|
|
ea6102b85d | ||
|
|
328cbb2773 | ||
|
|
64e3d67ac0 | ||
|
|
098b2d66fe | ||
|
|
8ebf271bb6 | ||
|
|
49a1262267 | ||
|
|
2b8a38b6d6 | ||
|
|
1bf1a34b19 | ||
|
|
a810299838 | ||
|
|
eb1629da24 | ||
|
|
019e2c3b7c | ||
|
|
f5fdec8ce2 | ||
|
|
1579c9b5fd | ||
|
|
889722f3bf | ||
|
|
49d9653852 | ||
|
|
a1d82466ea | ||
|
|
24a163ed77 | ||
|
|
378385b90c | ||
|
|
c5487e2b96 | ||
|
|
6437ff1fb9 | ||
|
|
5e00b561cd | ||
|
|
408195ec59 | ||
|
|
63227accf5 | ||
|
|
e675dda67b | ||
|
|
24dc30f7ff | ||
|
|
180fba653e | ||
|
|
f999539869 | ||
|
|
e1da249c93 | ||
|
|
9b693d023c | ||
|
|
808d6fd7b9 | ||
|
|
1861ae8aae | ||
|
|
4e31b7f228 | ||
|
|
6c20e89c02 | ||
|
|
85f55c943c | ||
|
|
cea3c754c4 | ||
|
|
42135d6898 | ||
|
|
e14467be43 | ||
|
|
7727ce35c2 | ||
|
|
6bb2bc71e2 | ||
|
|
c80f92c14d | ||
|
|
f23fb5a7c1 | ||
|
|
360aa93f8f | ||
|
|
27ca95b3c9 | ||
|
|
b4f64e5b02 | ||
|
|
7ab80a8e37 | ||
|
|
0900cedb3f | ||
|
|
6f067b1fb7 | ||
|
|
27b81e010d | ||
|
|
7013e9ac8f | ||
|
|
c78ee240b3 | ||
|
|
d2389c1262 | ||
|
|
22375f8d13 | ||
|
|
9b67338b78 | ||
|
|
2261340806 | ||
|
|
86c69dc54c | ||
|
|
7c5dedc247 | ||
|
|
193069d129 | ||
|
|
f0feb1cf81 | ||
|
|
09194b90a5 | ||
|
|
9ab4388cd3 | ||
|
|
04a9e064db | ||
|
|
c025263ddd | ||
|
|
6c97b9b9b6 | ||
|
|
4ca62a0dbd | ||
|
|
7901109ea5 | ||
|
|
13f6630a9e | ||
|
|
fda3f03eb2 | ||
|
|
bb9172030e | ||
|
|
c4e5bdf61b | ||
|
|
7f1bcd18ff | ||
|
|
8be263c3fb | ||
|
|
e1a34c3a5d | ||
|
|
148117ea2e | ||
|
|
e9c83cdc51 | ||
|
|
b75e85dede | ||
|
|
4753f3bf69 | ||
|
|
6c01ffb897 | ||
|
|
7b7cdce968 | ||
|
|
12dab78f49 | ||
|
|
05dc4bfab6 | ||
|
|
1a1fc3bbc0 | ||
|
|
43fada5360 | ||
|
|
4a5299c93f | ||
|
|
73f2a81c75 | ||
|
|
7350331718 | ||
|
|
9d1e611f0e | ||
|
|
0727cc9ecf | ||
|
|
a0490be8f1 | ||
|
|
cd3ac5b797 | ||
|
|
2636d76257 | ||
|
|
aa7f37ccfa | ||
|
|
c88860d759 | ||
|
|
758df5afe7 | ||
|
|
cdd03d25d3 | ||
|
|
74c583bc50 | ||
|
|
c0a350ca73 | ||
|
|
71832ba71e | ||
|
|
11bbf86f6a | ||
|
|
3c8740aacb | ||
|
|
7518a3dc65 | ||
|
|
976af2f314 | ||
|
|
9a1f16da1e | ||
|
|
bb1848cd62 | ||
|
|
6101a26dc9 | ||
|
|
f5d1740030 | ||
|
|
eebc58df0c | ||
|
|
16de822c71 | ||
|
|
5480c6b1fa | ||
|
|
ba29ab441e | ||
|
|
afc3622602 | ||
|
|
327a02d8db | ||
|
|
2f03035a61 | ||
|
|
38bf2ffb21 | ||
|
|
c826c72a96 | ||
|
|
fe36bf5e80 | ||
|
|
963dc0b865 | ||
|
|
8cc26acd8b | ||
|
|
4a6af8813f | ||
|
|
4147910f1e | ||
|
|
3055232ba0 | ||
|
|
965765aef9 | ||
|
|
9e078d0582 | ||
|
|
2b99f210f5 | ||
|
|
1646fea672 | ||
|
|
d3317bbba4 | ||
|
|
8e61425ee6 | ||
|
|
2e7c89e708 | ||
|
|
037a6487af | ||
|
|
5a3050a089 | ||
|
|
484e22bc18 | ||
|
|
ca21288080 | ||
|
|
4c82b6fac7 | ||
|
|
a884bc62d6 | ||
|
|
7a1030431a | ||
|
|
9fd918e510 | ||
|
|
c9a533079c | ||
|
|
6ca4f400d8 | ||
|
|
180e981d56 | ||
|
|
b84c426a8c | ||
|
|
b66b0d6abb | ||
|
|
03da3b52ef | ||
|
|
14ce524249 | ||
|
|
4ae77dfd42 | ||
|
|
73f635a75f | ||
|
|
35bf5d08e8 | ||
|
|
5de6dd0662 | ||
|
|
709502558c | ||
|
|
46f8a982b1 | ||
|
|
bcf2333cd6 | ||
|
|
83239ff19a | ||
|
|
c277fbdf31 | ||
|
|
aca5c51487 | ||
|
|
31c29257c8 | ||
|
|
8c11001ba2 | ||
|
|
bd292be0c0 | ||
|
|
41c544f78a | ||
|
|
1be5a73571 | ||
|
|
c36ba69bda | ||
|
|
047413375c | ||
|
|
74e4bb1c5a | ||
|
|
b34474bf2c | ||
|
|
6218034dd7 | ||
|
|
77c16df31d | ||
|
|
130d6c9514 | ||
|
|
361dfdc9d8 | ||
|
|
8ebfacaa75 | ||
|
|
b89275d018 | ||
|
|
28459785ff | ||
|
|
8853a50af2 | ||
|
|
c5891b5430 | ||
|
|
707b44cc28 | ||
|
|
3a4e10c847 | ||
|
|
cbbae38f93 | ||
|
|
cdba4c74b3 | ||
|
|
a52d1396a7 | ||
|
|
1e584823f8 | ||
|
|
4c1c501a7e | ||
|
|
ae1eba6a9a | ||
|
|
e9ec2a72d8 | ||
|
|
2c9b4cf5bf | ||
|
|
9d7ae3fcdb | ||
|
|
3c2685645e | ||
|
|
773d7073ae | ||
|
|
edadca109c | ||
|
|
d86fc23bdd | ||
|
|
375e5984fe | ||
|
|
19b251fe3d | ||
|
|
15422ed3f7 | ||
|
|
8471b27df9 | ||
|
|
66652e8082 | ||
|
|
e27078ea80 | ||
|
|
d084e9fca7 | ||
|
|
3a612322eb | ||
|
|
9ea07b41da | ||
|
|
552b262936 | ||
|
|
00e6402d56 | ||
|
|
ce0946249d | ||
|
|
3f28174c6a | ||
|
|
769d0629e1 | ||
|
|
90db5b31e4 | ||
|
|
b8199f6049 | ||
|
|
7e6f123810 | ||
|
|
9312a6c03a | ||
|
|
6388b50058 | ||
|
|
048bb59728 | ||
|
|
7933638051 | ||
|
|
6b176095e3 | ||
|
|
9d0d7f48d5 | ||
|
|
50632adc58 | ||
|
|
6fa6e7ef0c | ||
|
|
90c0836902 | ||
|
|
8ef50d9a6b | ||
|
|
2a60ac91d0 | ||
|
|
9e65bb4ef4 | ||
|
|
0db574b185 | ||
|
|
2f4a71daf2 | ||
|
|
69f8a0ea37 | ||
|
|
f28125d87b | ||
|
|
46f8c6b725 | ||
|
|
af54d2e2d0 | ||
|
|
6beef12b9b | ||
|
|
ab74b2a27a | ||
|
|
2263d44b68 | ||
|
|
4f3676e726 | ||
|
|
510265472c | ||
|
|
4f02cb2eac | ||
|
|
252c011012 | ||
|
|
98f60e5acb | ||
|
|
fefce49807 | ||
|
|
a5bbbd2f24 | ||
|
|
8c8653b672 | ||
|
|
232214b2ae | ||
|
|
eb28e8068d | ||
|
|
542a4059b2 | ||
|
|
df7e12715f | ||
|
|
44c34f22d9 | ||
|
|
80221e1884 | ||
|
|
5e714f7ff4 |
@@ -1,7 +1,8 @@
|
||||
name: vllm_ci
|
||||
job_dirs:
|
||||
- ".buildkite/test_areas"
|
||||
- ".buildkite/image_build"
|
||||
- ".buildkite/test_areas"
|
||||
- ".buildkite/hardware_tests"
|
||||
run_all_patterns:
|
||||
- "docker/Dockerfile"
|
||||
- "CMakeLists.txt"
|
||||
|
||||
29
.buildkite/hardware_tests/amd.yaml
Normal file
29
.buildkite/hardware_tests/amd.yaml
Normal file
@@ -0,0 +1,29 @@
|
||||
group: Hardware
|
||||
steps:
|
||||
- label: "AMD: :docker: build image"
|
||||
depends_on: []
|
||||
device: amd_cpu
|
||||
no_plugin: true
|
||||
commands:
|
||||
- >
|
||||
docker build
|
||||
--build-arg max_jobs=16
|
||||
--build-arg REMOTE_VLLM=1
|
||||
--build-arg ARG_PYTORCH_ROCM_ARCH='gfx90a;gfx942'
|
||||
--build-arg VLLM_BRANCH=$BUILDKITE_COMMIT
|
||||
--tag "rocm/vllm-ci:${BUILDKITE_COMMIT}"
|
||||
-f docker/Dockerfile.rocm
|
||||
--target test
|
||||
--no-cache
|
||||
--progress plain .
|
||||
- docker push "rocm/vllm-ci:${BUILDKITE_COMMIT}"
|
||||
env:
|
||||
DOCKER_BUILDKIT: "1"
|
||||
retry:
|
||||
automatic:
|
||||
- exit_status: -1 # Agent was lost
|
||||
limit: 1
|
||||
- exit_status: -10 # Agent was lost
|
||||
limit: 1
|
||||
- exit_status: 1 # Machine occasionally fail
|
||||
limit: 1
|
||||
10
.buildkite/hardware_tests/ascend_npu.yaml
Normal file
10
.buildkite/hardware_tests/ascend_npu.yaml
Normal file
@@ -0,0 +1,10 @@
|
||||
group: Hardware
|
||||
depends_on: ~
|
||||
steps:
|
||||
- label: "Ascend NPU Test"
|
||||
soft_fail: true
|
||||
timeout_in_minutes: 20
|
||||
no_plugin: true
|
||||
device: ascend_npu
|
||||
commands:
|
||||
- bash .buildkite/scripts/hardware_ci/run-npu-test.sh
|
||||
100
.buildkite/hardware_tests/cpu.yaml
Normal file
100
.buildkite/hardware_tests/cpu.yaml
Normal file
@@ -0,0 +1,100 @@
|
||||
group: CPU
|
||||
depends_on: []
|
||||
steps:
|
||||
- label: CPU-Kernel Tests
|
||||
depends_on: []
|
||||
soft_fail: true
|
||||
device: intel_cpu
|
||||
no_plugin: true
|
||||
source_file_dependencies:
|
||||
- csrc/cpu/
|
||||
- cmake/cpu_extension.cmake
|
||||
- CMakeLists.txt
|
||||
- vllm/_custom_ops.py
|
||||
- tests/kernels/attention/test_cpu_attn.py
|
||||
- tests/kernels/moe/test_cpu_fused_moe.py
|
||||
- tests/kernels/test_onednn.py
|
||||
commands:
|
||||
- |
|
||||
bash .buildkite/scripts/hardware_ci/run-cpu-test.sh 20m "
|
||||
pytest -x -v -s tests/kernels/attention/test_cpu_attn.py
|
||||
pytest -x -v -s tests/kernels/moe/test_cpu_fused_moe.py
|
||||
pytest -x -v -s tests/kernels/test_onednn.py"
|
||||
|
||||
- label: CPU-Language Generation and Pooling Model Tests
|
||||
depends_on: []
|
||||
soft_fail: true
|
||||
device: intel_cpu
|
||||
no_plugin: true
|
||||
source_file_dependencies:
|
||||
- csrc/cpu/
|
||||
- vllm/
|
||||
- tests/models/language/generation/
|
||||
- tests/models/language/pooling/
|
||||
commands:
|
||||
- |
|
||||
bash .buildkite/scripts/hardware_ci/run-cpu-test.sh 30m "
|
||||
pytest -x -v -s tests/models/language/generation -m cpu_model
|
||||
pytest -x -v -s tests/models/language/pooling -m cpu_model"
|
||||
|
||||
- label: CPU-Quantization Model Tests
|
||||
depends_on: []
|
||||
soft_fail: true
|
||||
device: intel_cpu
|
||||
no_plugin: true
|
||||
source_file_dependencies:
|
||||
- csrc/cpu/
|
||||
- vllm/model_executor/layers/quantization/cpu_wna16.py
|
||||
- vllm/model_executor/layers/quantization/gptq_marlin.py
|
||||
- vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py
|
||||
- vllm/model_executor/layers/quantization/kernels/scaled_mm/cpu.py
|
||||
- vllm/model_executor/layers/quantization/kernels/mixed_precision/cpu.py
|
||||
- tests/quantization/test_compressed_tensors.py
|
||||
- tests/quantization/test_cpu_wna16.py
|
||||
commands:
|
||||
- |
|
||||
bash .buildkite/scripts/hardware_ci/run-cpu-test.sh 20m "
|
||||
pytest -x -v -s tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_logprobs
|
||||
pytest -x -v -s tests/quantization/test_cpu_wna16.py"
|
||||
|
||||
- label: CPU-Distributed Tests
|
||||
depends_on: []
|
||||
soft_fail: true
|
||||
device: intel_cpu
|
||||
no_plugin: true
|
||||
source_file_dependencies:
|
||||
- csrc/cpu/shm.cpp
|
||||
- vllm/v1/worker/cpu_worker.py
|
||||
- vllm/v1/worker/gpu_worker.py
|
||||
- vllm/v1/worker/cpu_model_runner.py
|
||||
- vllm/v1/worker/gpu_model_runner.py
|
||||
- vllm/platforms/cpu.py
|
||||
- vllm/distributed/parallel_state.py
|
||||
- vllm/distributed/device_communicators/cpu_communicator.py
|
||||
commands:
|
||||
- |
|
||||
bash .buildkite/scripts/hardware_ci/run-cpu-test.sh 10m "
|
||||
bash .buildkite/scripts/hardware_ci/run-cpu-distributed-smoke-test.sh"
|
||||
|
||||
- label: CPU-Multi-Modal Model Tests %N
|
||||
depends_on: []
|
||||
soft_fail: true
|
||||
device: intel_cpu
|
||||
no_plugin: true
|
||||
source_file_dependencies:
|
||||
# - vllm/
|
||||
- vllm/model_executor/layers/rotary_embedding
|
||||
- tests/models/multimodal/generation/
|
||||
commands:
|
||||
- |
|
||||
bash .buildkite/scripts/hardware_ci/run-cpu-test.sh 45m "
|
||||
pytest -x -v -s tests/models/multimodal/generation --ignore=tests/models/multimodal/generation/test_pixtral.py -m cpu_model --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --shard-id=$$BUILDKITE_PARALLEL_JOB"
|
||||
parallelism: 2
|
||||
|
||||
- label: "Arm CPU Test"
|
||||
depends_on: []
|
||||
soft_fail: true
|
||||
device: arm_cpu
|
||||
no_plugin: true
|
||||
commands:
|
||||
- bash .buildkite/scripts/hardware_ci/run-cpu-test-arm.sh
|
||||
10
.buildkite/hardware_tests/gh200.yaml
Normal file
10
.buildkite/hardware_tests/gh200.yaml
Normal file
@@ -0,0 +1,10 @@
|
||||
group: Hardware
|
||||
steps:
|
||||
- label: "GH200 Test"
|
||||
soft_fail: true
|
||||
device: gh200
|
||||
no_plugin: true
|
||||
optional: true
|
||||
commands:
|
||||
- nvidia-smi
|
||||
- bash .buildkite/scripts/hardware_ci/run-gh200-test.sh
|
||||
17
.buildkite/hardware_tests/intel.yaml
Normal file
17
.buildkite/hardware_tests/intel.yaml
Normal file
@@ -0,0 +1,17 @@
|
||||
group: Hardware
|
||||
depends_on: ~
|
||||
steps:
|
||||
- label: "Intel HPU Test"
|
||||
soft_fail: true
|
||||
device: intel_hpu
|
||||
no_plugin: true
|
||||
commands:
|
||||
- bash .buildkite/scripts/hardware_ci/run-hpu-test.sh
|
||||
|
||||
- label: "Intel GPU Test"
|
||||
depends_on: []
|
||||
soft_fail: true
|
||||
device: intel_gpu
|
||||
no_plugin: true
|
||||
commands:
|
||||
- bash .buildkite/scripts/hardware_ci/run-xpu-test.sh
|
||||
@@ -1,56 +1,256 @@
|
||||
#!/bin/bash
|
||||
set -e
|
||||
set -euo pipefail
|
||||
|
||||
if [[ $# -lt 8 ]]; then
|
||||
echo "Usage: $0 <registry> <repo> <commit> <branch> <vllm_use_precompiled> <vllm_merge_base_commit> <cache_from> <cache_to>"
|
||||
exit 1
|
||||
# replace invalid characters in Docker image tags and truncate to 128 chars
|
||||
clean_docker_tag() {
|
||||
local input="$1"
|
||||
echo "$input" | sed 's/[^a-zA-Z0-9._-]/_/g' | cut -c1-128
|
||||
}
|
||||
|
||||
print_usage_and_exit() {
|
||||
echo "Usage: $0 <registry> <repo> <commit> <branch> <vllm_use_precompiled> <vllm_merge_base_commit> <cache_from> <cache_to>"
|
||||
exit 1
|
||||
}
|
||||
|
||||
print_instance_info() {
|
||||
echo ""
|
||||
echo "=== Debug: Instance Information ==="
|
||||
# Get IMDSv2 token
|
||||
if TOKEN=$(curl -s -X PUT "http://169.254.169.254/latest/api/token" \
|
||||
-H "X-aws-ec2-metadata-token-ttl-seconds: 21600" 2>/dev/null); then
|
||||
AMI_ID=$(curl -s -H "X-aws-ec2-metadata-token: $TOKEN" \
|
||||
http://169.254.169.254/latest/meta-data/ami-id 2>/dev/null || echo "unknown")
|
||||
INSTANCE_TYPE=$(curl -s -H "X-aws-ec2-metadata-token: $TOKEN" \
|
||||
http://169.254.169.254/latest/meta-data/instance-type 2>/dev/null || echo "unknown")
|
||||
INSTANCE_ID=$(curl -s -H "X-aws-ec2-metadata-token: $TOKEN" \
|
||||
http://169.254.169.254/latest/meta-data/instance-id 2>/dev/null || echo "unknown")
|
||||
AZ=$(curl -s -H "X-aws-ec2-metadata-token: $TOKEN" \
|
||||
http://169.254.169.254/latest/meta-data/placement/availability-zone 2>/dev/null || echo "unknown")
|
||||
echo "AMI ID: ${AMI_ID}"
|
||||
echo "Instance Type: ${INSTANCE_TYPE}"
|
||||
echo "Instance ID: ${INSTANCE_ID}"
|
||||
echo "AZ: ${AZ}"
|
||||
else
|
||||
echo "Not running on EC2 or IMDS not available"
|
||||
fi
|
||||
# Check for warm cache AMI (marker file baked into custom AMI)
|
||||
if [[ -f /etc/vllm-ami-info ]]; then
|
||||
echo "Cache: warm (custom vLLM AMI)"
|
||||
cat /etc/vllm-ami-info
|
||||
else
|
||||
echo "Cache: cold (standard AMI)"
|
||||
fi
|
||||
echo "==================================="
|
||||
echo ""
|
||||
}
|
||||
|
||||
setup_buildx_builder() {
|
||||
echo "--- :buildkite: Setting up buildx builder"
|
||||
if [[ -S "${BUILDKIT_SOCKET}" ]]; then
|
||||
# Custom AMI with standalone buildkitd - use remote driver for warm cache
|
||||
echo "✅ Found local buildkitd socket at ${BUILDKIT_SOCKET}"
|
||||
echo "Using remote driver to connect to buildkitd (warm cache available)"
|
||||
if docker buildx inspect baked-vllm-builder >/dev/null 2>&1; then
|
||||
echo "Using existing baked-vllm-builder"
|
||||
docker buildx use baked-vllm-builder
|
||||
else
|
||||
echo "Creating baked-vllm-builder with remote driver"
|
||||
docker buildx create \
|
||||
--name baked-vllm-builder \
|
||||
--driver remote \
|
||||
--use \
|
||||
"unix://${BUILDKIT_SOCKET}"
|
||||
fi
|
||||
docker buildx inspect --bootstrap
|
||||
elif docker buildx inspect "${BUILDER_NAME}" >/dev/null 2>&1; then
|
||||
# Existing builder available
|
||||
echo "Using existing builder: ${BUILDER_NAME}"
|
||||
docker buildx use "${BUILDER_NAME}"
|
||||
docker buildx inspect --bootstrap
|
||||
else
|
||||
# No local buildkitd, no existing builder - create new docker-container builder
|
||||
echo "No local buildkitd found, using docker-container driver"
|
||||
docker buildx create --name "${BUILDER_NAME}" --driver docker-container --use
|
||||
docker buildx inspect --bootstrap
|
||||
fi
|
||||
|
||||
# builder info
|
||||
echo "Active builder:"
|
||||
docker buildx ls | grep -E '^\*|^NAME' || docker buildx ls
|
||||
}
|
||||
|
||||
check_and_skip_if_image_exists() {
|
||||
if [[ -n "${IMAGE_TAG:-}" ]]; then
|
||||
echo "--- :mag: Checking if image exists"
|
||||
if docker manifest inspect "${IMAGE_TAG}" >/dev/null 2>&1; then
|
||||
echo "Image already exists: ${IMAGE_TAG}"
|
||||
echo "Skipping build"
|
||||
exit 0
|
||||
fi
|
||||
echo "Image not found, proceeding with build"
|
||||
fi
|
||||
}
|
||||
|
||||
ecr_login() {
|
||||
aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin "$REGISTRY"
|
||||
aws ecr get-login-password --region us-east-1 | docker login --username AWS --password-stdin 936637512419.dkr.ecr.us-east-1.amazonaws.com
|
||||
}
|
||||
|
||||
prepare_cache_tags() {
|
||||
# resolve and set: CACHE_TO, CACHE_FROM, CACHE_FROM_BASE_BRANCH, CACHE_FROM_MAIN
|
||||
TEST_CACHE_ECR="936637512419.dkr.ecr.us-east-1.amazonaws.com/vllm-ci-test-cache"
|
||||
MAIN_CACHE_ECR="936637512419.dkr.ecr.us-east-1.amazonaws.com/vllm-ci-postmerge-cache"
|
||||
|
||||
if [[ "$BUILDKITE_PULL_REQUEST" == "false" ]]; then
|
||||
if [[ "$BUILDKITE_BRANCH" == "main" ]]; then
|
||||
cache="${MAIN_CACHE_ECR}:latest"
|
||||
else
|
||||
clean_branch=$(clean_docker_tag "$BUILDKITE_BRANCH")
|
||||
cache="${TEST_CACHE_ECR}:${clean_branch}"
|
||||
fi
|
||||
CACHE_TO="$cache"
|
||||
CACHE_FROM="$cache"
|
||||
CACHE_FROM_BASE_BRANCH="$cache"
|
||||
else
|
||||
CACHE_TO="${TEST_CACHE_ECR}:pr-${BUILDKITE_PULL_REQUEST}"
|
||||
CACHE_FROM="${TEST_CACHE_ECR}:pr-${BUILDKITE_PULL_REQUEST}"
|
||||
if [[ "$BUILDKITE_PULL_REQUEST_BASE_BRANCH" == "main" ]]; then
|
||||
CACHE_FROM_BASE_BRANCH="${MAIN_CACHE_ECR}:latest"
|
||||
else
|
||||
clean_base=$(clean_docker_tag "$BUILDKITE_PULL_REQUEST_BASE_BRANCH")
|
||||
CACHE_FROM_BASE_BRANCH="${TEST_CACHE_ECR}:${clean_base}"
|
||||
fi
|
||||
fi
|
||||
|
||||
CACHE_FROM_MAIN="${MAIN_CACHE_ECR}:latest"
|
||||
export CACHE_TO CACHE_FROM CACHE_FROM_BASE_BRANCH CACHE_FROM_MAIN
|
||||
}
|
||||
|
||||
resolve_parent_commit() {
|
||||
if [[ -z "${PARENT_COMMIT:-}" ]]; then
|
||||
PARENT_COMMIT=$(git rev-parse HEAD~1 2>/dev/null || echo "")
|
||||
if [[ -n "${PARENT_COMMIT}" ]]; then
|
||||
echo "Computed parent commit for cache fallback: ${PARENT_COMMIT}"
|
||||
export PARENT_COMMIT
|
||||
else
|
||||
echo "Could not determine parent commit (may be first commit in repo)"
|
||||
fi
|
||||
else
|
||||
echo "Using provided PARENT_COMMIT: ${PARENT_COMMIT}"
|
||||
fi
|
||||
}
|
||||
|
||||
print_bake_config() {
|
||||
echo "--- :page_facing_up: Resolved bake configuration"
|
||||
BAKE_CONFIG_FILE="bake-config-build-${BUILDKITE_BUILD_NUMBER:-local}.json"
|
||||
docker buildx bake -f "${VLLM_BAKE_FILE_PATH}" -f "${CI_HCL_PATH}" --print "${TARGET}" | tee "${BAKE_CONFIG_FILE}" || true
|
||||
echo "Saved bake config to ${BAKE_CONFIG_FILE}"
|
||||
echo "--- :arrow_down: Uploading bake config to Buildkite"
|
||||
buildkite-agent artifact upload "${BAKE_CONFIG_FILE}"
|
||||
}
|
||||
|
||||
#################################
|
||||
# Main Script #
|
||||
#################################
|
||||
print_instance_info
|
||||
|
||||
if [[ $# -lt 7 ]]; then
|
||||
print_usage_and_exit
|
||||
fi
|
||||
|
||||
# input args
|
||||
REGISTRY=$1
|
||||
REPO=$2
|
||||
BUILDKITE_COMMIT=$3
|
||||
BRANCH=$4
|
||||
VLLM_USE_PRECOMPILED=$5
|
||||
VLLM_MERGE_BASE_COMMIT=$6
|
||||
CACHE_FROM=$7
|
||||
CACHE_TO=$8
|
||||
IMAGE_TAG=$7
|
||||
IMAGE_TAG_LATEST=${8:-} # only used for main branch, optional
|
||||
|
||||
# authenticate with AWS ECR
|
||||
aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin $REGISTRY
|
||||
aws ecr get-login-password --region us-east-1 | docker login --username AWS --password-stdin 936637512419.dkr.ecr.us-east-1.amazonaws.com
|
||||
# build config
|
||||
TARGET="test-ci"
|
||||
VLLM_BAKE_FILE_PATH="${VLLM_BAKE_FILE_PATH:-docker/docker-bake.hcl}"
|
||||
BUILDER_NAME="${BUILDER_NAME:-vllm-builder}"
|
||||
CI_HCL_URL="${CI_HCL_URL:-https://raw.githubusercontent.com/vllm-project/ci-infra/main/docker/ci.hcl}"
|
||||
CI_HCL_PATH="/tmp/ci.hcl"
|
||||
BUILDKIT_SOCKET="/run/buildkit/buildkitd.sock"
|
||||
|
||||
# docker buildx
|
||||
docker buildx create --name vllm-builder --driver docker-container --use
|
||||
docker buildx inspect --bootstrap
|
||||
docker buildx ls
|
||||
prepare_cache_tags
|
||||
ecr_login
|
||||
|
||||
# skip build if image already exists
|
||||
if [[ -z $(docker manifest inspect $REGISTRY/$REPO:$BUILDKITE_COMMIT) ]]; then
|
||||
echo "Image not found, proceeding with build..."
|
||||
else
|
||||
echo "Image found"
|
||||
exit 0
|
||||
# Environment info (for docs and human readers)
|
||||
# VLLM_CI_BRANCH - ci-infra branch to use (default: main)
|
||||
# VLLM_BAKE_FILE_PATH - Path to vLLM's bake file (default: docker/docker-bake.hcl)
|
||||
# BUILDER_NAME - Name for buildx builder (default: vllm-builder)
|
||||
#
|
||||
# Build configuration (exported as environment variables for bake):
|
||||
export BUILDKITE_COMMIT
|
||||
export PARENT_COMMIT
|
||||
export IMAGE_TAG
|
||||
export IMAGE_TAG_LATEST
|
||||
export CACHE_FROM
|
||||
export CACHE_FROM_BASE_BRANCH
|
||||
export CACHE_FROM_MAIN
|
||||
export CACHE_TO
|
||||
export VLLM_USE_PRECOMPILED
|
||||
export VLLM_MERGE_BASE_COMMIT
|
||||
|
||||
# print args
|
||||
echo "--- :mag: Arguments"
|
||||
echo "REGISTRY: ${REGISTRY}"
|
||||
echo "REPO: ${REPO}"
|
||||
echo "BUILDKITE_COMMIT: ${BUILDKITE_COMMIT}"
|
||||
echo "BRANCH: ${BRANCH}"
|
||||
echo "VLLM_USE_PRECOMPILED: ${VLLM_USE_PRECOMPILED}"
|
||||
echo "VLLM_MERGE_BASE_COMMIT: ${VLLM_MERGE_BASE_COMMIT}"
|
||||
echo "IMAGE_TAG: ${IMAGE_TAG}"
|
||||
echo "IMAGE_TAG_LATEST: ${IMAGE_TAG_LATEST}"
|
||||
|
||||
# print build configuration
|
||||
echo "--- :mag: Build configuration"
|
||||
echo "TARGET: ${TARGET}"
|
||||
echo "vLLM bake file: ${VLLM_BAKE_FILE_PATH}"
|
||||
echo "BUILDER_NAME: ${BUILDER_NAME}"
|
||||
echo "CI_HCL_URL: ${CI_HCL_URL}"
|
||||
echo "BUILDKIT_SOCKET: ${BUILDKIT_SOCKET}"
|
||||
|
||||
echo "--- :mag: Cache tags"
|
||||
echo "CACHE_TO: ${CACHE_TO}"
|
||||
echo "CACHE_FROM: ${CACHE_FROM}"
|
||||
echo "CACHE_FROM_BASE_BRANCH: ${CACHE_FROM_BASE_BRANCH}"
|
||||
echo "CACHE_FROM_MAIN: ${CACHE_FROM_MAIN}"
|
||||
|
||||
check_and_skip_if_image_exists
|
||||
|
||||
echo "--- :docker: Setting up Docker buildx bake"
|
||||
echo "Target: ${TARGET}"
|
||||
echo "vLLM bake file: ${VLLM_BAKE_FILE_PATH}"
|
||||
echo "CI HCL path: ${CI_HCL_PATH}"
|
||||
|
||||
if [[ ! -f "${VLLM_BAKE_FILE_PATH}" ]]; then
|
||||
echo "Error: vLLM bake file not found at ${VLLM_BAKE_FILE_PATH}"
|
||||
echo "Make sure you're running from the vLLM repository root"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [[ "${VLLM_USE_PRECOMPILED:-0}" == "1" ]]; then
|
||||
merge_base_commit_build_args="--build-arg VLLM_MERGE_BASE_COMMIT=${VLLM_MERGE_BASE_COMMIT}"
|
||||
else
|
||||
merge_base_commit_build_args=""
|
||||
echo "--- :arrow_down: Downloading ci.hcl"
|
||||
curl -sSfL -o "${CI_HCL_PATH}" "${CI_HCL_URL}"
|
||||
echo "Downloaded to ${CI_HCL_PATH}"
|
||||
|
||||
if [[ ! -f "${CI_HCL_PATH}" ]]; then
|
||||
echo "Error: ci.hcl not found at ${CI_HCL_PATH}"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# build
|
||||
docker buildx build --file docker/Dockerfile \
|
||||
--build-arg max_jobs=16 \
|
||||
--build-arg buildkite_commit=$BUILDKITE_COMMIT \
|
||||
--build-arg USE_SCCACHE=1 \
|
||||
--build-arg TORCH_CUDA_ARCH_LIST="8.0 8.9 9.0 10.0" \
|
||||
--build-arg FI_TORCH_CUDA_ARCH_LIST="8.0 8.9 9.0a 10.0a" \
|
||||
--build-arg VLLM_USE_PRECOMPILED="${VLLM_USE_PRECOMPILED:-0}" \
|
||||
${merge_base_commit_build_args} \
|
||||
--cache-from type=registry,ref=${CACHE_FROM},mode=max \
|
||||
--cache-to type=registry,ref=${CACHE_TO},mode=max \
|
||||
--tag ${REGISTRY}/${REPO}:${BUILDKITE_COMMIT} \
|
||||
$( [[ "${BRANCH}" == "main" ]] && echo "--tag ${REGISTRY}/${REPO}:latest" ) \
|
||||
--push \
|
||||
--target test \
|
||||
--progress plain .
|
||||
setup_buildx_builder
|
||||
|
||||
resolve_parent_commit
|
||||
export PARENT_COMMIT
|
||||
|
||||
print_bake_config
|
||||
|
||||
echo "--- :docker: Building ${TARGET}"
|
||||
docker --debug buildx bake -f "${VLLM_BAKE_FILE_PATH}" -f "${CI_HCL_PATH}" --progress plain "${TARGET}"
|
||||
|
||||
echo "--- :white_check_mark: Build complete"
|
||||
|
||||
@@ -4,7 +4,8 @@ steps:
|
||||
key: image-build
|
||||
depends_on: []
|
||||
commands:
|
||||
- .buildkite/image_build/image_build.sh $REGISTRY $REPO $BUILDKITE_COMMIT $BRANCH $VLLM_USE_PRECOMPILED $VLLM_MERGE_BASE_COMMIT $CACHE_FROM $CACHE_TO
|
||||
- if [[ "$BUILDKITE_BRANCH" != "main" ]]; then .buildkite/image_build/image_build.sh $REGISTRY $REPO $BUILDKITE_COMMIT $BRANCH $VLLM_USE_PRECOMPILED $VLLM_MERGE_BASE_COMMIT $IMAGE_TAG; fi
|
||||
- if [[ "$BUILDKITE_BRANCH" == "main" ]]; then .buildkite/image_build/image_build.sh $REGISTRY $REPO $BUILDKITE_COMMIT $BRANCH $VLLM_USE_PRECOMPILED $VLLM_MERGE_BASE_COMMIT $IMAGE_TAG $IMAGE_TAG_LATEST; fi
|
||||
retry:
|
||||
automatic:
|
||||
- exit_status: -1 # Agent was lost
|
||||
|
||||
@@ -14,7 +14,7 @@ BUILDKITE_COMMIT=$3
|
||||
aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin $REGISTRY
|
||||
|
||||
# skip build if image already exists
|
||||
if [[ -z $(docker manifest inspect $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu) ]]; then
|
||||
if [[ -z $(docker manifest inspect "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-arm64-cpu) ]]; then
|
||||
echo "Image not found, proceeding with build..."
|
||||
else
|
||||
echo "Image found"
|
||||
@@ -24,10 +24,10 @@ fi
|
||||
# build
|
||||
docker build --file docker/Dockerfile.cpu \
|
||||
--build-arg max_jobs=16 \
|
||||
--build-arg buildkite_commit=$BUILDKITE_COMMIT \
|
||||
--tag $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu \
|
||||
--build-arg buildkite_commit="$BUILDKITE_COMMIT" \
|
||||
--tag "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-arm64-cpu \
|
||||
--target vllm-test \
|
||||
--progress plain .
|
||||
|
||||
# push
|
||||
docker push $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu
|
||||
docker push "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-arm64-cpu
|
||||
|
||||
@@ -0,0 +1,15 @@
|
||||
model_name: "nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16"
|
||||
tasks:
|
||||
- name: "gsm8k"
|
||||
metrics:
|
||||
- name: "exact_match,strict-match"
|
||||
value: 0.695
|
||||
- name: "exact_match,flexible-extract"
|
||||
value: 0.447
|
||||
limit: 1319
|
||||
num_fewshot: 5
|
||||
max_model_len: 262144
|
||||
enforce_eager: false
|
||||
apply_chat_template: true
|
||||
fewshot_as_multiturn: true
|
||||
trust_remote_code: true
|
||||
@@ -0,0 +1,19 @@
|
||||
model_name: "nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-FP8"
|
||||
tasks:
|
||||
- name: "gsm8k"
|
||||
metrics:
|
||||
- name: "exact_match,strict-match"
|
||||
value: 0.7142
|
||||
- name: "exact_match,flexible-extract"
|
||||
value: 0.4579
|
||||
env_vars:
|
||||
VLLM_USE_FLASHINFER_MOE_FP8: "1"
|
||||
VLLM_FLASHINFER_MOE_BACKEND: "throughput"
|
||||
limit: 1319
|
||||
num_fewshot: 5
|
||||
max_model_len: 262144
|
||||
kv_cache_dtype: fp8
|
||||
enforce_eager: false
|
||||
apply_chat_template: true
|
||||
fewshot_as_multiturn: true
|
||||
trust_remote_code: true
|
||||
@@ -1 +1,2 @@
|
||||
Qwen3-235B-A22B-Instruct-2507-FP8.yaml
|
||||
NVIDIA-Nemotron-3-Nano-30B-A3B-FP8.yaml
|
||||
|
||||
@@ -3,3 +3,4 @@ Meta-Llama-3-70B-Instruct.yaml
|
||||
Mixtral-8x7B-Instruct-v0.1.yaml
|
||||
Qwen2-57B-A14-Instruct.yaml
|
||||
DeepSeek-V2-Lite-Chat.yaml
|
||||
NVIDIA-Nemotron-3-Nano-30B-A3B-BF16.yaml
|
||||
|
||||
5
.buildkite/lm-eval-harness/configs/models-small-rocm.txt
Normal file
5
.buildkite/lm-eval-harness/configs/models-small-rocm.txt
Normal file
@@ -0,0 +1,5 @@
|
||||
Qwen2.5-1.5B-Instruct.yaml
|
||||
Meta-Llama-3.2-1B-Instruct-INT8-compressed-tensors.yaml
|
||||
Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml
|
||||
Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml
|
||||
Qwen1.5-MoE-W4A16-compressed-tensors.yaml
|
||||
@@ -393,7 +393,7 @@ if __name__ == "__main__":
|
||||
with open(results_folder / md_file, "w") as f:
|
||||
results = read_markdown(
|
||||
"../.buildkite/performance-benchmarks/"
|
||||
+ "performance-benchmarks-descriptions.md"
|
||||
"performance-benchmarks-descriptions.md"
|
||||
)
|
||||
results = results.format(
|
||||
latency_tests_markdown_table=latency_md_table,
|
||||
|
||||
@@ -25,9 +25,9 @@ check_gpus() {
|
||||
echo "Need at least 1 GPU to run benchmarking."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
|
||||
declare -g arch_suffix=''
|
||||
|
||||
|
||||
if command -v nvidia-smi; then
|
||||
declare -g gpu_type=$(nvidia-smi --query-gpu=name --format=csv,noheader | awk '{print $2}')
|
||||
elif command -v amd-smi; then
|
||||
@@ -181,19 +181,20 @@ upload_to_buildkite() {
|
||||
$BUILDKITE_AGENT_COMMAND artifact upload "$RESULTS_FOLDER/*"
|
||||
}
|
||||
|
||||
run_latency_tests() {
|
||||
# run latency tests using `vllm bench latency` command
|
||||
# $1: a json file specifying latency test cases
|
||||
run_benchmark_tests() {
|
||||
# run benchmark tests using `vllm bench <test_type>` command
|
||||
# $1: test type (latency or throughput)
|
||||
# $2: a json file specifying test cases
|
||||
|
||||
local latency_test_file
|
||||
latency_test_file=$1
|
||||
local test_type=$1
|
||||
local test_file=$2
|
||||
|
||||
# Iterate over latency tests
|
||||
jq -c '.[]' "$latency_test_file" | while read -r params; do
|
||||
# Iterate over tests
|
||||
jq -c '.[]' "$test_file" | while read -r params; do
|
||||
# get the test name, and append the GPU type back to it.
|
||||
test_name=$(echo "$params" | jq -r '.test_name')
|
||||
if [[ ! "$test_name" =~ ^latency_ ]]; then
|
||||
echo "In latency-test.json, test_name must start with \"latency_\"."
|
||||
if [[ ! "$test_name" =~ ^${test_type}_ ]]; then
|
||||
echo "In ${test_type}-test.json, test_name must start with \"${test_type}_\"."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
@@ -204,15 +205,15 @@ run_latency_tests() {
|
||||
fi
|
||||
|
||||
# get arguments
|
||||
latency_params=$(echo "$params" | jq -r '.parameters')
|
||||
latency_args=$(json2args "$latency_params")
|
||||
latency_environment_variables=$(echo "$params" | jq -r '.environment_variables')
|
||||
latency_envs=$(json2envs "$latency_environment_variables")
|
||||
bench_params=$(echo "$params" | jq -r '.parameters')
|
||||
bench_args=$(json2args "$bench_params")
|
||||
bench_environment_variables=$(echo "$params" | jq -r '.environment_variables')
|
||||
bench_envs=$(json2envs "$bench_environment_variables")
|
||||
|
||||
# check if there is enough GPU to run the test
|
||||
tp=$(echo "$latency_params" | jq -r '.tensor_parallel_size')
|
||||
tp=$(echo "$bench_params" | jq -r '.tensor_parallel_size')
|
||||
if [[ "$ON_CPU" == "1" ]]; then
|
||||
pp=$(echo "$latency_params" | jq -r '.pipeline_parallel_size // 1')
|
||||
pp=$(echo "$bench_params" | jq -r '.pipeline_parallel_size // 1')
|
||||
world_size=$(($tp*$pp))
|
||||
if [[ $numa_count -lt $world_size && -z "${REMOTE_HOST}" ]]; then
|
||||
echo "Required world-size $world_size but only $numa_count NUMA nodes found. Skip testcase $test_name."
|
||||
@@ -225,97 +226,42 @@ run_latency_tests() {
|
||||
fi
|
||||
fi
|
||||
|
||||
latency_command=" $latency_envs vllm bench latency \
|
||||
bench_command=" $bench_envs vllm bench $test_type \
|
||||
--output-json $RESULTS_FOLDER/${test_name}.json \
|
||||
$latency_args"
|
||||
$bench_args"
|
||||
|
||||
echo "Running test case $test_name"
|
||||
echo "Latency command: $latency_command"
|
||||
echo "${test_type^} command: $bench_command"
|
||||
|
||||
# recoding benchmarking command ang GPU command
|
||||
# recording benchmarking command and GPU command
|
||||
jq_output=$(jq -n \
|
||||
--arg latency "$latency_command" \
|
||||
--arg command "$bench_command" \
|
||||
--arg gpu "$gpu_type" \
|
||||
--arg test_type "$test_type" \
|
||||
'{
|
||||
latency_command: $latency,
|
||||
($test_type + "_command"): $command,
|
||||
gpu_type: $gpu
|
||||
}')
|
||||
echo "$jq_output" >"$RESULTS_FOLDER/$test_name.commands"
|
||||
|
||||
# run the benchmark
|
||||
eval "$latency_command"
|
||||
eval "$bench_command"
|
||||
|
||||
kill_gpu_processes
|
||||
|
||||
done
|
||||
}
|
||||
|
||||
run_latency_tests() {
|
||||
run_benchmark_tests "latency" "$1"
|
||||
}
|
||||
|
||||
run_startup_tests() {
|
||||
run_benchmark_tests "startup" "$1"
|
||||
}
|
||||
|
||||
run_throughput_tests() {
|
||||
# run throughput tests using `vllm bench throughput`
|
||||
# $1: a json file specifying throughput test cases
|
||||
|
||||
local throughput_test_file
|
||||
throughput_test_file=$1
|
||||
|
||||
# Iterate over throughput tests
|
||||
jq -c '.[]' "$throughput_test_file" | while read -r params; do
|
||||
# get the test name, and append the GPU type back to it.
|
||||
test_name=$(echo "$params" | jq -r '.test_name')
|
||||
if [[ ! "$test_name" =~ ^throughput_ ]]; then
|
||||
echo "In throughput-test.json, test_name must start with \"throughput_\"."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# if TEST_SELECTOR is set, only run the test cases that match the selector
|
||||
if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
|
||||
echo "Skip test case $test_name."
|
||||
continue
|
||||
fi
|
||||
|
||||
# get arguments
|
||||
throughput_params=$(echo "$params" | jq -r '.parameters')
|
||||
throughput_args=$(json2args "$throughput_params")
|
||||
throughput_environment_variables=$(echo "$params" | jq -r '.environment_variables')
|
||||
throughput_envs=$(json2envs "$throughput_environment_variables")
|
||||
|
||||
# check if there is enough GPU to run the test
|
||||
tp=$(echo "$throughput_params" | jq -r '.tensor_parallel_size')
|
||||
if [[ "$ON_CPU" == "1" ]]; then
|
||||
pp=$(echo "$throughput_params" | jq -r '.pipeline_parallel_size // 1')
|
||||
world_size=$(($tp*$pp))
|
||||
if [[ $numa_count -lt $world_size && -z "${REMOTE_HOST}" ]]; then
|
||||
echo "Required world-size $world_size but only $numa_count NUMA nodes found. Skip testcase $test_name."
|
||||
continue
|
||||
fi
|
||||
else
|
||||
if [[ $gpu_count -lt $tp ]]; then
|
||||
echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name."
|
||||
continue
|
||||
fi
|
||||
fi
|
||||
|
||||
throughput_command=" $throughput_envs vllm bench throughput \
|
||||
--output-json $RESULTS_FOLDER/${test_name}.json \
|
||||
$throughput_args"
|
||||
|
||||
echo "Running test case $test_name"
|
||||
echo "Throughput command: $throughput_command"
|
||||
# recoding benchmarking command ang GPU command
|
||||
jq_output=$(jq -n \
|
||||
--arg command "$throughput_command" \
|
||||
--arg gpu "$gpu_type" \
|
||||
'{
|
||||
throughput_command: $command,
|
||||
gpu_type: $gpu
|
||||
}')
|
||||
echo "$jq_output" >"$RESULTS_FOLDER/$test_name.commands"
|
||||
|
||||
# run the benchmark
|
||||
eval "$throughput_command"
|
||||
|
||||
kill_gpu_processes
|
||||
|
||||
done
|
||||
run_benchmark_tests "throughput" "$1"
|
||||
}
|
||||
|
||||
run_serving_tests() {
|
||||
@@ -447,6 +393,11 @@ run_serving_tests() {
|
||||
fi
|
||||
fi
|
||||
|
||||
# save the compilation mode and optimization level on the serving results
|
||||
# whenever they are set
|
||||
compilation_config_mode=$(echo "$server_params" | jq -r '."compilation_config.mode" // empty')
|
||||
optimization_level=$(echo "$server_params" | jq -r '.optimization_level // empty')
|
||||
|
||||
# iterate over different QPS
|
||||
for qps in $qps_list; do
|
||||
# remove the surrounding single quote from qps
|
||||
@@ -460,15 +411,15 @@ run_serving_tests() {
|
||||
for max_concurrency in $max_concurrency_list; do
|
||||
new_test_name=$test_name"_qps_"$qps"_concurrency_"$max_concurrency
|
||||
echo " new test name $new_test_name"
|
||||
# pass the tensor parallel size to the client so that it can be displayed
|
||||
# on the benchmark dashboard
|
||||
# pass the tensor parallel size, the compilation mode, and the optimization
|
||||
# level to the client so that they can be used on the benchmark dashboard
|
||||
client_command="vllm bench serve \
|
||||
--save-result \
|
||||
--result-dir $RESULTS_FOLDER \
|
||||
--result-filename ${new_test_name}.json \
|
||||
--request-rate $qps \
|
||||
--max-concurrency $max_concurrency \
|
||||
--metadata "tensor_parallel_size=$tp" \
|
||||
--metadata tensor_parallel_size=$tp compilation_config.mode=$compilation_config_mode optimization_level=$optimization_level \
|
||||
$client_args $client_remote_args "
|
||||
|
||||
echo "Running test case $test_name with qps $qps"
|
||||
@@ -534,6 +485,7 @@ main() {
|
||||
# benchmarking
|
||||
run_serving_tests $QUICK_BENCHMARK_ROOT/tests/"${SERVING_JSON:-serving-tests$ARCH.json}"
|
||||
run_latency_tests $QUICK_BENCHMARK_ROOT/tests/"${LATENCY_JSON:-latency-tests$ARCH.json}"
|
||||
run_startup_tests $QUICK_BENCHMARK_ROOT/tests/"${STARTUP_JSON:-startup-tests$ARCH.json}"
|
||||
run_throughput_tests $QUICK_BENCHMARK_ROOT/tests/"${THROUGHPUT_JSON:-throughput-tests$ARCH.json}"
|
||||
|
||||
# postprocess benchmarking results
|
||||
|
||||
@@ -1,219 +1,291 @@
|
||||
steps:
|
||||
# aarch64 + CUDA builds
|
||||
- label: "Build arm64 wheel - CUDA 12.9"
|
||||
depends_on: ~
|
||||
id: build-wheel-arm64-cuda-12-9
|
||||
agents:
|
||||
queue: arm64_cpu_queue_postmerge
|
||||
commands:
|
||||
# #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here:
|
||||
# https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
||||
- "mkdir artifacts"
|
||||
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
||||
- "bash .buildkite/scripts/upload-wheels.sh"
|
||||
env:
|
||||
DOCKER_BUILDKIT: "1"
|
||||
|
||||
- label: "Build arm64 wheel - CUDA 13.0"
|
||||
depends_on: ~
|
||||
id: build-wheel-arm64-cuda-13-0
|
||||
agents:
|
||||
queue: arm64_cpu_queue_postmerge
|
||||
commands:
|
||||
# #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here:
|
||||
# https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
||||
- "mkdir artifacts"
|
||||
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
||||
- "bash .buildkite/scripts/upload-wheels.sh manylinux_2_35"
|
||||
env:
|
||||
DOCKER_BUILDKIT: "1"
|
||||
|
||||
# aarch64 build
|
||||
- label: "Build arm64 CPU wheel"
|
||||
depends_on: ~
|
||||
id: build-wheel-arm64-cpu
|
||||
agents:
|
||||
queue: arm64_cpu_queue_postmerge
|
||||
commands:
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_BUILD_ACL=ON --tag vllm-ci:build-image --target vllm-build --progress plain -f docker/Dockerfile.cpu ."
|
||||
- "mkdir artifacts"
|
||||
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
||||
- "bash .buildkite/scripts/upload-wheels.sh manylinux_2_35"
|
||||
env:
|
||||
DOCKER_BUILDKIT: "1"
|
||||
|
||||
# x86 + CUDA builds
|
||||
- label: "Build wheel - CUDA 12.9"
|
||||
depends_on: ~
|
||||
id: build-wheel-cuda-12-9
|
||||
agents:
|
||||
queue: cpu_queue_postmerge
|
||||
commands:
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
||||
- "mkdir artifacts"
|
||||
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
||||
- "bash .buildkite/scripts/upload-wheels.sh manylinux_2_31"
|
||||
env:
|
||||
DOCKER_BUILDKIT: "1"
|
||||
|
||||
- label: "Build wheel - CUDA 13.0"
|
||||
depends_on: ~
|
||||
id: build-wheel-cuda-13-0
|
||||
agents:
|
||||
queue: cpu_queue_postmerge
|
||||
commands:
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
||||
- "mkdir artifacts"
|
||||
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
||||
- "bash .buildkite/scripts/upload-wheels.sh manylinux_2_35"
|
||||
env:
|
||||
DOCKER_BUILDKIT: "1"
|
||||
|
||||
# x86 CPU wheel build
|
||||
- label: "Build x86 CPU wheel"
|
||||
depends_on: ~
|
||||
id: build-wheel-x86-cpu
|
||||
agents:
|
||||
queue: cpu_queue_postmerge
|
||||
commands:
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_CPU_AVX512BF16=true --build-arg VLLM_CPU_AVX512VNNI=true --build-arg VLLM_CPU_AMXBF16=true --tag vllm-ci:build-image --target vllm-build --progress plain -f docker/Dockerfile.cpu ."
|
||||
- "mkdir artifacts"
|
||||
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
||||
- "bash .buildkite/scripts/upload-wheels.sh manylinux_2_35"
|
||||
env:
|
||||
DOCKER_BUILDKIT: "1"
|
||||
|
||||
# Build release images (12.9)
|
||||
- label: "Build release image (x86)"
|
||||
depends_on: ~
|
||||
id: build-release-image-x86
|
||||
agents:
|
||||
queue: cpu_queue_postmerge
|
||||
commands:
|
||||
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
|
||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)"
|
||||
# re-tag to default image tag and push, just in case arm64 build fails
|
||||
- "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
|
||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
|
||||
|
||||
- label: "Build release image (arm64)"
|
||||
depends_on: ~
|
||||
id: build-release-image-arm64
|
||||
agents:
|
||||
queue: arm64_cpu_queue_postmerge
|
||||
commands:
|
||||
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
|
||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)"
|
||||
|
||||
# Add job to create multi-arch manifest
|
||||
- label: "Create multi-arch manifest"
|
||||
depends_on:
|
||||
- build-release-image-x86
|
||||
- build-release-image-arm64
|
||||
id: create-multi-arch-manifest
|
||||
agents:
|
||||
queue: cpu_queue_postmerge
|
||||
commands:
|
||||
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
||||
- "docker manifest create public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64 --amend"
|
||||
- "docker manifest push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
|
||||
|
||||
- label: "Annotate release workflow"
|
||||
depends_on:
|
||||
- create-multi-arch-manifest
|
||||
id: annotate-release-workflow
|
||||
agents:
|
||||
queue: cpu_queue_postmerge
|
||||
commands:
|
||||
- "bash .buildkite/scripts/annotate-release.sh"
|
||||
|
||||
- input: "Provide Release version here"
|
||||
id: input-release-version
|
||||
fields:
|
||||
- text: "What is the release version?"
|
||||
key: release-version
|
||||
|
||||
- block: "Build CPU release image"
|
||||
key: block-cpu-release-image-build
|
||||
depends_on: ~
|
||||
- group: "Build Python wheels"
|
||||
key: "build-wheels"
|
||||
steps:
|
||||
- label: "Build wheel - aarch64 - CUDA 12.9"
|
||||
depends_on: ~
|
||||
id: build-wheel-arm64-cuda-12-9
|
||||
agents:
|
||||
queue: arm64_cpu_queue_postmerge
|
||||
commands:
|
||||
# #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here:
|
||||
# https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
||||
- "mkdir artifacts"
|
||||
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
||||
- "bash .buildkite/scripts/upload-nightly-wheels.sh"
|
||||
env:
|
||||
DOCKER_BUILDKIT: "1"
|
||||
|
||||
- label: "Build and publish CPU release image"
|
||||
depends_on: block-cpu-release-image-build
|
||||
agents:
|
||||
queue: cpu_queue_postmerge
|
||||
commands:
|
||||
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_CPU_AVX512BF16=true --build-arg VLLM_CPU_AVX512VNNI=true --build-arg VLLM_CPU_AMXBF16=true --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ."
|
||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest"
|
||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version)"
|
||||
env:
|
||||
DOCKER_BUILDKIT: "1"
|
||||
- label: "Build wheel - aarch64 - CUDA 13.0"
|
||||
depends_on: ~
|
||||
id: build-wheel-arm64-cuda-13-0
|
||||
agents:
|
||||
queue: arm64_cpu_queue_postmerge
|
||||
commands:
|
||||
# #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here:
|
||||
# https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
||||
- "mkdir artifacts"
|
||||
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
||||
- "bash .buildkite/scripts/upload-nightly-wheels.sh manylinux_2_35"
|
||||
env:
|
||||
DOCKER_BUILDKIT: "1"
|
||||
|
||||
- block: "Build arm64 CPU release image"
|
||||
key: block-arm64-cpu-release-image-build
|
||||
depends_on: ~
|
||||
- label: "Build wheel - aarch64 - CPU"
|
||||
depends_on: ~
|
||||
id: build-wheel-arm64-cpu
|
||||
agents:
|
||||
queue: arm64_cpu_queue_postmerge
|
||||
commands:
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_BUILD_ACL=ON --tag vllm-ci:build-image --target vllm-build --progress plain -f docker/Dockerfile.cpu ."
|
||||
- "mkdir artifacts"
|
||||
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
||||
- "bash .buildkite/scripts/upload-nightly-wheels.sh manylinux_2_35"
|
||||
env:
|
||||
DOCKER_BUILDKIT: "1"
|
||||
|
||||
- label: "Build and publish arm64 CPU release image"
|
||||
depends_on: block-arm64-cpu-release-image-build
|
||||
agents:
|
||||
queue: arm64_cpu_queue_postmerge
|
||||
commands:
|
||||
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ."
|
||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:latest"
|
||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:$(buildkite-agent meta-data get release-version)"
|
||||
env:
|
||||
DOCKER_BUILDKIT: "1"
|
||||
- label: "Build wheel - x86_64 - CUDA 12.9"
|
||||
depends_on: ~
|
||||
id: build-wheel-x86-cuda-12-9
|
||||
agents:
|
||||
queue: cpu_queue_postmerge
|
||||
commands:
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
||||
- "mkdir artifacts"
|
||||
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
||||
- "bash .buildkite/scripts/upload-nightly-wheels.sh manylinux_2_31"
|
||||
env:
|
||||
DOCKER_BUILDKIT: "1"
|
||||
|
||||
- block: "Build ROCm release image"
|
||||
key: block-rocm-release-image-build
|
||||
depends_on: ~
|
||||
- label: "Build wheel - x86_64 - CUDA 13.0"
|
||||
depends_on: ~
|
||||
id: build-wheel-x86-cuda-13-0
|
||||
agents:
|
||||
queue: cpu_queue_postmerge
|
||||
commands:
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
||||
- "mkdir artifacts"
|
||||
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
||||
- "bash .buildkite/scripts/upload-nightly-wheels.sh manylinux_2_35"
|
||||
env:
|
||||
DOCKER_BUILDKIT: "1"
|
||||
|
||||
- label: "Build release image (ROCm)"
|
||||
depends_on: block-rocm-release-image-build
|
||||
id: build-release-image-rocm
|
||||
agents:
|
||||
queue: cpu_queue_postmerge
|
||||
commands:
|
||||
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
||||
# Build base image first
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --tag rocm/vllm-dev:base-$BUILDKITE_COMMIT --target final --progress plain -f docker/Dockerfile.rocm_base ."
|
||||
# Build vLLM ROCm image using the base
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg BASE_IMAGE=rocm/vllm-dev:base-$BUILDKITE_COMMIT --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-rocm --target vllm-openai --progress plain -f docker/Dockerfile.rocm ."
|
||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-rocm"
|
||||
- label: "Build wheel - x86_64 - CPU"
|
||||
depends_on: ~
|
||||
id: build-wheel-x86-cpu
|
||||
agents:
|
||||
queue: cpu_queue_postmerge
|
||||
commands:
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_CPU_AVX512BF16=true --build-arg VLLM_CPU_AVX512VNNI=true --build-arg VLLM_CPU_AMXBF16=true --tag vllm-ci:build-image --target vllm-build --progress plain -f docker/Dockerfile.cpu ."
|
||||
- "mkdir artifacts"
|
||||
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
||||
- "bash .buildkite/scripts/upload-nightly-wheels.sh manylinux_2_35"
|
||||
env:
|
||||
DOCKER_BUILDKIT: "1"
|
||||
|
||||
|
||||
- label: "Build and publish nightly multi-arch image to DockerHub"
|
||||
depends_on:
|
||||
- create-multi-arch-manifest
|
||||
if: build.env("NIGHTLY") == "1"
|
||||
agents:
|
||||
queue: cpu_queue_postmerge
|
||||
commands:
|
||||
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
||||
- "docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64"
|
||||
- "docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64"
|
||||
- "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64 vllm/vllm-openai:nightly-x86_64"
|
||||
- "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64 vllm/vllm-openai:nightly-aarch64"
|
||||
- "docker push vllm/vllm-openai:nightly-x86_64"
|
||||
- "docker push vllm/vllm-openai:nightly-aarch64"
|
||||
- "docker manifest create vllm/vllm-openai:nightly vllm/vllm-openai:nightly-x86_64 vllm/vllm-openai:nightly-aarch64 --amend"
|
||||
- "docker manifest create vllm/vllm-openai:nightly-$BUILDKITE_COMMIT vllm/vllm-openai:nightly-x86_64 vllm/vllm-openai:nightly-aarch64 --amend"
|
||||
- "docker manifest push vllm/vllm-openai:nightly"
|
||||
- "docker manifest push vllm/vllm-openai:nightly-$BUILDKITE_COMMIT"
|
||||
# Clean up old nightly builds (keep only last 14)
|
||||
- "bash .buildkite/scripts/cleanup-nightly-builds.sh"
|
||||
plugins:
|
||||
- docker-login#v3.0.0:
|
||||
username: vllmbot
|
||||
password-env: DOCKERHUB_TOKEN
|
||||
env:
|
||||
DOCKER_BUILDKIT: "1"
|
||||
DOCKERHUB_USERNAME: "vllmbot"
|
||||
- group: "Build release Docker images"
|
||||
key: "build-release-images"
|
||||
steps:
|
||||
- label: "Build release image - x86_64 - CUDA 12.9"
|
||||
depends_on: ~
|
||||
id: build-release-image-x86
|
||||
agents:
|
||||
queue: cpu_queue_postmerge
|
||||
commands:
|
||||
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
|
||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)"
|
||||
# re-tag to default image tag and push, just in case arm64 build fails
|
||||
- "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
|
||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
|
||||
|
||||
- label: "Build release image - aarch64 - CUDA 12.9"
|
||||
depends_on: ~
|
||||
id: build-release-image-arm64
|
||||
agents:
|
||||
queue: arm64_cpu_queue_postmerge
|
||||
commands:
|
||||
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
|
||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)"
|
||||
|
||||
- label: "Build release image - x86_64 - CUDA 13.0"
|
||||
depends_on: ~
|
||||
id: build-release-image-x86-cuda-13-0
|
||||
agents:
|
||||
queue: cpu_queue_postmerge
|
||||
commands:
|
||||
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg INSTALL_KV_CONNECTORS=true --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130 --target vllm-openai --progress plain -f docker/Dockerfile ."
|
||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130"
|
||||
# re-tag to default image tag and push, just in case arm64 build fails
|
||||
- "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu130"
|
||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu130"
|
||||
|
||||
- label: "Build release image - aarch64 - CUDA 13.0"
|
||||
depends_on: ~
|
||||
id: build-release-image-arm64-cuda-13-0
|
||||
agents:
|
||||
queue: arm64_cpu_queue_postmerge
|
||||
commands:
|
||||
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
||||
# compute capability 12.0 for RTX-50 series / RTX PRO 6000 Blackwell, 12.1 for DGX Spark
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0 12.1' --build-arg INSTALL_KV_CONNECTORS=true --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130 --target vllm-openai --progress plain -f docker/Dockerfile ."
|
||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130"
|
||||
|
||||
- block: "Build release image for x86_64 CPU"
|
||||
key: block-cpu-release-image-build
|
||||
depends_on: ~
|
||||
|
||||
- label: "Build release image - x86_64 - CPU"
|
||||
depends_on:
|
||||
- block-cpu-release-image-build
|
||||
- input-release-version
|
||||
agents:
|
||||
queue: cpu_queue_postmerge
|
||||
commands:
|
||||
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_CPU_AVX512BF16=true --build-arg VLLM_CPU_AVX512VNNI=true --build-arg VLLM_CPU_AMXBF16=true --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ."
|
||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest"
|
||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version)"
|
||||
env:
|
||||
DOCKER_BUILDKIT: "1"
|
||||
|
||||
- block: "Build release image for arm64 CPU"
|
||||
key: block-arm64-cpu-release-image-build
|
||||
depends_on: ~
|
||||
|
||||
- label: "Build release image - arm64 - CPU"
|
||||
depends_on:
|
||||
- block-arm64-cpu-release-image-build
|
||||
- input-release-version
|
||||
agents:
|
||||
queue: arm64_cpu_queue_postmerge
|
||||
commands:
|
||||
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ."
|
||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:latest"
|
||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:$(buildkite-agent meta-data get release-version)"
|
||||
env:
|
||||
DOCKER_BUILDKIT: "1"
|
||||
|
||||
- group: "Publish release images"
|
||||
key: "publish-release-images"
|
||||
steps:
|
||||
- label: "Create multi-arch manifest - CUDA 12.9"
|
||||
depends_on:
|
||||
- build-release-image-x86
|
||||
- build-release-image-arm64
|
||||
id: create-multi-arch-manifest
|
||||
agents:
|
||||
queue: small_cpu_queue_postmerge
|
||||
commands:
|
||||
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
||||
- "docker manifest create public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64 --amend"
|
||||
- "docker manifest push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
|
||||
|
||||
- label: "Annotate release workflow - CUDA 12.9"
|
||||
depends_on:
|
||||
- create-multi-arch-manifest
|
||||
id: annotate-release-workflow
|
||||
agents:
|
||||
queue: small_cpu_queue_postmerge
|
||||
commands:
|
||||
- "bash .buildkite/scripts/annotate-release.sh"
|
||||
|
||||
- label: "Create multi-arch manifest - CUDA 13.0"
|
||||
depends_on:
|
||||
- build-release-image-x86-cuda-13-0
|
||||
- build-release-image-arm64-cuda-13-0
|
||||
id: create-multi-arch-manifest-cuda-13-0
|
||||
agents:
|
||||
queue: small_cpu_queue_postmerge
|
||||
commands:
|
||||
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
||||
- "docker manifest create public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu130 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64-cu130 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64-cu130 --amend"
|
||||
- "docker manifest push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu130"
|
||||
|
||||
- label: "Publish nightly multi-arch image to DockerHub"
|
||||
depends_on:
|
||||
- create-multi-arch-manifest
|
||||
if: build.env("NIGHTLY") == "1"
|
||||
agents:
|
||||
queue: small_cpu_queue_postmerge
|
||||
commands:
|
||||
- "bash .buildkite/scripts/push-nightly-builds.sh"
|
||||
# Clean up old nightly builds (keep only last 14)
|
||||
- "bash .buildkite/scripts/cleanup-nightly-builds.sh"
|
||||
plugins:
|
||||
- docker-login#v3.0.0:
|
||||
username: vllmbot
|
||||
password-env: DOCKERHUB_TOKEN
|
||||
env:
|
||||
DOCKER_BUILDKIT: "1"
|
||||
DOCKERHUB_USERNAME: "vllmbot"
|
||||
|
||||
- label: "Publish nightly multi-arch image to DockerHub - CUDA 13.0"
|
||||
depends_on:
|
||||
- create-multi-arch-manifest-cuda-13-0
|
||||
if: build.env("NIGHTLY") == "1"
|
||||
agents:
|
||||
queue: small_cpu_queue_postmerge
|
||||
commands:
|
||||
- "bash .buildkite/scripts/push-nightly-builds.sh cu130"
|
||||
# Clean up old nightly builds (keep only last 14)
|
||||
- "bash .buildkite/scripts/cleanup-nightly-builds.sh cu130-nightly-"
|
||||
plugins:
|
||||
- docker-login#v3.0.0:
|
||||
username: vllmbot
|
||||
password-env: DOCKERHUB_TOKEN
|
||||
env:
|
||||
DOCKER_BUILDKIT: "1"
|
||||
DOCKERHUB_USERNAME: "vllmbot"
|
||||
|
||||
- group: "Publish release artifacts"
|
||||
key: "publish-release-artifacts"
|
||||
steps:
|
||||
- block: "Confirm update release wheels to PyPI (experimental, use with caution)?"
|
||||
key: block-upload-release-wheels
|
||||
depends_on:
|
||||
- input-release-version
|
||||
- build-wheels
|
||||
|
||||
- label: "Upload release wheels to PyPI"
|
||||
depends_on:
|
||||
- block-upload-release-wheels
|
||||
id: upload-release-wheels
|
||||
agents:
|
||||
queue: small_cpu_queue_postmerge
|
||||
commands:
|
||||
- "bash .buildkite/scripts/upload-release-wheels-pypi.sh"
|
||||
|
||||
- block: "Confirm update release images to DockerHub"
|
||||
key: block-update-release-images-dockerhub
|
||||
depends_on:
|
||||
- input-release-version
|
||||
- annotate-release-workflow
|
||||
|
||||
- label: "Publish release images to DockerHub"
|
||||
depends_on:
|
||||
- block-update-release-images-dockerhub
|
||||
agents:
|
||||
queue: small_cpu_queue_postmerge
|
||||
commands:
|
||||
- "bash .buildkite/scripts/push-release-images-dockerhub.sh"
|
||||
plugins:
|
||||
- docker-login#v3.0.0:
|
||||
username: vllmbot
|
||||
password-env: DOCKERHUB_TOKEN
|
||||
env:
|
||||
DOCKER_BUILDKIT: "1"
|
||||
DOCKERHUB_USERNAME: "vllmbot"
|
||||
|
||||
# =============================================================================
|
||||
# ROCm Release Pipeline (x86_64 only)
|
||||
@@ -408,7 +480,7 @@ steps:
|
||||
S3_BUCKET: "vllm-wheels"
|
||||
|
||||
# ROCm Job 2: Build vLLM ROCm Wheel
|
||||
- label: ":python: Build vLLM ROCm Wheel"
|
||||
- label: ":python: Build vLLM ROCm Wheel - x86_64"
|
||||
id: build-rocm-vllm-wheel
|
||||
depends_on:
|
||||
- step: build-rocm-base-wheels
|
||||
@@ -570,9 +642,93 @@ steps:
|
||||
depends_on:
|
||||
- step: upload-rocm-wheels
|
||||
allow_failure: true
|
||||
- step: input-release-version
|
||||
allow_failure: true
|
||||
agents:
|
||||
queue: cpu_queue_postmerge
|
||||
commands:
|
||||
- "bash .buildkite/scripts/annotate-rocm-release.sh"
|
||||
env:
|
||||
S3_BUCKET: "vllm-wheels"
|
||||
|
||||
# ROCm Job 5: Generate Root Index for ROCm Wheels (for release only)
|
||||
# This is the job to create https://wheels.vllm.ai/rocm/ index allowing
|
||||
# users to install with `uv pip install vllm --extra-index-url https://wheels.vllm.ai/rocm/`
|
||||
- block: "Generate Root Index for ROCm Wheels for Release"
|
||||
key: block-generate-root-index-rocm-wheels
|
||||
depends_on: upload-rocm-wheels
|
||||
|
||||
- label: ":package: Generate Root Index for ROCm Wheels for Release"
|
||||
depends_on: block-generate-root-index-rocm-wheels
|
||||
id: generate-root-index-rocm-wheels
|
||||
agents:
|
||||
queue: cpu_queue_postmerge
|
||||
commands:
|
||||
- "bash tools/vllm-rocm/generate-rocm-wheels-root-index.sh"
|
||||
env:
|
||||
S3_BUCKET: "vllm-wheels"
|
||||
VARIANT: "rocm700"
|
||||
|
||||
# ROCm Job 5: Build ROCm Release Docker Image
|
||||
- label: ":docker: Build release image - x86_64 - ROCm"
|
||||
id: build-rocm-release-image
|
||||
depends_on:
|
||||
- step: build-rocm-base-wheels
|
||||
allow_failure: false
|
||||
agents:
|
||||
queue: cpu_queue_postmerge
|
||||
timeout_in_minutes: 60
|
||||
commands:
|
||||
- |
|
||||
set -euo pipefail
|
||||
|
||||
# Login to ECR
|
||||
aws ecr-public get-login-password --region us-east-1 | \
|
||||
docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7
|
||||
|
||||
# Download Docker image from S3 (set by build-rocm-base-wheels)
|
||||
DOCKER_IMAGE_S3_PATH="$$(buildkite-agent meta-data get rocm-docker-image-s3-path 2>/dev/null || echo '')"
|
||||
if [ -z "$${DOCKER_IMAGE_S3_PATH}" ]; then
|
||||
echo "ERROR: rocm-docker-image-s3-path metadata not found"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "Downloading base image from $${DOCKER_IMAGE_S3_PATH}"
|
||||
mkdir -p artifacts/rocm-docker-image
|
||||
aws s3 cp "$${DOCKER_IMAGE_S3_PATH}" artifacts/rocm-docker-image/rocm-base-image.tar.gz
|
||||
|
||||
# Load base Docker image
|
||||
echo "Loading base Docker image..."
|
||||
LOAD_OUTPUT=$$(gunzip -c artifacts/rocm-docker-image/rocm-base-image.tar.gz | docker load)
|
||||
BASE_IMAGE_TAG=$$(echo "$${LOAD_OUTPUT}" | grep "Loaded image:" | sed 's/Loaded image: //')
|
||||
echo "Loaded base image: $${BASE_IMAGE_TAG}"
|
||||
|
||||
# Tag and push the base image to ECR
|
||||
docker tag "$${BASE_IMAGE_TAG}" public.ecr.aws/q9t5s3a7/vllm-release-repo:$${BUILDKITE_COMMIT}-rocm-base
|
||||
docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$${BUILDKITE_COMMIT}-rocm-base
|
||||
echo "Pushed base image: public.ecr.aws/q9t5s3a7/vllm-release-repo:$${BUILDKITE_COMMIT}-rocm-base"
|
||||
|
||||
# Get GPU architectures from meta-data
|
||||
PYTORCH_ROCM_ARCH="$$(buildkite-agent meta-data get rocm-pytorch-rocm-arch 2>/dev/null || echo '')"
|
||||
PYTORCH_ROCM_ARCH="$${PYTORCH_ROCM_ARCH:-gfx90a;gfx942;gfx950;gfx1100;gfx1101;gfx1200;gfx1201;gfx1150;gfx1151}"
|
||||
|
||||
# Build vLLM ROCm release image using cached base
|
||||
DOCKER_BUILDKIT=1 docker build \
|
||||
--build-arg max_jobs=16 \
|
||||
--build-arg BASE_IMAGE="$${BASE_IMAGE_TAG}" \
|
||||
--build-arg ARG_PYTORCH_ROCM_ARCH="$${PYTORCH_ROCM_ARCH}" \
|
||||
--build-arg USE_SCCACHE=1 \
|
||||
--build-arg SCCACHE_BUCKET_NAME=vllm-build-sccache \
|
||||
--build-arg SCCACHE_REGION_NAME=us-west-2 \
|
||||
--build-arg SCCACHE_S3_NO_CREDENTIALS=0 \
|
||||
--tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$${BUILDKITE_COMMIT}-rocm \
|
||||
--target vllm-openai \
|
||||
--progress plain \
|
||||
-f docker/Dockerfile.rocm .
|
||||
|
||||
# Push to ECR
|
||||
docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$${BUILDKITE_COMMIT}-rocm
|
||||
echo "Pushed: public.ecr.aws/q9t5s3a7/vllm-release-repo:$${BUILDKITE_COMMIT}-rocm"
|
||||
env:
|
||||
DOCKER_BUILDKIT: "1"
|
||||
S3_BUCKET: "vllm-wheels"
|
||||
|
||||
@@ -11,28 +11,36 @@ fi
|
||||
buildkite-agent annotate --style 'info' --context 'release-workflow' << EOF
|
||||
To download the wheel (by commit):
|
||||
\`\`\`
|
||||
aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux1_x86_64.whl .
|
||||
aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux2014_aarch64.whl .
|
||||
aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux_2_31_x86_64.whl .
|
||||
aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux_2_31_aarch64.whl .
|
||||
|
||||
aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}+cu129-cp38-abi3-manylinux1_x86_64.whl .
|
||||
aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}+cu129-cp38-abi3-manylinux1_x86_64.whl .
|
||||
(Optional) For CUDA 13.0:
|
||||
aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}+cu130-cp38-abi3-manylinux_2_35_x86_64.whl .
|
||||
aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}+cu130-cp38-abi3-manylinux_2_35_aarch64.whl .
|
||||
|
||||
(Optional) For CPU:
|
||||
aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}+cpu-cp38-abi3-manylinux_2_35_x86_64.whl .
|
||||
aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}+cpu-cp38-abi3-manylinux_2_35_aarch64.whl .
|
||||
\`\`\`
|
||||
|
||||
To download the wheel (by version):
|
||||
\`\`\`
|
||||
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux1_x86_64.whl .
|
||||
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux2014_aarch64.whl .
|
||||
|
||||
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu129/vllm-${RELEASE_VERSION}+cu129-cp38-abi3-manylinux1_x86_64.whl .
|
||||
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu130/vllm-${RELEASE_VERSION}+cu130-cp38-abi3-manylinux1_x86_64.whl .
|
||||
\`\`\`
|
||||
|
||||
To download and upload the image:
|
||||
|
||||
\`\`\`
|
||||
# Download images:
|
||||
|
||||
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64
|
||||
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64
|
||||
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64-cu130
|
||||
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64-cu130
|
||||
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm-base
|
||||
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm
|
||||
docker pull public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:v${RELEASE_VERSION}
|
||||
docker pull public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:v${RELEASE_VERSION}
|
||||
|
||||
# Tag and push images:
|
||||
|
||||
## CUDA
|
||||
|
||||
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64 vllm/vllm-openai:x86_64
|
||||
docker tag vllm/vllm-openai:x86_64 vllm/vllm-openai:latest-x86_64
|
||||
@@ -40,22 +48,70 @@ docker tag vllm/vllm-openai:x86_64 vllm/vllm-openai:v${RELEASE_VERSION}-x86_64
|
||||
docker push vllm/vllm-openai:latest-x86_64
|
||||
docker push vllm/vllm-openai:v${RELEASE_VERSION}-x86_64
|
||||
|
||||
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64-cu130 vllm/vllm-openai:x86_64-cu130
|
||||
docker tag vllm/vllm-openai:x86_64-cu130 vllm/vllm-openai:latest-x86_64-cu130
|
||||
docker tag vllm/vllm-openai:x86_64-cu130 vllm/vllm-openai:v${RELEASE_VERSION}-x86_64-cu130
|
||||
docker push vllm/vllm-openai:latest-x86_64-cu130
|
||||
docker push vllm/vllm-openai:v${RELEASE_VERSION}-x86_64-cu130
|
||||
|
||||
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64 vllm/vllm-openai:aarch64
|
||||
docker tag vllm/vllm-openai:aarch64 vllm/vllm-openai:latest-aarch64
|
||||
docker tag vllm/vllm-openai:aarch64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
|
||||
docker push vllm/vllm-openai:latest-aarch64
|
||||
docker push vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
|
||||
|
||||
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm vllm/vllm-openai:rocm
|
||||
docker tag vllm/vllm-openai:rocm vllm/vllm-openai:latest-rocm
|
||||
docker tag vllm/vllm-openai:rocm vllm/vllm-openai:v${RELEASE_VERSION}-rocm
|
||||
docker push vllm/vllm-openai:latest-rocm
|
||||
docker push vllm/vllm-openai:v${RELEASE_VERSION}-rocm
|
||||
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64-cu130 vllm/vllm-openai:aarch64-cu130
|
||||
docker tag vllm/vllm-openai:aarch64-cu130 vllm/vllm-openai:latest-aarch64-cu130
|
||||
docker tag vllm/vllm-openai:aarch64-cu130 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64-cu130
|
||||
docker push vllm/vllm-openai:latest-aarch64-cu130
|
||||
docker push vllm/vllm-openai:v${RELEASE_VERSION}-aarch64-cu130
|
||||
|
||||
## ROCm
|
||||
|
||||
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}
|
||||
docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT} vllm/vllm-openai-rocm:latest
|
||||
docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT} vllm/vllm-openai-rocm:v${RELEASE_VERSION}
|
||||
docker push vllm/vllm-openai-rocm:latest
|
||||
docker push vllm/vllm-openai-rocm:v${RELEASE_VERSION}
|
||||
|
||||
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm-base vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-base
|
||||
docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-base vllm/vllm-openai-rocm:latest-base
|
||||
docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-base vllm/vllm-openai-rocm:v${RELEASE_VERSION}-base
|
||||
docker push vllm/vllm-openai-rocm:latest-base
|
||||
docker push vllm/vllm-openai-rocm:v${RELEASE_VERSION}-base
|
||||
|
||||
## CPU
|
||||
|
||||
docker tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:v${RELEASE_VERSION} vllm/vllm-openai-cpu:x86_64
|
||||
docker tag vllm/vllm-openai-cpu:x86_64 vllm/vllm-openai-cpu:latest-x86_64
|
||||
docker tag vllm/vllm-openai-cpu:x86_64 vllm/vllm-openai-cpu:v${RELEASE_VERSION}-x86_64
|
||||
docker push vllm/vllm-openai-cpu:latest-x86_64
|
||||
docker push vllm/vllm-openai-cpu:v${RELEASE_VERSION}-x86_64
|
||||
|
||||
docker tag public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:v${RELEASE_VERSION} vllm/vllm-openai-cpu:arm64
|
||||
docker tag vllm/vllm-openai-cpu:arm64 vllm/vllm-openai-cpu:latest-arm64
|
||||
docker tag vllm/vllm-openai-cpu:arm64 vllm/vllm-openai-cpu:v${RELEASE_VERSION}-arm64
|
||||
docker push vllm/vllm-openai-cpu:latest-arm64
|
||||
docker push vllm/vllm-openai-cpu:v${RELEASE_VERSION}-arm64
|
||||
|
||||
# Create multi-arch manifest:
|
||||
|
||||
docker manifest rm vllm/vllm-openai:latest
|
||||
docker manifest create vllm/vllm-openai:latest vllm/vllm-openai:latest-x86_64 vllm/vllm-openai:latest-aarch64
|
||||
docker manifest create vllm/vllm-openai:v${RELEASE_VERSION} vllm/vllm-openai:v${RELEASE_VERSION}-x86_64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
|
||||
docker manifest push vllm/vllm-openai:latest
|
||||
docker manifest push vllm/vllm-openai:v${RELEASE_VERSION}
|
||||
|
||||
docker manifest rm vllm/vllm-openai:latest-cu130
|
||||
docker manifest create vllm/vllm-openai:latest-cu130 vllm/vllm-openai:latest-x86_64-cu130 vllm/vllm-openai:latest-aarch64-cu130
|
||||
docker manifest create vllm/vllm-openai:v${RELEASE_VERSION}-cu130 vllm/vllm-openai:v${RELEASE_VERSION}-x86_64-cu130 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64-cu130
|
||||
docker manifest push vllm/vllm-openai:latest-cu130
|
||||
docker manifest push vllm/vllm-openai:v${RELEASE_VERSION}-cu130
|
||||
|
||||
docker manifest rm vllm/vllm-openai-cpu:latest || true
|
||||
docker manifest create vllm/vllm-openai-cpu:latest vllm/vllm-openai-cpu:latest-x86_64 vllm/vllm-openai-cpu:latest-arm64
|
||||
docker manifest create vllm/vllm-openai-cpu:v${RELEASE_VERSION} vllm/vllm-openai-cpu:v${RELEASE_VERSION}-x86_64 vllm/vllm-openai-cpu:v${RELEASE_VERSION}-arm64
|
||||
docker manifest push vllm/vllm-openai-cpu:latest
|
||||
docker manifest push vllm/vllm-openai-cpu:v${RELEASE_VERSION}
|
||||
\`\`\`
|
||||
EOF
|
||||
EOF
|
||||
|
||||
@@ -3,25 +3,32 @@
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
#
|
||||
# Generate Buildkite annotation for ROCm wheel release
|
||||
|
||||
set -ex
|
||||
|
||||
# Get build configuration from meta-data
|
||||
# Extract ROCm version dynamically from Dockerfile.rocm_base
|
||||
# BASE_IMAGE format: rocm/dev-ubuntu-22.04:7.1-complete -> extracts "7.1"
|
||||
# BASE_IMAGE format: rocm/dev-ubuntu-22.04:7.0-complete -> extracts "7.0"
|
||||
ROCM_VERSION=$(grep -E '^ARG BASE_IMAGE=' docker/Dockerfile.rocm_base | sed -E 's/.*:([0-9]+\.[0-9]+).*/\1/' || echo "unknown")
|
||||
PYTHON_VERSION=$(buildkite-agent meta-data get rocm-python-version 2>/dev/null || echo "3.12")
|
||||
PYTORCH_ROCM_ARCH=$(buildkite-agent meta-data get rocm-pytorch-rocm-arch 2>/dev/null || echo "gfx90a;gfx942;gfx950;gfx1100;gfx1101;gfx1200;gfx1201;gfx1150;gfx1151")
|
||||
|
||||
# TODO: Enable the nightly build for ROCm
|
||||
# Get release version, default to 1.0.0.dev for nightly/per-commit builds
|
||||
RELEASE_VERSION=$(buildkite-agent meta-data get release-version 2>/dev/null || echo "")
|
||||
if [ -z "${RELEASE_VERSION}" ]; then
|
||||
RELEASE_VERSION="1.0.0.dev"
|
||||
fi
|
||||
|
||||
# S3 URLs
|
||||
S3_BUCKET="${S3_BUCKET:-vllm-wheels}"
|
||||
S3_REGION="${AWS_DEFAULT_REGION:-us-west-2}"
|
||||
S3_URL="https://${S3_BUCKET}.s3.${S3_REGION}.amazonaws.com"
|
||||
ROCM_PATH="rocm/${BUILDKITE_COMMIT}"
|
||||
S3_URL="http://${S3_BUCKET}.s3-website-${S3_REGION}.amazonaws.com"
|
||||
|
||||
# Format ROCm version for path (e.g., "7.1" -> "rocm710")
|
||||
ROCM_VERSION_PATH="rocm$(echo ${ROCM_VERSION} | tr -d '.')"
|
||||
ROCM_PATH="rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}"
|
||||
buildkite-agent annotate --style 'success' --context 'rocm-release-workflow' << EOF
|
||||
## :rocm: ROCm Wheel Release
|
||||
|
||||
## ROCm Wheel and Docker Image Releases
|
||||
### Build Configuration
|
||||
| Setting | Value |
|
||||
|---------|-------|
|
||||
@@ -34,41 +41,72 @@ buildkite-agent annotate --style 'success' --context 'rocm-release-workflow' <<
|
||||
### :package: Installation
|
||||
|
||||
**Install from this build (by commit):**
|
||||
\`\`\`bash
|
||||
uv pip install vllm --extra-index-url ${S3_URL}/${ROCM_PATH}/{rocm_variant}/
|
||||
|
||||
# Example:
|
||||
uv pip install vllm --extra-index-url ${S3_URL}/${ROCM_PATH}/rocm700/
|
||||
\`\`\`bash
|
||||
pip install vllm --extra-index-url ${S3_URL}/${ROCM_PATH}/ --trusted-host ${S3_BUCKET}.s3-website-${S3_REGION}.amazonaws.com
|
||||
|
||||
# Example for ROCm ${ROCM_VERSION}:
|
||||
pip install vllm --extra-index-url ${S3_URL}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/ --trusted-host ${S3_BUCKET}.s3-website-${S3_REGION}.amazonaws.com
|
||||
\`\`\`
|
||||
|
||||
**Install from nightly (if published):**
|
||||
|
||||
\`\`\`bash
|
||||
uv pip install vllm --extra-index-url ${S3_URL}/rocm/nightly/
|
||||
pip install vllm --extra-index-url ${S3_URL}/rocm/nightly/ --trusted-host ${S3_BUCKET}.s3-website-${S3_REGION}.amazonaws.com
|
||||
\`\`\`
|
||||
|
||||
### :floppy_disk: Download Wheels Directly
|
||||
|
||||
\`\`\`bash
|
||||
# List all ROCm wheels
|
||||
aws s3 ls s3://${S3_BUCKET}/${ROCM_PATH}/
|
||||
|
||||
aws s3 ls s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/
|
||||
# Download specific wheels
|
||||
aws s3 cp s3://${S3_BUCKET}/${ROCM_PATH}/vllm-*.whl .
|
||||
aws s3 cp s3://${S3_BUCKET}/${ROCM_PATH}/torch-*.whl .
|
||||
aws s3 cp s3://${S3_BUCKET}/${ROCM_PATH}/triton_rocm-*.whl .
|
||||
aws s3 cp s3://${S3_BUCKET}/${ROCM_PATH}/torchvision-*.whl .
|
||||
aws s3 cp s3://${S3_BUCKET}/${ROCM_PATH}/amdsmi-*.whl .
|
||||
aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/vllm-*.whl .
|
||||
aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/torch-*.whl .
|
||||
aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/triton-*.whl .
|
||||
aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/triton-kernels-*.whl .
|
||||
aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/torchvision-*.whl .
|
||||
aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/torchaudio-*.whl .
|
||||
aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/amdsmi-*.whl .
|
||||
aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/aiter-*.whl .
|
||||
aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/flash-attn-*.whl .
|
||||
\`\`\`
|
||||
|
||||
### :gear: Included Packages
|
||||
- **vllm**: vLLM with ROCm support
|
||||
- **torch**: PyTorch built for ROCm ${ROCM_VERSION}
|
||||
- **triton_rocm**: Triton built for ROCm
|
||||
- **triton**: Triton
|
||||
- **triton-kernels**: Triton kernels
|
||||
- **torchvision**: TorchVision for ROCm PyTorch
|
||||
- **torchaudio**: Torchaudio for ROCm PyTorch
|
||||
- **amdsmi**: AMD SMI Python bindings
|
||||
- **aiter**: Aiter for ROCm
|
||||
- **flash-attn**: Flash Attention for ROCm
|
||||
|
||||
### :warning: Notes
|
||||
- These wheels are built for **ROCm ${ROCM_VERSION}** and will NOT work with CUDA GPUs
|
||||
- Supported GPU architectures: ${PYTORCH_ROCM_ARCH}
|
||||
- Platform: Linux x86_64 only
|
||||
|
||||
### :package: Docker Image Release
|
||||
|
||||
To download and upload the image:
|
||||
|
||||
\`\`\`
|
||||
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm-base
|
||||
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm
|
||||
|
||||
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm-base vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-base
|
||||
docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-base vllm/vllm-openai-rocm:latest-base
|
||||
docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-base vllm/vllm-openai-rocm:v${RELEASE_VERSION}-base
|
||||
docker push vllm/vllm-openai-rocm:latest-base
|
||||
docker push vllm/vllm-openai-rocm:v${RELEASE_VERSION}-base
|
||||
|
||||
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}
|
||||
docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT} vllm/vllm-openai-rocm:latest
|
||||
docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT} vllm/vllm-openai-rocm:v${RELEASE_VERSION}
|
||||
docker push vllm/vllm-openai-rocm:latest
|
||||
docker push vllm/vllm-openai-rocm:v${RELEASE_VERSION}
|
||||
\`\`\`
|
||||
|
||||
EOF
|
||||
|
||||
242
.buildkite/scripts/cherry-pick-from-milestone.sh
Executable file
242
.buildkite/scripts/cherry-pick-from-milestone.sh
Executable file
@@ -0,0 +1,242 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# cherry-pick-from-milestone.sh
|
||||
# Find commits from a GitHub milestone that are missing from the current branch
|
||||
# and output them in chronological order for cherry-picking.
|
||||
#
|
||||
# Usage: ./cherry-pick-from-milestone.sh <milestone> [--dry-run] [--execute]
|
||||
#
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
# Colors for output
|
||||
RED='\033[0;31m'
|
||||
GREEN='\033[0;32m'
|
||||
YELLOW='\033[1;33m'
|
||||
BLUE='\033[0;34m'
|
||||
NC='\033[0m' # No Color
|
||||
|
||||
usage() {
|
||||
cat <<EOF
|
||||
Usage: $(basename "$0") <milestone> [options]
|
||||
|
||||
Find commits from a GitHub milestone that need to be cherry-picked into the current branch.
|
||||
|
||||
Arguments:
|
||||
milestone The GitHub milestone name (e.g., v0.14.0)
|
||||
|
||||
Options:
|
||||
--dry-run Show the cherry-pick commands without executing (default)
|
||||
--execute Actually execute the cherry-picks
|
||||
--main-branch Specify the main branch name (default: main)
|
||||
--help Show this help message
|
||||
|
||||
Examples:
|
||||
$(basename "$0") v0.14.0
|
||||
$(basename "$0") v0.14.0 --dry-run
|
||||
$(basename "$0") v0.14.0 --execute
|
||||
$(basename "$0") v0.14.0 --main-branch master
|
||||
EOF
|
||||
exit 1
|
||||
}
|
||||
|
||||
log_info() {
|
||||
echo -e "${BLUE}[INFO]${NC} $1"
|
||||
}
|
||||
|
||||
log_success() {
|
||||
echo -e "${GREEN}[OK]${NC} $1"
|
||||
}
|
||||
|
||||
log_warn() {
|
||||
echo -e "${YELLOW}[WARN]${NC} $1"
|
||||
}
|
||||
|
||||
log_error() {
|
||||
echo -e "${RED}[ERROR]${NC} $1" >&2
|
||||
}
|
||||
|
||||
# Default values
|
||||
MILESTONE=""
|
||||
DRY_RUN=true
|
||||
MAIN_BRANCH="main"
|
||||
|
||||
# Parse arguments
|
||||
while [[ $# -gt 0 ]]; do
|
||||
case $1 in
|
||||
--dry-run)
|
||||
DRY_RUN=true
|
||||
shift
|
||||
;;
|
||||
--execute)
|
||||
DRY_RUN=false
|
||||
shift
|
||||
;;
|
||||
--main-branch)
|
||||
MAIN_BRANCH="$2"
|
||||
shift 2
|
||||
;;
|
||||
--help|-h)
|
||||
usage
|
||||
;;
|
||||
-*)
|
||||
log_error "Unknown option: $1"
|
||||
usage
|
||||
;;
|
||||
*)
|
||||
if [[ -z "$MILESTONE" ]]; then
|
||||
MILESTONE="$1"
|
||||
else
|
||||
log_error "Unexpected argument: $1"
|
||||
usage
|
||||
fi
|
||||
shift
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
# Validate milestone argument
|
||||
if [[ -z "$MILESTONE" ]]; then
|
||||
log_error "Milestone is required"
|
||||
usage
|
||||
fi
|
||||
|
||||
# Check if we're in a git repository
|
||||
if ! git rev-parse --is-inside-work-tree &>/dev/null; then
|
||||
log_error "Not in a git repository"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Check if gh CLI is available
|
||||
if ! command -v gh &>/dev/null; then
|
||||
log_error "GitHub CLI (gh) is not installed"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Check if authenticated with gh
|
||||
if ! gh auth status &>/dev/null; then
|
||||
log_error "Not authenticated with GitHub CLI. Run 'gh auth login' first."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
CURRENT_BRANCH=$(git branch --show-current)
|
||||
log_info "Current branch: ${CURRENT_BRANCH}"
|
||||
log_info "Main branch: ${MAIN_BRANCH}"
|
||||
log_info "Milestone: ${MILESTONE}"
|
||||
echo ""
|
||||
|
||||
# Fetch latest from remote
|
||||
log_info "Fetching latest from remote..."
|
||||
git fetch origin "$MAIN_BRANCH" --quiet
|
||||
|
||||
# Get merged PRs from the milestone, sorted by merge date
|
||||
log_info "Fetching merged PRs from milestone '${MILESTONE}'..."
|
||||
|
||||
# Store PR data in a temp file
|
||||
PR_DATA=$(mktemp)
|
||||
trap "rm -f $PR_DATA" EXIT
|
||||
|
||||
if ! gh pr list --state merged --search "milestone:${MILESTONE}" \
|
||||
--limit 1000 \
|
||||
--json number,title,mergeCommit,mergedAt \
|
||||
--jq 'sort_by(.mergedAt) | .[] | "\(.mergeCommit.oid)\t\(.number)\t\(.title)"' > "$PR_DATA" 2>/dev/null; then
|
||||
log_error "Failed to fetch PRs from milestone '${MILESTONE}'"
|
||||
log_error "This could be due to:"
|
||||
log_error " - Milestone does not exist"
|
||||
log_error " - Network/authentication issues"
|
||||
log_error " - Invalid milestone name format"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [[ ! -s "$PR_DATA" ]]; then
|
||||
log_warn "No merged PRs found for milestone '${MILESTONE}'"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
TOTAL_PRS=$(wc -l < "$PR_DATA")
|
||||
log_info "Found ${TOTAL_PRS} merged PR(s) in milestone"
|
||||
echo ""
|
||||
|
||||
# Find commits that are missing from current branch
|
||||
MISSING_COMMITS=()
|
||||
MISSING_INFO=()
|
||||
|
||||
while IFS=$'\t' read -r sha pr_number title; do
|
||||
# Skip if SHA is empty or null
|
||||
if [[ -z "$sha" || "$sha" == "null" ]]; then
|
||||
log_warn "PR #${pr_number} has no merge commit SHA, skipping"
|
||||
continue
|
||||
fi
|
||||
|
||||
# Check if this commit is already in the current branch
|
||||
if git merge-base --is-ancestor "$sha" HEAD 2>/dev/null; then
|
||||
log_success "PR #${pr_number} already in branch: ${title:0:60}"
|
||||
else
|
||||
log_warn "PR #${pr_number} MISSING: ${title:0:60}"
|
||||
MISSING_COMMITS+=("$sha")
|
||||
MISSING_INFO+=("$sha PR #${pr_number}: ${title}")
|
||||
fi
|
||||
done < "$PR_DATA"
|
||||
|
||||
echo ""
|
||||
|
||||
if [[ ${#MISSING_COMMITS[@]} -eq 0 ]]; then
|
||||
log_success "All PRs from milestone '${MILESTONE}' are already in the current branch!"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
log_info "Found ${#MISSING_COMMITS[@]} missing commit(s) to cherry-pick"
|
||||
echo ""
|
||||
|
||||
# Output the cherry-pick commands
|
||||
echo "=========================================="
|
||||
echo "Cherry-pick commands (in chronological order):"
|
||||
echo "=========================================="
|
||||
echo ""
|
||||
|
||||
for info in "${MISSING_INFO[@]}"; do
|
||||
echo "# $info"
|
||||
done
|
||||
echo ""
|
||||
|
||||
echo "# Run these commands to cherry-pick all missing commits:"
|
||||
echo "git cherry-pick ${MISSING_COMMITS[*]}"
|
||||
echo ""
|
||||
|
||||
# Or one by one
|
||||
echo "# Or cherry-pick one at a time:"
|
||||
for sha in "${MISSING_COMMITS[@]}"; do
|
||||
echo "git cherry-pick $sha"
|
||||
done
|
||||
echo ""
|
||||
|
||||
# Execute if requested
|
||||
if [[ "$DRY_RUN" == false ]]; then
|
||||
echo "=========================================="
|
||||
log_info "Executing cherry-picks..."
|
||||
echo "=========================================="
|
||||
|
||||
for i in "${!MISSING_COMMITS[@]}"; do
|
||||
sha="${MISSING_COMMITS[$i]}"
|
||||
info="${MISSING_INFO[$i]}"
|
||||
|
||||
echo ""
|
||||
log_info "Cherry-picking: $info"
|
||||
|
||||
if git cherry-pick "$sha"; then
|
||||
log_success "Successfully cherry-picked $sha"
|
||||
else
|
||||
log_error "Failed to cherry-pick $sha"
|
||||
log_error "Resolve conflicts and run 'git cherry-pick --continue', or 'git cherry-pick --abort' to cancel"
|
||||
exit 1
|
||||
fi
|
||||
done
|
||||
|
||||
echo ""
|
||||
log_success "All cherry-picks completed successfully!"
|
||||
else
|
||||
echo "=========================================="
|
||||
echo -e "${YELLOW}Dry run mode - no changes made${NC}"
|
||||
echo "Run with --execute to perform the cherry-picks"
|
||||
echo "=========================================="
|
||||
fi
|
||||
@@ -3,7 +3,14 @@
|
||||
set -ex
|
||||
|
||||
# Clean up old nightly builds from DockerHub, keeping only the last 14 builds
|
||||
# This script uses DockerHub API to list and delete old tags with "nightly-" prefix
|
||||
# This script uses DockerHub API to list and delete old tags with specified prefix
|
||||
# Usage: cleanup-nightly-builds.sh [TAG_PREFIX]
|
||||
# Example: cleanup-nightly-builds.sh "nightly-" or cleanup-nightly-builds.sh "cu130-nightly-"
|
||||
|
||||
# Get tag prefix from argument, default to "nightly-" if not provided
|
||||
TAG_PREFIX="${1:-nightly-}"
|
||||
|
||||
echo "Cleaning up tags with prefix: $TAG_PREFIX"
|
||||
|
||||
# DockerHub API endpoint for vllm/vllm-openai repository
|
||||
REPO_API_URL="https://hub.docker.com/v2/repositories/vllm/vllm-openai/tags"
|
||||
@@ -45,7 +52,7 @@ get_all_tags() {
|
||||
set -x
|
||||
|
||||
# Get both last_updated timestamp and tag name, separated by |
|
||||
local tags=$(echo "$response" | jq -r '.results[] | select(.name | startswith("nightly-")) | "\(.last_updated)|\(.name)"')
|
||||
local tags=$(echo "$response" | jq -r --arg prefix "$TAG_PREFIX" '.results[] | select(.name | startswith($prefix)) | "\(.last_updated)|\(.name)"')
|
||||
|
||||
if [ -z "$tags" ]; then
|
||||
break
|
||||
|
||||
@@ -112,7 +112,7 @@ def parse_from_filename(file: str) -> WheelFileInfo:
|
||||
|
||||
def generate_project_list(subdir_names: list[str], comment: str = "") -> str:
|
||||
"""
|
||||
Generate project list HTML content linking to each project & variant sub-directory.
|
||||
Generate project list HTML content linking to each project & variant subdirectory.
|
||||
"""
|
||||
href_tags = []
|
||||
for name in sorted(subdir_names):
|
||||
@@ -168,23 +168,23 @@ def generate_index_and_metadata(
|
||||
comment (str | None): Optional comment to include in the generated HTML files.
|
||||
|
||||
First, parse all wheel files to extract metadata.
|
||||
We need to collect all wheel files for each variant, and generate an index for it (in a sub-directory).
|
||||
We need to collect all wheel files for each variant, and generate an index for it (in a subdirectory).
|
||||
The index for the default variant (if any) is generated in the root index directory.
|
||||
|
||||
If `default_variant` is provided, all wheels must have variant suffixes, and the default variant index
|
||||
is purely a copy of the corresponding variant index, with only the links adjusted.
|
||||
Otherwise, all wheels without variant suffixes are treated as the default variant.
|
||||
|
||||
If `alias_to_default` is provided, an additional alias sub-directory is created, it has the same content
|
||||
If `alias_to_default` is provided, an additional alias subdirectory is created, it has the same content
|
||||
as the default variant index, but the links are adjusted accordingly.
|
||||
|
||||
Index directory structure:
|
||||
index_base_dir/ (hosted at wheels.vllm.ai/{nightly,$commit,$version}/)
|
||||
index.html # project list, linking to "vllm/" and other packages, and all variant sub-directories
|
||||
index.html # project list, linking to "vllm/" and other packages, and all variant subdirectories
|
||||
vllm/
|
||||
index.html # package index, pointing to actual files in wheel_base_dir (relative path)
|
||||
metadata.json # machine-readable metadata for all wheels in this package
|
||||
cpu/ # cpu variant sub-directory
|
||||
cpu/ # cpu variant subdirectory
|
||||
index.html
|
||||
vllm/
|
||||
index.html
|
||||
@@ -194,7 +194,7 @@ def generate_index_and_metadata(
|
||||
vllm/
|
||||
index.html
|
||||
metadata.json
|
||||
cu130/ # cu130 variant sub-directory
|
||||
cu130/ # cu130 variant subdirectory
|
||||
index.html
|
||||
vllm/
|
||||
index.html
|
||||
|
||||
@@ -44,6 +44,17 @@ cleanup_docker() {
|
||||
fi
|
||||
}
|
||||
|
||||
cleanup_network() {
|
||||
for node in $(seq 0 $((NUM_NODES-1))); do
|
||||
if docker pr -a -q -f name="node${node}" | grep -q .; then
|
||||
docker stop "node${node}"
|
||||
fi
|
||||
done
|
||||
if docker network ls | grep docker-net; then
|
||||
docker network rm docker-net
|
||||
fi
|
||||
}
|
||||
|
||||
# Call the cleanup docker function
|
||||
cleanup_docker
|
||||
|
||||
@@ -76,7 +87,7 @@ mkdir -p "${HF_CACHE}"
|
||||
HF_MOUNT="/root/.cache/huggingface"
|
||||
|
||||
commands=$@
|
||||
echo "Commands:$commands"
|
||||
echo "Raw commands: $commands"
|
||||
|
||||
commands=${commands//"pytest -v -s basic_correctness/test_basic_correctness.py"/"pytest -v -s basic_correctness/test_basic_correctness.py"}
|
||||
|
||||
@@ -158,6 +169,9 @@ if [[ $commands == *" entrypoints/llm "* ]]; then
|
||||
--ignore=entrypoints/llm/test_prompt_validation.py "}
|
||||
fi
|
||||
|
||||
commands=$(echo "$commands" | sed 's/ \\ / /g')
|
||||
echo "Final commands: $commands"
|
||||
|
||||
# --ignore=entrypoints/openai/test_encoder_decoder.py \
|
||||
# --ignore=entrypoints/openai/test_embedding.py \
|
||||
# --ignore=entrypoints/openai/test_oot_registration.py
|
||||
@@ -165,7 +179,6 @@ fi
|
||||
# --ignore=entrypoints/openai/test_models.py <= Fails on MI250 but passes on MI300 as of 2025-03-13
|
||||
|
||||
|
||||
PARALLEL_JOB_COUNT=8
|
||||
MYPYTHONPATH=".."
|
||||
|
||||
# Test that we're launching on the machine that has
|
||||
@@ -176,53 +189,33 @@ if [[ -z "$render_gid" ]]; then
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# check if the command contains shard flag, we will run all shards in parallel because the host have 8 GPUs.
|
||||
if [[ $commands == *"--shard-id="* ]]; then
|
||||
# assign job count as the number of shards used
|
||||
commands=$(echo "$commands" | sed -E "s/--num-shards[[:blank:]]*=[[:blank:]]*[0-9]*/--num-shards=${PARALLEL_JOB_COUNT} /g" | sed 's/ \\ / /g')
|
||||
for GPU in $(seq 0 $(($PARALLEL_JOB_COUNT-1))); do
|
||||
# assign shard-id for each shard
|
||||
commands_gpu=$(echo "$commands" | sed -E "s/--shard-id[[:blank:]]*=[[:blank:]]*[0-9]*/--shard-id=${GPU} /g" | sed 's/ \\ / /g')
|
||||
echo "Shard ${GPU} commands:$commands_gpu"
|
||||
echo "Render devices: $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES"
|
||||
docker run \
|
||||
--device /dev/kfd $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES \
|
||||
--network=host \
|
||||
--shm-size=16gb \
|
||||
--group-add "$render_gid" \
|
||||
--rm \
|
||||
-e HIP_VISIBLE_DEVICES="${GPU}" \
|
||||
-e HF_TOKEN \
|
||||
-e AWS_ACCESS_KEY_ID \
|
||||
-e AWS_SECRET_ACCESS_KEY \
|
||||
-v "${HF_CACHE}:${HF_MOUNT}" \
|
||||
-e "HF_HOME=${HF_MOUNT}" \
|
||||
-e "PYTHONPATH=${MYPYTHONPATH}" \
|
||||
--name "${container_name}_${GPU}" \
|
||||
"${image_name}" \
|
||||
/bin/bash -c "${commands_gpu}" \
|
||||
|& while read -r line; do echo ">>Shard $GPU: $line"; done &
|
||||
PIDS+=($!)
|
||||
done
|
||||
#wait for all processes to finish and collect exit codes
|
||||
for pid in "${PIDS[@]}"; do
|
||||
wait "${pid}"
|
||||
STATUS+=($?)
|
||||
done
|
||||
at_least_one_shard_with_tests=0
|
||||
for st in "${STATUS[@]}"; do
|
||||
if [[ ${st} -ne 0 ]] && [[ ${st} -ne 5 ]]; then
|
||||
echo "One of the processes failed with $st"
|
||||
exit "${st}"
|
||||
elif [[ ${st} -eq 5 ]]; then
|
||||
echo "Shard exited with status 5 (no tests collected) - treating as success"
|
||||
else # This means st is 0
|
||||
at_least_one_shard_with_tests=1
|
||||
fi
|
||||
done
|
||||
if [[ ${#STATUS[@]} -gt 0 && ${at_least_one_shard_with_tests} -eq 0 ]]; then
|
||||
echo "All shards reported no tests collected. Failing the build."
|
||||
exit 1
|
||||
if [[ $commands == *"VLLM_TEST_GROUP_NAME=mi325_4-2-node-tests-4-gpus-in-total"* ]]; then
|
||||
|
||||
export DCKR_VER=$(docker --version | sed 's/Docker version \(.*\), build .*/\1/')
|
||||
|
||||
if [[ "$commands" =~ ^(.*)"["(.*)"] && ["(.*)"]"$ ]]; then
|
||||
prefix=$( echo "${BASH_REMATCH[1]}" | sed 's/;//g')
|
||||
echo "PREFIX: ${prefix}"
|
||||
export composite_command="(command rocm-smi || true)"
|
||||
myIFS=$IFS
|
||||
IFS=','
|
||||
read -ra node0 <<< ${BASH_REMATCH[2]}
|
||||
read -ra node1 <<< ${BASH_REMATCH[3]}
|
||||
IFS=$myIFS
|
||||
for i in "${!node0[@]}";do
|
||||
command_node_0=$(echo ${node0[i]} | sed 's/\"//g')
|
||||
command_node_1=$(echo ${node1[i]} | sed 's/\"//g')
|
||||
|
||||
export commands="./.buildkite/scripts/run-multi-node-test.sh /vllm-workspace/tests 2 2 ${image_name} '${command_node_0}' '${command_node_1}'"
|
||||
echo "COMMANDS: ${commands}"
|
||||
composite_command=$(echo "${composite_command} && ${commands}")
|
||||
done
|
||||
/bin/bash -c "${composite_command}"
|
||||
cleanup_network
|
||||
else
|
||||
echo "Failed to parse node commands! Exiting."
|
||||
cleanup_network
|
||||
exit 111
|
||||
fi
|
||||
else
|
||||
echo "Render devices: $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES"
|
||||
|
||||
@@ -0,0 +1,26 @@
|
||||
#!/bin/bash
|
||||
set -euox pipefail
|
||||
|
||||
echo "--- PP+TP"
|
||||
vllm serve meta-llama/Llama-3.2-3B-Instruct -tp=2 -pp=2 &
|
||||
server_pid=$!
|
||||
timeout 600 bash -c "until curl localhost:8000/v1/models; do sleep 1; done" || exit 1
|
||||
vllm bench serve \
|
||||
--backend vllm \
|
||||
--dataset-name random \
|
||||
--model meta-llama/Llama-3.2-3B-Instruct \
|
||||
--num-prompts 20 \
|
||||
--endpoint /v1/completions
|
||||
kill -s SIGTERM $server_pid &
|
||||
|
||||
echo "--- DP+TP"
|
||||
vllm serve meta-llama/Llama-3.2-3B-Instruct -tp=2 -dp=2 &
|
||||
server_pid=$!
|
||||
timeout 600 bash -c "until curl localhost:8000/v1/models; do sleep 1; done" || exit 1
|
||||
vllm bench serve \
|
||||
--backend vllm \
|
||||
--dataset-name random \
|
||||
--model meta-llama/Llama-3.2-3B-Instruct \
|
||||
--num-prompts 20 \
|
||||
--endpoint /v1/completions
|
||||
kill -s SIGTERM $server_pid &
|
||||
@@ -2,119 +2,19 @@
|
||||
|
||||
# This script build the CPU docker image and run the offline inference inside the container.
|
||||
# It serves a sanity check for compilation and basic model usage.
|
||||
set -ex
|
||||
set -euox pipefail
|
||||
|
||||
# allow to bind to different cores
|
||||
CORE_RANGE=${CORE_RANGE:-48-95}
|
||||
# used for TP/PP E2E test
|
||||
OMP_CORE_RANGE=${OMP_CORE_RANGE:-48-95}
|
||||
NUMA_NODE=${NUMA_NODE:-1}
|
||||
IMAGE_NAME="cpu-test-$NUMA_NODE"
|
||||
TIMEOUT_VAL=$1
|
||||
TEST_COMMAND=$2
|
||||
|
||||
export CMAKE_BUILD_PARALLEL_LEVEL=32
|
||||
|
||||
# Setup cleanup
|
||||
remove_docker_container() {
|
||||
set -e;
|
||||
docker rm -f cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"-avx2 || true;
|
||||
}
|
||||
trap remove_docker_container EXIT
|
||||
remove_docker_container
|
||||
|
||||
# Try building the docker image
|
||||
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --progress plain --tag cpu-test-"$NUMA_NODE" --target vllm-test -f docker/Dockerfile.cpu .
|
||||
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --progress plain --build-arg VLLM_CPU_DISABLE_AVX512="true" --tag cpu-test-"$NUMA_NODE"-avx2 --target vllm-test -f docker/Dockerfile.cpu .
|
||||
# building the docker image
|
||||
echo "--- :docker: Building Docker image"
|
||||
docker build --progress plain --tag "$IMAGE_NAME" --target vllm-test -f docker/Dockerfile.cpu .
|
||||
|
||||
# Run the image, setting --shm-size=4g for tensor parallel.
|
||||
docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=16 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"
|
||||
docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=16 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE"-avx2 cpu-test-"$NUMA_NODE"-avx2
|
||||
|
||||
function cpu_tests() {
|
||||
set -e
|
||||
export NUMA_NODE=$2
|
||||
|
||||
# list packages
|
||||
docker exec cpu-test-"$NUMA_NODE"-avx2 bash -c "
|
||||
set -e
|
||||
pip list"
|
||||
|
||||
docker exec cpu-test-"$NUMA_NODE" bash -c "
|
||||
set -e
|
||||
pip list"
|
||||
|
||||
# offline inference
|
||||
docker exec cpu-test-"$NUMA_NODE"-avx2 bash -c "
|
||||
set -e
|
||||
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
|
||||
|
||||
# Run kernel tests
|
||||
docker exec cpu-test-"$NUMA_NODE" bash -c "
|
||||
set -e
|
||||
pytest -x -v -s tests/kernels/attention/test_cpu_attn.py
|
||||
pytest -x -v -s tests/kernels/moe/test_cpu_fused_moe.py
|
||||
pytest -x -v -s tests/kernels/test_onednn.py"
|
||||
|
||||
# Run basic model test
|
||||
docker exec cpu-test-"$NUMA_NODE" bash -c "
|
||||
set -e
|
||||
# Note: disable until supports V1
|
||||
# pytest -x -v -s tests/kernels/attention/test_cache.py -m cpu_model
|
||||
# pytest -x -v -s tests/kernels/attention/test_mla_decode_cpu.py -m cpu_model
|
||||
|
||||
pytest -x -v -s tests/models/language/generation -m cpu_model
|
||||
VLLM_CPU_SGL_KERNEL=1 pytest -x -v -s tests/models/language/generation -m cpu_model
|
||||
|
||||
pytest -x -v -s tests/models/language/pooling -m cpu_model
|
||||
pytest -x -v -s tests/models/multimodal/generation \
|
||||
--ignore=tests/models/multimodal/generation/test_pixtral.py \
|
||||
-m cpu_model"
|
||||
|
||||
# Run compressed-tensor test
|
||||
docker exec cpu-test-"$NUMA_NODE" bash -c "
|
||||
set -e
|
||||
pytest -x -s -v \
|
||||
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_logprobs"
|
||||
|
||||
# Run AWQ/GPTQ test
|
||||
docker exec cpu-test-"$NUMA_NODE" bash -c "
|
||||
set -e
|
||||
pytest -x -s -v \
|
||||
tests/quantization/test_cpu_wna16.py"
|
||||
|
||||
# Run multi-lora tests
|
||||
docker exec cpu-test-"$NUMA_NODE" bash -c "
|
||||
set -e
|
||||
pytest -x -s -v \
|
||||
tests/lora/test_qwenvl.py"
|
||||
|
||||
# online serving: tp+pp
|
||||
docker exec cpu-test-"$NUMA_NODE" bash -c '
|
||||
set -e
|
||||
VLLM_CPU_OMP_THREADS_BIND=$E2E_OMP_THREADS VLLM_CPU_SGL_KERNEL=1 vllm serve meta-llama/Llama-3.2-3B-Instruct -tp=2 -pp=2 &
|
||||
server_pid=$!
|
||||
timeout 600 bash -c "until curl localhost:8000/v1/models; do sleep 1; done" || exit 1
|
||||
vllm bench serve \
|
||||
--backend vllm \
|
||||
--dataset-name random \
|
||||
--model meta-llama/Llama-3.2-3B-Instruct \
|
||||
--num-prompts 20 \
|
||||
--endpoint /v1/completions
|
||||
kill -s SIGTERM $server_pid &'
|
||||
|
||||
# online serving: tp+dp
|
||||
docker exec cpu-test-"$NUMA_NODE" bash -c '
|
||||
set -e
|
||||
VLLM_CPU_OMP_THREADS_BIND=$E2E_OMP_THREADS VLLM_CPU_SGL_KERNEL=1 vllm serve meta-llama/Llama-3.2-3B-Instruct -tp=2 -dp=2 &
|
||||
server_pid=$!
|
||||
timeout 600 bash -c "until curl localhost:8000/v1/models; do sleep 1; done" || exit 1
|
||||
vllm bench serve \
|
||||
--backend vllm \
|
||||
--dataset-name random \
|
||||
--model meta-llama/Llama-3.2-3B-Instruct \
|
||||
--num-prompts 20 \
|
||||
--endpoint /v1/completions
|
||||
kill -s SIGTERM $server_pid &'
|
||||
}
|
||||
|
||||
# All of CPU tests are expected to be finished less than 40 mins.
|
||||
export -f cpu_tests
|
||||
timeout 2.5h bash -c "cpu_tests $CORE_RANGE $NUMA_NODE"
|
||||
docker run --rm --cpuset-cpus=$CORE_RANGE --cpuset-mems=$NUMA_NODE -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN -e VLLM_CPU_KVCACHE_SPACE=16 -e VLLM_CPU_CI_ENV=1 -e VLLM_CPU_SIM_MULTI_NUMA=1 --shm-size=4g $IMAGE_NAME \
|
||||
timeout $TIMEOUT_VAL bash -c "set -euox pipefail; echo \"--- Print packages\"; pip list; echo \"--- Running tests\"; ${TEST_COMMAND}"
|
||||
|
||||
@@ -5,7 +5,9 @@
|
||||
set -exuo pipefail
|
||||
|
||||
# Try building the docker image
|
||||
cat <<EOF | docker build -t hpu-plugin-v1-test-env -f - .
|
||||
image_name="hpu/upstream-vllm-ci:${BUILDKITE_COMMIT}"
|
||||
container_name="hpu-upstream-vllm-ci-${BUILDKITE_COMMIT}-container"
|
||||
cat <<EOF | docker build -t ${image_name} -f - .
|
||||
FROM gaudi-base-image:latest
|
||||
|
||||
COPY ./ /workspace/vllm
|
||||
@@ -15,7 +17,8 @@ WORKDIR /workspace/vllm
|
||||
ENV no_proxy=localhost,127.0.0.1
|
||||
ENV PT_HPU_ENABLE_LAZY_COLLECTIVES=true
|
||||
|
||||
RUN VLLM_TARGET_DEVICE=empty pip install .
|
||||
RUN bash -c 'pip install -r <(sed "/^torch/d" requirements/build.txt)'
|
||||
RUN VLLM_TARGET_DEVICE=empty pip install --no-build-isolation -e .
|
||||
RUN pip install git+https://github.com/vllm-project/vllm-gaudi.git
|
||||
|
||||
# install development dependencies (for testing)
|
||||
@@ -36,15 +39,20 @@ EOF
|
||||
# functions, while other platforms only need one remove_docker_container
|
||||
# function.
|
||||
EXITCODE=1
|
||||
remove_docker_containers() { docker rm -f hpu-plugin-v1-test || true; }
|
||||
remove_docker_containers() { docker rm -f ${container_name} || true; }
|
||||
trap 'remove_docker_containers; exit $EXITCODE;' EXIT
|
||||
remove_docker_containers
|
||||
|
||||
echo "Running HPU plugin v1 test"
|
||||
docker run --rm --runtime=habana --name=hpu-plugin-v1-test --network=host \
|
||||
docker run --rm --runtime=habana --name=${container_name} --network=host \
|
||||
-e HABANA_VISIBLE_DEVICES=all \
|
||||
hpu-plugin-v1-test-env \
|
||||
/bin/bash "/workspace/vllm-gaudi/tests/upstream_tests/ci_tests.sh"
|
||||
-e VLLM_SKIP_WARMUP=true \
|
||||
-e PT_HPU_ENABLE_LAZY_COLLECTIVES=true \
|
||||
-e PT_HPU_LAZY_MODE=1 \
|
||||
"${image_name}" \
|
||||
/bin/bash -c '
|
||||
cd vllm; timeout 120s python -u examples/offline_inference/basic/generate.py --model facebook/opt-125m
|
||||
'
|
||||
|
||||
EXITCODE=$?
|
||||
if [ $EXITCODE -eq 0 ]; then
|
||||
|
||||
@@ -38,15 +38,18 @@ docker run \
|
||||
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 -O3 -cc.cudagraph_mode=NONE
|
||||
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray
|
||||
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp
|
||||
python3 examples/offline_inference/basic/generate.py --model Intel/Qwen2.5-0.5B-W4A16-G128-AutoRound-LLMC-TEST-ONLY --enforce-eager
|
||||
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager --attention-backend=TRITON_ATTN
|
||||
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager --quantization fp8
|
||||
python3 examples/offline_inference/basic/generate.py --model superjob/Qwen3-4B-Instruct-2507-GPTQ-Int4 --block-size 64 --enforce-eager
|
||||
python3 examples/offline_inference/basic/generate.py --model ibm-research/PowerMoE-3b --block-size 64 --enforce-eager -tp 2
|
||||
python3 examples/offline_inference/basic/generate.py --model ibm-research/PowerMoE-3b --block-size 64 --enforce-eager -tp 2 --enable-expert-parallel
|
||||
cd tests
|
||||
pytest -v -s v1/core
|
||||
pytest -v -s v1/core --ignore=v1/core/test_reset_prefix_cache_e2e.py
|
||||
pytest -v -s v1/engine
|
||||
pytest -v -s v1/sample --ignore=v1/sample/test_logprobs.py --ignore=v1/sample/test_logprobs_e2e.py
|
||||
pytest -v -s v1/worker --ignore=v1/worker/test_gpu_model_runner.py
|
||||
pytest -v -s v1/structured_output
|
||||
pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_tree_attention.py --ignore=v1/spec_decode/test_speculators_eagle3.py
|
||||
pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_tree_attention.py --ignore=v1/spec_decode/test_speculators_eagle3.py --ignore=v1/spec_decode/test_acceptance_length.py
|
||||
pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_nixl_connector.py --ignore=v1/kv_connector/unit/test_example_connector.py --ignore=v1/kv_connector/unit/test_lmcache_integration.py
|
||||
pytest -v -s v1/test_serial_utils.py
|
||||
'
|
||||
|
||||
36
.buildkite/scripts/push-nightly-builds.sh
Executable file
36
.buildkite/scripts/push-nightly-builds.sh
Executable file
@@ -0,0 +1,36 @@
|
||||
#!/bin/bash
|
||||
|
||||
set -ex
|
||||
|
||||
# Get tag variant from argument, default to empty if not provided, should be something like "cu130".
|
||||
# Due to limits in cleanup script, we must move variants to use separate tags like "cu130-nightly",
|
||||
# otherwise they will be cleaned up together with the main "nightly" tags.
|
||||
|
||||
TAG_VARIANT="$1"
|
||||
if [ -n "$TAG_VARIANT" ]; then
|
||||
ORIG_TAG_SUFFIX="-$TAG_VARIANT"
|
||||
TAG_NAME="$TAG_VARIANT-nightly"
|
||||
else
|
||||
ORIG_TAG_SUFFIX=""
|
||||
TAG_NAME="nightly"
|
||||
fi
|
||||
|
||||
ORIG_TAG_NAME="$BUILDKITE_COMMIT"
|
||||
|
||||
echo "Pushing original tag $ORIG_TAG_NAME$ORIG_TAG_SUFFIX to new nightly tag name: $TAG_NAME"
|
||||
|
||||
# pull original arch-dependent images from AWS ECR Public
|
||||
aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7
|
||||
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:$ORIG_TAG_NAME-x86_64$ORIG_TAG_SUFFIX
|
||||
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:$ORIG_TAG_NAME-aarch64$ORIG_TAG_SUFFIX
|
||||
# tag arch-dependent images
|
||||
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$ORIG_TAG_NAME-x86_64$ORIG_TAG_SUFFIX vllm/vllm-openai:$TAG_NAME-x86_64
|
||||
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$ORIG_TAG_NAME-aarch64$ORIG_TAG_SUFFIX vllm/vllm-openai:$TAG_NAME-aarch64
|
||||
# push arch-dependent images to DockerHub
|
||||
docker push vllm/vllm-openai:$TAG_NAME-x86_64
|
||||
docker push vllm/vllm-openai:$TAG_NAME-aarch64
|
||||
# push arch-independent manifest to DockerHub
|
||||
docker manifest create vllm/vllm-openai:$TAG_NAME vllm/vllm-openai:$TAG_NAME-x86_64 vllm/vllm-openai:$TAG_NAME-aarch64 --amend
|
||||
docker manifest create vllm/vllm-openai:$TAG_NAME-$BUILDKITE_COMMIT vllm/vllm-openai:$TAG_NAME-x86_64 vllm/vllm-openai:$TAG_NAME-aarch64 --amend
|
||||
docker manifest push vllm/vllm-openai:$TAG_NAME
|
||||
docker manifest push vllm/vllm-openai:$TAG_NAME-$BUILDKITE_COMMIT
|
||||
98
.buildkite/scripts/push-release-images-dockerhub.sh
Normal file
98
.buildkite/scripts/push-release-images-dockerhub.sh
Normal file
@@ -0,0 +1,98 @@
|
||||
#!/bin/bash
|
||||
|
||||
set -ex
|
||||
|
||||
RELEASE_VERSION=$(buildkite-agent meta-data get release-version 2>/dev/null | sed 's/^v//')
|
||||
if [ -z "${RELEASE_VERSION}" ]; then
|
||||
echo "RELEASE_VERSION is not set"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7
|
||||
|
||||
# Download images:
|
||||
|
||||
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64
|
||||
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64
|
||||
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64-cu130
|
||||
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64-cu130
|
||||
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm-base
|
||||
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm
|
||||
docker pull public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:v${RELEASE_VERSION}
|
||||
docker pull public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:v${RELEASE_VERSION}
|
||||
|
||||
# Tag and push images:
|
||||
|
||||
## CUDA
|
||||
|
||||
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64 vllm/vllm-openai:x86_64
|
||||
docker tag vllm/vllm-openai:x86_64 vllm/vllm-openai:latest-x86_64
|
||||
docker tag vllm/vllm-openai:x86_64 vllm/vllm-openai:v${RELEASE_VERSION}-x86_64
|
||||
docker push vllm/vllm-openai:latest-x86_64
|
||||
docker push vllm/vllm-openai:v${RELEASE_VERSION}-x86_64
|
||||
|
||||
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64-cu130 vllm/vllm-openai:x86_64-cu130
|
||||
docker tag vllm/vllm-openai:x86_64-cu130 vllm/vllm-openai:latest-x86_64-cu130
|
||||
docker tag vllm/vllm-openai:x86_64-cu130 vllm/vllm-openai:v${RELEASE_VERSION}-x86_64-cu130
|
||||
docker push vllm/vllm-openai:latest-x86_64-cu130
|
||||
docker push vllm/vllm-openai:v${RELEASE_VERSION}-x86_64-cu130
|
||||
|
||||
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64 vllm/vllm-openai:aarch64
|
||||
docker tag vllm/vllm-openai:aarch64 vllm/vllm-openai:latest-aarch64
|
||||
docker tag vllm/vllm-openai:aarch64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
|
||||
docker push vllm/vllm-openai:latest-aarch64
|
||||
docker push vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
|
||||
|
||||
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64-cu130 vllm/vllm-openai:aarch64-cu130
|
||||
docker tag vllm/vllm-openai:aarch64-cu130 vllm/vllm-openai:latest-aarch64-cu130
|
||||
docker tag vllm/vllm-openai:aarch64-cu130 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64-cu130
|
||||
docker push vllm/vllm-openai:latest-aarch64-cu130
|
||||
docker push vllm/vllm-openai:v${RELEASE_VERSION}-aarch64-cu130
|
||||
|
||||
## ROCm
|
||||
|
||||
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}
|
||||
docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT} vllm/vllm-openai-rocm:latest
|
||||
docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT} vllm/vllm-openai-rocm:v${RELEASE_VERSION}
|
||||
docker push vllm/vllm-openai-rocm:latest
|
||||
docker push vllm/vllm-openai-rocm:v${RELEASE_VERSION}
|
||||
|
||||
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm-base vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-base
|
||||
docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-base vllm/vllm-openai-rocm:latest-base
|
||||
docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-base vllm/vllm-openai-rocm:v${RELEASE_VERSION}-base
|
||||
docker push vllm/vllm-openai-rocm:latest-base
|
||||
docker push vllm/vllm-openai-rocm:v${RELEASE_VERSION}-base
|
||||
|
||||
## CPU
|
||||
|
||||
docker tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:v${RELEASE_VERSION} vllm/vllm-openai-cpu:x86_64
|
||||
docker tag vllm/vllm-openai-cpu:x86_64 vllm/vllm-openai-cpu:latest-x86_64
|
||||
docker tag vllm/vllm-openai-cpu:x86_64 vllm/vllm-openai-cpu:v${RELEASE_VERSION}-x86_64
|
||||
docker push vllm/vllm-openai-cpu:latest-x86_64
|
||||
docker push vllm/vllm-openai-cpu:v${RELEASE_VERSION}-x86_64
|
||||
|
||||
docker tag public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:v${RELEASE_VERSION} vllm/vllm-openai-cpu:arm64
|
||||
docker tag vllm/vllm-openai-cpu:arm64 vllm/vllm-openai-cpu:latest-arm64
|
||||
docker tag vllm/vllm-openai-cpu:arm64 vllm/vllm-openai-cpu:v${RELEASE_VERSION}-arm64
|
||||
docker push vllm/vllm-openai-cpu:latest-arm64
|
||||
docker push vllm/vllm-openai-cpu:v${RELEASE_VERSION}-arm64
|
||||
|
||||
# Create multi-arch manifest:
|
||||
|
||||
docker manifest rm vllm/vllm-openai:latest
|
||||
docker manifest create vllm/vllm-openai:latest vllm/vllm-openai:latest-x86_64 vllm/vllm-openai:latest-aarch64
|
||||
docker manifest create vllm/vllm-openai:v${RELEASE_VERSION} vllm/vllm-openai:v${RELEASE_VERSION}-x86_64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
|
||||
docker manifest push vllm/vllm-openai:latest
|
||||
docker manifest push vllm/vllm-openai:v${RELEASE_VERSION}
|
||||
|
||||
docker manifest rm vllm/vllm-openai:latest-cu130
|
||||
docker manifest create vllm/vllm-openai:latest-cu130 vllm/vllm-openai:latest-x86_64-cu130 vllm/vllm-openai:latest-aarch64-cu130
|
||||
docker manifest create vllm/vllm-openai:v${RELEASE_VERSION}-cu130 vllm/vllm-openai:v${RELEASE_VERSION}-x86_64-cu130 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64-cu130
|
||||
docker manifest push vllm/vllm-openai:latest-cu130
|
||||
docker manifest push vllm/vllm-openai:v${RELEASE_VERSION}-cu130
|
||||
|
||||
docker manifest rm vllm/vllm-openai-cpu:latest || true
|
||||
docker manifest create vllm/vllm-openai-cpu:latest vllm/vllm-openai-cpu:latest-x86_64 vllm/vllm-openai-cpu:latest-arm64
|
||||
docker manifest create vllm/vllm-openai-cpu:v${RELEASE_VERSION} vllm/vllm-openai-cpu:v${RELEASE_VERSION}-x86_64 vllm/vllm-openai-cpu:v${RELEASE_VERSION}-arm64
|
||||
docker manifest push vllm/vllm-openai-cpu:latest
|
||||
docker manifest push vllm/vllm-openai-cpu:v${RELEASE_VERSION}
|
||||
@@ -43,7 +43,6 @@ trap cleanup EXIT
|
||||
|
||||
for BACK in "${BACKENDS[@]}"; do
|
||||
VLLM_DEEP_GEMM_WARMUP=skip \
|
||||
VLLM_ALL2ALL_BACKEND=$BACK \
|
||||
vllm serve "$MODEL" \
|
||||
--enforce-eager \
|
||||
--tensor-parallel-size 2 \
|
||||
@@ -52,6 +51,7 @@ for BACK in "${BACKENDS[@]}"; do
|
||||
--enable-eplb \
|
||||
--trust-remote-code \
|
||||
--max-model-len 2048 \
|
||||
--all2all-backend $BACK \
|
||||
--port $PORT &
|
||||
SERVER_PID=$!
|
||||
wait_for_server $PORT
|
||||
|
||||
@@ -18,15 +18,18 @@ wait_for_server() {
|
||||
|
||||
MODEL="Qwen/Qwen3-Next-80B-A3B-Instruct"
|
||||
|
||||
# Set BACKENDS based on platform
|
||||
# Set BACKENDS and platform-specific args based on platform
|
||||
if command -v rocm-smi &> /dev/null || [[ -d /opt/rocm ]] || [[ -n "${ROCM_PATH:-}" ]]; then
|
||||
# ROCm platform
|
||||
BACKENDS=("allgather_reducescatter")
|
||||
# Disable MOE padding for ROCm since it is causing eplb to fail
|
||||
export VLLM_ROCM_MOE_PADDING=0
|
||||
PLATFORM_ARGS=("--no-async-scheduling")
|
||||
echo "Disabled async scheduling for ROCm platform due to issues with spec decode."
|
||||
else
|
||||
# Non-ROCm platform (CUDA/other)
|
||||
BACKENDS=("deepep_high_throughput" "deepep_low_latency")
|
||||
PLATFORM_ARGS=()
|
||||
fi
|
||||
|
||||
cleanup() {
|
||||
@@ -54,6 +57,7 @@ for BACK in "${BACKENDS[@]}"; do
|
||||
--trust-remote-code \
|
||||
--max-model-len 2048 \
|
||||
--gpu-memory-utilization 0.9 \
|
||||
"${PLATFORM_ARGS[@]}" \
|
||||
--port $PORT &
|
||||
SERVER_PID=$!
|
||||
wait_for_server $PORT
|
||||
|
||||
227
.buildkite/scripts/trigger-ci-build.sh
Executable file
227
.buildkite/scripts/trigger-ci-build.sh
Executable file
@@ -0,0 +1,227 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# trigger-ci-build.sh
|
||||
# Trigger a Buildkite CI build using the bk CLI for the current commit and branch
|
||||
# with RUN_ALL=1 and NIGHTLY=1 environment variables.
|
||||
#
|
||||
# Usage: ./trigger-ci-build.sh [options]
|
||||
#
|
||||
# Requires: bk CLI (https://buildkite.com/docs/platform/cli)
|
||||
#
|
||||
# SAFETY: Dry-run by default. Use --execute to actually trigger a build.
|
||||
#
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
# Colors for output
|
||||
RED='\033[0;31m'
|
||||
GREEN='\033[0;32m'
|
||||
YELLOW='\033[1;33m'
|
||||
BLUE='\033[0;34m'
|
||||
NC='\033[0m' # No Color
|
||||
|
||||
# Default configuration
|
||||
PIPELINE="ci"
|
||||
DRY_RUN=true
|
||||
|
||||
usage() {
|
||||
cat <<EOF
|
||||
Usage: $(basename "$0") [options]
|
||||
|
||||
Trigger a Buildkite CI build using the bk CLI for the current commit and branch.
|
||||
Sets RUN_ALL=1 and NIGHTLY=1 environment variables.
|
||||
|
||||
SAFETY: Dry-run by default. Use --execute to actually trigger a build.
|
||||
|
||||
Options:
|
||||
--execute Actually trigger the build (default: dry-run)
|
||||
--pipeline Buildkite pipeline slug (default: ${PIPELINE})
|
||||
--commit Override commit SHA (default: current HEAD)
|
||||
--branch Override branch name (default: current branch)
|
||||
--message Custom build message (default: auto-generated)
|
||||
--help Show this help message
|
||||
|
||||
Prerequisites:
|
||||
- bk CLI installed: brew tap buildkite/buildkite && brew install buildkite/buildkite/bk
|
||||
- bk configured: bk configure
|
||||
|
||||
Examples:
|
||||
$(basename "$0") # Dry-run, show what would happen
|
||||
$(basename "$0") --execute # Actually trigger the build
|
||||
$(basename "$0") --pipeline ci-shadow # Dry-run with different pipeline
|
||||
EOF
|
||||
exit 1
|
||||
}
|
||||
|
||||
log_info() {
|
||||
echo -e "${BLUE}[INFO]${NC} $1"
|
||||
}
|
||||
|
||||
log_success() {
|
||||
echo -e "${GREEN}[OK]${NC} $1"
|
||||
}
|
||||
|
||||
log_warn() {
|
||||
echo -e "${YELLOW}[WARN]${NC} $1"
|
||||
}
|
||||
|
||||
log_error() {
|
||||
echo -e "${RED}[ERROR]${NC} $1" >&2
|
||||
}
|
||||
|
||||
# Parse arguments
|
||||
COMMIT=""
|
||||
BRANCH=""
|
||||
MESSAGE=""
|
||||
|
||||
while [[ $# -gt 0 ]]; do
|
||||
case $1 in
|
||||
--execute)
|
||||
DRY_RUN=false
|
||||
shift
|
||||
;;
|
||||
--pipeline)
|
||||
PIPELINE="$2"
|
||||
shift 2
|
||||
;;
|
||||
--commit)
|
||||
COMMIT="$2"
|
||||
shift 2
|
||||
;;
|
||||
--branch)
|
||||
BRANCH="$2"
|
||||
shift 2
|
||||
;;
|
||||
--message)
|
||||
MESSAGE="$2"
|
||||
shift 2
|
||||
;;
|
||||
--help|-h)
|
||||
usage
|
||||
;;
|
||||
-*)
|
||||
log_error "Unknown option: $1"
|
||||
usage
|
||||
;;
|
||||
*)
|
||||
log_error "Unexpected argument: $1"
|
||||
usage
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
# Check if bk CLI is installed
|
||||
if ! command -v bk &>/dev/null; then
|
||||
log_error "Buildkite CLI (bk) is not installed"
|
||||
echo ""
|
||||
echo "Install with:"
|
||||
echo " brew tap buildkite/buildkite && brew install buildkite/buildkite/bk"
|
||||
echo ""
|
||||
echo "Then configure:"
|
||||
echo " bk configure"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Check if we're in a git repository
|
||||
if ! git rev-parse --is-inside-work-tree &>/dev/null; then
|
||||
log_error "Not in a git repository"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Get current commit and branch if not overridden
|
||||
if [[ -z "$COMMIT" ]]; then
|
||||
COMMIT=$(git rev-parse HEAD)
|
||||
fi
|
||||
|
||||
if [[ -z "$BRANCH" ]]; then
|
||||
BRANCH=$(git branch --show-current)
|
||||
if [[ -z "$BRANCH" ]]; then
|
||||
# Detached HEAD state - try to get branch from ref
|
||||
BRANCH=$(git rev-parse --abbrev-ref HEAD)
|
||||
fi
|
||||
fi
|
||||
|
||||
# Generate default message if not provided
|
||||
if [[ -z "$MESSAGE" ]]; then
|
||||
COMMIT_MSG=$(git log -1 --pretty=format:"%s" "$COMMIT" 2>/dev/null || echo "Manual build")
|
||||
MESSAGE="[Manual] ${COMMIT_MSG}"
|
||||
fi
|
||||
|
||||
# Safety check: Verify the commit exists on the remote
|
||||
log_info "Verifying commit exists on remote..."
|
||||
git fetch origin --quiet 2>/dev/null || true
|
||||
|
||||
# Check if commit is reachable from any remote branch
|
||||
REMOTE_BRANCHES=$(git branch -r --contains "$COMMIT" 2>/dev/null || true)
|
||||
if [[ -z "$REMOTE_BRANCHES" ]]; then
|
||||
log_error "Commit ${COMMIT} does not exist on any remote branch!"
|
||||
echo ""
|
||||
echo "The CI system will fail to checkout this commit."
|
||||
echo "Please push your changes first:"
|
||||
echo ""
|
||||
echo " git push origin ${BRANCH}"
|
||||
echo ""
|
||||
exit 1
|
||||
fi
|
||||
|
||||
log_success "Commit found on remote branches:"
|
||||
echo "$REMOTE_BRANCHES" | head -5 | sed 's/^/ /'
|
||||
if [[ $(echo "$REMOTE_BRANCHES" | wc -l) -gt 5 ]]; then
|
||||
echo " ... and more"
|
||||
fi
|
||||
echo ""
|
||||
|
||||
log_info "Pipeline: ${PIPELINE}"
|
||||
log_info "Branch: ${BRANCH}"
|
||||
log_info "Commit: ${COMMIT}"
|
||||
log_info "Message: ${MESSAGE}"
|
||||
log_info "Environment: RUN_ALL=1, NIGHTLY=1"
|
||||
echo ""
|
||||
|
||||
# Build the command
|
||||
CMD=(bk build create
|
||||
-y
|
||||
-w
|
||||
-i
|
||||
--pipeline "${PIPELINE}"
|
||||
--commit "${COMMIT}"
|
||||
--branch "${BRANCH}"
|
||||
--message "${MESSAGE}"
|
||||
--env "RUN_ALL=1"
|
||||
--env "NIGHTLY=1"
|
||||
)
|
||||
|
||||
if [[ "$DRY_RUN" == true ]]; then
|
||||
echo "=========================================="
|
||||
log_warn "DRY-RUN MODE - No build will be triggered"
|
||||
echo "=========================================="
|
||||
echo ""
|
||||
echo "Command that would be executed:"
|
||||
echo ""
|
||||
# Escape single quotes in values for safe shell display
|
||||
escape_for_shell() {
|
||||
printf '%s' "$1" | sed "s/'/'\\\\''/g"
|
||||
}
|
||||
echo " bk build create \\"
|
||||
echo " -y \\"
|
||||
echo " -w \\"
|
||||
echo " -i \\"
|
||||
echo " --pipeline '$(escape_for_shell "${PIPELINE}")' \\"
|
||||
echo " --commit '$(escape_for_shell "${COMMIT}")' \\"
|
||||
echo " --branch '$(escape_for_shell "${BRANCH}")' \\"
|
||||
echo " --message '$(escape_for_shell "${MESSAGE}")' \\"
|
||||
echo " --env 'RUN_ALL=1' \\"
|
||||
echo " --env 'NIGHTLY=1'"
|
||||
echo ""
|
||||
echo "=========================================="
|
||||
echo -e "${YELLOW}To actually trigger this build, run:${NC}"
|
||||
echo ""
|
||||
echo " $0 --execute"
|
||||
echo "=========================================="
|
||||
exit 0
|
||||
fi
|
||||
|
||||
log_info "Triggering build..."
|
||||
|
||||
# Execute the command - bk will print the URL and open browser
|
||||
"${CMD[@]}"
|
||||
70
.buildkite/scripts/upload-release-wheels-pypi.sh
Normal file
70
.buildkite/scripts/upload-release-wheels-pypi.sh
Normal file
@@ -0,0 +1,70 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
set -e
|
||||
|
||||
BUCKET="vllm-wheels"
|
||||
SUBPATH=$BUILDKITE_COMMIT
|
||||
S3_COMMIT_PREFIX="s3://$BUCKET/$SUBPATH/"
|
||||
|
||||
RELEASE_VERSION=$(buildkite-agent meta-data get release-version)
|
||||
GIT_VERSION=$(git describe --exact-match --tags $BUILDKITE_COMMIT 2>/dev/null)
|
||||
|
||||
echo "Release version from Buildkite: $RELEASE_VERSION"
|
||||
|
||||
if [[ -z "$GIT_VERSION" ]]; then
|
||||
echo "[FATAL] Not on a git tag, cannot create release."
|
||||
exit 1
|
||||
else
|
||||
echo "Git version for commit $BUILDKITE_COMMIT: $GIT_VERSION"
|
||||
fi
|
||||
# sanity check for version mismatch
|
||||
if [[ "$RELEASE_VERSION" != "$GIT_VERSION" ]]; then
|
||||
if [[ "$FORCE_RELEASE_IGNORE_VERSION_MISMATCH" == "true" ]]; then
|
||||
echo "[WARNING] Force release and ignore version mismatch"
|
||||
else
|
||||
echo "[FATAL] Release version from Buildkite does not match Git version."
|
||||
exit 1
|
||||
fi
|
||||
fi
|
||||
PURE_VERSION=${RELEASE_VERSION#v} # remove leading 'v'
|
||||
|
||||
# check pypi token
|
||||
if [[ -z "$PYPI_TOKEN" ]]; then
|
||||
echo "[FATAL] PYPI_TOKEN is not set."
|
||||
exit 1
|
||||
else
|
||||
export TWINE_USERNAME="__token__"
|
||||
export TWINE_PASSWORD="$PYPI_TOKEN"
|
||||
fi
|
||||
|
||||
set -x # avoid printing secrets above
|
||||
|
||||
# install twine from pypi
|
||||
python3 -m venv /tmp/vllm-release-env
|
||||
source /tmp/vllm-release-env/bin/activate
|
||||
pip install twine
|
||||
python3 -m twine --version
|
||||
|
||||
# copy release wheels to local directory
|
||||
DIST_DIR=/tmp/vllm-release-dist
|
||||
echo "Existing wheels on S3:"
|
||||
aws s3 ls "$S3_COMMIT_PREFIX"
|
||||
echo "Copying wheels to local directory"
|
||||
mkdir -p $DIST_DIR
|
||||
# include only wheels for the release version, ignore all files with "dev" or "rc" in the name (without excluding 'aarch64')
|
||||
aws s3 cp --recursive --exclude "*" --include "vllm-${PURE_VERSION}*.whl" --exclude "*dev*" --exclude "*rc[0-9]*" "$S3_COMMIT_PREFIX" $DIST_DIR
|
||||
echo "Wheels copied to local directory"
|
||||
# generate source tarball
|
||||
git archive --format=tar.gz --output="$DIST_DIR/vllm-${PURE_VERSION}.tar.gz" $BUILDKITE_COMMIT
|
||||
ls -la $DIST_DIR
|
||||
|
||||
# upload wheels to PyPI (only default variant, i.e. files without '+' in the name)
|
||||
PYPI_WHEEL_FILES=$(find $DIST_DIR -name "vllm-${PURE_VERSION}*.whl" -not -name "*+*")
|
||||
if [[ -z "$PYPI_WHEEL_FILES" ]]; then
|
||||
echo "No default variant wheels found, quitting..."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
python3 -m twine check $PYPI_WHEEL_FILES
|
||||
python3 -m twine upload --non-interactive --verbose $PYPI_WHEEL_FILES
|
||||
echo "Wheels uploaded to PyPI"
|
||||
@@ -70,7 +70,9 @@ steps:
|
||||
- vllm/
|
||||
- tests/test_inputs.py
|
||||
- tests/test_outputs.py
|
||||
- tests/test_pooling_params.py
|
||||
- tests/multimodal
|
||||
- tests/renderers
|
||||
- tests/standalone_tests/lazy_imports.py
|
||||
- tests/tokenizers_
|
||||
- tests/tool_parsers
|
||||
@@ -81,7 +83,9 @@ steps:
|
||||
- python3 standalone_tests/lazy_imports.py
|
||||
- pytest -v -s test_inputs.py
|
||||
- pytest -v -s test_outputs.py
|
||||
- pytest -v -s test_pooling_params.py
|
||||
- pytest -v -s -m 'cpu_test' multimodal
|
||||
- pytest -v -s renderers
|
||||
- pytest -v -s tokenizers_
|
||||
- pytest -v -s tool_parsers
|
||||
- pytest -v -s transformers_utils
|
||||
@@ -229,6 +233,7 @@ steps:
|
||||
- tests/compile/fullgraph/test_basic_correctness.py
|
||||
- examples/offline_inference/rlhf.py
|
||||
- examples/offline_inference/rlhf_colocate.py
|
||||
- examples/offline_inference/new_weight_syncing/
|
||||
- tests/examples/offline_inference/data_parallel.py
|
||||
- tests/v1/distributed
|
||||
- tests/v1/engine/test_engine_core_client.py
|
||||
@@ -264,10 +269,16 @@ steps:
|
||||
- pytest -v -s distributed/test_symm_mem_allreduce.py
|
||||
# TODO: create a dedicated test section for multi-GPU example tests
|
||||
# when we have multiple distributed example tests
|
||||
# OLD rlhf examples
|
||||
- pushd ../examples/offline_inference
|
||||
- VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf.py
|
||||
- VLLM_ALLOW_INSECURE_SERIALIZATION=1 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
|
||||
- popd
|
||||
# NEW rlhf examples
|
||||
- pushd ../examples/offline_inference/new_weight_syncing
|
||||
- VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf.py
|
||||
- VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf_async_new_apis.py
|
||||
- popd
|
||||
|
||||
- label: Distributed Tests (8 GPUs) # 4min
|
||||
timeout_in_minutes: 10
|
||||
@@ -428,6 +439,8 @@ steps:
|
||||
timeout_in_minutes: 30
|
||||
gpu: h100
|
||||
source_file_dependencies:
|
||||
- vllm/config/attention.py
|
||||
- vllm/model_executor/layers/attention
|
||||
- vllm/v1/attention
|
||||
- tests/v1/attention
|
||||
commands:
|
||||
@@ -452,10 +465,12 @@ steps:
|
||||
timeout_in_minutes: 30
|
||||
gpu: b200
|
||||
source_file_dependencies:
|
||||
- vllm/config/attention.py
|
||||
- vllm/model_executor/layers/attention
|
||||
- vllm/v1/attention
|
||||
- tests/v1/attention
|
||||
commands:
|
||||
- VLLM_DISABLE_FLASHINFER_PREFILL=1 pytest -v -s v1/attention # TODO: FI prefill is bugged and causes incorrectness, fix this
|
||||
- pytest -v -s v1/attention
|
||||
|
||||
- label: V1 Test others (CPU) # 5 mins
|
||||
mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
|
||||
@@ -499,7 +514,7 @@ steps:
|
||||
- python3 offline_inference/vision_language_multi_image.py --seed 0
|
||||
- python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
|
||||
# for pooling models
|
||||
- python3 pooling/pooling/vision_language_pooling.py --seed 0
|
||||
- python3 pooling/embed/vision_embedding_offline.py --seed 0
|
||||
# for features demo
|
||||
- python3 offline_inference/prefix_caching.py
|
||||
- python3 offline_inference/llm_engine_example.py
|
||||
@@ -519,6 +534,7 @@ steps:
|
||||
- tests/cuda
|
||||
commands:
|
||||
- pytest -v -s cuda/test_cuda_context.py
|
||||
- pytest -v -s cuda/test_platform_no_cuda_init.py
|
||||
|
||||
- label: Samplers Test # 56min
|
||||
timeout_in_minutes: 75
|
||||
@@ -598,9 +614,11 @@ steps:
|
||||
- tests/compile
|
||||
commands:
|
||||
- pytest -v -s compile/fullgraph/test_full_graph.py -k 'not test_fp8_kv_scale_compile'
|
||||
# Limit to no custom ops to reduce running time
|
||||
# Wrap with quotes to escape yaml and avoid starting -k string with a -
|
||||
- "pytest -v -s compile/distributed/test_fusions_e2e.py -k 'TRITON and not +quant_fp8 and not Llama-4'"
|
||||
# # Limit to no custom ops to reduce running time
|
||||
# # Wrap with quotes to escape yaml and avoid starting -k string with a -
|
||||
# - "pytest -v -s compile/distributed/test_fusions_e2e.py -k 'TRITON and not +quant_fp8 and not Llama-4'"
|
||||
# Old E2E tests were removed in https://github.com/vllm-project/vllm/pull/33293
|
||||
# in favor of new tests in fusions_e2e. We avoid replicating the new jobs in this file as it's deprecated.
|
||||
|
||||
- label: Cudagraph test
|
||||
timeout_in_minutes: 20
|
||||
@@ -634,8 +652,9 @@ steps:
|
||||
# grade: Blocking
|
||||
source_file_dependencies:
|
||||
- csrc/attention/
|
||||
- vllm/attention
|
||||
- vllm/v1/attention
|
||||
# TODO: remove this dependency (https://github.com/vllm-project/vllm/issues/32267)
|
||||
- vllm/model_executor/layers/attention
|
||||
- tests/kernels/attention
|
||||
commands:
|
||||
- pytest -v -s kernels/attention --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
|
||||
@@ -703,6 +722,17 @@ steps:
|
||||
- pytest -v -s kernels/moe/test_batched_deepgemm.py
|
||||
- pytest -v -s kernels/attention/test_deepgemm_attention.py
|
||||
|
||||
- label: Kernels Helion Test
|
||||
timeout_in_minutes: 30
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
agent_pool: mi325_1
|
||||
source_file_dependencies:
|
||||
- vllm/utils/import_utils.py
|
||||
- tests/kernels/helion/
|
||||
commands:
|
||||
- pip install helion
|
||||
- pytest -v -s kernels/helion/
|
||||
|
||||
- label: Model Executor Test # 23min
|
||||
timeout_in_minutes: 35
|
||||
torch_nightly: true
|
||||
@@ -724,7 +754,7 @@ steps:
|
||||
- label: Benchmarks # 11min
|
||||
timeout_in_minutes: 20
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
agent_pool: mi325_8
|
||||
agent_pool: mi325_1
|
||||
# grade: Blocking
|
||||
working_dir: "/vllm-workspace/.buildkite"
|
||||
source_file_dependencies:
|
||||
@@ -735,7 +765,7 @@ steps:
|
||||
- label: Benchmarks CLI Test # 7min
|
||||
timeout_in_minutes: 20
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
agent_pool: mi325_8
|
||||
agent_pool: mi325_1
|
||||
# grade: Blocking
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
@@ -834,10 +864,11 @@ steps:
|
||||
torch_nightly: true
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/models/test_terratorch.py
|
||||
- tests/models/test_transformers.py
|
||||
- tests/models/test_registry.py
|
||||
commands:
|
||||
- pytest -v -s models/test_transformers.py models/test_registry.py
|
||||
- pytest -v -s models/test_terratorch.py models/test_transformers.py models/test_registry.py
|
||||
|
||||
- label: Basic Models Test (Other CPU) # 5min
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
@@ -855,7 +886,7 @@ steps:
|
||||
|
||||
- label: Language Models Tests (Standard)
|
||||
timeout_in_minutes: 25
|
||||
mirror_hardwares: [amdexperimental]
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
agent_pool: mi325_1
|
||||
# grade: Blocking
|
||||
torch_nightly: true
|
||||
@@ -1114,7 +1145,7 @@ steps:
|
||||
- csrc/quantization/cutlass_w8a8/moe/
|
||||
- vllm/model_executor/layers/fused_moe/cutlass_moe.py
|
||||
- vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py
|
||||
- vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py
|
||||
- vllm/model_executor/layers/fused_moe/flashinfer_a2a_prepare_finalize.py
|
||||
- vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
|
||||
- vllm/v1/attention/backends/flashinfer.py
|
||||
- vllm/v1/attention/backends/mla/cutlass_mla.py
|
||||
@@ -1160,44 +1191,26 @@ steps:
|
||||
- vllm/model_executor/layers/layernorm.py
|
||||
- vllm/model_executor/layers/activation.py
|
||||
- vllm/model_executor/layers/quantization/input_quant_fp8.py
|
||||
- tests/compile/test_fusion_attn.py
|
||||
- tests/compile/test_silu_mul_quant_fusion.py
|
||||
- tests/compile/distributed/test_fusion_all_reduce.py
|
||||
- tests/compile/distributed/test_fusions_e2e.py
|
||||
- tests/compile/passes/test_fusion_attn.py
|
||||
- tests/compile/passes/test_silu_mul_quant_fusion.py
|
||||
- tests/compile/passes/distributed/test_fusion_all_reduce.py
|
||||
- tests/compile/fullgraph/test_full_graph.py
|
||||
commands:
|
||||
- nvidia-smi
|
||||
- pytest -v -s tests/compile/test_fusion_attn.py
|
||||
- pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
|
||||
- pytest -v -s tests/compile/passes/test_fusion_attn.py
|
||||
- pytest -v -s tests/compile/passes/test_silu_mul_quant_fusion.py
|
||||
# this runner has 2 GPUs available even though num_gpus=2 is not set
|
||||
- pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
|
||||
# Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time
|
||||
# Wrap with quotes to escape yaml
|
||||
- "pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and not +quant_fp8 and not +rms_norm'"
|
||||
- pytest -v -s tests/compile/passes/distributed/test_fusion_all_reduce.py
|
||||
|
||||
# # Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time
|
||||
# # Wrap with quotes to escape yaml
|
||||
# - "pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and not +quant_fp8 and not +rms_norm'"
|
||||
# Old E2E tests were removed in https://github.com/vllm-project/vllm/pull/33293
|
||||
# in favor of new tests in fusions_e2e. We avoid replicating the new jobs in this file as it's deprecated.
|
||||
|
||||
# test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40)
|
||||
- pytest -v -s tests/compile/fullgraph/test_full_graph.py::test_fp8_kv_scale_compile
|
||||
|
||||
- label: Blackwell Fusion E2E Tests # 30 min
|
||||
timeout_in_minutes: 40
|
||||
working_dir: "/vllm-workspace/"
|
||||
gpu: b200
|
||||
optional: true
|
||||
num_gpus: 2
|
||||
source_file_dependencies:
|
||||
- csrc/quantization/fp4/
|
||||
- vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
|
||||
- vllm/v1/attention/backends/flashinfer.py
|
||||
- vllm/compilation/
|
||||
# can affect pattern matching
|
||||
- vllm/model_executor/layers/layernorm.py
|
||||
- vllm/model_executor/layers/activation.py
|
||||
- vllm/model_executor/layers/quantization/input_quant_fp8.py
|
||||
- tests/compile/distributed/test_fusions_e2e.py
|
||||
commands:
|
||||
- nvidia-smi
|
||||
# Run all e2e fusion tests
|
||||
- pytest -v -s tests/compile/distributed/test_fusions_e2e.py
|
||||
|
||||
- label: Blackwell GPT-OSS Eval
|
||||
timeout_in_minutes: 60
|
||||
working_dir: "/vllm-workspace/"
|
||||
@@ -1260,7 +1273,7 @@ steps:
|
||||
|
||||
- label: 2 Node Tests (4 GPUs in total) # 16min
|
||||
timeout_in_minutes: 30
|
||||
mirror_hardwares: [amdexperimental]
|
||||
mirror_hardwares: [amdexperimental, amdmultinode]
|
||||
agent_pool: mi325_4
|
||||
# grade: Blocking
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
@@ -1274,15 +1287,15 @@ steps:
|
||||
- tests/distributed/
|
||||
- tests/examples/offline_inference/data_parallel.py
|
||||
commands:
|
||||
- # the following commands are for the first node, with ip 192.168.10.10 (ray environment already set up)
|
||||
- VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed'
|
||||
- NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed'
|
||||
- # the following commands are for the first node, with ip 192.168.10.10 (ray environment already set up) | grep 'Same node test passed' | grep 'Node count test passed'
|
||||
- VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py
|
||||
- NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py
|
||||
- python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=0 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code
|
||||
- VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py
|
||||
- VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py
|
||||
- # the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up)
|
||||
- VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed'
|
||||
- NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed'
|
||||
- VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py
|
||||
- NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py
|
||||
- python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=1 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code
|
||||
|
||||
- label: Distributed Tests (2 GPUs) # 68min
|
||||
@@ -1451,7 +1464,7 @@ steps:
|
||||
- bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large-amd.txt
|
||||
|
||||
- label: NixlConnector PD accuracy tests (Distributed) # 30min
|
||||
mirror_hardwares: [amdexperimental]
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
agent_pool: mi325_4
|
||||
# grade: Blocking
|
||||
timeout_in_minutes: 30
|
||||
@@ -1462,10 +1475,10 @@ steps:
|
||||
- tests/v1/kv_connector/nixl_integration/
|
||||
commands:
|
||||
- uv pip install --system -r /vllm-workspace/requirements/kv_connectors_rocm.txt
|
||||
- VLLM_ATTENTION_BACKEND=ROCM_ATTN bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh
|
||||
- ROCM_ATTN=1 bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh
|
||||
|
||||
- label: DP EP NixlConnector PD accuracy tests (Distributed) # 15min
|
||||
mirror_hardwares: [amdexperimental]
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
agent_pool: mi325_4
|
||||
# grade: Blocking
|
||||
timeout_in_minutes: 15
|
||||
@@ -1476,7 +1489,7 @@ steps:
|
||||
- tests/v1/kv_connector/nixl_integration/
|
||||
commands:
|
||||
- uv pip install --system -r /vllm-workspace/requirements/kv_connectors_rocm.txt
|
||||
- VLLM_ATTENTION_BACKEND=ROCM_ATTN DP_EP=1 bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh
|
||||
- DP_EP=1 ROCM_ATTN=1 bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh
|
||||
|
||||
##### multi gpus test #####
|
||||
##### A100 test #####
|
||||
@@ -1491,6 +1504,9 @@ steps:
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
commands:
|
||||
# Work around HIP bug tracked here: https://github.com/ROCm/hip/issues/3876
|
||||
# TODO: Remove when the bug is fixed in a future ROCm release
|
||||
- export TORCH_NCCL_BLOCKING_WAIT=1
|
||||
# NOTE: don't test llama model here, it seems hf implementation is buggy
|
||||
# see https://github.com/vllm-project/vllm/pull/5689 for details
|
||||
- pytest -v -s distributed/test_custom_all_reduce.py
|
||||
@@ -1541,12 +1557,15 @@ steps:
|
||||
working_dir: "/vllm-workspace/"
|
||||
num_gpus: 2
|
||||
commands:
|
||||
- VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_async_tp.py
|
||||
- pytest -v -s tests/compile/distributed/test_sequence_parallelism.py
|
||||
- pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
|
||||
- VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/passes/distributed/test_async_tp.py
|
||||
- pytest -v -s tests/compile/passes/distributed/test_sequence_parallelism.py
|
||||
- pytest -v -s tests/compile/passes/distributed/test_fusion_all_reduce.py
|
||||
#- pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm
|
||||
- "VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'"
|
||||
- VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/distributed/test_sequence_parallel.py
|
||||
# - "VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'"
|
||||
# Old E2E tests were removed in https://github.com/vllm-project/vllm/pull/33293
|
||||
# in favor of new tests in fusions_e2e. We avoid replicating the new jobs in this file as it's deprecated.
|
||||
|
||||
- VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/correctness_e2e/test_sequence_parallel.py
|
||||
- pytest -v -s tests/distributed/test_context_parallel.py
|
||||
- HIP_VISIBLE_DEVICES=0,1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=allgather_reducescatter --disable-nccl-for-dp-synchronization
|
||||
- pytest -v -s tests/v1/distributed/test_dbo.py
|
||||
@@ -1662,17 +1681,6 @@ steps:
|
||||
commands:
|
||||
- bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1
|
||||
|
||||
- label: DeepSeek V2-Lite Async EPLB Accuracy
|
||||
timeout_in_minutes: 60
|
||||
mirror_hardwares: [amdexperimental]
|
||||
agent_pool: mi325_4
|
||||
# grade: Blocking
|
||||
gpu: h100
|
||||
optional: true
|
||||
num_gpus: 4
|
||||
working_dir: "/vllm-workspace"
|
||||
commands:
|
||||
- bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_async_eplb.sh 0.25 1319 8030
|
||||
|
||||
- label: Qwen3-Next-80B-A3B-Instruct MTP Async EPLB Accuracy
|
||||
timeout_in_minutes: 60
|
||||
|
||||
@@ -63,7 +63,9 @@ steps:
|
||||
- vllm/
|
||||
- tests/test_inputs.py
|
||||
- tests/test_outputs.py
|
||||
- tests/test_pooling_params.py
|
||||
- tests/multimodal
|
||||
- tests/renderers
|
||||
- tests/standalone_tests/lazy_imports.py
|
||||
- tests/tokenizers_
|
||||
- tests/tool_parsers
|
||||
@@ -74,7 +76,9 @@ steps:
|
||||
- python3 standalone_tests/lazy_imports.py
|
||||
- pytest -v -s test_inputs.py
|
||||
- pytest -v -s test_outputs.py
|
||||
- pytest -v -s test_pooling_params.py
|
||||
- pytest -v -s -m 'cpu_test' multimodal
|
||||
- pytest -v -s renderers
|
||||
- pytest -v -s tokenizers_
|
||||
- pytest -v -s tool_parsers
|
||||
- pytest -v -s transformers_utils
|
||||
@@ -202,6 +206,7 @@ steps:
|
||||
- tests/compile/fullgraph/test_basic_correctness.py
|
||||
- examples/offline_inference/rlhf.py
|
||||
- examples/offline_inference/rlhf_colocate.py
|
||||
- examples/offline_inference/new_weight_syncing/
|
||||
- tests/examples/offline_inference/data_parallel.py
|
||||
- tests/v1/distributed
|
||||
- tests/v1/engine/test_engine_core_client.py
|
||||
@@ -236,10 +241,16 @@ steps:
|
||||
- pytest -v -s distributed/test_symm_mem_allreduce.py
|
||||
# TODO: create a dedicated test section for multi-GPU example tests
|
||||
# when we have multiple distributed example tests
|
||||
# OLD rlhf examples
|
||||
- pushd ../examples/offline_inference
|
||||
- VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf.py
|
||||
- VLLM_ALLOW_INSECURE_SERIALIZATION=1 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
|
||||
- popd
|
||||
# NEW rlhf examples
|
||||
- pushd ../examples/offline_inference/new_weight_syncing
|
||||
- VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf.py
|
||||
- VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf_async_new_apis.py
|
||||
- popd
|
||||
|
||||
- label: Distributed Tests (8 GPUs) # 4min
|
||||
timeout_in_minutes: 10
|
||||
@@ -360,7 +371,7 @@ steps:
|
||||
- pytest -v -s v1/sample
|
||||
- pytest -v -s v1/logits_processors
|
||||
- pytest -v -s v1/worker
|
||||
- pytest -v -s v1/spec_decode
|
||||
- pytest -v -s -m 'not slow_test' v1/spec_decode
|
||||
- pytest -v -s -m 'not cpu_test' v1/kv_connector/unit
|
||||
- pytest -v -s -m 'not cpu_test' v1/metrics
|
||||
- pytest -v -s v1/test_oracle.py
|
||||
@@ -374,6 +385,8 @@ steps:
|
||||
timeout_in_minutes: 30
|
||||
gpu: h100
|
||||
source_file_dependencies:
|
||||
- vllm/config/attention.py
|
||||
- vllm/model_executor/layers/attention
|
||||
- vllm/v1/attention
|
||||
- tests/v1/attention
|
||||
commands:
|
||||
@@ -396,10 +409,12 @@ steps:
|
||||
timeout_in_minutes: 30
|
||||
gpu: b200
|
||||
source_file_dependencies:
|
||||
- vllm/config/attention.py
|
||||
- vllm/model_executor/layers/attention
|
||||
- vllm/v1/attention
|
||||
- tests/v1/attention
|
||||
commands:
|
||||
- VLLM_DISABLE_FLASHINFER_PREFILL=1 pytest -v -s v1/attention # TODO: FI prefill is bugged and causes incorrectness, fix this
|
||||
- pytest -v -s v1/attention
|
||||
|
||||
- label: V1 Test others (CPU) # 5 mins
|
||||
source_file_dependencies:
|
||||
@@ -438,7 +453,7 @@ steps:
|
||||
- python3 offline_inference/vision_language_multi_image.py --seed 0
|
||||
- python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
|
||||
# for pooling models
|
||||
- python3 pooling/pooling/vision_language_pooling.py --seed 0
|
||||
- python3 pooling/embed/vision_embedding_offline.py --seed 0
|
||||
# for features demo
|
||||
- python3 offline_inference/prefix_caching.py
|
||||
- python3 offline_inference/llm_engine_example.py
|
||||
@@ -504,6 +519,7 @@ steps:
|
||||
# However, find does not normally propagate error codes, so we combine it with xargs
|
||||
# (using -0 for proper path handling)
|
||||
- "find compile/ -maxdepth 1 -name 'test_*.py' -print0 | xargs -0 -n1 -I{} pytest -s -v '{}'"
|
||||
- pytest -s -v compile/passes --ignore compile/passes/distributed
|
||||
|
||||
- label: PyTorch Fullgraph Smoke Test # 15min
|
||||
timeout_in_minutes: 30
|
||||
@@ -531,9 +547,11 @@ steps:
|
||||
commands:
|
||||
# fp8 kv scales not supported on sm89, tested on Blackwell instead
|
||||
- pytest -v -s compile/fullgraph/test_full_graph.py -k 'not test_fp8_kv_scale_compile'
|
||||
# Limit to no custom ops to reduce running time
|
||||
# Wrap with quotes to escape yaml and avoid starting -k string with a -
|
||||
- "pytest -v -s compile/distributed/test_fusions_e2e.py -k 'TRITON and not +quant_fp8 and not Llama-4'"
|
||||
# # Limit to no custom ops to reduce running time
|
||||
# # Wrap with quotes to escape yaml and avoid starting -k string with a -
|
||||
# - "pytest -v -s compile/distributed/test_fusions_e2e.py -k 'TRITON and not +quant_fp8 and not Llama-4'"
|
||||
# Old E2E tests were removed in https://github.com/vllm-project/vllm/pull/33293
|
||||
# in favor of new tests in fusions_e2e. We avoid replicating the new jobs in this file as it's deprecated.
|
||||
|
||||
- label: Cudagraph test
|
||||
timeout_in_minutes: 20
|
||||
@@ -562,8 +580,9 @@ steps:
|
||||
mirror_hardwares: [amdexperimental]
|
||||
source_file_dependencies:
|
||||
- csrc/attention/
|
||||
- vllm/attention
|
||||
- vllm/v1/attention
|
||||
# TODO: remove this dependency (https://github.com/vllm-project/vllm/issues/32267)
|
||||
- vllm/model_executor/layers/attention
|
||||
- tests/kernels/attention
|
||||
commands:
|
||||
- pytest -v -s kernels/attention --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
|
||||
@@ -624,6 +643,56 @@ steps:
|
||||
- pytest -v -s kernels/moe/test_batched_deepgemm.py
|
||||
- pytest -v -s kernels/attention/test_deepgemm_attention.py
|
||||
|
||||
- label: Kernels Helion Test
|
||||
timeout_in_minutes: 30
|
||||
gpu: h100
|
||||
source_file_dependencies:
|
||||
- vllm/utils/import_utils.py
|
||||
- tests/kernels/helion/
|
||||
commands:
|
||||
- pip install helion
|
||||
- pytest -v -s kernels/helion/
|
||||
|
||||
|
||||
- label: Kernels FP8 MoE Test (1 H100)
|
||||
timeout_in_minutes: 90
|
||||
gpu: h100
|
||||
num_gpus: 1
|
||||
optional: true
|
||||
commands:
|
||||
- pytest -v -s kernels/moe/test_cutlass_moe.py
|
||||
- pytest -v -s kernels/moe/test_flashinfer.py
|
||||
- pytest -v -s kernels/moe/test_gpt_oss_triton_kernels.py
|
||||
- pytest -v -s kernels/moe/test_modular_oai_triton_moe.py
|
||||
- pytest -v -s kernels/moe/test_moe.py
|
||||
# - pytest -v -s kernels/moe/test_block_fp8.py - failing on main
|
||||
- pytest -v -s kernels/moe/test_block_int8.py
|
||||
- pytest -v -s kernels/moe/test_triton_moe_no_act_mul.py
|
||||
- pytest -v -s kernels/moe/test_triton_moe_ptpc_fp8.py
|
||||
|
||||
- label: Kernels FP8 MoE Test (2 H100s)
|
||||
timeout_in_minutes: 90
|
||||
gpu: h100
|
||||
num_gpus: 2
|
||||
optional: true
|
||||
commands:
|
||||
- pytest -v -s kernels/moe/test_deepep_deepgemm_moe.py
|
||||
- pytest -v -s kernels/moe/test_deepep_moe.py
|
||||
- pytest -v -s kernels/moe/test_pplx_cutlass_moe.py
|
||||
# - pytest -v -s kernels/moe/test_pplx_moe.py - failing on main
|
||||
|
||||
- label: Kernels Fp4 MoE Test (B200)
|
||||
timeout_in_minutes: 60
|
||||
gpu: b200
|
||||
num_gpus: 1
|
||||
optional: true
|
||||
commands:
|
||||
- pytest -v -s kernels/moe/test_cutedsl_moe.py
|
||||
- pytest -v -s kernels/moe/test_flashinfer_moe.py
|
||||
- pytest -v -s kernels/moe/test_nvfp4_moe.py
|
||||
- pytest -v -s kernels/moe/test_ocp_mx_moe.py
|
||||
|
||||
|
||||
- label: Model Executor Test # 23min
|
||||
timeout_in_minutes: 35
|
||||
torch_nightly: true
|
||||
@@ -736,10 +805,11 @@ steps:
|
||||
torch_nightly: true
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/models/test_terratorch.py
|
||||
- tests/models/test_transformers.py
|
||||
- tests/models/test_registry.py
|
||||
commands:
|
||||
- pytest -v -s models/test_transformers.py models/test_registry.py
|
||||
- pytest -v -s models/test_terratorch.py models/test_transformers.py models/test_registry.py
|
||||
|
||||
- label: Basic Models Test (Other CPU) # 5min
|
||||
timeout_in_minutes: 10
|
||||
@@ -951,7 +1021,7 @@ steps:
|
||||
# Whisper needs spawn method to avoid deadlock
|
||||
- VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper
|
||||
|
||||
- label: Blackwell Test # 21 min
|
||||
- label: Blackwell Test # 23 min
|
||||
timeout_in_minutes: 30
|
||||
working_dir: "/vllm-workspace/"
|
||||
gpu: b200
|
||||
@@ -961,7 +1031,7 @@ steps:
|
||||
- csrc/quantization/cutlass_w8a8/moe/
|
||||
- vllm/model_executor/layers/fused_moe/cutlass_moe.py
|
||||
- vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py
|
||||
- vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py
|
||||
- vllm/model_executor/layers/fused_moe/flashinfer_a2a_prepare_finalize.py
|
||||
- vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
|
||||
- vllm/v1/attention/backends/flashinfer.py
|
||||
- vllm/v1/attention/backends/mla/cutlass_mla.py
|
||||
@@ -991,6 +1061,8 @@ steps:
|
||||
- pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py
|
||||
- pytest -v -s tests/kernels/moe/test_flashinfer.py
|
||||
- pytest -v -s tests/kernels/moe/test_cutedsl_moe.py
|
||||
# e2e
|
||||
- pytest -v -s tests/models/quantization/test_nvfp4.py
|
||||
|
||||
- label: Blackwell Fusion and Compile Tests # 30 min
|
||||
timeout_in_minutes: 40
|
||||
@@ -1009,42 +1081,23 @@ steps:
|
||||
- vllm/model_executor/layers/quantization/input_quant_fp8.py
|
||||
- tests/compile/test_fusion_attn.py
|
||||
- tests/compile/test_silu_mul_quant_fusion.py
|
||||
- tests/compile/distributed/test_fusion_all_reduce.py
|
||||
- tests/compile/distributed/test_fusions_e2e.py
|
||||
- tests/compile/passes/distributed/test_fusion_all_reduce.py
|
||||
- tests/compile/fullgraph/test_full_graph.py
|
||||
commands:
|
||||
- nvidia-smi
|
||||
- pytest -v -s tests/compile/test_fusion_attn.py
|
||||
- pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
|
||||
# this runner has 2 GPUs available even though num_gpus=2 is not set
|
||||
- pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
|
||||
# Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time
|
||||
# Wrap with quotes to escape yaml
|
||||
- "pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and not +quant_fp8 and not +rms_norm'"
|
||||
- pytest -v -s tests/compile/passes/distributed/test_fusion_all_reduce.py
|
||||
# # Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time
|
||||
# # Wrap with quotes to escape yaml
|
||||
# - "pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and not +quant_fp8 and not +rms_norm'"
|
||||
# Old E2E tests were removed in https://github.com/vllm-project/vllm/pull/33293
|
||||
# in favor of new tests in fusions_e2e. We avoid replicating the new jobs in this file as it's deprecated.
|
||||
|
||||
# test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40)
|
||||
- pytest -v -s tests/compile/fullgraph/test_full_graph.py::test_fp8_kv_scale_compile
|
||||
|
||||
- label: Blackwell Fusion E2E Tests # 30 min
|
||||
timeout_in_minutes: 40
|
||||
working_dir: "/vllm-workspace/"
|
||||
gpu: b200
|
||||
optional: true
|
||||
num_gpus: 2
|
||||
source_file_dependencies:
|
||||
- csrc/quantization/fp4/
|
||||
- vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
|
||||
- vllm/v1/attention/backends/flashinfer.py
|
||||
- vllm/compilation/
|
||||
# can affect pattern matching
|
||||
- vllm/model_executor/layers/layernorm.py
|
||||
- vllm/model_executor/layers/activation.py
|
||||
- vllm/model_executor/layers/quantization/input_quant_fp8.py
|
||||
- tests/compile/distributed/test_fusions_e2e.py
|
||||
commands:
|
||||
- nvidia-smi
|
||||
# Run all e2e fusion tests
|
||||
- pytest -v -s tests/compile/distributed/test_fusions_e2e.py
|
||||
|
||||
- label: Blackwell GPT-OSS Eval
|
||||
timeout_in_minutes: 60
|
||||
working_dir: "/vllm-workspace/"
|
||||
@@ -1102,6 +1155,8 @@ steps:
|
||||
- pytest -v -s distributed/test_shm_broadcast.py
|
||||
- pytest -v -s distributed/test_shm_buffer.py
|
||||
- pytest -v -s distributed/test_shm_storage.py
|
||||
- pytest -v -s distributed/test_packed_tensor.py
|
||||
- pytest -v -s distributed/test_weight_transfer.py
|
||||
|
||||
- label: 2 Node Tests (4 GPUs in total) # 16min
|
||||
timeout_in_minutes: 30
|
||||
@@ -1216,7 +1271,7 @@ steps:
|
||||
- pytest -v -s distributed/test_distributed_oot.py
|
||||
- pytest -v -s entrypoints/openai/test_oot_registration.py # it needs a clean process
|
||||
- pytest -v -s models/test_oot_registration.py # it needs a clean process
|
||||
- pytest -v -s plugins/lora_resolvers # unit tests for in-tree lora resolver plugins
|
||||
- pytest -v -s plugins/lora_resolvers # unit tests for lora resolver plugins
|
||||
|
||||
- label: Pipeline + Context Parallelism Test # 45min
|
||||
timeout_in_minutes: 60
|
||||
@@ -1319,6 +1374,20 @@ steps:
|
||||
- TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
|
||||
- pytest -v -s -x lora/test_mixtral.py
|
||||
|
||||
- label: Acceptance Length Test (Large Models) # optional
|
||||
timeout_in_minutes: 120
|
||||
gpu: h100
|
||||
optional: true
|
||||
num_gpus: 1
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
source_file_dependencies:
|
||||
- vllm/v1/spec_decode/
|
||||
- vllm/model_executor/models/mlp_speculator.py
|
||||
- tests/v1/spec_decode/test_acceptance_length.py
|
||||
commands:
|
||||
- export VLLM_ALLOW_INSECURE_SERIALIZATION=1
|
||||
- pytest -v -s v1/spec_decode/test_acceptance_length.py -m slow_test
|
||||
|
||||
- label: LM Eval Large Models # optional
|
||||
gpu: a100
|
||||
optional: true
|
||||
@@ -1344,22 +1413,31 @@ steps:
|
||||
- export VLLM_USE_DEEP_GEMM=0 # We found Triton is faster than DeepGEMM for H100
|
||||
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt --tp-size=4
|
||||
|
||||
##### H200 test #####
|
||||
- label: Distributed Tests (H200) # optional
|
||||
gpu: h200
|
||||
- label: Sequence Parallel Tests (H100) # 60 min
|
||||
timeout_in_minutes: 60
|
||||
working_dir: "/vllm-workspace/"
|
||||
gpu: h100
|
||||
optional: true
|
||||
num_gpus: 2
|
||||
commands:
|
||||
- export VLLM_TEST_CLEAN_GPU_MEMORY=1
|
||||
# Run sequence parallel tests
|
||||
- pytest -v -s tests/compile/correctness_e2e/test_sequence_parallel.py
|
||||
- pytest -v -s tests/compile/passes/distributed/test_sequence_parallelism.py
|
||||
|
||||
- label: Distributed Tests (H100) # optional
|
||||
gpu: h100
|
||||
optional: true
|
||||
working_dir: "/vllm-workspace/"
|
||||
num_gpus: 2
|
||||
commands:
|
||||
- VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_async_tp.py
|
||||
- pytest -v -s tests/compile/distributed/test_sequence_parallelism.py
|
||||
- pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
|
||||
- "VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'"
|
||||
- VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/distributed/test_sequence_parallel.py
|
||||
- VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/passes/distributed/test_async_tp.py
|
||||
- pytest -v -s tests/distributed/test_context_parallel.py
|
||||
- CUDA_VISIBLE_DEVICES=1,2 VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=deepep_high_throughput
|
||||
- VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=deepep_high_throughput
|
||||
- pytest -v -s tests/v1/distributed/test_dbo.py
|
||||
|
||||
##### H200 test #####
|
||||
|
||||
- label: LM Eval Large Models (H200) # optional
|
||||
timeout_in_minutes: 60
|
||||
gpu: h200
|
||||
|
||||
@@ -4,8 +4,10 @@ depends_on:
|
||||
steps:
|
||||
- label: V1 attention (H100)
|
||||
timeout_in_minutes: 30
|
||||
gpu: h100
|
||||
device: h100
|
||||
source_file_dependencies:
|
||||
- vllm/config/attention.py
|
||||
- vllm/model_executor/layers/attention
|
||||
- vllm/v1/attention
|
||||
- tests/v1/attention
|
||||
commands:
|
||||
@@ -13,9 +15,11 @@ steps:
|
||||
|
||||
- label: V1 attention (B200)
|
||||
timeout_in_minutes: 30
|
||||
gpu: b200
|
||||
device: b200
|
||||
source_file_dependencies:
|
||||
- vllm/config/attention.py
|
||||
- vllm/model_executor/layers/attention
|
||||
- vllm/v1/attention
|
||||
- tests/v1/attention
|
||||
commands:
|
||||
- VLLM_DISABLE_FLASHINFER_PREFILL=1 pytest -v -s v1/attention # TODO: FI prefill is bugged and causes incorrectness, fix this
|
||||
- pytest -v -s v1/attention
|
||||
|
||||
@@ -2,56 +2,202 @@ group: Compile
|
||||
depends_on:
|
||||
- image-build
|
||||
steps:
|
||||
- label: Fusion and Compile Tests (B200)
|
||||
timeout_in_minutes: 40
|
||||
- label: Sequence Parallel Correctness Tests (2 GPUs)
|
||||
timeout_in_minutes: 50
|
||||
working_dir: "/vllm-workspace/"
|
||||
gpu: b200
|
||||
num_devices: 2
|
||||
source_file_dependencies:
|
||||
- csrc/quantization/fp4/
|
||||
- vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
|
||||
- vllm/v1/attention/backends/flashinfer.py
|
||||
- vllm/model_executor/layers/
|
||||
- vllm/compilation/
|
||||
- vllm/v1/worker/
|
||||
- vllm/v1/cudagraph_dispatcher.py
|
||||
- vllm/compilation/
|
||||
# can affect pattern matching
|
||||
- vllm/model_executor/layers/layernorm.py
|
||||
- vllm/model_executor/layers/activation.py
|
||||
- vllm/model_executor/layers/quantization/input_quant_fp8.py
|
||||
- tests/compile/test_fusion_attn.py
|
||||
- tests/compile/test_silu_mul_quant_fusion.py
|
||||
- tests/compile/distributed/test_fusion_all_reduce.py
|
||||
- tests/compile/distributed/test_fusions_e2e.py
|
||||
- tests/compile/fullgraph/test_full_graph.py
|
||||
- tests/compile/correctness_e2e/test_sequence_parallel.py
|
||||
commands:
|
||||
- nvidia-smi
|
||||
- pytest -v -s tests/compile/test_fusion_attn.py
|
||||
- pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
|
||||
# this runner has 2 GPUs available even though num_gpus=2 is not set
|
||||
- pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
|
||||
# Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time
|
||||
# Wrap with quotes to escape yaml
|
||||
- "pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and not +quant_fp8 and not +rms_norm'"
|
||||
# test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40)
|
||||
- pytest -v -s tests/compile/fullgraph/test_full_graph.py::test_fp8_kv_scale_compile
|
||||
- export VLLM_TEST_CLEAN_GPU_MEMORY=1
|
||||
- pytest -v -s tests/compile/correctness_e2e/test_sequence_parallel.py
|
||||
|
||||
- label: Fusion E2E (2 GPUs)(B200)
|
||||
timeout_in_minutes: 40
|
||||
- label: Sequence Parallel Correctness Tests (2xH100)
|
||||
timeout_in_minutes: 50
|
||||
working_dir: "/vllm-workspace/"
|
||||
gpu: b200
|
||||
device: h100
|
||||
optional: true
|
||||
num_gpus: 2
|
||||
num_devices: 2
|
||||
commands:
|
||||
- export VLLM_TEST_CLEAN_GPU_MEMORY=1
|
||||
- pytest -v -s tests/compile/correctness_e2e/test_sequence_parallel.py
|
||||
|
||||
- label: AsyncTP Correctness Tests (2xH100)
|
||||
timeout_in_minutes: 50
|
||||
working_dir: "/vllm-workspace/"
|
||||
device: h100
|
||||
optional: true
|
||||
num_devices: 2
|
||||
commands:
|
||||
- export VLLM_TEST_CLEAN_GPU_MEMORY=1
|
||||
- pytest -v -s tests/compile/correctness_e2e/test_async_tp.py
|
||||
|
||||
- label: Distributed Compile Unit Tests (2xH100)
|
||||
timeout_in_minutes: 20
|
||||
working_dir: "/vllm-workspace/"
|
||||
device: h100
|
||||
num_devices: 2
|
||||
source_file_dependencies:
|
||||
- vllm/compilation/
|
||||
- vllm/model_executor/layers
|
||||
- tests/compile/passes/distributed/
|
||||
commands:
|
||||
- export VLLM_TEST_CLEAN_GPU_MEMORY=1
|
||||
- pytest -s -v tests/compile/passes/distributed
|
||||
|
||||
- label: Fusion and Compile Unit Tests (B200)
|
||||
timeout_in_minutes: 20
|
||||
working_dir: "/vllm-workspace/"
|
||||
device: b200
|
||||
source_file_dependencies:
|
||||
- csrc/quantization/fp4/
|
||||
- vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
|
||||
- vllm/v1/attention/backends/flashinfer.py
|
||||
- vllm/compilation/
|
||||
# can affect pattern matching
|
||||
- vllm/model_executor/layers/quantization/
|
||||
- vllm/model_executor/layers/layernorm.py
|
||||
- vllm/model_executor/layers/activation.py
|
||||
- vllm/model_executor/layers/quantization/input_quant_fp8.py
|
||||
- tests/compile/distributed/test_fusions_e2e.py
|
||||
- vllm/model_executor/layers/attention/attention.py
|
||||
- vllm/v1/attention/backends/flashinfer.py
|
||||
- vllm/compilation/ # TODO(luka) limit to vllm/compilation/passes
|
||||
- tests/compile/passes/test_fusion_attn.py
|
||||
- tests/compile/passes/test_silu_mul_quant_fusion.py
|
||||
- tests/compile/passes/distributed/test_fusion_all_reduce.py
|
||||
- tests/compile/fullgraph/test_full_graph.py
|
||||
commands:
|
||||
# b200 runners are limited, so we limit the tests to the minimum set only supported on Blackwell
|
||||
- nvidia-smi
|
||||
- pytest -v -s tests/compile/passes/test_fusion_attn.py -k FLASHINFER
|
||||
- pytest -v -s tests/compile/passes/test_silu_mul_quant_fusion.py
|
||||
# this runner has 2 GPUs available even though num_devices=2 is not set
|
||||
- pytest -v -s tests/compile/passes/distributed/test_fusion_all_reduce.py
|
||||
# test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40)
|
||||
# TODO(luka) move to H100 once pass tests run on H100
|
||||
- pytest -v -s tests/compile/fullgraph/test_full_graph.py::test_fp8_kv_scale_compile
|
||||
|
||||
- label: Fusion E2E Quick (H100)
|
||||
timeout_in_minutes: 15
|
||||
working_dir: "/vllm-workspace/"
|
||||
device: h100
|
||||
num_devices: 1
|
||||
source_file_dependencies:
|
||||
- csrc/quantization/
|
||||
- vllm/model_executor/
|
||||
- vllm/v1/attention/
|
||||
- vllm/compilation/
|
||||
- tests/compile/fusions_e2e/
|
||||
commands:
|
||||
- nvidia-smi
|
||||
# Run all e2e fusion tests
|
||||
- pytest -v -s tests/compile/distributed/test_fusions_e2e.py
|
||||
# Run all models and attn backends but only Inductor partition and native custom ops
|
||||
- pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and not +rms_norm and not +quant_fp8"
|
||||
# Qwen requires +quant_fp8 as -quant_fp8 rms+quant fusion is not supported
|
||||
- pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and not +rms_norm and +quant_fp8 and qwen3"
|
||||
|
||||
- label: Fusion E2E Config Sweep (H100)
|
||||
timeout_in_minutes: 30
|
||||
working_dir: "/vllm-workspace/"
|
||||
device: h100
|
||||
num_devices: 1
|
||||
source_file_dependencies:
|
||||
- csrc/quantization/
|
||||
- vllm/compilation/
|
||||
# can affect pattern matching
|
||||
- vllm/model_executor/layers/layernorm.py
|
||||
- vllm/model_executor/layers/activation.py
|
||||
- vllm/model_executor/layers/attention/attention.py
|
||||
- vllm/model_executor/layers/quantization/input_quant_fp8.py
|
||||
- tests/compile/fusions_e2e/
|
||||
commands:
|
||||
- nvidia-smi
|
||||
# Run just llama3 (fp8) for all config combinations
|
||||
- pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "llama-3"
|
||||
|
||||
- label: Fusion E2E Config Sweep (B200)
|
||||
timeout_in_minutes: 30
|
||||
working_dir: "/vllm-workspace/"
|
||||
device: b200
|
||||
num_devices: 1
|
||||
optional: true
|
||||
commands:
|
||||
- nvidia-smi
|
||||
# Run all models and attn backends but only Inductor partition and native custom ops
|
||||
# -k "inductor_partition and not +rms_norm and not +quant_fp8"
|
||||
# Qwen requires +quant_fp8 as -quant_fp8 rms+quant fusion is not supported
|
||||
# -k "inductor_partition and not +rms_norm and +quant_fp8 and qwen3"
|
||||
# Run just llama3 (fp8 & fp4) for all config combinations
|
||||
# -k "llama-3"
|
||||
- pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and not +rms_norm and not +quant_fp8" -k "inductor_partition and not +rms_norm and +quant_fp8 and qwen3" -k "llama-3"
|
||||
|
||||
- label: Fusion E2E TP2 Quick (H100)
|
||||
timeout_in_minutes: 20
|
||||
working_dir: "/vllm-workspace/"
|
||||
device: h100
|
||||
num_devices: 2
|
||||
source_file_dependencies:
|
||||
- csrc/quantization/
|
||||
- vllm/model_executor/
|
||||
- vllm/v1/attention/
|
||||
- vllm/compilation/
|
||||
- tests/compile/fusions_e2e/
|
||||
commands:
|
||||
- nvidia-smi
|
||||
# Run all models and attn backends but only Inductor partition and native custom ops
|
||||
- pytest -v -s tests/compile/fusions_e2e/test_tp2_ar_rms.py -k "inductor_partition and not +rms_norm and not +quant_fp8"
|
||||
- pytest -v -s tests/compile/fusions_e2e/test_tp2_async_tp.py -k "inductor_partition and not +rms_norm and not +quant_fp8"
|
||||
|
||||
- label: Fusion E2E TP2 AR-RMS Config Sweep (H100)
|
||||
timeout_in_minutes: 40
|
||||
working_dir: "/vllm-workspace/"
|
||||
device: h100
|
||||
num_devices: 2
|
||||
source_file_dependencies:
|
||||
- csrc/quantization/
|
||||
- vllm/compilation/
|
||||
# can affect pattern matching
|
||||
- vllm/model_executor/layers/layernorm.py
|
||||
- vllm/model_executor/layers/activation.py
|
||||
- vllm/model_executor/layers/attention/attention.py
|
||||
- vllm/model_executor/layers/quantization/input_quant_fp8.py
|
||||
- tests/compile/fusions_e2e/
|
||||
commands:
|
||||
- nvidia-smi
|
||||
# Run just llama3 (fp4 & fp8 & bf16) for all config combinations
|
||||
- pytest -v -s tests/compile/fusions_e2e/test_tp2_ar_rms.py -k "llama-3"
|
||||
|
||||
- label: Fusion E2E TP2 AsyncTP Config Sweep (H100)
|
||||
timeout_in_minutes: 40
|
||||
working_dir: "/vllm-workspace/"
|
||||
device: h100
|
||||
num_devices: 2
|
||||
source_file_dependencies:
|
||||
- csrc/quantization/
|
||||
- vllm/compilation/
|
||||
# can affect pattern matching
|
||||
- vllm/model_executor/layers/layernorm.py
|
||||
- vllm/model_executor/layers/activation.py
|
||||
- vllm/model_executor/layers/attention/attention.py
|
||||
- vllm/model_executor/layers/quantization/input_quant_fp8.py
|
||||
- tests/compile/fusions_e2e/
|
||||
commands:
|
||||
- nvidia-smi
|
||||
# Run just llama3 (fp8 & bf16) for all config combinations
|
||||
- pytest -v -s tests/compile/fusions_e2e/test_tp2_async_tp.py -k "llama-3"
|
||||
|
||||
- label: Fusion E2E TP2 (B200)
|
||||
timeout_in_minutes: 20
|
||||
working_dir: "/vllm-workspace/"
|
||||
device: b200
|
||||
num_devices: 2
|
||||
source_file_dependencies:
|
||||
- csrc/quantization/
|
||||
- vllm/model_executor/
|
||||
- vllm/v1/attention/
|
||||
- vllm/compilation/
|
||||
- tests/compile/fusions_e2e/
|
||||
commands:
|
||||
- nvidia-smi
|
||||
# Run all models and attn backends but only Inductor partition and native custom ops
|
||||
# for ar-rms-quant-fp4, also sweep llama3
|
||||
- pytest -v -s tests/compile/fusions_e2e/test_tp2_ar_rms.py -k "inductor_partition and not +rms_norm and not +quant_fp8" -k "Llama-3.1-8B-Instruct-FP4"
|
||||
- pytest -v -s tests/compile/fusions_e2e/test_tp2_async_tp.py -k "inductor_partition and not +rms_norm and not +quant_fp8"
|
||||
|
||||
@@ -9,6 +9,7 @@ steps:
|
||||
- tests/cuda
|
||||
commands:
|
||||
- pytest -v -s cuda/test_cuda_context.py
|
||||
- pytest -v -s cuda/test_platform_no_cuda_init.py
|
||||
|
||||
- label: Cudagraph
|
||||
timeout_in_minutes: 20
|
||||
|
||||
@@ -5,7 +5,7 @@ steps:
|
||||
- label: Distributed Comm Ops
|
||||
timeout_in_minutes: 20
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
num_gpus: 2
|
||||
num_devices: 2
|
||||
source_file_dependencies:
|
||||
- vllm/distributed
|
||||
- tests/distributed
|
||||
@@ -16,9 +16,9 @@ steps:
|
||||
- pytest -v -s distributed/test_shm_storage.py
|
||||
|
||||
- label: Distributed (2 GPUs)
|
||||
timeout_in_minutes: 90
|
||||
timeout_in_minutes: 60
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
num_gpus: 2
|
||||
num_devices: 2
|
||||
source_file_dependencies:
|
||||
- vllm/compilation/
|
||||
- vllm/distributed/
|
||||
@@ -47,14 +47,13 @@ steps:
|
||||
- pytest -v -s ./compile/test_wrapper.py
|
||||
- VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
|
||||
- VLLM_TEST_SAME_HOST=1 VLLM_TEST_WITH_DEFAULT_DEVICE_SET=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
|
||||
- pytest -v -s distributed/test_sequence_parallel.py
|
||||
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
|
||||
- pytest -v -s v1/worker/test_worker_memory_snapshot.py
|
||||
|
||||
- label: Distributed Tests (4 GPUs)
|
||||
timeout_in_minutes: 50
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
num_gpus: 4
|
||||
num_devices: 4
|
||||
source_file_dependencies:
|
||||
- vllm/distributed/
|
||||
- tests/distributed/test_utils
|
||||
@@ -63,6 +62,7 @@ steps:
|
||||
- tests/compile/fullgraph/test_basic_correctness.py
|
||||
- examples/offline_inference/rlhf.py
|
||||
- examples/offline_inference/rlhf_colocate.py
|
||||
- examples/offline_inference/new_weight_syncing/
|
||||
- tests/examples/offline_inference/data_parallel.py
|
||||
- tests/v1/distributed
|
||||
- tests/v1/engine/test_engine_core_client.py
|
||||
@@ -97,14 +97,19 @@ steps:
|
||||
- pytest -v -s distributed/test_symm_mem_allreduce.py
|
||||
# TODO: create a dedicated test section for multi-GPU example tests
|
||||
# when we have multiple distributed example tests
|
||||
# OLD rlhf examples
|
||||
- cd ../examples/offline_inference
|
||||
- VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf.py
|
||||
- VLLM_ALLOW_INSECURE_SERIALIZATION=1 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
|
||||
# NEW rlhf examples
|
||||
- cd new_weight_syncing
|
||||
- VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf.py
|
||||
- VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf_async_new_apis.py
|
||||
|
||||
- label: Distributed Tests (8 GPUs)(H100)
|
||||
timeout_in_minutes: 10
|
||||
gpu: h100
|
||||
num_gpus: 8
|
||||
device: h100
|
||||
num_devices: 8
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
source_file_dependencies:
|
||||
- examples/offline_inference/torchrun_dp_example.py
|
||||
@@ -120,9 +125,9 @@ steps:
|
||||
- torchrun --nproc-per-node=8 ../examples/offline_inference/torchrun_dp_example.py --tp-size=2 --pp-size=1 --dp-size=4 --enable-ep
|
||||
|
||||
- label: Distributed Tests (4 GPUs)(A100)
|
||||
gpu: a100
|
||||
device: a100
|
||||
optional: true
|
||||
num_gpus: 4
|
||||
num_devices: 4
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
commands:
|
||||
@@ -133,26 +138,22 @@ steps:
|
||||
- TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
|
||||
- pytest -v -s -x lora/test_mixtral.py
|
||||
|
||||
- label: Distributed Tests (2 GPUs)(H200)
|
||||
gpu: h200
|
||||
- label: Distributed Tests (2 GPUs)(H100)
|
||||
timeout_in_minutes: 15
|
||||
device: h100
|
||||
optional: true
|
||||
working_dir: "/vllm-workspace/"
|
||||
num_gpus: 2
|
||||
num_devices: 2
|
||||
commands:
|
||||
- VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_async_tp.py
|
||||
- pytest -v -s tests/compile/distributed/test_sequence_parallelism.py
|
||||
- pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
|
||||
- VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'
|
||||
- VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/distributed/test_sequence_parallel.py
|
||||
- pytest -v -s tests/distributed/test_context_parallel.py
|
||||
- CUDA_VISIBLE_DEVICES=1,2 VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=deepep_high_throughput
|
||||
- VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=deepep_high_throughput
|
||||
- pytest -v -s tests/v1/distributed/test_dbo.py
|
||||
|
||||
- label: Distributed Tests (2 GPUs)(B200)
|
||||
gpu: b200
|
||||
device: b200
|
||||
optional: true
|
||||
working_dir: "/vllm-workspace/"
|
||||
num_gpus: 2
|
||||
num_devices: 2
|
||||
commands:
|
||||
- pytest -v -s tests/distributed/test_context_parallel.py
|
||||
- pytest -v -s tests/distributed/test_nccl_symm_mem_allreduce.py
|
||||
@@ -161,8 +162,9 @@ steps:
|
||||
- label: 2 Node Test (4 GPUs)
|
||||
timeout_in_minutes: 30
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
num_gpus: 2
|
||||
num_devices: 2
|
||||
num_nodes: 2
|
||||
no_plugin: true
|
||||
source_file_dependencies:
|
||||
- vllm/distributed/
|
||||
- vllm/engine/
|
||||
@@ -171,12 +173,12 @@ steps:
|
||||
- tests/distributed/
|
||||
- tests/examples/offline_inference/data_parallel.py
|
||||
commands:
|
||||
- ./.buildkite/scripts/run-multi-node-test.sh /vllm-workspace/tests 2 2 public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:0bec63fa317e1fbd62e19b0fc31c43c81bf89077 "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=0 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py" "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=1 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code"
|
||||
- ./.buildkite/scripts/run-multi-node-test.sh /vllm-workspace/tests 2 2 $IMAGE_TAG "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=0 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py" "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=1 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code"
|
||||
|
||||
- label: Distributed NixlConnector PD accuracy (4 GPUs)
|
||||
timeout_in_minutes: 30
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
num_gpus: 4
|
||||
num_devices: 4
|
||||
source_file_dependencies:
|
||||
- vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
|
||||
- tests/v1/kv_connector/nixl_integration/
|
||||
@@ -184,10 +186,21 @@ steps:
|
||||
- uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
|
||||
- bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh
|
||||
|
||||
- label: DP EP Distributed NixlConnector PD accuracy tests (4 GPUs)
|
||||
timeout_in_minutes: 30
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
num_devices: 4
|
||||
source_file_dependencies:
|
||||
- vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
|
||||
- tests/v1/kv_connector/nixl_integration/
|
||||
commands:
|
||||
- uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
|
||||
- DP_EP=1 bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh
|
||||
|
||||
- label: Pipeline + Context Parallelism (4 GPUs))
|
||||
timeout_in_minutes: 60
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
num_gpus: 4
|
||||
num_devices: 4
|
||||
source_file_dependencies:
|
||||
- vllm/distributed/
|
||||
- vllm/engine/
|
||||
@@ -196,4 +209,4 @@ steps:
|
||||
- tests/distributed/
|
||||
commands:
|
||||
- pytest -v -s distributed/test_pp_cudagraph.py
|
||||
- pytest -v -s distributed/test_pipeline_parallel.py
|
||||
- pytest -v -s distributed/test_pipeline_parallel.py
|
||||
|
||||
@@ -4,27 +4,27 @@ depends_on:
|
||||
steps:
|
||||
- label: DeepSeek V2-Lite Accuracy
|
||||
timeout_in_minutes: 60
|
||||
gpu: h100
|
||||
device: h100
|
||||
optional: true
|
||||
num_gpus: 4
|
||||
num_devices: 4
|
||||
working_dir: "/vllm-workspace"
|
||||
commands:
|
||||
- bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh 0.25 200 8010
|
||||
|
||||
- label: Qwen3-30B-A3B-FP8-block Accuracy
|
||||
timeout_in_minutes: 60
|
||||
gpu: h100
|
||||
device: h100
|
||||
optional: true
|
||||
num_gpus: 4
|
||||
num_devices: 4
|
||||
working_dir: "/vllm-workspace"
|
||||
commands:
|
||||
- bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020
|
||||
|
||||
- label: Qwen3-30B-A3B-FP8-block Accuracy (B200)
|
||||
timeout_in_minutes: 60
|
||||
gpu: b200
|
||||
device: b200
|
||||
optional: true
|
||||
num_gpus: 2
|
||||
num_devices: 2
|
||||
working_dir: "/vllm-workspace"
|
||||
commands:
|
||||
- bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1
|
||||
@@ -33,10 +33,11 @@ steps:
|
||||
timeout_in_minutes: 30
|
||||
optional: true
|
||||
soft_fail: true
|
||||
num_gpus: 2
|
||||
num_devices: 2
|
||||
working_dir: "/vllm-workspace"
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- .buildkite/scripts/run-prime-rl-test.sh
|
||||
commands:
|
||||
- nvidia-smi
|
||||
- bash .buildkite/scripts/run-prime-rl-test.sh
|
||||
|
||||
@@ -23,4 +23,8 @@ steps:
|
||||
# TODO: accuracy does not match, whether setting
|
||||
# VLLM_USE_FLASHINFER_SAMPLER or not on H100.
|
||||
- pytest -v -s v1/e2e
|
||||
- pytest -v -s v1/engine
|
||||
# Run this test standalone for now;
|
||||
# need to untangle use (implicit) use of spawn/fork across the tests.
|
||||
- pytest -v -s v1/engine/test_preprocess_error_handling.py
|
||||
# Run the rest of v1/engine tests
|
||||
- pytest -v -s v1/engine --ignore v1/engine/test_preprocess_error_handling.py
|
||||
|
||||
@@ -14,7 +14,7 @@ steps:
|
||||
- label: EPLB Execution
|
||||
timeout_in_minutes: 20
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
num_gpus: 4
|
||||
num_devices: 4
|
||||
source_file_dependencies:
|
||||
- vllm/distributed/eplb
|
||||
- tests/distributed/test_eplb_execute.py
|
||||
|
||||
@@ -15,8 +15,9 @@ steps:
|
||||
timeout_in_minutes: 35
|
||||
source_file_dependencies:
|
||||
- csrc/attention/
|
||||
- vllm/attention
|
||||
- vllm/v1/attention
|
||||
# TODO: remove this dependency (https://github.com/vllm-project/vllm/issues/32267)
|
||||
- vllm/model_executor/layers/attention
|
||||
- tests/kernels/attention
|
||||
commands:
|
||||
- pytest -v -s kernels/attention --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
|
||||
@@ -57,8 +58,8 @@ steps:
|
||||
|
||||
- label: Kernels DeepGEMM Test (H100)
|
||||
timeout_in_minutes: 45
|
||||
gpu: h100
|
||||
num_gpus: 1
|
||||
device: h100
|
||||
num_devices: 1
|
||||
source_file_dependencies:
|
||||
- tools/install_deepgemm.sh
|
||||
- vllm/utils/deep_gemm.py
|
||||
@@ -77,7 +78,7 @@ steps:
|
||||
- label: Kernels (B200)
|
||||
timeout_in_minutes: 30
|
||||
working_dir: "/vllm-workspace/"
|
||||
gpu: b200
|
||||
device: b200
|
||||
# optional: true
|
||||
source_file_dependencies:
|
||||
- csrc/quantization/fp4/
|
||||
@@ -85,7 +86,7 @@ steps:
|
||||
- csrc/quantization/cutlass_w8a8/moe/
|
||||
- vllm/model_executor/layers/fused_moe/cutlass_moe.py
|
||||
- vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py
|
||||
- vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py
|
||||
- vllm/model_executor/layers/fused_moe/flashinfer_a2a_prepare_finalize.py
|
||||
- vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
|
||||
- vllm/v1/attention/backends/flashinfer.py
|
||||
- vllm/v1/attention/backends/mla/cutlass_mla.py
|
||||
@@ -114,4 +115,55 @@ steps:
|
||||
- pytest -v -s tests/kernels/moe/test_nvfp4_moe.py
|
||||
- pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py
|
||||
- pytest -v -s tests/kernels/moe/test_flashinfer.py
|
||||
- pytest -v -s tests/kernels/moe/test_cutedsl_moe.py
|
||||
- pytest -v -s tests/kernels/moe/test_cutedsl_moe.py
|
||||
# e2e
|
||||
- pytest -v -s tests/models/quantization/test_nvfp4.py
|
||||
|
||||
- label: Kernels Helion Test
|
||||
timeout_in_minutes: 30
|
||||
device: h100
|
||||
source_file_dependencies:
|
||||
- vllm/utils/import_utils.py
|
||||
- tests/kernels/helion/
|
||||
commands:
|
||||
- pip install helion
|
||||
- pytest -v -s kernels/helion/
|
||||
|
||||
|
||||
- label: Kernels FP8 MoE Test (1 H100)
|
||||
timeout_in_minutes: 90
|
||||
device: h100
|
||||
num_devices: 1
|
||||
optional: true
|
||||
commands:
|
||||
- pytest -v -s kernels/moe/test_cutlass_moe.py
|
||||
- pytest -v -s kernels/moe/test_flashinfer.py
|
||||
- pytest -v -s kernels/moe/test_gpt_oss_triton_kernels.py
|
||||
- pytest -v -s kernels/moe/test_modular_oai_triton_moe.py
|
||||
- pytest -v -s kernels/moe/test_moe.py
|
||||
# - pytest -v -s kernels/moe/test_block_fp8.py - failing on main
|
||||
- pytest -v -s kernels/moe/test_block_int8.py
|
||||
- pytest -v -s kernels/moe/test_triton_moe_no_act_mul.py
|
||||
- pytest -v -s kernels/moe/test_triton_moe_ptpc_fp8.py
|
||||
|
||||
- label: Kernels FP8 MoE Test (2 H100s)
|
||||
timeout_in_minutes: 90
|
||||
device: h100
|
||||
num_devices: 2
|
||||
optional: true
|
||||
commands:
|
||||
- pytest -v -s kernels/moe/test_deepep_deepgemm_moe.py
|
||||
- pytest -v -s kernels/moe/test_deepep_moe.py
|
||||
- pytest -v -s kernels/moe/test_pplx_cutlass_moe.py
|
||||
# - pytest -v -s kernels/moe/test_pplx_moe.py - failing on main
|
||||
|
||||
- label: Kernels Fp4 MoE Test (B200)
|
||||
timeout_in_minutes: 60
|
||||
device: b200
|
||||
num_devices: 1
|
||||
optional: true
|
||||
commands:
|
||||
- pytest -v -s kernels/moe/test_cutedsl_moe.py
|
||||
- pytest -v -s kernels/moe/test_flashinfer_moe.py
|
||||
- pytest -v -s kernels/moe/test_nvfp4_moe.py
|
||||
- pytest -v -s kernels/moe/test_ocp_mx_moe.py
|
||||
|
||||
@@ -12,9 +12,9 @@ steps:
|
||||
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt
|
||||
|
||||
- label: LM Eval Large Models (4 GPUs)(A100)
|
||||
gpu: a100
|
||||
device: a100
|
||||
optional: true
|
||||
num_gpus: 4
|
||||
num_devices: 4
|
||||
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
|
||||
source_file_dependencies:
|
||||
- csrc/
|
||||
@@ -24,9 +24,9 @@ steps:
|
||||
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4
|
||||
|
||||
- label: LM Eval Large Models (4 GPUs)(H100)
|
||||
gpu: h100
|
||||
device: h100
|
||||
optional: true
|
||||
num_gpus: 4
|
||||
num_devices: 4
|
||||
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
|
||||
source_file_dependencies:
|
||||
- csrc/
|
||||
@@ -37,10 +37,39 @@ steps:
|
||||
|
||||
- label: LM Eval Small Models (B200)
|
||||
timeout_in_minutes: 120
|
||||
gpu: b200
|
||||
device: b200
|
||||
optional: true
|
||||
source_file_dependencies:
|
||||
- csrc/
|
||||
- vllm/model_executor/layers/quantization
|
||||
commands:
|
||||
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt
|
||||
|
||||
- label: LM Eval Large Models (H200)
|
||||
timeout_in_minutes: 60
|
||||
device: h200
|
||||
optional: true
|
||||
num_devices: 8
|
||||
commands:
|
||||
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-h200.txt
|
||||
|
||||
- label: MoE Refactor Integration Test (H100 - TEMPORARY)
|
||||
device: h100
|
||||
optional: true
|
||||
num_devices: 2
|
||||
commands:
|
||||
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=evals/gsm8k/configs/moe-refactor/config-h100.txt
|
||||
|
||||
- label: MoE Refactor Integration Test (B200 - TEMPORARY)
|
||||
device: b200
|
||||
optional: true
|
||||
num_devices: 2
|
||||
commands:
|
||||
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=evals/gsm8k/configs/moe-refactor/config-b200.txt
|
||||
|
||||
- label: MoE Refactor Integration Test (B200 DP - TEMPORARY)
|
||||
device: b200
|
||||
optional: true
|
||||
num_devices: 2
|
||||
commands:
|
||||
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=evals/gsm8k/configs/moe-refactor-dp-ep/config-b200.txt
|
||||
|
||||
@@ -14,7 +14,7 @@ steps:
|
||||
|
||||
- label: LoRA TP (Distributed)
|
||||
timeout_in_minutes: 30
|
||||
num_gpus: 4
|
||||
num_devices: 4
|
||||
source_file_dependencies:
|
||||
- vllm/lora
|
||||
- tests/lora
|
||||
|
||||
@@ -16,7 +16,7 @@ steps:
|
||||
- pytest -v -s v1/sample
|
||||
- pytest -v -s v1/logits_processors
|
||||
- pytest -v -s v1/worker
|
||||
- pytest -v -s v1/spec_decode
|
||||
- pytest -v -s -m 'not slow_test' v1/spec_decode
|
||||
- pytest -v -s -m 'not cpu_test' v1/kv_connector/unit
|
||||
- pytest -v -s -m 'not cpu_test' v1/metrics
|
||||
- pytest -v -s v1/test_oracle.py
|
||||
@@ -27,11 +27,12 @@ steps:
|
||||
- pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
|
||||
|
||||
- label: V1 Others (CPU)
|
||||
depends_on: ~
|
||||
depends_on:
|
||||
- image-build-cpu
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/v1
|
||||
no_gpu: true
|
||||
device: cpu
|
||||
commands:
|
||||
# split the test to avoid interference
|
||||
- pytest -v -s -m 'cpu_test' v1/core
|
||||
@@ -71,7 +72,7 @@ steps:
|
||||
- python3 offline_inference/vision_language_multi_image.py --seed 0
|
||||
- python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
|
||||
# for pooling models
|
||||
- python3 pooling/pooling/vision_language_pooling.py --seed 0
|
||||
- python3 pooling/embed/vision_embedding_offline.py --seed 0
|
||||
# for features demo
|
||||
- python3 offline_inference/prefix_caching.py
|
||||
- python3 offline_inference/llm_engine_example.py
|
||||
@@ -82,7 +83,7 @@ steps:
|
||||
|
||||
- label: Metrics, Tracing (2 GPUs)
|
||||
timeout_in_minutes: 20
|
||||
num_gpus: 2
|
||||
num_devices: 2
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/v1/tracing
|
||||
@@ -114,24 +115,29 @@ steps:
|
||||
- pytest -v -s utils_
|
||||
|
||||
- label: Async Engine, Inputs, Utils, Worker, Config (CPU)
|
||||
depends_on: ~
|
||||
depends_on:
|
||||
- image-build-cpu
|
||||
timeout_in_minutes: 30
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/test_inputs.py
|
||||
- tests/test_outputs.py
|
||||
- tests/test_pooling_params.py
|
||||
- tests/multimodal
|
||||
- tests/renderers
|
||||
- tests/standalone_tests/lazy_imports.py
|
||||
- tests/tokenizers_
|
||||
- tests/tool_parsers
|
||||
- tests/transformers_utils
|
||||
- tests/config
|
||||
no_gpu: true
|
||||
device: cpu
|
||||
commands:
|
||||
- python3 standalone_tests/lazy_imports.py
|
||||
- pytest -v -s test_inputs.py
|
||||
- pytest -v -s test_outputs.py
|
||||
- pytest -v -s test_pooling_params.py
|
||||
- pytest -v -s -m 'cpu_test' multimodal
|
||||
- pytest -v -s renderers
|
||||
- pytest -v -s tokenizers_
|
||||
- pytest -v -s tool_parsers
|
||||
- pytest -v -s transformers_utils
|
||||
@@ -140,7 +146,7 @@ steps:
|
||||
- label: GPT-OSS Eval (B200)
|
||||
timeout_in_minutes: 60
|
||||
working_dir: "/vllm-workspace/"
|
||||
gpu: b200
|
||||
device: b200
|
||||
optional: true
|
||||
source_file_dependencies:
|
||||
- tests/evals/gpt_oss
|
||||
@@ -153,7 +159,7 @@ steps:
|
||||
|
||||
- label: Batch Invariance (H100)
|
||||
timeout_in_minutes: 25
|
||||
gpu: h100
|
||||
device: h100
|
||||
source_file_dependencies:
|
||||
- vllm/v1/attention
|
||||
- vllm/model_executor/layers
|
||||
@@ -162,4 +168,18 @@ steps:
|
||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||
- pip install pytest-timeout pytest-forked
|
||||
- pytest -v -s v1/determinism/test_batch_invariance.py
|
||||
- pytest -v -s v1/determinism/test_rms_norm_batch_invariant.py
|
||||
- pytest -v -s v1/determinism/test_rms_norm_batch_invariant.py
|
||||
|
||||
- label: Acceptance Length Test (Large Models) # optional
|
||||
timeout_in_minutes: 25
|
||||
gpu: h100
|
||||
optional: true
|
||||
num_gpus: 1
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
source_file_dependencies:
|
||||
- vllm/v1/spec_decode/
|
||||
- vllm/model_executor/models/mlp_speculator.py
|
||||
- tests/v1/spec_decode/test_acceptance_length.py
|
||||
commands:
|
||||
- export VLLM_ALLOW_INSECURE_SERIALIZATION=1
|
||||
- pytest -v -s v1/spec_decode/test_acceptance_length.py -m slow_test
|
||||
|
||||
@@ -33,18 +33,21 @@ steps:
|
||||
timeout_in_minutes: 45
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/models/test_terratorch.py
|
||||
- tests/models/test_transformers.py
|
||||
- tests/models/test_registry.py
|
||||
commands:
|
||||
- pytest -v -s models/test_transformers.py models/test_registry.py
|
||||
- pytest -v -s models/test_terratorch.py models/test_transformers.py models/test_registry.py
|
||||
|
||||
- label: Basic Models Test (Other CPU) # 5min
|
||||
depends_on:
|
||||
- image-build-cpu
|
||||
timeout_in_minutes: 10
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/models/test_utils.py
|
||||
- tests/models/test_vision.py
|
||||
no_gpu: true
|
||||
device: cpu
|
||||
commands:
|
||||
- pytest -v -s models/test_utils.py models/test_vision.py
|
||||
|
||||
|
||||
@@ -5,7 +5,7 @@ steps:
|
||||
- label: Distributed Model Tests (2 GPUs)
|
||||
timeout_in_minutes: 50
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
num_gpus: 2
|
||||
num_devices: 2
|
||||
source_file_dependencies:
|
||||
- vllm/model_executor/model_loader/sharded_state_loader.py
|
||||
- vllm/model_executor/models/
|
||||
|
||||
@@ -14,11 +14,13 @@ steps:
|
||||
- cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model # Otherwise, mp_method="spawn" doesn't work
|
||||
|
||||
- label: Multi-Modal Processor Test (CPU)
|
||||
depends_on:
|
||||
- image-build-cpu
|
||||
timeout_in_minutes: 60
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/models/multimodal
|
||||
no_gpu: true
|
||||
device: cpu
|
||||
commands:
|
||||
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
|
||||
- pytest -v -s models/multimodal/processing --ignore models/multimodal/processing/test_tensor_schema.py
|
||||
|
||||
@@ -5,7 +5,7 @@ steps:
|
||||
- label: Plugin Tests (2 GPUs)
|
||||
timeout_in_minutes: 60
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
num_gpus: 2
|
||||
num_devices: 2
|
||||
source_file_dependencies:
|
||||
- vllm/plugins/
|
||||
- tests/plugins/
|
||||
|
||||
@@ -3,7 +3,7 @@ depends_on:
|
||||
- image-build
|
||||
steps:
|
||||
- label: PyTorch Compilation Unit Tests
|
||||
timeout_in_minutes: 30
|
||||
timeout_in_minutes: 10
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/compile
|
||||
@@ -17,8 +17,16 @@ steps:
|
||||
# (using -0 for proper path handling)
|
||||
- "find compile/ -maxdepth 1 -name 'test_*.py' -print0 | xargs -0 -n1 -I{} pytest -s -v '{}'"
|
||||
|
||||
- label: PyTorch Compilation Passes Unit Tests
|
||||
timeout_in_minutes: 20
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/compile/passes
|
||||
commands:
|
||||
- pytest -s -v compile/passes --ignore compile/passes/distributed
|
||||
|
||||
- label: PyTorch Fullgraph Smoke Test
|
||||
timeout_in_minutes: 30
|
||||
timeout_in_minutes: 35
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/compile
|
||||
@@ -30,16 +38,13 @@ steps:
|
||||
- "find compile/fullgraph/ -name 'test_*.py' -not -name 'test_full_graph.py' -exec pytest -s -v {} \\;"
|
||||
|
||||
- label: PyTorch Fullgraph
|
||||
timeout_in_minutes: 40
|
||||
timeout_in_minutes: 30
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/compile
|
||||
commands:
|
||||
# fp8 kv scales not supported on sm89, tested on Blackwell instead
|
||||
- pytest -v -s compile/fullgraph/test_full_graph.py -k 'not test_fp8_kv_scale_compile'
|
||||
# Limit to no custom ops to reduce running time
|
||||
# Wrap with quotes to escape yaml and avoid starting -k string with a -
|
||||
- "pytest -v -s compile/distributed/test_fusions_e2e.py -k 'TRITON and not +quant_fp8 and not Llama-4'"
|
||||
|
||||
- label: Pytorch Nightly Dependency Override Check # 2min
|
||||
# if this test fails, it means the nightly torch version is not compatible with some
|
||||
|
||||
@@ -16,14 +16,14 @@ steps:
|
||||
# https://github.com/pytorch/ao/issues/2919, we'll have to skip new torchao tests for now
|
||||
# we can only upgrade after this is resolved
|
||||
# TODO(jerryzh168): resolve the above comment
|
||||
- uv pip install --system torchao==0.13.0 --index-url https://download.pytorch.org/whl/cu129
|
||||
- uv pip install --system torchao==0.14.1 --index-url https://download.pytorch.org/whl/cu129
|
||||
- uv pip install --system conch-triton-kernels
|
||||
- VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py
|
||||
|
||||
- label: Quantized MoE Test (B200)
|
||||
timeout_in_minutes: 60
|
||||
working_dir: "/vllm-workspace/"
|
||||
gpu: b200
|
||||
device: b200
|
||||
source_file_dependencies:
|
||||
- tests/quantization/test_blackwell_moe.py
|
||||
- vllm/model_executor/models/deepseek_v2.py
|
||||
|
||||
@@ -5,7 +5,7 @@ steps:
|
||||
- label: Weight Loading Multiple GPU # 33min
|
||||
timeout_in_minutes: 45
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
num_gpus: 2
|
||||
num_devices: 2
|
||||
optional: true
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
@@ -15,8 +15,8 @@ steps:
|
||||
|
||||
- label: Weight Loading Multiple GPU - Large Models # optional
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
num_gpus: 2
|
||||
gpu: a100
|
||||
num_devices: 2
|
||||
device: a100
|
||||
optional: true
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
|
||||
16
.github/CODEOWNERS
vendored
16
.github/CODEOWNERS
vendored
@@ -2,8 +2,8 @@
|
||||
# for more info about CODEOWNERS file
|
||||
|
||||
# This lists cover the "core" components of vLLM that require careful review
|
||||
/vllm/attention @LucasWilkinson
|
||||
/vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-redhat @njhill @22quinn
|
||||
/vllm/model_executor/layers/attention @LucasWilkinson
|
||||
/vllm/model_executor/layers/fused_moe @mgoin @pavanimajety
|
||||
/vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth @yewentao256 @pavanimajety
|
||||
/vllm/model_executor/layers/mamba @tdoublep
|
||||
@@ -16,7 +16,7 @@
|
||||
/vllm/entrypoints @aarnphm @chaunceyjiang
|
||||
/vllm/tool_parsers @aarnphm @chaunceyjiang
|
||||
/vllm/compilation @zou3519 @youkaichao @ProExpertProg
|
||||
/vllm/distributed/kv_transfer @NickLucche @ApostaC
|
||||
/vllm/distributed/kv_transfer @NickLucche @ApostaC @orozery
|
||||
CMakeLists.txt @tlrmchlsmth @LucasWilkinson
|
||||
|
||||
# Any change to the VllmConfig changes can have a large user-facing impact,
|
||||
@@ -30,12 +30,14 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
|
||||
/vllm/v1/attention/backends/mla @pavanimajety
|
||||
/vllm/v1/attention/backends/flashinfer.py @mgoin @pavanimajety
|
||||
/vllm/v1/attention/backends/triton_attn.py @tdoublep
|
||||
/vllm/v1/core @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @alexm-redhat @heheda12345 @ApostaC
|
||||
/vllm/v1/core @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @alexm-redhat @heheda12345 @ApostaC @orozery
|
||||
/vllm/v1/sample @22quinn @houseroad @njhill
|
||||
/vllm/v1/spec_decode @benchislett @luccafong
|
||||
/vllm/v1/structured_output @mgoin @russellb @aarnphm @benchislett
|
||||
/vllm/v1/kv_cache_interface.py @heheda12345
|
||||
/vllm/v1/offloading @ApostaC
|
||||
/vllm/v1/kv_offload @ApostaC @orozery
|
||||
/vllm/v1/worker/gpu/kv_connector.py @orozery
|
||||
/vllm/v1/worker/kv_connector_model_runner_mixin.py @orozery
|
||||
|
||||
# Model runner V2
|
||||
/vllm/v1/worker/gpu @WoosukKwon
|
||||
@@ -54,13 +56,13 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
|
||||
/tests/test_inputs.py @DarkLight1337 @ywang96
|
||||
/tests/v1/entrypoints/llm/test_struct_output_generate.py @mgoin @russellb @aarnphm
|
||||
/tests/v1/structured_output @mgoin @russellb @aarnphm
|
||||
/tests/v1/core @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @alexm-redhat @heheda12345 @ApostaC
|
||||
/tests/v1/core @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @alexm-redhat @heheda12345 @ApostaC @orozery
|
||||
/tests/weight_loading @mgoin @youkaichao @yewentao256
|
||||
/tests/lora @jeejeelee
|
||||
/tests/models/language/generation/test_hybrid.py @tdoublep
|
||||
/tests/v1/kv_connector/nixl_integration @NickLucche
|
||||
/tests/v1/kv_connector @ApostaC
|
||||
/tests/v1/offloading @ApostaC
|
||||
/tests/v1/kv_connector @ApostaC @orozery
|
||||
/tests/v1/kv_offload @ApostaC @orozery
|
||||
/tests/v1/determinism @yewentao256
|
||||
|
||||
# Transformers modeling backend
|
||||
|
||||
12
.github/mergify.yml
vendored
12
.github/mergify.yml
vendored
@@ -414,6 +414,18 @@ pull_request_rules:
|
||||
remove:
|
||||
- needs-rebase
|
||||
|
||||
- name: label-bug
|
||||
description: Automatically apply bug label
|
||||
conditions:
|
||||
- label != stale
|
||||
- or:
|
||||
- title~=(?i)\bbug\b
|
||||
- title~=(?i)\bbugfix\b
|
||||
actions:
|
||||
label:
|
||||
add:
|
||||
- bug
|
||||
|
||||
- name: label-kv-connector
|
||||
description: Automatically apply kv-connector label
|
||||
conditions:
|
||||
|
||||
3
.github/workflows/macos-smoke-test.yml
vendored
3
.github/workflows/macos-smoke-test.yml
vendored
@@ -29,8 +29,9 @@ jobs:
|
||||
|
||||
- name: Install dependencies and build vLLM
|
||||
run: |
|
||||
uv pip install -r requirements/cpu-build.txt --index-strategy unsafe-best-match
|
||||
uv pip install -r requirements/cpu.txt --index-strategy unsafe-best-match
|
||||
uv pip install -e .
|
||||
uv pip install -e . --no-build-isolation
|
||||
env:
|
||||
CMAKE_BUILD_PARALLEL_LEVEL: 4
|
||||
|
||||
|
||||
6
.gitignore
vendored
6
.gitignore
vendored
@@ -7,6 +7,9 @@ vllm/vllm_flash_attn/*
|
||||
# OpenAI triton kernels copied from source
|
||||
vllm/third_party/triton_kernels/*
|
||||
|
||||
# FlashMLA interface copied from source
|
||||
vllm/third_party/flashmla/flash_mla_interface.py
|
||||
|
||||
# triton jit
|
||||
.triton
|
||||
|
||||
@@ -191,6 +194,9 @@ CLAUDE.md
|
||||
AGENTS.md
|
||||
.codex/
|
||||
|
||||
# Cursor
|
||||
.cursor/
|
||||
|
||||
# DS Store
|
||||
.DS_Store
|
||||
|
||||
|
||||
@@ -121,24 +121,9 @@ repos:
|
||||
name: Update Dockerfile dependency graph
|
||||
entry: tools/pre_commit/update-dockerfile-graph.sh
|
||||
language: script
|
||||
- id: enforce-import-regex-instead-of-re
|
||||
name: Enforce import regex as re
|
||||
entry: python tools/pre_commit/enforce_regex_import.py
|
||||
language: python
|
||||
types: [python]
|
||||
pass_filenames: false
|
||||
additional_dependencies: [regex]
|
||||
# forbid directly import triton
|
||||
- id: forbid-direct-triton-import
|
||||
name: "Forbid direct 'import triton'"
|
||||
entry: python tools/pre_commit/check_triton_import.py
|
||||
language: python
|
||||
types: [python]
|
||||
pass_filenames: false
|
||||
additional_dependencies: [regex]
|
||||
- id: check-pickle-imports
|
||||
name: Prevent new pickle/cloudpickle imports
|
||||
entry: python tools/pre_commit/check_pickle_imports.py
|
||||
- id: check-forbidden-imports
|
||||
name: Check for forbidden imports
|
||||
entry: python tools/pre_commit/check_forbidden_imports.py
|
||||
language: python
|
||||
types: [python]
|
||||
additional_dependencies: [regex]
|
||||
@@ -147,6 +132,17 @@ repos:
|
||||
entry: python tools/pre_commit/validate_config.py
|
||||
language: python
|
||||
additional_dependencies: [regex]
|
||||
- id: validate-docker-versions
|
||||
name: Validate docker/versions.json matches Dockerfile
|
||||
entry: python tools/generate_versions_json.py --check
|
||||
language: python
|
||||
files: ^docker/(Dockerfile|versions\.json)$
|
||||
pass_filenames: false
|
||||
additional_dependencies: [dockerfile-parse]
|
||||
- id: attention-backend-docs
|
||||
name: Check attention backend documentation is up to date
|
||||
entry: python tools/pre_commit/generate_attention_backend_docs.py --check
|
||||
language: python
|
||||
# Keep `suggestion` last
|
||||
- id: suggestion
|
||||
name: Suggestion
|
||||
|
||||
@@ -377,7 +377,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
# preselected input type pairs and schedules.
|
||||
# Generate sources:
|
||||
set(MARLIN_GEN_SCRIPT
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/csrc/quantization/gptq_marlin/generate_kernels.py)
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/csrc/quantization/marlin/generate_kernels.py)
|
||||
file(MD5 ${MARLIN_GEN_SCRIPT} MARLIN_GEN_SCRIPT_HASH)
|
||||
list(JOIN CUDA_ARCHS "," CUDA_ARCHS_STR)
|
||||
set(MARLIN_GEN_SCRIPT_HASH_AND_ARCH "${MARLIN_GEN_SCRIPT_HASH}(ARCH:${CUDA_ARCHS_STR})")
|
||||
@@ -412,7 +412,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
endif()
|
||||
|
||||
if (MARLIN_ARCHS)
|
||||
file(GLOB MARLIN_TEMPLATE_KERNEL_SRC "csrc/quantization/gptq_marlin/sm80_kernel_*_float16.cu")
|
||||
file(GLOB MARLIN_TEMPLATE_KERNEL_SRC "csrc/quantization/marlin/sm80_kernel_*_float16.cu")
|
||||
set_gencode_flags_for_srcs(
|
||||
SRCS "${MARLIN_TEMPLATE_KERNEL_SRC}"
|
||||
CUDA_ARCHS "${MARLIN_ARCHS}")
|
||||
@@ -422,7 +422,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
endif()
|
||||
list(APPEND VLLM_EXT_SRC ${MARLIN_TEMPLATE_KERNEL_SRC})
|
||||
|
||||
file(GLOB MARLIN_TEMPLATE_BF16_KERNEL_SRC "csrc/quantization/gptq_marlin/sm80_kernel_*_bfloat16.cu")
|
||||
file(GLOB MARLIN_TEMPLATE_BF16_KERNEL_SRC "csrc/quantization/marlin/sm80_kernel_*_bfloat16.cu")
|
||||
set_gencode_flags_for_srcs(
|
||||
SRCS "${MARLIN_TEMPLATE_BF16_KERNEL_SRC}"
|
||||
CUDA_ARCHS "${MARLIN_BF16_ARCHS}")
|
||||
@@ -434,7 +434,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
endif()
|
||||
|
||||
if (MARLIN_SM75_ARCHS)
|
||||
file(GLOB MARLIN_TEMPLATE_SM75_KERNEL_SRC "csrc/quantization/gptq_marlin/sm75_kernel_*.cu")
|
||||
file(GLOB MARLIN_TEMPLATE_SM75_KERNEL_SRC "csrc/quantization/marlin/sm75_kernel_*.cu")
|
||||
set_gencode_flags_for_srcs(
|
||||
SRCS "${MARLIN_TEMPLATE_SM75_KERNEL_SRC}"
|
||||
CUDA_ARCHS "${MARLIN_SM75_ARCHS}")
|
||||
@@ -446,7 +446,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
endif()
|
||||
|
||||
if (MARLIN_FP8_ARCHS)
|
||||
file(GLOB MARLIN_TEMPLATE_FP8_KERNEL_SRC "csrc/quantization/gptq_marlin/sm89_kernel_*.cu")
|
||||
file(GLOB MARLIN_TEMPLATE_FP8_KERNEL_SRC "csrc/quantization/marlin/sm89_kernel_*.cu")
|
||||
set_gencode_flags_for_srcs(
|
||||
SRCS "${MARLIN_TEMPLATE_FP8_KERNEL_SRC}"
|
||||
CUDA_ARCHS "${MARLIN_FP8_ARCHS}")
|
||||
@@ -458,11 +458,10 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
endif()
|
||||
|
||||
set(MARLIN_SRCS
|
||||
"csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu"
|
||||
"csrc/quantization/gptq_marlin/gptq_marlin.cu"
|
||||
"csrc/quantization/gptq_marlin/marlin_int4_fp8_preprocess.cu"
|
||||
"csrc/quantization/gptq_marlin/gptq_marlin_repack.cu"
|
||||
"csrc/quantization/gptq_marlin/awq_marlin_repack.cu")
|
||||
"csrc/quantization/marlin/marlin.cu"
|
||||
"csrc/quantization/marlin/marlin_int4_fp8_preprocess.cu"
|
||||
"csrc/quantization/marlin/gptq_marlin_repack.cu"
|
||||
"csrc/quantization/marlin/awq_marlin_repack.cu")
|
||||
set_gencode_flags_for_srcs(
|
||||
SRCS "${MARLIN_SRCS}"
|
||||
CUDA_ARCHS "${MARLIN_OTHER_ARCHS}")
|
||||
|
||||
@@ -11,7 +11,7 @@ This directory used to contain vLLM's benchmark scripts and utilities for perfor
|
||||
|
||||
## Usage
|
||||
|
||||
For detailed usage instructions, examples, and dataset information, see the [Benchmark CLI documentation](https://docs.vllm.ai/en/latest/contributing/benchmarks.html#benchmark-cli).
|
||||
For detailed usage instructions, examples, and dataset information, see the [Benchmark CLI documentation](https://docs.vllm.ai/en/latest/benchmarking/cli/#benchmark-cli).
|
||||
|
||||
For full CLI reference see:
|
||||
|
||||
|
||||
266
benchmarks/attention_benchmarks/README.md
Normal file
266
benchmarks/attention_benchmarks/README.md
Normal file
@@ -0,0 +1,266 @@
|
||||
# vLLM Attention Benchmarking Suite
|
||||
|
||||
Fast, flexible benchmarking for vLLM attention and MLA backends with an extended batch specification grammar.
|
||||
|
||||
## Quick Start
|
||||
|
||||
```bash
|
||||
cd benchmarks/attention_benchmarks
|
||||
|
||||
# Run a pre-configured benchmark
|
||||
python benchmark.py --config configs/mla_decode.yaml
|
||||
python benchmark.py --config configs/mla_mixed_batch.yaml
|
||||
python benchmark.py --config configs/speculative_decode.yaml
|
||||
python benchmark.py --config configs/standard_attention.yaml
|
||||
python benchmark.py --config configs/reorder_threshold.yaml
|
||||
|
||||
# Or run custom benchmarks
|
||||
python benchmark.py \
|
||||
--backends flash flashinfer \
|
||||
--batch-specs "q2k" "8q1s1k" "2q2k_32q1s1k" \
|
||||
--output-csv results.csv
|
||||
```
|
||||
|
||||
## Simplified Batch Specification Grammar
|
||||
|
||||
Express workloads concisely using query length and sequence length:
|
||||
|
||||
```python
|
||||
"q2k" # 2048-token prefill (q_len=2048, seq_len=2048)
|
||||
"q1s1k" # Decode: 1 token with 1K sequence
|
||||
"8q1s1k" # 8 decode requests
|
||||
"q4s1k" # 4-token extend (e.g., spec decode)
|
||||
"2q2k_32q1s1k" # Mixed: 2 prefills + 32 decodes
|
||||
"16q4s1k" # 16 spec decode (4 tokens each)
|
||||
```
|
||||
|
||||
### Grammar Rule
|
||||
|
||||
```text
|
||||
Format: (<count>?) q<q_len>(k?) (s<seq_len>(k?))?
|
||||
|
||||
- count: Number of identical requests (optional, default=1)
|
||||
- q_len: Query length (number of new tokens)
|
||||
- seq_len: Total sequence length (optional, defaults to q_len for prefill)
|
||||
- 'k': Multiplies value by 1024
|
||||
|
||||
Mixed batches: Use _ to combine (e.g., "2q2k_32q1s1k")
|
||||
```
|
||||
|
||||
**Note**: Decode, prefill, and spec decode are just different query lengths - no special syntax needed!
|
||||
|
||||
## Pre-configured Benchmarks
|
||||
|
||||
The suite includes several pre-configured YAML benchmark configurations:
|
||||
|
||||
### MLA Decode Benchmark
|
||||
|
||||
Tests pure decode performance across MLA backends with varying batch sizes and sequence lengths.
|
||||
|
||||
```bash
|
||||
python benchmark.py --config configs/mla_decode.yaml
|
||||
```
|
||||
|
||||
### MLA Mixed Batch Benchmark
|
||||
|
||||
Tests chunked prefill performance with mixed prefill + decode batches.
|
||||
|
||||
```bash
|
||||
python benchmark.py --config configs/mla_mixed_batch.yaml
|
||||
```
|
||||
|
||||
### Speculative Decoding Benchmark
|
||||
|
||||
Tests speculative decode scenarios (K-token verification) and reorder_batch_threshold optimization.
|
||||
|
||||
```bash
|
||||
python benchmark.py --config configs/speculative_decode.yaml
|
||||
```
|
||||
|
||||
### Standard Attention Benchmark
|
||||
|
||||
Tests standard attention backends (Flash/Triton/FlashInfer) with pure prefill, decode, and mixed batches.
|
||||
|
||||
```bash
|
||||
python benchmark.py --config configs/standard_attention.yaml
|
||||
```
|
||||
|
||||
### Reorder Threshold Study
|
||||
|
||||
**Question:** At what query length does the prefill pipeline become faster than the decode pipeline?
|
||||
|
||||
Tests query lengths from 1-1024 across 9 batch sizes to find the crossover point. Uses `decode_vs_prefill` mode to compare both pipelines for each query length.
|
||||
|
||||
```bash
|
||||
python benchmark.py --config configs/reorder_threshold.yaml
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Universal Benchmark
|
||||
|
||||
The `benchmark.py` script handles **all** backends - both standard attention and MLA.
|
||||
|
||||
### Standard Attention (Flash/Triton/FlashInfer)
|
||||
|
||||
```bash
|
||||
python benchmark.py \
|
||||
--backends flash triton flashinfer \
|
||||
--batch-specs "q2k" "8q1s1k" "2q2k_32q1s1k" \
|
||||
--num-layers 10 \
|
||||
--repeats 5 \
|
||||
--output-csv results.csv
|
||||
```
|
||||
|
||||
### MLA Backends
|
||||
|
||||
```bash
|
||||
# Compare all MLA backends
|
||||
python benchmark.py \
|
||||
--backends cutlass_mla flashinfer_mla flashattn_mla flashmla \
|
||||
--batch-specs "64q1s1k" "64q1s4k" \
|
||||
--output-csv mla_results.csv
|
||||
```
|
||||
|
||||
### Parameter Sweeps
|
||||
|
||||
Use `--sweep-param` and `--sweep-values` to run parameter sweeps from the CLI:
|
||||
|
||||
#### CUTLASS MLA num-splits Optimization
|
||||
|
||||
**Question:** What is the optimal `num_kv_splits` for CUTLASS MLA?
|
||||
|
||||
```bash
|
||||
python benchmark.py \
|
||||
--backend cutlass_mla \
|
||||
--batch-specs "64q1s1k" "64q1s4k" "64q1s16k" \
|
||||
--sweep-param num_kv_splits \
|
||||
--sweep-values 1 2 4 8 16 \
|
||||
--output-json optimal_splits.json
|
||||
```
|
||||
|
||||
#### Reorder Batch Threshold Optimization
|
||||
|
||||
**Question:** What's the optimal `reorder_batch_threshold` for speculative decoding?
|
||||
|
||||
```bash
|
||||
python benchmark.py \
|
||||
--backend flashmla \
|
||||
--batch-specs "q4s1k" "q8s2k" \
|
||||
--sweep-param reorder_batch_threshold \
|
||||
--sweep-values 1 4 16 64 256 512 \
|
||||
--output-csv threshold_sweep.csv
|
||||
```
|
||||
|
||||
### All Command-Line Options
|
||||
|
||||
```text
|
||||
--config CONFIG # Path to YAML config file (overrides other args)
|
||||
--backends BACKEND [BACKEND ...] # flash, triton, flashinfer, cutlass_mla,
|
||||
# flashinfer_mla, flashattn_mla, flashmla
|
||||
--backend BACKEND # Single backend (alternative to --backends)
|
||||
--batch-specs SPEC [SPEC ...] # Batch specifications using extended grammar
|
||||
|
||||
# Model configuration
|
||||
--num-layers N # Number of layers
|
||||
--head-dim N # Head dimension
|
||||
--num-q-heads N # Query heads
|
||||
--num-kv-heads N # KV heads
|
||||
--block-size N # Block size
|
||||
|
||||
# Benchmark settings
|
||||
--device DEVICE # Device (default: cuda:0)
|
||||
--repeats N # Repetitions
|
||||
--warmup-iters N # Warmup iterations
|
||||
--profile-memory # Profile memory usage
|
||||
|
||||
# Parameter sweeps
|
||||
--sweep-param PARAM # Parameter name to sweep (e.g., num_kv_splits,
|
||||
# reorder_batch_threshold)
|
||||
--sweep-values N [N ...] # Values to sweep for the parameter
|
||||
|
||||
# Output
|
||||
--output-csv FILE # Save to CSV
|
||||
--output-json FILE # Save to JSON
|
||||
```
|
||||
|
||||
## Hardware Requirements
|
||||
|
||||
| Backend | Hardware |
|
||||
|---------|----------|
|
||||
| Flash/Triton/FlashInfer | Any CUDA GPU |
|
||||
| CUTLASS MLA | Blackwell (SM100+) |
|
||||
| FlashAttn MLA | Hopper (SM90+) |
|
||||
| FlashMLA | Hopper (SM90+) |
|
||||
| FlashInfer-MLA | Any CUDA GPU |
|
||||
|
||||
## Using MLA Runner Directly
|
||||
|
||||
All MLA backends are available through `mla_runner.run_mla_benchmark()`:
|
||||
|
||||
```python
|
||||
from mla_runner import run_mla_benchmark
|
||||
from common import BenchmarkConfig
|
||||
|
||||
config = BenchmarkConfig(
|
||||
backend="cutlass_mla",
|
||||
batch_spec="64q1s4k",
|
||||
num_layers=10,
|
||||
head_dim=576,
|
||||
num_q_heads=128,
|
||||
num_kv_heads=1,
|
||||
block_size=128,
|
||||
device="cuda:0",
|
||||
repeats=5,
|
||||
warmup_iters=3,
|
||||
)
|
||||
|
||||
# CUTLASS MLA with specific num_kv_splits
|
||||
result = run_mla_benchmark("cutlass_mla", config, num_kv_splits=4)
|
||||
print(f"Time: {result.mean_time:.6f}s")
|
||||
|
||||
# FlashInfer-MLA
|
||||
result = run_mla_benchmark("flashinfer_mla", config)
|
||||
|
||||
# FlashAttn MLA (Hopper SM90+)
|
||||
result = run_mla_benchmark("flashattn_mla", config, reorder_batch_threshold=64)
|
||||
|
||||
# FlashMLA (Hopper SM90+)
|
||||
result = run_mla_benchmark("flashmla", config, reorder_batch_threshold=64)
|
||||
```
|
||||
|
||||
## Python API
|
||||
|
||||
```python
|
||||
from batch_spec import parse_batch_spec, format_batch_spec, get_batch_stats
|
||||
from common import BenchmarkConfig, BenchmarkResult, ResultsFormatter
|
||||
|
||||
# Parse batch specs
|
||||
requests = parse_batch_spec("2q2k_q4s1k_32q1s1k")
|
||||
print(format_batch_spec(requests))
|
||||
# "2 prefill (2x2k), 1 extend (1xq4kv1k), 32 decode (32x1k)"
|
||||
|
||||
# Get batch statistics
|
||||
stats = get_batch_stats(requests)
|
||||
print(f"Total tokens: {stats['total_tokens']}")
|
||||
print(f"Num decode: {stats['num_decode']}, Num prefill: {stats['num_prefill']}")
|
||||
|
||||
# Format results
|
||||
formatter = ResultsFormatter()
|
||||
formatter.save_csv(results, "output.csv")
|
||||
formatter.save_json(results, "output.json")
|
||||
```
|
||||
|
||||
## Tips
|
||||
|
||||
**1. Warmup matters** - Use `--warmup-iters 10` for stable results
|
||||
|
||||
**2. Multiple repeats** - Use `--repeats 20` for low variance
|
||||
|
||||
**3. Save results** - Always use `--output-csv` or `--output-json`
|
||||
|
||||
**4. Test incrementally** - Start with `--num-layers 1 --repeats 1`
|
||||
|
||||
**5. Extended grammar** - Leverage spec decode, chunked prefill patterns
|
||||
|
||||
**6. Parameter sweeps** - Use `--sweep-param` and `--sweep-values` to find optimal values
|
||||
44
benchmarks/attention_benchmarks/__init__.py
Normal file
44
benchmarks/attention_benchmarks/__init__.py
Normal file
@@ -0,0 +1,44 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
"""vLLM Attention Benchmarking Suite."""
|
||||
|
||||
from .batch_spec import (
|
||||
BatchRequest,
|
||||
format_batch_spec,
|
||||
get_batch_stats,
|
||||
parse_batch_spec,
|
||||
reorder_for_flashinfer,
|
||||
split_by_type,
|
||||
)
|
||||
from .common import (
|
||||
BenchmarkConfig,
|
||||
BenchmarkResult,
|
||||
MockLayer,
|
||||
MockModelConfig,
|
||||
ResultsFormatter,
|
||||
get_attention_scale,
|
||||
is_mla_backend,
|
||||
setup_mla_dims,
|
||||
)
|
||||
|
||||
__all__ = [
|
||||
# Batch specification
|
||||
"BatchRequest",
|
||||
"parse_batch_spec",
|
||||
"format_batch_spec",
|
||||
"reorder_for_flashinfer",
|
||||
"split_by_type",
|
||||
"get_batch_stats",
|
||||
# Benchmarking infrastructure
|
||||
"BenchmarkConfig",
|
||||
"BenchmarkResult",
|
||||
"ResultsFormatter",
|
||||
# Mock objects
|
||||
"MockLayer",
|
||||
"MockModelConfig",
|
||||
# Utilities
|
||||
"setup_mla_dims",
|
||||
"get_attention_scale",
|
||||
"is_mla_backend",
|
||||
]
|
||||
231
benchmarks/attention_benchmarks/batch_spec.py
Normal file
231
benchmarks/attention_benchmarks/batch_spec.py
Normal file
@@ -0,0 +1,231 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
"""
|
||||
Simplified batch specification grammar for attention benchmarks.
|
||||
|
||||
Grammar (underscore-separated segments):
|
||||
Format: (<count>?) q<q_len>(k?) (s<seq_len>(k?))?
|
||||
|
||||
- count: Number of identical requests (optional, default=1)
|
||||
- q_len: Query length (number of new tokens)
|
||||
- seq_len: Total sequence length (optional, defaults to q_len for prefill)
|
||||
- 'k' suffix: Multiplies value by 1024
|
||||
|
||||
Common patterns:
|
||||
- Prefill: q_len == seq_len (e.g., "q2k" → 2048 new tokens, 2048 seq)
|
||||
- Decode: q_len == 1 (e.g., "q1s1k" → 1 token, 1024 seq length)
|
||||
- Extend: q_len < seq_len (e.g., "q4s1k" → 4 tokens, 1024 seq length)
|
||||
|
||||
Examples:
|
||||
q2k -> [(2048, 2048)] # Prefill: 2048 tokens
|
||||
q1s1k -> [(1, 1024)] # Decode: 1 token, 1K sequence
|
||||
8q1s1k -> [(1, 1024)] * 8 # 8 decode requests
|
||||
q4s1k -> [(4, 1024)] # 4-token extend (spec decode)
|
||||
2q1k_32q1s1k -> [(1024, 1024)] * 2 + [(1, 1024)] * 32 # Mixed batch
|
||||
16q4s1k -> [(4, 1024)] * 16 # 16 spec decode requests
|
||||
"""
|
||||
|
||||
from collections import Counter
|
||||
from dataclasses import dataclass
|
||||
|
||||
import regex as re
|
||||
|
||||
|
||||
@dataclass
|
||||
class BatchRequest:
|
||||
"""Represents a single request in a batch."""
|
||||
|
||||
q_len: int # Query length (number of new tokens)
|
||||
kv_len: int # Total KV cache length
|
||||
|
||||
@property
|
||||
def is_decode(self) -> bool:
|
||||
"""True if this is a decode request (q_len == 1)."""
|
||||
return self.q_len == 1
|
||||
|
||||
@property
|
||||
def is_prefill(self) -> bool:
|
||||
"""True if this is a pure prefill (q_len == kv_len)."""
|
||||
return self.q_len == self.kv_len
|
||||
|
||||
@property
|
||||
def is_extend(self) -> bool:
|
||||
"""True if this is context extension (q_len > 1, kv_len > q_len)."""
|
||||
return self.q_len > 1 and self.kv_len > self.q_len
|
||||
|
||||
@property
|
||||
def context_len(self) -> int:
|
||||
"""Context length (KV cache - query)."""
|
||||
return self.kv_len - self.q_len
|
||||
|
||||
def as_tuple(self) -> tuple[int, int]:
|
||||
"""Return as (q_len, kv_len) tuple for compatibility."""
|
||||
return (self.q_len, self.kv_len)
|
||||
|
||||
|
||||
def _parse_size(size_str: str, k_suffix: str) -> int:
|
||||
"""Parse size string with optional 'k' suffix."""
|
||||
size = int(size_str)
|
||||
return size * 1024 if k_suffix == "k" else size
|
||||
|
||||
|
||||
def parse_batch_spec(spec: str) -> list[BatchRequest]:
|
||||
"""
|
||||
Parse batch specification string into list of BatchRequest objects.
|
||||
|
||||
Grammar: (<count>?) q<q_len>(k?) (s<seq_len>(k?))?
|
||||
|
||||
Args:
|
||||
spec: Batch specification string (see module docstring for grammar)
|
||||
|
||||
Returns:
|
||||
List of BatchRequest objects
|
||||
|
||||
Raises:
|
||||
ValueError: If spec format is invalid
|
||||
"""
|
||||
requests = []
|
||||
|
||||
for seg in spec.split("_"):
|
||||
# Unified pattern: (<count>?) q<q_len>(k?) (s<seq_len>(k?))?
|
||||
m = re.match(r"^(?:(\d+))?q(\d+)(k?)(?:s(\d+)(k?))?$", seg)
|
||||
if m:
|
||||
cnt = int(m.group(1)) if m.group(1) else 1
|
||||
q_len = _parse_size(m.group(2), m.group(3))
|
||||
kv_len = _parse_size(m.group(4), m.group(5)) if m.group(4) else q_len
|
||||
requests.extend([BatchRequest(q_len=q_len, kv_len=kv_len)] * cnt)
|
||||
continue
|
||||
|
||||
raise ValueError(f"Invalid batch spec segment: '{seg}'")
|
||||
|
||||
return requests
|
||||
|
||||
|
||||
def format_batch_spec(requests: list[BatchRequest]) -> str:
|
||||
"""
|
||||
Format list of BatchRequest into human-readable string.
|
||||
|
||||
Groups requests by type and provides counts and sizes.
|
||||
|
||||
Args:
|
||||
requests: List of BatchRequest objects
|
||||
|
||||
Returns:
|
||||
Formatted string describing the batch
|
||||
"""
|
||||
kinds = {
|
||||
"prefill": [],
|
||||
"extend": [],
|
||||
"decode": [],
|
||||
}
|
||||
|
||||
for req in requests:
|
||||
tup = (req.q_len, req.kv_len)
|
||||
if req.is_prefill:
|
||||
kinds["prefill"].append(tup)
|
||||
elif req.is_extend:
|
||||
kinds["extend"].append(tup)
|
||||
elif req.is_decode:
|
||||
kinds["decode"].append(tup)
|
||||
|
||||
parts = []
|
||||
for kind in ["prefill", "extend", "decode"]:
|
||||
lst = kinds[kind]
|
||||
if not lst:
|
||||
continue
|
||||
|
||||
cnt_total = len(lst)
|
||||
ctr = Counter(lst)
|
||||
inner = []
|
||||
|
||||
for (q, kv), cnt in ctr.items():
|
||||
if kind == "prefill":
|
||||
size = f"{q // 1024}k" if q % 1024 == 0 else str(q)
|
||||
inner.append(f"{cnt}x{size}")
|
||||
elif kind == "decode":
|
||||
size = f"{kv // 1024}k" if kv % 1024 == 0 else str(kv)
|
||||
inner.append(f"{cnt}x{size}")
|
||||
else: # extend
|
||||
qstr = f"{q // 1024}k" if q % 1024 == 0 else str(q)
|
||||
kstr = f"{kv // 1024}k" if kv % 1024 == 0 else str(kv)
|
||||
inner.append(f"{cnt}xq{qstr}kv{kstr}")
|
||||
|
||||
parts.append(f"{cnt_total} {kind} ({', '.join(inner)})")
|
||||
|
||||
return ", ".join(parts)
|
||||
|
||||
|
||||
def reorder_for_flashinfer(requests: list[BatchRequest]) -> list[BatchRequest]:
|
||||
"""
|
||||
Reorder requests for FlashInfer: decode first, then prefill.
|
||||
|
||||
FlashInfer expects decode requests before prefill requests for
|
||||
optimal performance.
|
||||
|
||||
Args:
|
||||
requests: Original list of BatchRequest
|
||||
|
||||
Returns:
|
||||
Reordered list with decode requests first
|
||||
"""
|
||||
decodes = [r for r in requests if r.is_decode]
|
||||
non_decodes = [r for r in requests if not r.is_decode]
|
||||
return decodes + non_decodes
|
||||
|
||||
|
||||
def split_by_type(
|
||||
requests: list[BatchRequest],
|
||||
) -> dict[str, list[BatchRequest]]:
|
||||
"""
|
||||
Split requests by type for analysis.
|
||||
|
||||
Args:
|
||||
requests: List of BatchRequest
|
||||
|
||||
Returns:
|
||||
Dict with keys: 'decode', 'prefill', 'extend'
|
||||
"""
|
||||
result = {
|
||||
"decode": [],
|
||||
"prefill": [],
|
||||
"extend": [],
|
||||
}
|
||||
|
||||
for req in requests:
|
||||
if req.is_decode:
|
||||
result["decode"].append(req)
|
||||
elif req.is_prefill:
|
||||
result["prefill"].append(req)
|
||||
elif req.is_extend:
|
||||
result["extend"].append(req)
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def get_batch_stats(requests: list[BatchRequest]) -> dict:
|
||||
"""
|
||||
Compute statistics about a batch.
|
||||
|
||||
Args:
|
||||
requests: List of BatchRequest
|
||||
|
||||
Returns:
|
||||
Dict with batch statistics
|
||||
"""
|
||||
by_type = split_by_type(requests)
|
||||
|
||||
return {
|
||||
"total_requests": len(requests),
|
||||
"num_decode": len(by_type["decode"]),
|
||||
"num_prefill": len(by_type["prefill"]),
|
||||
"num_extend": len(by_type["extend"]),
|
||||
"total_tokens": sum(r.q_len for r in requests),
|
||||
"total_kv_cache": sum(r.kv_len for r in requests),
|
||||
"max_q_len": max((r.q_len for r in requests), default=0),
|
||||
"max_kv_len": max((r.kv_len for r in requests), default=0),
|
||||
"avg_q_len": sum(r.q_len for r in requests) / len(requests) if requests else 0,
|
||||
"avg_kv_len": (
|
||||
sum(r.kv_len for r in requests) / len(requests) if requests else 0
|
||||
),
|
||||
}
|
||||
886
benchmarks/attention_benchmarks/benchmark.py
Normal file
886
benchmarks/attention_benchmarks/benchmark.py
Normal file
@@ -0,0 +1,886 @@
|
||||
#!/usr/bin/env python3
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
"""
|
||||
Universal vLLM Attention Benchmark
|
||||
|
||||
Benchmark any attention backend with the extended grammar.
|
||||
Supports standard attention (Flash/Triton/FlashInfer) and MLA backends.
|
||||
|
||||
Examples:
|
||||
# Standard attention
|
||||
python benchmark.py --backends flash flashinfer --batch-specs "q2k" "8q1s1k"
|
||||
|
||||
# MLA backends
|
||||
python benchmark.py --backends cutlass_mla flashinfer_mla --batch-specs "64q1s1k"
|
||||
|
||||
# Parameter sweep (CLI)
|
||||
python benchmark.py --backend cutlass_mla \
|
||||
--batch-specs "64q1s1k" \
|
||||
--sweep-param num_kv_splits \
|
||||
--sweep-values 1 4 8 16
|
||||
|
||||
# Parameter sweep (YAML config - recommended)
|
||||
python benchmark.py --config configs/cutlass_numsplits.yaml
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import sys
|
||||
from dataclasses import replace
|
||||
from pathlib import Path
|
||||
|
||||
import yaml
|
||||
from rich.console import Console
|
||||
from tqdm import tqdm
|
||||
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent.parent))
|
||||
|
||||
from batch_spec import parse_batch_spec
|
||||
from common import (
|
||||
BenchmarkConfig,
|
||||
BenchmarkResult,
|
||||
ModelParameterSweep,
|
||||
ParameterSweep,
|
||||
ResultsFormatter,
|
||||
is_mla_backend,
|
||||
)
|
||||
|
||||
|
||||
def run_standard_attention_benchmark(config: BenchmarkConfig) -> BenchmarkResult:
|
||||
"""Run standard attention benchmark (Flash/Triton/FlashInfer)."""
|
||||
from runner import run_attention_benchmark
|
||||
|
||||
return run_attention_benchmark(config)
|
||||
|
||||
|
||||
def run_mla_benchmark(config: BenchmarkConfig, **kwargs) -> BenchmarkResult:
|
||||
"""Run MLA benchmark with appropriate backend."""
|
||||
from mla_runner import run_mla_benchmark as run_mla
|
||||
|
||||
return run_mla(config.backend, config, **kwargs)
|
||||
|
||||
|
||||
def run_benchmark(config: BenchmarkConfig, **kwargs) -> BenchmarkResult:
|
||||
"""
|
||||
Run a single benchmark with proper backend selection.
|
||||
|
||||
Args:
|
||||
config: BenchmarkConfig with backend, batch_spec, and model params
|
||||
**kwargs: Additional arguments passed to MLA benchmarks
|
||||
|
||||
Returns:
|
||||
BenchmarkResult (may have error field set on failure)
|
||||
"""
|
||||
try:
|
||||
if is_mla_backend(config.backend):
|
||||
return run_mla_benchmark(config, **kwargs)
|
||||
else:
|
||||
return run_standard_attention_benchmark(config)
|
||||
except Exception as e:
|
||||
return BenchmarkResult(
|
||||
config=config,
|
||||
mean_time=float("inf"),
|
||||
std_time=0,
|
||||
min_time=float("inf"),
|
||||
max_time=float("inf"),
|
||||
error=str(e),
|
||||
)
|
||||
|
||||
|
||||
def run_model_parameter_sweep(
|
||||
backends: list[str],
|
||||
batch_specs: list[str],
|
||||
base_config_args: dict,
|
||||
sweep: ModelParameterSweep,
|
||||
console: Console,
|
||||
) -> list[BenchmarkResult]:
|
||||
"""
|
||||
Run model parameter sweep for given backends and batch specs.
|
||||
|
||||
Args:
|
||||
backends: List of backend names
|
||||
batch_specs: List of batch specifications
|
||||
base_config_args: Base configuration arguments (num_layers, head_dim, etc.)
|
||||
sweep: ModelParameterSweep configuration
|
||||
console: Rich console for output
|
||||
|
||||
Returns:
|
||||
List of BenchmarkResult objects
|
||||
"""
|
||||
all_results = []
|
||||
|
||||
console.print(
|
||||
f"[yellow]Model sweep mode: testing {sweep.param_name} = {sweep.values}[/]"
|
||||
)
|
||||
|
||||
total = len(backends) * len(batch_specs) * len(sweep.values)
|
||||
|
||||
with tqdm(total=total, desc="Benchmarking") as pbar:
|
||||
for backend in backends:
|
||||
for spec in batch_specs:
|
||||
for value in sweep.values:
|
||||
# Create config with modified model parameter
|
||||
config_args = base_config_args.copy()
|
||||
config_args[sweep.param_name] = value
|
||||
|
||||
# Create config with original backend for running
|
||||
clean_config = BenchmarkConfig(
|
||||
backend=backend, batch_spec=spec, **config_args
|
||||
)
|
||||
|
||||
# Run benchmark
|
||||
result = run_benchmark(clean_config)
|
||||
|
||||
# Replace backend with labeled version for display
|
||||
backend_label = sweep.get_label(backend, value)
|
||||
labeled_config = replace(result.config, backend=backend_label)
|
||||
result = replace(result, config=labeled_config)
|
||||
all_results.append(result)
|
||||
|
||||
if not result.success:
|
||||
console.print(
|
||||
f"[red]Error {backend} {spec} {sweep.param_name}="
|
||||
f"{value}: {result.error}[/]"
|
||||
)
|
||||
|
||||
pbar.update(1)
|
||||
|
||||
# Display sweep results - create separate table for each parameter value
|
||||
console.print("\n[bold green]Model Parameter Sweep Results:[/]")
|
||||
formatter = ResultsFormatter(console)
|
||||
|
||||
# Group results by parameter value and extract backend mapping
|
||||
by_param_value = {}
|
||||
backend_mapping = {} # Maps labeled backend -> original backend
|
||||
|
||||
for r in all_results:
|
||||
# Extract original backend and param value from labeled backend
|
||||
# The label format is: {backend}_{param_name}_{value}
|
||||
# We need to reverse engineer this
|
||||
labeled_backend = r.config.backend
|
||||
|
||||
# Try each backend to find which one this result belongs to
|
||||
for backend in backends:
|
||||
for value in sweep.values:
|
||||
expected_label = sweep.get_label(backend, value)
|
||||
if labeled_backend == expected_label:
|
||||
backend_mapping[labeled_backend] = backend
|
||||
param_value = str(value)
|
||||
|
||||
if param_value not in by_param_value:
|
||||
by_param_value[param_value] = []
|
||||
by_param_value[param_value].append(r)
|
||||
break
|
||||
|
||||
# Create a table for each parameter value
|
||||
sorted_param_values = sorted(
|
||||
by_param_value.keys(), key=lambda x: int(x) if x.isdigit() else x
|
||||
)
|
||||
|
||||
for param_value in sorted_param_values:
|
||||
console.print(f"\n[bold cyan]{sweep.param_name} = {param_value}[/]")
|
||||
param_results = by_param_value[param_value]
|
||||
|
||||
# Create modified results with original backend names
|
||||
modified_results = []
|
||||
for r in param_results:
|
||||
# Get the original backend name from our mapping
|
||||
original_backend = backend_mapping[r.config.backend]
|
||||
modified_config = replace(r.config, backend=original_backend)
|
||||
modified_result = replace(r, config=modified_config)
|
||||
modified_results.append(modified_result)
|
||||
|
||||
# Print table with original backend names
|
||||
formatter.print_table(modified_results, backends, compare_to_fastest=True)
|
||||
|
||||
# Show optimal backend for each (param_value, batch_spec) combination
|
||||
console.print(
|
||||
f"\n[bold cyan]Optimal backend for each ({sweep.param_name}, batch_spec):[/]"
|
||||
)
|
||||
|
||||
# Group by (param_value, batch_spec)
|
||||
by_param_and_spec = {}
|
||||
for r in all_results:
|
||||
if r.success:
|
||||
# Find which (backend, value) this result corresponds to
|
||||
labeled_backend = r.config.backend
|
||||
for backend in backends:
|
||||
for value in sweep.values:
|
||||
expected_label = sweep.get_label(backend, value)
|
||||
if labeled_backend == expected_label:
|
||||
param_value = str(value)
|
||||
spec = r.config.batch_spec
|
||||
key = (param_value, spec)
|
||||
|
||||
if key not in by_param_and_spec:
|
||||
by_param_and_spec[key] = []
|
||||
by_param_and_spec[key].append(r)
|
||||
break
|
||||
|
||||
# Sort by param value then spec
|
||||
sorted_keys = sorted(
|
||||
by_param_and_spec.keys(),
|
||||
key=lambda x: (int(x[0]) if x[0].isdigit() else x[0], x[1]),
|
||||
)
|
||||
|
||||
current_param_value = None
|
||||
for param_value, spec in sorted_keys:
|
||||
# Print header when param value changes
|
||||
if param_value != current_param_value:
|
||||
console.print(f"\n [bold]{sweep.param_name}={param_value}:[/]")
|
||||
current_param_value = param_value
|
||||
|
||||
results = by_param_and_spec[(param_value, spec)]
|
||||
best = min(results, key=lambda r: r.mean_time)
|
||||
|
||||
# Extract original backend name using the mapping
|
||||
backend_name = backend_mapping[best.config.backend]
|
||||
|
||||
# Show all backends' times for comparison
|
||||
times_str = " | ".join(
|
||||
[
|
||||
f"{backend_mapping[r.config.backend]}: {r.mean_time:.6f}s"
|
||||
for r in sorted(results, key=lambda r: r.mean_time)
|
||||
]
|
||||
)
|
||||
|
||||
console.print(
|
||||
f" {spec:12s} -> [bold green]{backend_name:15s}[/] ({times_str})"
|
||||
)
|
||||
|
||||
return all_results
|
||||
|
||||
|
||||
def run_parameter_sweep(
|
||||
backends: list[str],
|
||||
batch_specs: list[str],
|
||||
base_config_args: dict,
|
||||
sweep: ParameterSweep,
|
||||
console: Console,
|
||||
) -> list[BenchmarkResult]:
|
||||
"""
|
||||
Run parameter sweep for given backends and batch specs.
|
||||
|
||||
Args:
|
||||
backends: List of backend names
|
||||
batch_specs: List of batch specifications
|
||||
base_config_args: Base configuration arguments (num_layers, head_dim, etc.)
|
||||
sweep: ParameterSweep configuration
|
||||
console: Rich console for output
|
||||
|
||||
Returns:
|
||||
List of BenchmarkResult objects
|
||||
"""
|
||||
all_results = []
|
||||
|
||||
# Build list of values to sweep (including auto if requested)
|
||||
sweep_values = list(sweep.values)
|
||||
if sweep.include_auto:
|
||||
sweep_values.append("auto")
|
||||
|
||||
console.print(f"[yellow]Sweep mode: testing {sweep.param_name} = {sweep_values}[/]")
|
||||
|
||||
total = len(backends) * len(batch_specs) * len(sweep_values)
|
||||
|
||||
with tqdm(total=total, desc="Benchmarking") as pbar:
|
||||
for backend in backends:
|
||||
for spec in batch_specs:
|
||||
for value in sweep_values:
|
||||
# Create config with original backend for running
|
||||
config = BenchmarkConfig(
|
||||
backend=backend, batch_spec=spec, **base_config_args
|
||||
)
|
||||
|
||||
# Prepare kwargs for benchmark runner
|
||||
kwargs = {}
|
||||
if value != "auto":
|
||||
kwargs[sweep.param_name] = value
|
||||
|
||||
# Run benchmark
|
||||
result = run_benchmark(config, **kwargs)
|
||||
|
||||
# Replace backend with labeled version for display
|
||||
backend_label = sweep.get_label(backend, value)
|
||||
labeled_config = replace(result.config, backend=backend_label)
|
||||
result = replace(result, config=labeled_config)
|
||||
all_results.append(result)
|
||||
|
||||
if not result.success:
|
||||
console.print(
|
||||
f"[red]Error {backend} {spec} {sweep.param_name}="
|
||||
f"{value}: {result.error}[/]"
|
||||
)
|
||||
|
||||
pbar.update(1)
|
||||
|
||||
# Display sweep results
|
||||
console.print("\n[bold green]Sweep Results:[/]")
|
||||
backend_labels = [sweep.get_label(b, v) for b in backends for v in sweep_values]
|
||||
formatter = ResultsFormatter(console)
|
||||
formatter.print_table(all_results, backend_labels)
|
||||
|
||||
# Show optimal values
|
||||
console.print(f"\n[bold cyan]Optimal {sweep.param_name} per batch spec:[/]")
|
||||
by_spec = {}
|
||||
for r in all_results:
|
||||
if r.success:
|
||||
spec = r.config.batch_spec
|
||||
if spec not in by_spec:
|
||||
by_spec[spec] = []
|
||||
by_spec[spec].append(r)
|
||||
|
||||
for spec in sorted(by_spec.keys()):
|
||||
results = by_spec[spec]
|
||||
best = min(results, key=lambda r: r.mean_time)
|
||||
console.print(
|
||||
f" {spec}: [bold green]{best.config.backend}[/] ({best.mean_time:.6f}s)"
|
||||
)
|
||||
|
||||
return all_results
|
||||
|
||||
|
||||
def load_config_from_yaml(config_path: str) -> dict:
|
||||
"""Load configuration from YAML file."""
|
||||
with open(config_path) as f:
|
||||
return yaml.safe_load(f)
|
||||
|
||||
|
||||
def generate_batch_specs_from_ranges(ranges: list[dict]) -> list[str]:
|
||||
"""
|
||||
Generate batch specs from range specifications.
|
||||
|
||||
Args:
|
||||
ranges: List of range specifications, each containing:
|
||||
- template: Batch spec template (e.g., "q{q_len}kv1k")
|
||||
- q_len: Dict with start, stop, step, end_inclusive (optional)
|
||||
- Other parameters can also be ranges
|
||||
|
||||
Returns:
|
||||
List of generated batch spec strings
|
||||
|
||||
Example:
|
||||
ranges = [
|
||||
{
|
||||
"template": "q{q_len}kv1k",
|
||||
"q_len": {
|
||||
"start": 1,
|
||||
"stop": 16,
|
||||
"step": 1,
|
||||
"end_inclusive": true # Optional, defaults to true
|
||||
}
|
||||
}
|
||||
]
|
||||
Returns: ["q1kv1k", "q2kv1k", ..., "q16kv1k"]
|
||||
"""
|
||||
all_specs = []
|
||||
|
||||
for range_spec in ranges:
|
||||
template = range_spec.get("template")
|
||||
if not template:
|
||||
raise ValueError("Range specification must include 'template'")
|
||||
|
||||
# Extract all range parameters from the spec
|
||||
range_params = {}
|
||||
for key, value in range_spec.items():
|
||||
if key == "template":
|
||||
continue
|
||||
if isinstance(value, dict) and "start" in value:
|
||||
# This is a range specification
|
||||
start = value["start"]
|
||||
stop = value["stop"]
|
||||
step = value.get("step", 1)
|
||||
# Check if end should be inclusive (default: True)
|
||||
end_inclusive = value.get("end_inclusive", True)
|
||||
|
||||
# Adjust stop based on end_inclusive
|
||||
if end_inclusive:
|
||||
range_params[key] = list(range(start, stop + 1, step))
|
||||
else:
|
||||
range_params[key] = list(range(start, stop, step))
|
||||
else:
|
||||
# This is a fixed value
|
||||
range_params[key] = [value]
|
||||
|
||||
# Generate all combinations (Cartesian product)
|
||||
if range_params:
|
||||
import itertools
|
||||
|
||||
param_names = list(range_params.keys())
|
||||
param_values = [range_params[name] for name in param_names]
|
||||
|
||||
for values in itertools.product(*param_values):
|
||||
params = dict(zip(param_names, values))
|
||||
spec = template.format(**params)
|
||||
all_specs.append(spec)
|
||||
else:
|
||||
# No parameters, just use template as-is
|
||||
all_specs.append(template)
|
||||
|
||||
return all_specs
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Universal vLLM attention benchmark",
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
epilog=__doc__,
|
||||
)
|
||||
|
||||
# Config file
|
||||
parser.add_argument(
|
||||
"--config",
|
||||
help="Path to YAML config file (overrides other args)",
|
||||
)
|
||||
|
||||
# Backend selection
|
||||
parser.add_argument(
|
||||
"--backends",
|
||||
nargs="+",
|
||||
help="Backends to benchmark (flash, triton, flashinfer, cutlass_mla, "
|
||||
"flashinfer_mla, flashattn_mla, flashmla)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--backend",
|
||||
help="Single backend (alternative to --backends)",
|
||||
)
|
||||
|
||||
# Batch specifications
|
||||
parser.add_argument(
|
||||
"--batch-specs",
|
||||
nargs="+",
|
||||
default=["q2k", "8q1s1k"],
|
||||
help="Batch specifications using extended grammar",
|
||||
)
|
||||
|
||||
# Model config
|
||||
parser.add_argument("--num-layers", type=int, default=10, help="Number of layers")
|
||||
parser.add_argument("--head-dim", type=int, default=128, help="Head dimension")
|
||||
parser.add_argument("--num-q-heads", type=int, default=32, help="Query heads")
|
||||
parser.add_argument("--num-kv-heads", type=int, default=8, help="KV heads")
|
||||
parser.add_argument("--block-size", type=int, default=16, help="Block size")
|
||||
|
||||
# Benchmark settings
|
||||
parser.add_argument("--device", default="cuda:0", help="Device")
|
||||
parser.add_argument("--repeats", type=int, default=1, help="Repetitions")
|
||||
parser.add_argument("--warmup-iters", type=int, default=3, help="Warmup iterations")
|
||||
parser.add_argument("--profile-memory", action="store_true", help="Profile memory")
|
||||
|
||||
# Parameter sweep (use YAML config for advanced sweeps)
|
||||
parser.add_argument(
|
||||
"--sweep-param",
|
||||
help="Parameter name to sweep (e.g., num_kv_splits, reorder_batch_threshold)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--sweep-values",
|
||||
type=int,
|
||||
nargs="+",
|
||||
help="Values to sweep for the parameter",
|
||||
)
|
||||
|
||||
# Output
|
||||
parser.add_argument("--output-csv", help="Save to CSV")
|
||||
parser.add_argument("--output-json", help="Save to JSON")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
console = Console()
|
||||
console.print("[bold cyan]vLLM Attention Benchmark[/]")
|
||||
|
||||
# Load config from YAML if provided
|
||||
if args.config:
|
||||
console.print(f"[yellow]Loading config from: {args.config}[/]")
|
||||
yaml_config = load_config_from_yaml(args.config)
|
||||
|
||||
# Show description if available
|
||||
if "description" in yaml_config:
|
||||
console.print(f"[dim]{yaml_config['description']}[/]")
|
||||
|
||||
# Override args with YAML values
|
||||
# (YAML takes precedence unless CLI arg was explicitly set)
|
||||
# Backend(s)
|
||||
if "backend" in yaml_config:
|
||||
args.backend = yaml_config["backend"]
|
||||
args.backends = None
|
||||
elif "backends" in yaml_config:
|
||||
args.backends = yaml_config["backends"]
|
||||
args.backend = None
|
||||
|
||||
# Check for special modes
|
||||
if "mode" in yaml_config:
|
||||
args.mode = yaml_config["mode"]
|
||||
else:
|
||||
args.mode = None
|
||||
|
||||
# Batch specs and sizes
|
||||
# Support both explicit batch_specs and generated batch_spec_ranges
|
||||
if "batch_spec_ranges" in yaml_config:
|
||||
# Generate batch specs from ranges
|
||||
generated_specs = generate_batch_specs_from_ranges(
|
||||
yaml_config["batch_spec_ranges"]
|
||||
)
|
||||
# Combine with any explicit batch_specs
|
||||
if "batch_specs" in yaml_config:
|
||||
args.batch_specs = yaml_config["batch_specs"] + generated_specs
|
||||
else:
|
||||
args.batch_specs = generated_specs
|
||||
console.print(
|
||||
f"[dim]Generated {len(generated_specs)} batch specs from ranges[/]"
|
||||
)
|
||||
elif "batch_specs" in yaml_config:
|
||||
args.batch_specs = yaml_config["batch_specs"]
|
||||
|
||||
if "batch_sizes" in yaml_config:
|
||||
args.batch_sizes = yaml_config["batch_sizes"]
|
||||
else:
|
||||
args.batch_sizes = None
|
||||
|
||||
# Model config
|
||||
if "model" in yaml_config:
|
||||
model = yaml_config["model"]
|
||||
args.num_layers = model.get("num_layers", args.num_layers)
|
||||
args.head_dim = model.get("head_dim", args.head_dim)
|
||||
args.num_q_heads = model.get("num_q_heads", args.num_q_heads)
|
||||
args.num_kv_heads = model.get("num_kv_heads", args.num_kv_heads)
|
||||
args.block_size = model.get("block_size", args.block_size)
|
||||
|
||||
# Benchmark settings
|
||||
if "benchmark" in yaml_config:
|
||||
bench = yaml_config["benchmark"]
|
||||
args.device = bench.get("device", args.device)
|
||||
args.repeats = bench.get("repeats", args.repeats)
|
||||
args.warmup_iters = bench.get("warmup_iters", args.warmup_iters)
|
||||
args.profile_memory = bench.get("profile_memory", args.profile_memory)
|
||||
|
||||
# Parameter sweep configuration
|
||||
if "parameter_sweep" in yaml_config:
|
||||
sweep_config = yaml_config["parameter_sweep"]
|
||||
args.parameter_sweep = ParameterSweep(
|
||||
param_name=sweep_config["param_name"],
|
||||
values=sweep_config["values"],
|
||||
include_auto=sweep_config.get("include_auto", False),
|
||||
label_format=sweep_config.get(
|
||||
"label_format", "{backend}_{param_name}_{value}"
|
||||
),
|
||||
)
|
||||
else:
|
||||
args.parameter_sweep = None
|
||||
|
||||
# Model parameter sweep configuration
|
||||
if "model_parameter_sweep" in yaml_config:
|
||||
sweep_config = yaml_config["model_parameter_sweep"]
|
||||
args.model_parameter_sweep = ModelParameterSweep(
|
||||
param_name=sweep_config["param_name"],
|
||||
values=sweep_config["values"],
|
||||
label_format=sweep_config.get(
|
||||
"label_format", "{backend}_{param_name}_{value}"
|
||||
),
|
||||
)
|
||||
else:
|
||||
args.model_parameter_sweep = None
|
||||
|
||||
# Output
|
||||
if "output" in yaml_config:
|
||||
output = yaml_config["output"]
|
||||
if "csv" in output and not args.output_csv:
|
||||
args.output_csv = output["csv"]
|
||||
if "json" in output and not args.output_json:
|
||||
args.output_json = output["json"]
|
||||
|
||||
console.print()
|
||||
|
||||
# Handle CLI-based parameter sweep (if not from YAML)
|
||||
if (
|
||||
(not hasattr(args, "parameter_sweep") or args.parameter_sweep is None)
|
||||
and args.sweep_param
|
||||
and args.sweep_values
|
||||
):
|
||||
args.parameter_sweep = ParameterSweep(
|
||||
param_name=args.sweep_param,
|
||||
values=args.sweep_values,
|
||||
include_auto=False,
|
||||
label_format="{backend}_{param_name}_{value}",
|
||||
)
|
||||
|
||||
# Determine backends
|
||||
backends = args.backends or ([args.backend] if args.backend else ["flash"])
|
||||
console.print(f"Backends: {', '.join(backends)}")
|
||||
console.print(f"Batch specs: {', '.join(args.batch_specs)}")
|
||||
console.print()
|
||||
|
||||
# Run benchmarks
|
||||
all_results = []
|
||||
|
||||
# Handle special mode: decode_vs_prefill comparison
|
||||
if hasattr(args, "mode") and args.mode == "decode_vs_prefill":
|
||||
console.print("[yellow]Mode: Decode vs Prefill pipeline comparison[/]")
|
||||
console.print(
|
||||
"[dim]For each query length, testing both decode and prefill pipelines[/]"
|
||||
)
|
||||
console.print("[dim]Using batched execution for optimal performance[/]")
|
||||
|
||||
# Extract batch sizes from config
|
||||
batch_sizes = getattr(args, "batch_sizes", [1])
|
||||
backend = backends[0] # Use first backend (should only be one)
|
||||
|
||||
# Calculate total benchmarks
|
||||
total = len(batch_sizes)
|
||||
|
||||
with tqdm(total=total, desc="Benchmarking") as pbar:
|
||||
for batch_size in batch_sizes:
|
||||
# Prepare all configs for this batch size
|
||||
configs_with_thresholds = []
|
||||
|
||||
for spec in args.batch_specs:
|
||||
# Parse the batch spec to get query length
|
||||
requests = parse_batch_spec(spec)
|
||||
if not requests:
|
||||
console.print(
|
||||
f"[red]Error: Could not parse batch spec '{spec}'[/]"
|
||||
)
|
||||
continue
|
||||
|
||||
# Get query length from first request
|
||||
query_length = requests[0].q_len
|
||||
|
||||
# Create batch spec for this batch size
|
||||
# For batch_size > 1, we need to prepend the count
|
||||
batch_spec = f"{batch_size}{spec}" if batch_size > 1 else spec
|
||||
|
||||
# Create base config (without backend name)
|
||||
base_config = BenchmarkConfig(
|
||||
backend=backend, # Will be overridden later
|
||||
batch_spec=batch_spec,
|
||||
num_layers=args.num_layers,
|
||||
head_dim=args.head_dim,
|
||||
num_q_heads=args.num_q_heads,
|
||||
num_kv_heads=args.num_kv_heads,
|
||||
block_size=args.block_size,
|
||||
device=args.device,
|
||||
repeats=args.repeats,
|
||||
warmup_iters=args.warmup_iters,
|
||||
profile_memory=args.profile_memory,
|
||||
)
|
||||
|
||||
# Add decode pipeline config
|
||||
decode_threshold = query_length
|
||||
config_decode = replace(
|
||||
base_config,
|
||||
backend=f"{backend}_decode_qlen{query_length}_bs{batch_size}",
|
||||
)
|
||||
configs_with_thresholds.append((config_decode, decode_threshold))
|
||||
|
||||
# Add prefill pipeline config if query_length > 1
|
||||
if query_length > 1:
|
||||
prefill_threshold = query_length - 1
|
||||
config_prefill = replace(
|
||||
base_config,
|
||||
backend=f"{backend}_prefill_qlen{query_length}"
|
||||
f"_bs{batch_size}",
|
||||
)
|
||||
configs_with_thresholds.append(
|
||||
(config_prefill, prefill_threshold)
|
||||
)
|
||||
|
||||
# Run all benchmarks for this batch size in one go (batched mode)
|
||||
try:
|
||||
from mla_runner import run_mla_benchmark as run_mla
|
||||
|
||||
# Use batched API: pass list of (config, threshold) tuples
|
||||
timing_results = run_mla(backend, configs_with_thresholds)
|
||||
|
||||
# Create BenchmarkResult objects from timing results
|
||||
for (config, _), timing in zip(
|
||||
configs_with_thresholds, timing_results
|
||||
):
|
||||
result = BenchmarkResult(
|
||||
config=config,
|
||||
mean_time=timing["mean"],
|
||||
std_time=timing["std"],
|
||||
min_time=timing["min"],
|
||||
max_time=timing["max"],
|
||||
throughput_tokens_per_sec=timing.get("throughput", None),
|
||||
)
|
||||
all_results.append(result)
|
||||
|
||||
except Exception as e:
|
||||
import traceback
|
||||
|
||||
console.print(
|
||||
f"[red]Error running batched benchmarks for "
|
||||
f"batch_size={batch_size}: {e}[/]"
|
||||
)
|
||||
console.print("[red]Traceback:[/]")
|
||||
traceback.print_exc()
|
||||
# Add error results for all configs
|
||||
for config, _ in configs_with_thresholds:
|
||||
result = BenchmarkResult(
|
||||
config=config,
|
||||
mean_time=float("inf"),
|
||||
std_time=0,
|
||||
min_time=float("inf"),
|
||||
max_time=float("inf"),
|
||||
error=str(e),
|
||||
)
|
||||
all_results.append(result)
|
||||
|
||||
pbar.update(1)
|
||||
|
||||
# Display decode vs prefill results
|
||||
console.print("\n[bold green]Decode vs Prefill Results:[/]")
|
||||
|
||||
# Group by batch size
|
||||
by_batch_size = {}
|
||||
for r in all_results:
|
||||
if r.success:
|
||||
# Extract batch size from backend name
|
||||
parts = r.config.backend.split("_")
|
||||
bs_part = [p for p in parts if p.startswith("bs")]
|
||||
if bs_part:
|
||||
bs = int(bs_part[0][2:])
|
||||
if bs not in by_batch_size:
|
||||
by_batch_size[bs] = []
|
||||
by_batch_size[bs].append(r)
|
||||
|
||||
# For each batch size, analyze crossover point
|
||||
for bs in sorted(by_batch_size.keys()):
|
||||
console.print(f"\n[bold cyan]Batch size: {bs}[/]")
|
||||
results = by_batch_size[bs]
|
||||
|
||||
# Group by query length
|
||||
by_qlen = {}
|
||||
for r in results:
|
||||
parts = r.config.backend.split("_")
|
||||
qlen_part = [p for p in parts if p.startswith("qlen")]
|
||||
if qlen_part:
|
||||
qlen = int(qlen_part[0][4:])
|
||||
if qlen not in by_qlen:
|
||||
by_qlen[qlen] = {}
|
||||
|
||||
pipeline = "decode" if "decode" in r.config.backend else "prefill"
|
||||
by_qlen[qlen][pipeline] = r
|
||||
|
||||
# Find crossover point
|
||||
last_decode_faster = None
|
||||
for qlen in sorted(by_qlen.keys()):
|
||||
pipelines = by_qlen[qlen]
|
||||
if "decode" in pipelines and "prefill" in pipelines:
|
||||
decode_time = pipelines["decode"].mean_time
|
||||
prefill_time = pipelines["prefill"].mean_time
|
||||
faster = "decode" if decode_time < prefill_time else "prefill"
|
||||
|
||||
speedup = (
|
||||
prefill_time / decode_time
|
||||
if decode_time < prefill_time
|
||||
else decode_time / prefill_time
|
||||
)
|
||||
|
||||
console.print(
|
||||
f" qlen={qlen:3d}: decode={decode_time:.6f}s, "
|
||||
f"prefill={prefill_time:.6f}s -> "
|
||||
f"[bold]{faster}[/] ({speedup:.2f}x)"
|
||||
)
|
||||
|
||||
if faster == "decode":
|
||||
last_decode_faster = qlen
|
||||
|
||||
if last_decode_faster is not None:
|
||||
optimal_threshold = last_decode_faster
|
||||
console.print(
|
||||
f"\n [bold green]Optimal threshold for batch_size={bs}: "
|
||||
f"{optimal_threshold}[/]"
|
||||
)
|
||||
console.print(
|
||||
f" [dim](Use decode pipeline for query_length <= "
|
||||
f"{optimal_threshold})[/]"
|
||||
)
|
||||
else:
|
||||
console.print(
|
||||
f"\n [yellow]Prefill always faster for batch_size={bs}[/]"
|
||||
)
|
||||
|
||||
# Handle model parameter sweep mode
|
||||
elif hasattr(args, "model_parameter_sweep") and args.model_parameter_sweep:
|
||||
# Model parameter sweep
|
||||
base_config_args = {
|
||||
"num_layers": args.num_layers,
|
||||
"head_dim": args.head_dim,
|
||||
"num_q_heads": args.num_q_heads,
|
||||
"num_kv_heads": args.num_kv_heads,
|
||||
"block_size": args.block_size,
|
||||
"device": args.device,
|
||||
"repeats": args.repeats,
|
||||
"warmup_iters": args.warmup_iters,
|
||||
"profile_memory": args.profile_memory,
|
||||
}
|
||||
all_results = run_model_parameter_sweep(
|
||||
backends,
|
||||
args.batch_specs,
|
||||
base_config_args,
|
||||
args.model_parameter_sweep,
|
||||
console,
|
||||
)
|
||||
|
||||
# Handle parameter sweep mode (unified)
|
||||
elif hasattr(args, "parameter_sweep") and args.parameter_sweep:
|
||||
# Unified parameter sweep
|
||||
base_config_args = {
|
||||
"num_layers": args.num_layers,
|
||||
"head_dim": args.head_dim,
|
||||
"num_q_heads": args.num_q_heads,
|
||||
"num_kv_heads": args.num_kv_heads,
|
||||
"block_size": args.block_size,
|
||||
"device": args.device,
|
||||
"repeats": args.repeats,
|
||||
"warmup_iters": args.warmup_iters,
|
||||
"profile_memory": args.profile_memory,
|
||||
}
|
||||
all_results = run_parameter_sweep(
|
||||
backends, args.batch_specs, base_config_args, args.parameter_sweep, console
|
||||
)
|
||||
|
||||
else:
|
||||
# Normal mode: compare backends
|
||||
total = len(backends) * len(args.batch_specs)
|
||||
|
||||
with tqdm(total=total, desc="Benchmarking") as pbar:
|
||||
for spec in args.batch_specs:
|
||||
for backend in backends:
|
||||
config = BenchmarkConfig(
|
||||
backend=backend,
|
||||
batch_spec=spec,
|
||||
num_layers=args.num_layers,
|
||||
head_dim=args.head_dim,
|
||||
num_q_heads=args.num_q_heads,
|
||||
num_kv_heads=args.num_kv_heads,
|
||||
block_size=args.block_size,
|
||||
device=args.device,
|
||||
repeats=args.repeats,
|
||||
warmup_iters=args.warmup_iters,
|
||||
profile_memory=args.profile_memory,
|
||||
)
|
||||
|
||||
result = run_benchmark(config)
|
||||
all_results.append(result)
|
||||
|
||||
if not result.success:
|
||||
console.print(f"[red]Error {backend} {spec}: {result.error}[/]")
|
||||
|
||||
pbar.update(1)
|
||||
|
||||
# Display results
|
||||
console.print("\n[bold green]Results:[/]")
|
||||
formatter = ResultsFormatter(console)
|
||||
formatter.print_table(all_results, backends)
|
||||
|
||||
# Save results
|
||||
if all_results:
|
||||
formatter = ResultsFormatter(console)
|
||||
if args.output_csv:
|
||||
formatter.save_csv(all_results, args.output_csv)
|
||||
if args.output_json:
|
||||
formatter.save_json(all_results, args.output_json)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
503
benchmarks/attention_benchmarks/common.py
Normal file
503
benchmarks/attention_benchmarks/common.py
Normal file
@@ -0,0 +1,503 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
"""Common utilities for attention benchmarking."""
|
||||
|
||||
import csv
|
||||
import json
|
||||
import math
|
||||
from dataclasses import asdict, dataclass
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from rich.console import Console
|
||||
from rich.table import Table
|
||||
|
||||
# Mock classes for vLLM attention infrastructure
|
||||
|
||||
|
||||
class MockHfConfig:
|
||||
"""Mock HuggingFace config that satisfies vLLM's requirements."""
|
||||
|
||||
def __init__(self, mla_dims: dict):
|
||||
self.num_attention_heads = mla_dims["num_q_heads"]
|
||||
self.num_key_value_heads = mla_dims["num_kv_heads"]
|
||||
self.hidden_size = mla_dims["head_dim"] * mla_dims["num_q_heads"]
|
||||
self.model_type = "deepseek_v2"
|
||||
self.is_encoder_decoder = False
|
||||
self.kv_lora_rank = mla_dims["kv_lora_rank"]
|
||||
self.qk_nope_head_dim = mla_dims["qk_nope_head_dim"]
|
||||
self.qk_rope_head_dim = mla_dims["qk_rope_head_dim"]
|
||||
self.v_head_dim = mla_dims["v_head_dim"]
|
||||
self.qk_head_dim = mla_dims["qk_nope_head_dim"] + mla_dims["qk_rope_head_dim"]
|
||||
|
||||
def get_text_config(self):
|
||||
return self
|
||||
|
||||
|
||||
# Import AttentionLayerBase at module level to avoid circular dependencies
|
||||
try:
|
||||
from vllm.model_executor.layers.attention_layer_base import AttentionLayerBase
|
||||
|
||||
_HAS_ATTENTION_LAYER_BASE = True
|
||||
except ImportError:
|
||||
_HAS_ATTENTION_LAYER_BASE = False
|
||||
AttentionLayerBase = object # Fallback
|
||||
|
||||
|
||||
class MockKVBProj:
|
||||
"""Mock KV projection layer for MLA prefill mode.
|
||||
|
||||
Mimics ColumnParallelLinear behavior for kv_b_proj in MLA backends.
|
||||
Projects kv_c_normed to [qk_nope_head_dim + v_head_dim] per head.
|
||||
"""
|
||||
|
||||
def __init__(self, num_heads: int, qk_nope_head_dim: int, v_head_dim: int):
|
||||
self.num_heads = num_heads
|
||||
self.qk_nope_head_dim = qk_nope_head_dim
|
||||
self.v_head_dim = v_head_dim
|
||||
self.out_dim = qk_nope_head_dim + v_head_dim
|
||||
|
||||
def __call__(self, x: torch.Tensor) -> tuple[torch.Tensor]:
|
||||
"""
|
||||
Project kv_c_normed to output space.
|
||||
|
||||
Args:
|
||||
x: Input tensor [num_tokens, kv_lora_rank]
|
||||
|
||||
Returns:
|
||||
Tuple containing output tensor
|
||||
[num_tokens, num_heads, qk_nope_head_dim + v_head_dim]
|
||||
"""
|
||||
num_tokens = x.shape[0]
|
||||
result = torch.randn(
|
||||
num_tokens,
|
||||
self.num_heads,
|
||||
self.out_dim,
|
||||
device=x.device,
|
||||
dtype=x.dtype,
|
||||
)
|
||||
return (result,) # Return as tuple to match ColumnParallelLinear API
|
||||
|
||||
|
||||
class MockLayer(AttentionLayerBase):
|
||||
"""Mock attention layer with scale parameters and impl.
|
||||
|
||||
Inherits from AttentionLayerBase so it passes isinstance checks
|
||||
in get_layers_from_vllm_config when FlashInfer prefill is enabled.
|
||||
"""
|
||||
|
||||
def __init__(self, device: torch.device, impl=None, kv_cache_spec=None):
|
||||
# Don't call super().__init__() as AttentionLayerBase doesn't have __init__
|
||||
self._k_scale = torch.tensor(1.0, device=device)
|
||||
self._v_scale = torch.tensor(1.0, device=device)
|
||||
self._q_scale = torch.tensor(1.0, device=device)
|
||||
# Scalar floats for kernels that need them
|
||||
self._k_scale_float = float(self._k_scale.item())
|
||||
self._v_scale_float = float(self._v_scale.item())
|
||||
self._q_scale_float = float(self._q_scale.item())
|
||||
# AttentionImpl for metadata builders to query
|
||||
self.impl = impl
|
||||
# KV cache spec for get_kv_cache_spec
|
||||
self._kv_cache_spec = kv_cache_spec
|
||||
|
||||
def get_attn_backend(self):
|
||||
"""Get the attention backend class (required by AttentionLayerBase)."""
|
||||
# Return None as this is just a mock layer for benchmarking
|
||||
return None
|
||||
|
||||
def get_kv_cache_spec(self):
|
||||
"""Get the KV cache spec (required by AttentionLayerBase)."""
|
||||
return self._kv_cache_spec
|
||||
|
||||
|
||||
class MockModelConfig:
|
||||
"""Mock model configuration."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
num_q_heads: int,
|
||||
num_kv_heads: int,
|
||||
head_dim: int,
|
||||
dtype: torch.dtype = torch.float16,
|
||||
max_model_len: int = 32768,
|
||||
):
|
||||
self._n_q = num_q_heads
|
||||
self._n_kv = num_kv_heads
|
||||
self._d = head_dim
|
||||
self.dtype = dtype
|
||||
self.max_model_len = max_model_len
|
||||
|
||||
def get_num_attention_heads(self, _=None) -> int:
|
||||
return self._n_q
|
||||
|
||||
def get_num_kv_heads(self, _=None) -> int:
|
||||
return self._n_kv
|
||||
|
||||
def get_head_size(self) -> int:
|
||||
return self._d
|
||||
|
||||
def get_num_layers(self) -> int:
|
||||
"""Mock method for layer count queries."""
|
||||
return 1
|
||||
|
||||
def get_sliding_window_for_layer(self, _layer_idx: int):
|
||||
"""Mock method for sliding window queries."""
|
||||
return None
|
||||
|
||||
def get_logits_soft_cap_for_layer(self, _layer_idx: int):
|
||||
"""Mock method for logits soft cap queries."""
|
||||
return None
|
||||
|
||||
def get_sm_scale_for_layer(self, _layer_idx: int) -> float:
|
||||
"""Mock method for SM scale queries."""
|
||||
return 1.0 / (self.get_head_size() ** 0.5)
|
||||
|
||||
|
||||
class MockParallelConfig:
|
||||
"""Mock parallel configuration."""
|
||||
|
||||
pass
|
||||
|
||||
|
||||
class MockCompilationConfig:
|
||||
"""Mock compilation configuration."""
|
||||
|
||||
def __init__(self):
|
||||
self.full_cuda_graph = False
|
||||
self.static_forward_context = {}
|
||||
|
||||
|
||||
class MockVLLMConfig:
|
||||
"""Mock VLLM configuration."""
|
||||
|
||||
def __init__(self):
|
||||
self.compilation_config = MockCompilationConfig()
|
||||
|
||||
|
||||
class MockRunner:
|
||||
"""Mock GPU runner for metadata builders."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
seq_lens: np.ndarray,
|
||||
query_start_locs: np.ndarray,
|
||||
device: torch.device,
|
||||
num_q_heads: int,
|
||||
num_kv_heads: int,
|
||||
head_dim: int,
|
||||
dtype: torch.dtype,
|
||||
):
|
||||
self.model_config = MockModelConfig(num_q_heads, num_kv_heads, head_dim, dtype)
|
||||
self.parallel_config = MockParallelConfig()
|
||||
self.vllm_config = MockVLLMConfig()
|
||||
self.seq_lens_np = seq_lens
|
||||
self.query_start_loc_np = query_start_locs
|
||||
self.device = device
|
||||
self.attention_chunk_size = None
|
||||
self.num_query_heads = num_q_heads
|
||||
self.num_kv_heads = num_kv_heads
|
||||
self.dtype = dtype
|
||||
|
||||
|
||||
@dataclass
|
||||
class ParameterSweep:
|
||||
"""Configuration for sweeping a backend parameter."""
|
||||
|
||||
param_name: str # Name of the backend parameter to sweep
|
||||
values: list[Any] # List of values to test
|
||||
include_auto: bool = False # Also test with param unset (auto mode)
|
||||
label_format: str = "{backend}_{param_name}_{value}" # Result label template
|
||||
|
||||
def get_label(self, backend: str, value: Any) -> str:
|
||||
"""Generate a label for a specific parameter value."""
|
||||
return self.label_format.format(
|
||||
backend=backend, param_name=self.param_name, value=value
|
||||
)
|
||||
|
||||
|
||||
@dataclass
|
||||
class ModelParameterSweep:
|
||||
"""Configuration for sweeping a model configuration parameter."""
|
||||
|
||||
param_name: str # Name of the model config parameter to sweep (e.g., "num_q_heads")
|
||||
values: list[Any] # List of values to test
|
||||
label_format: str = "{backend}_{param_name}_{value}" # Result label template
|
||||
|
||||
def get_label(self, backend: str, value: Any) -> str:
|
||||
"""Generate a label for a specific parameter value."""
|
||||
return self.label_format.format(
|
||||
backend=backend, param_name=self.param_name, value=value
|
||||
)
|
||||
|
||||
|
||||
@dataclass
|
||||
class BenchmarkConfig:
|
||||
"""Configuration for a single benchmark run."""
|
||||
|
||||
backend: str
|
||||
batch_spec: str
|
||||
num_layers: int
|
||||
head_dim: int
|
||||
num_q_heads: int
|
||||
num_kv_heads: int
|
||||
block_size: int
|
||||
device: str
|
||||
dtype: torch.dtype = torch.float16
|
||||
repeats: int = 1
|
||||
warmup_iters: int = 3
|
||||
profile_memory: bool = False
|
||||
use_cuda_graphs: bool = False
|
||||
|
||||
# MLA-specific
|
||||
kv_lora_rank: int | None = None
|
||||
qk_nope_head_dim: int | None = None
|
||||
qk_rope_head_dim: int | None = None
|
||||
v_head_dim: int | None = None
|
||||
|
||||
# Backend-specific tuning
|
||||
num_kv_splits: int | None = None # CUTLASS MLA
|
||||
reorder_batch_threshold: int | None = None # FlashAttn MLA, FlashMLA
|
||||
|
||||
|
||||
@dataclass
|
||||
class BenchmarkResult:
|
||||
"""Results from a single benchmark run."""
|
||||
|
||||
config: BenchmarkConfig
|
||||
mean_time: float # seconds
|
||||
std_time: float # seconds
|
||||
min_time: float # seconds
|
||||
max_time: float # seconds
|
||||
throughput_tokens_per_sec: float | None = None
|
||||
memory_allocated_mb: float | None = None
|
||||
memory_reserved_mb: float | None = None
|
||||
error: str | None = None
|
||||
|
||||
@property
|
||||
def success(self) -> bool:
|
||||
"""Whether benchmark completed successfully."""
|
||||
return self.error is None
|
||||
|
||||
def to_dict(self) -> dict[str, Any]:
|
||||
"""Convert to dictionary for serialization."""
|
||||
return {
|
||||
"config": asdict(self.config),
|
||||
"mean_time": self.mean_time,
|
||||
"std_time": self.std_time,
|
||||
"min_time": self.min_time,
|
||||
"max_time": self.max_time,
|
||||
"throughput_tokens_per_sec": self.throughput_tokens_per_sec,
|
||||
"memory_allocated_mb": self.memory_allocated_mb,
|
||||
"memory_reserved_mb": self.memory_reserved_mb,
|
||||
"error": self.error,
|
||||
}
|
||||
|
||||
|
||||
class ResultsFormatter:
|
||||
"""Format and display benchmark results."""
|
||||
|
||||
def __init__(self, console: Console | None = None):
|
||||
self.console = console or Console()
|
||||
|
||||
def print_table(
|
||||
self,
|
||||
results: list[BenchmarkResult],
|
||||
backends: list[str],
|
||||
compare_to_fastest: bool = True,
|
||||
):
|
||||
"""
|
||||
Print results as a rich table.
|
||||
|
||||
Args:
|
||||
results: List of BenchmarkResult
|
||||
backends: List of backend names being compared
|
||||
compare_to_fastest: Show percentage comparison to fastest
|
||||
"""
|
||||
# Group by batch spec
|
||||
by_spec = {}
|
||||
for r in results:
|
||||
spec = r.config.batch_spec
|
||||
if spec not in by_spec:
|
||||
by_spec[spec] = {}
|
||||
by_spec[spec][r.config.backend] = r
|
||||
|
||||
# Create shortened backend names for display
|
||||
def shorten_backend_name(name: str) -> str:
|
||||
"""Shorten long backend names for table display."""
|
||||
# Remove common prefixes
|
||||
name = name.replace("flashattn_mla", "famla")
|
||||
name = name.replace("flashinfer_mla", "fimla")
|
||||
name = name.replace("flashmla", "fmla")
|
||||
name = name.replace("cutlass_mla", "cmla")
|
||||
name = name.replace("numsplits", "ns")
|
||||
return name
|
||||
|
||||
table = Table(title="Attention Benchmark Results")
|
||||
table.add_column("Batch\nSpec", no_wrap=True)
|
||||
|
||||
multi = len(backends) > 1
|
||||
for backend in backends:
|
||||
short_name = shorten_backend_name(backend)
|
||||
# Time column
|
||||
col_time = f"{short_name}\nTime (s)"
|
||||
table.add_column(col_time, justify="right", no_wrap=False)
|
||||
if multi and compare_to_fastest:
|
||||
# Relative performance column
|
||||
col_rel = f"{short_name}\nvs Best"
|
||||
table.add_column(col_rel, justify="right", no_wrap=False)
|
||||
|
||||
# Add rows
|
||||
for spec in sorted(by_spec.keys()):
|
||||
spec_results = by_spec[spec]
|
||||
times = {b: r.mean_time for b, r in spec_results.items() if r.success}
|
||||
best_time = min(times.values()) if times else 0.0
|
||||
|
||||
row = [spec]
|
||||
for backend in backends:
|
||||
if backend in spec_results:
|
||||
r = spec_results[backend]
|
||||
if r.success:
|
||||
row.append(f"{r.mean_time:.6f}")
|
||||
if multi and compare_to_fastest:
|
||||
pct = (
|
||||
(r.mean_time / best_time * 100) if best_time > 0 else 0
|
||||
)
|
||||
pct_str = f"{pct:.1f}%"
|
||||
if r.mean_time == best_time:
|
||||
pct_str = f"[bold green]{pct_str}[/]"
|
||||
row.append(pct_str)
|
||||
else:
|
||||
row.append("[red]ERROR[/]")
|
||||
if multi and compare_to_fastest:
|
||||
row.append("-")
|
||||
else:
|
||||
row.append("-")
|
||||
if multi and compare_to_fastest:
|
||||
row.append("-")
|
||||
|
||||
table.add_row(*row)
|
||||
|
||||
self.console.print(table)
|
||||
|
||||
def save_csv(self, results: list[BenchmarkResult], path: str):
|
||||
"""Save results to CSV file."""
|
||||
if not results:
|
||||
return
|
||||
|
||||
path_obj = Path(path)
|
||||
path_obj.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
with open(path, "w", newline="") as f:
|
||||
writer = csv.DictWriter(
|
||||
f,
|
||||
fieldnames=[
|
||||
"backend",
|
||||
"batch_spec",
|
||||
"num_layers",
|
||||
"mean_time",
|
||||
"std_time",
|
||||
"throughput",
|
||||
"memory_mb",
|
||||
],
|
||||
)
|
||||
writer.writeheader()
|
||||
for r in results:
|
||||
writer.writerow(
|
||||
{
|
||||
"backend": r.config.backend,
|
||||
"batch_spec": r.config.batch_spec,
|
||||
"num_layers": r.config.num_layers,
|
||||
"mean_time": r.mean_time,
|
||||
"std_time": r.std_time,
|
||||
"throughput": r.throughput_tokens_per_sec or 0,
|
||||
"memory_mb": r.memory_allocated_mb or 0,
|
||||
}
|
||||
)
|
||||
|
||||
self.console.print(f"[green]Saved CSV results to {path}[/]")
|
||||
|
||||
def save_json(self, results: list[BenchmarkResult], path: str):
|
||||
"""Save results to JSON file."""
|
||||
path_obj = Path(path)
|
||||
path_obj.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
data = [r.to_dict() for r in results]
|
||||
with open(path, "w") as f:
|
||||
json.dump(data, f, indent=2, default=str)
|
||||
|
||||
self.console.print(f"[green]Saved JSON results to {path}[/]")
|
||||
|
||||
|
||||
def setup_mla_dims(model_name: str = "deepseek-v3") -> dict:
|
||||
"""
|
||||
Get MLA dimensions for known models.
|
||||
|
||||
Args:
|
||||
model_name: Model identifier
|
||||
|
||||
Returns:
|
||||
Dict with MLA dimension configuration
|
||||
"""
|
||||
configs = {
|
||||
"deepseek-v2": {
|
||||
"kv_lora_rank": 512,
|
||||
"qk_nope_head_dim": 128,
|
||||
"qk_rope_head_dim": 64,
|
||||
"v_head_dim": 128,
|
||||
"num_q_heads": 128,
|
||||
"num_kv_heads": 1,
|
||||
"head_dim": 576,
|
||||
},
|
||||
"deepseek-v3": {
|
||||
"kv_lora_rank": 512,
|
||||
"qk_nope_head_dim": 128,
|
||||
"qk_rope_head_dim": 64,
|
||||
"v_head_dim": 128,
|
||||
"num_q_heads": 128,
|
||||
"num_kv_heads": 1,
|
||||
"head_dim": 576,
|
||||
},
|
||||
"deepseek-v2-lite": {
|
||||
"kv_lora_rank": 512,
|
||||
"qk_nope_head_dim": 128,
|
||||
"qk_rope_head_dim": 64,
|
||||
"v_head_dim": 128,
|
||||
"num_q_heads": 16,
|
||||
"num_kv_heads": 1,
|
||||
"head_dim": 576,
|
||||
},
|
||||
}
|
||||
|
||||
if model_name not in configs:
|
||||
raise ValueError(
|
||||
f"Unknown model '{model_name}'. Known models: {list(configs.keys())}"
|
||||
)
|
||||
|
||||
return configs[model_name]
|
||||
|
||||
|
||||
def get_attention_scale(head_dim: int) -> float:
|
||||
"""Compute attention scale factor (1/sqrt(d))."""
|
||||
return 1.0 / math.sqrt(head_dim)
|
||||
|
||||
|
||||
def is_mla_backend(backend: str) -> bool:
|
||||
"""
|
||||
Check if backend is an MLA backend using the backend's is_mla() property.
|
||||
|
||||
Args:
|
||||
backend: Backend name (e.g., "CUTLASS_MLA", "FLASHINFER_MLA")
|
||||
|
||||
Returns:
|
||||
True if the backend is an MLA backend, False otherwise
|
||||
"""
|
||||
from vllm.v1.attention.backends.registry import AttentionBackendEnum
|
||||
|
||||
try:
|
||||
backend_class = AttentionBackendEnum[backend.upper()].get_class()
|
||||
return backend_class.is_mla()
|
||||
except (KeyError, ValueError, ImportError):
|
||||
return False
|
||||
61
benchmarks/attention_benchmarks/configs/mla_decode.yaml
Normal file
61
benchmarks/attention_benchmarks/configs/mla_decode.yaml
Normal file
@@ -0,0 +1,61 @@
|
||||
# MLA decode-only benchmark configuration
|
||||
|
||||
model:
|
||||
name: "deepseek-v3"
|
||||
num_layers: 60
|
||||
num_q_heads: 128
|
||||
num_kv_heads: 1 # MLA uses single latent KV
|
||||
head_dim: 576
|
||||
kv_lora_rank: 512
|
||||
qk_nope_head_dim: 128
|
||||
qk_rope_head_dim: 64
|
||||
v_head_dim: 128
|
||||
block_size: 128 # CUTLASS MLA and FlashAttn MLA use 128
|
||||
|
||||
batch_specs:
|
||||
# Small batches, varying sequence lengths
|
||||
- "16q1s512" # 16 requests, 512 KV cache
|
||||
- "16q1s1k" # 16 requests, 1k KV cache
|
||||
- "16q1s2k" # 16 requests, 2k KV cache
|
||||
- "16q1s4k" # 16 requests, 4k KV cache
|
||||
|
||||
# Medium batches
|
||||
- "32q1s1k" # 32 requests, 1k KV cache
|
||||
- "32q1s2k" # 32 requests, 2k KV cache
|
||||
- "32q1s4k" # 32 requests, 4k KV cache
|
||||
- "32q1s8k" # 32 requests, 8k KV cache
|
||||
|
||||
# Large batches
|
||||
- "64q1s1k" # 64 requests, 1k KV cache
|
||||
- "64q1s2k" # 64 requests, 2k KV cache
|
||||
- "64q1s4k" # 64 requests, 4k KV cache
|
||||
- "64q1s8k" # 64 requests, 8k KV cache
|
||||
|
||||
# Very large batches
|
||||
- "128q1s1k" # 128 requests, 1k KV cache
|
||||
- "128q1s2k" # 128 requests, 2k KV cache
|
||||
|
||||
# Long context
|
||||
- "32q1s16k" # 32 requests, 16k KV cache
|
||||
- "32q1s32k" # 32 requests, 32k KV cache
|
||||
|
||||
backends:
|
||||
- cutlass_mla
|
||||
- flashinfer_mla
|
||||
- flashattn_mla # Hopper only
|
||||
- flashmla # Hopper only
|
||||
|
||||
device: "cuda:0"
|
||||
repeats: 5
|
||||
warmup_iters: 3
|
||||
profile_memory: true
|
||||
|
||||
# Backend-specific tuning
|
||||
cutlass_mla:
|
||||
num_kv_splits: auto # or specific value like 4, 8, 16
|
||||
|
||||
flashattn_mla:
|
||||
reorder_batch_threshold: 512
|
||||
|
||||
flashmla:
|
||||
reorder_batch_threshold: 1
|
||||
60
benchmarks/attention_benchmarks/configs/mla_mixed_batch.yaml
Normal file
60
benchmarks/attention_benchmarks/configs/mla_mixed_batch.yaml
Normal file
@@ -0,0 +1,60 @@
|
||||
# MLA mixed batch benchmark (prefill + decode)
|
||||
# Tests chunked prefill performance
|
||||
|
||||
model:
|
||||
name: "deepseek-v3"
|
||||
num_layers: 60
|
||||
num_q_heads: 128
|
||||
num_kv_heads: 1
|
||||
head_dim: 576
|
||||
kv_lora_rank: 512
|
||||
qk_nope_head_dim: 128
|
||||
qk_rope_head_dim: 64
|
||||
v_head_dim: 128
|
||||
block_size: 128
|
||||
|
||||
batch_specs:
|
||||
# Small prefill + decode
|
||||
- "1q1k_8q1s1k" # 1 prefill + 8 decode
|
||||
- "2q2k_16q1s1k" # 2 prefill + 16 decode
|
||||
- "4q1k_32q1s2k" # 4 prefill + 32 decode
|
||||
|
||||
# Medium prefill + decode
|
||||
- "2q4k_32q1s2k" # 2 medium prefill + 32 decode
|
||||
- "4q4k_64q1s2k" # 4 medium prefill + 64 decode
|
||||
- "8q2k_64q1s4k" # 8 prefill + 64 decode
|
||||
|
||||
# Large prefill + decode (chunked prefill stress test)
|
||||
- "2q8k_32q1s1k" # 2 large prefill + 32 decode
|
||||
- "1q16k_16q1s2k" # 1 very large prefill + 16 decode
|
||||
- "2q16k_32q1s4k" # 2 very large prefill + 32 decode
|
||||
|
||||
# Context extension + decode
|
||||
- "2q1kkv2k_16q1s1k" # 2 extend + 16 decode
|
||||
- "4q2kkv4k_32q1s2k" # 4 extend + 32 decode
|
||||
- "2q1kkv8k_32q1s2k" # 2 large extend + 32 decode
|
||||
|
||||
# Explicitly chunked prefill
|
||||
- "q8k" # 8k prefill with chunking hint
|
||||
- "q16k" # 16k prefill with chunking hint
|
||||
- "2q8k_32q1s2k" # 2 chunked prefill + 32 decode
|
||||
|
||||
# High decode ratio (realistic serving)
|
||||
- "1q2k_63q1s1k" # 1 prefill + 63 decode
|
||||
- "2q2k_62q1s2k" # 2 prefill + 62 decode
|
||||
- "4q4k_60q1s4k" # 4 prefill + 60 decode
|
||||
|
||||
backends:
|
||||
- cutlass_mla
|
||||
- flashinfer_mla
|
||||
- flashattn_mla # Hopper only
|
||||
- flashmla # Hopper only
|
||||
|
||||
device: "cuda:0"
|
||||
repeats: 5
|
||||
warmup_iters: 3
|
||||
profile_memory: true
|
||||
|
||||
# Analyze chunked prefill workspace size impact
|
||||
chunked_prefill:
|
||||
test_workspace_sizes: [4096, 8192, 16384, 32768, 65536]
|
||||
@@ -0,0 +1,88 @@
|
||||
# Study 4: What is optimal reorder_batch_threshold for MLA backends supporting query length > 1?
|
||||
# Question: At what query length does prefill pipeline become faster than decode pipeline?
|
||||
# Methodology: For each query length, compare decode vs prefill performance to find crossover point
|
||||
# Applies to: FlashAttn MLA, FlashMLA
|
||||
|
||||
description: "Decode vs Prefill pipeline crossover analysis"
|
||||
|
||||
# Test FlashAttn MLA
|
||||
backend: flashattn_mla
|
||||
|
||||
# Mode: decode_vs_prefill comparison (special sweep mode)
|
||||
# For each batch spec, we'll test both decode and prefill pipelines
|
||||
mode: "decode_vs_prefill"
|
||||
|
||||
# Query lengths to test (from old benchmark_mla_threshold.py methodology)
|
||||
# Each query length will be tested with BOTH decode and prefill pipelines:
|
||||
# - decode: threshold >= query_length (forces decode pipeline)
|
||||
# - prefill: threshold < query_length (forces prefill pipeline)
|
||||
#
|
||||
# We use q<N>s1k format which creates q_len=N, seq_len=1024 requests
|
||||
# This tests different query lengths with fixed sequence length context
|
||||
#
|
||||
# Using batch_spec_ranges for automatic generation:
|
||||
batch_spec_ranges:
|
||||
- template: "q{q_len}s1k"
|
||||
q_len:
|
||||
start: 1
|
||||
stop: 16
|
||||
step: 1
|
||||
end_inclusive: false
|
||||
- template: "q{q_len}s1k"
|
||||
q_len:
|
||||
start: 16
|
||||
stop: 64
|
||||
step: 2
|
||||
end_inclusive: false
|
||||
- template: "q{q_len}s1k"
|
||||
q_len:
|
||||
start: 64
|
||||
stop: 1024
|
||||
step: 4
|
||||
end_inclusive: true
|
||||
|
||||
# Batch sizes to test (from old script)
|
||||
batch_sizes:
|
||||
- 1
|
||||
- 2
|
||||
- 4
|
||||
- 8
|
||||
- 16
|
||||
- 32
|
||||
- 64
|
||||
- 128
|
||||
- 256
|
||||
|
||||
# Model configuration (DeepSeek V2/V3 defaults)
|
||||
model:
|
||||
num_layers: 10
|
||||
head_dim: 576
|
||||
num_q_heads: 128
|
||||
num_kv_heads: 1
|
||||
block_size: 128
|
||||
|
||||
# Benchmark settings
|
||||
benchmark:
|
||||
device: "cuda:0"
|
||||
repeats: 15 # More repeats for spec decode variance
|
||||
warmup_iters: 5
|
||||
profile_memory: false
|
||||
|
||||
# Output
|
||||
output:
|
||||
csv: "reorder_threshold_results.csv"
|
||||
json: "reorder_threshold_results.json"
|
||||
|
||||
# Expected outcome (reproduces old benchmark_mla_threshold.py study):
|
||||
# - For each batch size, find the crossover point where prefill becomes faster than decode
|
||||
# - Show decode vs prefill performance across all query lengths
|
||||
# - Determine optimal reorder_batch_threshold based on last query length where decode is faster
|
||||
# - Understand how crossover point varies with batch size
|
||||
# - Provide data-driven guidance for default threshold value
|
||||
#
|
||||
# Methodology (from old script):
|
||||
# - Each query length tested with BOTH pipelines:
|
||||
# * decode: threshold >= query_length (forces decode pipeline)
|
||||
# * prefill: threshold < query_length (forces prefill pipeline)
|
||||
# - Compare which is faster to find crossover point
|
||||
#
|
||||
@@ -0,0 +1,62 @@
|
||||
# Speculative decoding benchmark configuration
|
||||
# Tests reorder_batch_threshold optimization
|
||||
|
||||
model:
|
||||
name: "deepseek-v3"
|
||||
num_layers: 60
|
||||
num_q_heads: 128
|
||||
num_kv_heads: 1
|
||||
head_dim: 576
|
||||
kv_lora_rank: 512
|
||||
qk_nope_head_dim: 128
|
||||
qk_rope_head_dim: 64
|
||||
v_head_dim: 128
|
||||
|
||||
batch_specs:
|
||||
# Pure speculative decode (K-token verification)
|
||||
- "q2s1k" # 2-token spec, 1k KV
|
||||
- "q4s1k" # 4-token spec, 1k KV
|
||||
- "q8s1k" # 8-token spec, 1k KV
|
||||
- "q16s1k" # 16-token spec, 1k KV
|
||||
|
||||
# Speculative with different context lengths
|
||||
- "q4s2k" # 4-token spec, 2k KV
|
||||
- "q4s4k" # 4-token spec, 4k KV
|
||||
- "q8s2k" # 8-token spec, 2k KV
|
||||
- "q8s4k" # 8-token spec, 4k KV
|
||||
|
||||
# Mixed: speculative + regular decode
|
||||
- "32q4s1k" # 32 spec requests
|
||||
- "16q4s1k_16q1s1k" # 16 spec + 16 regular
|
||||
- "8q8s2k_24q1s2k" # 8 spec (8-tok) + 24 regular
|
||||
|
||||
# Mixed: speculative + prefill + decode
|
||||
- "2q1k_16q4s1k_16q1s1k" # 2 prefill + 16 spec + 16 decode
|
||||
- "4q2k_32q4s2k_32q1s2k" # 4 prefill + 32 spec + 32 decode
|
||||
|
||||
# Large batches with speculation
|
||||
- "64q4s1k" # 64 spec requests
|
||||
- "32q8s2k" # 32 spec (8-token)
|
||||
- "16q16s4k" # 16 spec (16-token)
|
||||
|
||||
# Backends that support query length > 1
|
||||
backends:
|
||||
- flashattn_mla # reorder_batch_threshold = 512
|
||||
- flashmla # reorder_batch_threshold = 1 (tunable)
|
||||
|
||||
# FlashInfer-MLA also supports uniform spec-as-decode but with different mechanism
|
||||
# - flashinfer_mla
|
||||
|
||||
# Benchmark settings
|
||||
benchmark:
|
||||
device: "cuda:0"
|
||||
repeats: 10 # More repeats for statistical significance
|
||||
warmup_iters: 5
|
||||
profile_memory: false
|
||||
|
||||
# Test these threshold values for optimization
|
||||
parameter_sweep:
|
||||
param_name: "reorder_batch_threshold"
|
||||
values: [1, 2, 4, 8, 16, 32, 64, 128, 256, 512]
|
||||
include_auto: false
|
||||
label_format: "{backend}_threshold_{value}"
|
||||
@@ -0,0 +1,40 @@
|
||||
# Standard attention backend benchmark configuration
|
||||
|
||||
model:
|
||||
num_layers: 32
|
||||
num_q_heads: 32
|
||||
num_kv_heads: 8 # GQA with 4:1 ratio
|
||||
head_dim: 128
|
||||
block_size: 16
|
||||
|
||||
batch_specs:
|
||||
# Pure prefill
|
||||
- "q512" # Small prefill (512 tokens)
|
||||
- "q2k" # Medium prefill (2048 tokens)
|
||||
- "q4k" # Large prefill (4096 tokens)
|
||||
- "q8k" # Very large prefill (8192 tokens)
|
||||
|
||||
# Pure decode
|
||||
- "8q1s1k" # 8 requests, 1k KV cache each
|
||||
- "16q1s2k" # 16 requests, 2k KV cache each
|
||||
- "32q1s1k" # 32 requests, 1k KV cache each
|
||||
- "64q1s4k" # 64 requests, 4k KV cache each
|
||||
|
||||
# Mixed prefill/decode
|
||||
- "2q2k_8q1s1k" # 2 prefill + 8 decode
|
||||
- "4q1k_16q1s2k" # 4 prefill + 16 decode
|
||||
- "2q4k_32q1s1k" # 2 large prefill + 32 decode
|
||||
|
||||
# Context extension
|
||||
- "q1ks2k" # 1k query, 2k sequence (chunked prefill)
|
||||
- "2q1ks4k" # 2 requests: 1k query, 4k sequence
|
||||
|
||||
backends:
|
||||
- flash
|
||||
- triton
|
||||
- flashinfer
|
||||
|
||||
device: "cuda:0"
|
||||
repeats: 5
|
||||
warmup_iters: 3
|
||||
profile_memory: false
|
||||
836
benchmarks/attention_benchmarks/mla_runner.py
Normal file
836
benchmarks/attention_benchmarks/mla_runner.py
Normal file
@@ -0,0 +1,836 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
"""
|
||||
MLA benchmark runner - shared utilities for MLA benchmarks.
|
||||
|
||||
This module provides helpers for running MLA backends without
|
||||
needing full VllmConfig integration.
|
||||
"""
|
||||
|
||||
import importlib
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from batch_spec import parse_batch_spec
|
||||
from common import (
|
||||
BenchmarkResult,
|
||||
MockHfConfig,
|
||||
MockKVBProj,
|
||||
MockLayer,
|
||||
setup_mla_dims,
|
||||
)
|
||||
|
||||
from vllm.config import (
|
||||
CacheConfig,
|
||||
CompilationConfig,
|
||||
ModelConfig,
|
||||
ParallelConfig,
|
||||
SchedulerConfig,
|
||||
VllmConfig,
|
||||
set_current_vllm_config,
|
||||
)
|
||||
|
||||
# ============================================================================
|
||||
# VllmConfig Creation
|
||||
# ============================================================================
|
||||
|
||||
|
||||
def _add_mock_methods_to_model_config(model_config: ModelConfig) -> None:
|
||||
"""
|
||||
Add mock methods for layer-specific queries to ModelConfig.
|
||||
|
||||
These methods are needed by metadata builders but aren't normally
|
||||
present on ModelConfig when used in benchmark contexts.
|
||||
"""
|
||||
import types
|
||||
|
||||
model_config.get_num_layers = types.MethodType(lambda self: 1, model_config)
|
||||
model_config.get_sliding_window_for_layer = types.MethodType(
|
||||
lambda self, _i: None, model_config
|
||||
)
|
||||
model_config.get_logits_soft_cap_for_layer = types.MethodType(
|
||||
lambda self, _i: None, model_config
|
||||
)
|
||||
model_config.get_sm_scale_for_layer = types.MethodType(
|
||||
lambda self, _i: 1.0 / model_config.get_head_size() ** 0.5, model_config
|
||||
)
|
||||
|
||||
|
||||
def create_minimal_vllm_config(
|
||||
model_name: str = "deepseek-v3",
|
||||
block_size: int = 128,
|
||||
max_num_seqs: int = 256,
|
||||
mla_dims: dict | None = None,
|
||||
) -> VllmConfig:
|
||||
"""
|
||||
Create minimal VllmConfig for MLA benchmarks.
|
||||
|
||||
Args:
|
||||
model_name: Model name (deepseek-v2, deepseek-v3, etc.) - used if mla_dims not
|
||||
provided
|
||||
block_size: KV cache block size
|
||||
max_num_seqs: Maximum number of sequences
|
||||
mla_dims: Optional custom MLA dimensions dict. If not provided, uses
|
||||
setup_mla_dims(model_name)
|
||||
|
||||
Returns:
|
||||
VllmConfig for benchmarking
|
||||
"""
|
||||
# Get MLA dimensions - use provided or load from model name
|
||||
if mla_dims is None:
|
||||
mla_dims = setup_mla_dims(model_name)
|
||||
|
||||
# Create mock HF config first (avoids downloading from HuggingFace)
|
||||
mock_hf_config = MockHfConfig(mla_dims)
|
||||
|
||||
# Create a temporary minimal config.json to avoid HF downloads
|
||||
# This ensures consistent ModelConfig construction without network access
|
||||
import json
|
||||
import os
|
||||
import shutil
|
||||
import tempfile
|
||||
|
||||
minimal_config = {
|
||||
"architectures": ["DeepseekV2ForCausalLM"],
|
||||
"model_type": "deepseek_v2",
|
||||
"num_attention_heads": mla_dims["num_q_heads"],
|
||||
"num_key_value_heads": mla_dims["num_kv_heads"],
|
||||
"hidden_size": mla_dims["head_dim"] * mla_dims["num_q_heads"],
|
||||
"torch_dtype": "bfloat16",
|
||||
"max_position_embeddings": 163840, # DeepSeek V3 default
|
||||
"rope_theta": 10000.0,
|
||||
"vocab_size": 128256,
|
||||
}
|
||||
|
||||
# Create temporary directory with config.json
|
||||
temp_dir = tempfile.mkdtemp(prefix="vllm_bench_")
|
||||
config_path = os.path.join(temp_dir, "config.json")
|
||||
with open(config_path, "w") as f:
|
||||
json.dump(minimal_config, f)
|
||||
|
||||
try:
|
||||
# Create model config using local path - no HF downloads
|
||||
model_config = ModelConfig(
|
||||
model=temp_dir, # Use local temp directory
|
||||
tokenizer=None,
|
||||
tokenizer_mode="auto",
|
||||
trust_remote_code=True,
|
||||
dtype="bfloat16",
|
||||
seed=0,
|
||||
max_model_len=32768,
|
||||
quantization=None,
|
||||
quantization_param_path=None,
|
||||
enforce_eager=False,
|
||||
max_context_len_to_capture=None,
|
||||
max_seq_len_to_capture=8192,
|
||||
max_logprobs=20,
|
||||
disable_sliding_window=False,
|
||||
skip_tokenizer_init=True,
|
||||
served_model_name=None,
|
||||
limit_mm_per_prompt=None,
|
||||
use_async_output_proc=True,
|
||||
config_format="auto",
|
||||
)
|
||||
finally:
|
||||
# Clean up temporary directory
|
||||
shutil.rmtree(temp_dir, ignore_errors=True)
|
||||
|
||||
# Override with our mock config
|
||||
model_config.hf_config = mock_hf_config
|
||||
model_config.hf_text_config = mock_hf_config
|
||||
|
||||
# Add mock methods for layer-specific queries
|
||||
_add_mock_methods_to_model_config(model_config)
|
||||
|
||||
# Create sub-configs
|
||||
cache_config = CacheConfig(
|
||||
block_size=block_size,
|
||||
gpu_memory_utilization=0.9,
|
||||
swap_space=0,
|
||||
cache_dtype="auto",
|
||||
enable_prefix_caching=False,
|
||||
)
|
||||
|
||||
scheduler_config = SchedulerConfig(
|
||||
max_num_seqs=max_num_seqs,
|
||||
max_num_batched_tokens=8192,
|
||||
max_model_len=32768,
|
||||
is_encoder_decoder=False,
|
||||
enable_chunked_prefill=True,
|
||||
)
|
||||
|
||||
parallel_config = ParallelConfig(
|
||||
tensor_parallel_size=1,
|
||||
)
|
||||
|
||||
compilation_config = CompilationConfig()
|
||||
|
||||
return VllmConfig(
|
||||
model_config=model_config,
|
||||
cache_config=cache_config,
|
||||
parallel_config=parallel_config,
|
||||
scheduler_config=scheduler_config,
|
||||
compilation_config=compilation_config,
|
||||
)
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Backend Configuration
|
||||
# ============================================================================
|
||||
|
||||
|
||||
# Backend name to class name prefix mapping
|
||||
_BACKEND_NAME_MAP = {
|
||||
"flashattn_mla": "FlashAttnMLA",
|
||||
"flashmla": "FlashMLA",
|
||||
"flashinfer_mla": "FlashInferMLA",
|
||||
"cutlass_mla": "CutlassMLA",
|
||||
}
|
||||
|
||||
# Special properties that differ from defaults
|
||||
_BACKEND_PROPERTIES = {
|
||||
"flashmla": {
|
||||
"query_format": "concat", # Single concatenated tensor (vs tuple)
|
||||
"block_size": 64, # FlashMLA uses fixed block size
|
||||
},
|
||||
"flashinfer_mla": {
|
||||
"block_size": 64, # FlashInfer MLA only supports 32 or 64
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
def _get_backend_config(backend: str) -> dict:
|
||||
"""
|
||||
Get backend configuration using naming conventions.
|
||||
|
||||
All MLA backends follow the pattern:
|
||||
- Module: vllm.v1.attention.backends.mla.{backend}
|
||||
- Impl: {Name}Impl
|
||||
- Metadata: {Name}Metadata (or MLACommonMetadata)
|
||||
- DecodeMetadata: {Name}DecodeMetadata (or MLACommonDecodeMetadata)
|
||||
- MetadataBuilder: {Name}MetadataBuilder
|
||||
"""
|
||||
if backend not in _BACKEND_NAME_MAP:
|
||||
raise ValueError(f"Unknown backend: {backend}")
|
||||
|
||||
name = _BACKEND_NAME_MAP[backend]
|
||||
props = _BACKEND_PROPERTIES.get(backend, {})
|
||||
|
||||
# Check if backend uses common metadata (FlashInfer, CUTLASS)
|
||||
uses_common = backend in ("flashinfer_mla", "cutlass_mla")
|
||||
|
||||
return {
|
||||
"module": f"vllm.v1.attention.backends.mla.{backend}",
|
||||
"impl_class": f"{name}Impl",
|
||||
"metadata_class": "MLACommonMetadata" if uses_common else f"{name}Metadata",
|
||||
"decode_metadata_class": "MLACommonDecodeMetadata"
|
||||
if uses_common
|
||||
else f"{name}DecodeMetadata",
|
||||
"builder_class": f"{name}MetadataBuilder",
|
||||
"query_format": props.get("query_format", "tuple"),
|
||||
"block_size": props.get("block_size", None),
|
||||
}
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Metadata Building Helpers
|
||||
# ============================================================================
|
||||
|
||||
|
||||
def _build_attention_metadata(
|
||||
requests: list,
|
||||
block_size: int,
|
||||
device: torch.device,
|
||||
builder_instance,
|
||||
) -> tuple:
|
||||
"""
|
||||
Build attention metadata from batch requests.
|
||||
|
||||
Args:
|
||||
requests: List of BatchRequest objects
|
||||
block_size: KV cache block size
|
||||
device: Target device
|
||||
builder_instance: Metadata builder instance
|
||||
|
||||
Returns:
|
||||
Tuple of (metadata, kv_cache_num_blocks)
|
||||
"""
|
||||
q_lens = [r.q_len for r in requests]
|
||||
kv_lens = [r.kv_len for r in requests]
|
||||
total_q = sum(q_lens)
|
||||
max_kv = max(kv_lens)
|
||||
|
||||
# Build query start locations
|
||||
q_start_cpu = torch.tensor(
|
||||
[0] + [sum(q_lens[: i + 1]) for i in range(len(q_lens))],
|
||||
dtype=torch.int32,
|
||||
)
|
||||
q_start_gpu = q_start_cpu.to(device)
|
||||
|
||||
# Build sequence lengths
|
||||
seq_lens_cpu = torch.tensor(kv_lens, dtype=torch.int32)
|
||||
seq_lens_gpu = seq_lens_cpu.to(device)
|
||||
|
||||
# Build num_computed_tokens (context length for each request)
|
||||
context_lens = [kv_len - q_len for q_len, kv_len in zip(q_lens, kv_lens)]
|
||||
num_computed_tokens_cpu = torch.tensor(context_lens, dtype=torch.int32)
|
||||
|
||||
# Build block table
|
||||
num_blocks_per_req = [(kv + block_size - 1) // block_size for kv in kv_lens]
|
||||
max_num_blocks = max(num_blocks_per_req)
|
||||
|
||||
block_table_cpu = np.zeros((len(requests), max_num_blocks), dtype=np.int32)
|
||||
current_block = 0
|
||||
for i, num_blocks in enumerate(num_blocks_per_req):
|
||||
for j in range(num_blocks):
|
||||
block_table_cpu[i, j] = current_block
|
||||
current_block += 1
|
||||
|
||||
block_table_gpu = torch.from_numpy(block_table_cpu).to(device)
|
||||
|
||||
# Build slot mapping
|
||||
slot_mapping_list = []
|
||||
for i, (q_len, kv_len, num_blocks) in enumerate(
|
||||
zip(q_lens, kv_lens, num_blocks_per_req)
|
||||
):
|
||||
context_len = kv_len - q_len
|
||||
for j in range(q_len):
|
||||
token_kv_idx = context_len + j
|
||||
block_idx = token_kv_idx // block_size
|
||||
offset_in_block = token_kv_idx % block_size
|
||||
global_block_id = block_table_cpu[i, block_idx]
|
||||
slot_id = global_block_id * block_size + offset_in_block
|
||||
slot_mapping_list.append(slot_id)
|
||||
|
||||
slot_mapping = torch.tensor(slot_mapping_list, dtype=torch.int64, device=device)
|
||||
|
||||
# Create CommonAttentionMetadata
|
||||
from vllm.v1.attention.backends.utils import CommonAttentionMetadata
|
||||
|
||||
common_attn_metadata = CommonAttentionMetadata(
|
||||
num_reqs=len(requests),
|
||||
max_query_len=max(q_lens),
|
||||
max_seq_len=max_kv,
|
||||
num_actual_tokens=total_q,
|
||||
query_start_loc=q_start_gpu,
|
||||
query_start_loc_cpu=q_start_cpu,
|
||||
seq_lens=seq_lens_gpu,
|
||||
_seq_lens_cpu=seq_lens_cpu,
|
||||
_num_computed_tokens_cpu=num_computed_tokens_cpu,
|
||||
slot_mapping=slot_mapping,
|
||||
block_table_tensor=block_table_gpu,
|
||||
dcp_local_seq_lens=None,
|
||||
)
|
||||
|
||||
# Use the production build() method
|
||||
metadata = builder_instance.build(
|
||||
common_prefix_len=0,
|
||||
common_attn_metadata=common_attn_metadata,
|
||||
fast_build=False,
|
||||
)
|
||||
|
||||
return metadata, current_block
|
||||
|
||||
|
||||
def _create_input_tensors(
|
||||
total_q: int,
|
||||
mla_dims: dict,
|
||||
query_format: str,
|
||||
device: torch.device,
|
||||
dtype: torch.dtype,
|
||||
):
|
||||
"""
|
||||
Create input tensors for both decode and prefill modes.
|
||||
|
||||
MLA requires different tensor formats for decode vs prefill:
|
||||
- Decode: Uses kv_lora_rank (512) dimension
|
||||
- Prefill: Uses qk_nope_head_dim (128) to stay under FlashAttention's 256 limit
|
||||
|
||||
Args:
|
||||
total_q: Total number of query tokens
|
||||
mla_dims: MLA dimension configuration
|
||||
query_format: Either "tuple" or "concat"
|
||||
device: Target device
|
||||
dtype: Tensor dtype
|
||||
|
||||
Returns:
|
||||
Tuple of (decode_inputs, prefill_inputs)
|
||||
- decode_inputs: Query tensor(s) for decode mode
|
||||
- prefill_inputs: Dict with 'q', 'k_c_normed', 'k_pe', 'k_scale' for prefill
|
||||
"""
|
||||
if query_format == "tuple":
|
||||
# Decode mode format: (q_nope, q_pe) where q_nope has kv_lora_rank dim
|
||||
q_nope_decode = torch.randn(
|
||||
total_q,
|
||||
mla_dims["num_q_heads"],
|
||||
mla_dims["kv_lora_rank"],
|
||||
device=device,
|
||||
dtype=dtype,
|
||||
)
|
||||
q_pe = torch.randn(
|
||||
total_q,
|
||||
mla_dims["num_q_heads"],
|
||||
mla_dims["qk_rope_head_dim"],
|
||||
device=device,
|
||||
dtype=dtype,
|
||||
)
|
||||
decode_inputs = (q_nope_decode, q_pe)
|
||||
|
||||
# For prefill, we need q with qk_nope_head_dim instead of kv_lora_rank
|
||||
q_nope_prefill = torch.randn(
|
||||
total_q,
|
||||
mla_dims["num_q_heads"],
|
||||
mla_dims["qk_nope_head_dim"],
|
||||
device=device,
|
||||
dtype=dtype,
|
||||
)
|
||||
prefill_q = torch.cat([q_nope_prefill, q_pe], dim=-1)
|
||||
else: # concat
|
||||
decode_inputs = torch.randn(
|
||||
total_q,
|
||||
mla_dims["num_q_heads"],
|
||||
mla_dims["kv_lora_rank"] + mla_dims["qk_rope_head_dim"],
|
||||
device=device,
|
||||
dtype=dtype,
|
||||
)
|
||||
# For prefill with concat format
|
||||
prefill_q = torch.randn(
|
||||
total_q,
|
||||
mla_dims["num_q_heads"],
|
||||
mla_dims["qk_nope_head_dim"] + mla_dims["qk_rope_head_dim"],
|
||||
device=device,
|
||||
dtype=dtype,
|
||||
)
|
||||
|
||||
# Create additional inputs needed for prefill forward
|
||||
k_c_normed = torch.randn(
|
||||
total_q,
|
||||
mla_dims["kv_lora_rank"],
|
||||
device=device,
|
||||
dtype=dtype,
|
||||
)
|
||||
k_pe = torch.randn(
|
||||
total_q,
|
||||
1, # Single head for MLA
|
||||
mla_dims["qk_rope_head_dim"],
|
||||
device=device,
|
||||
dtype=dtype,
|
||||
)
|
||||
k_scale = torch.ones(1, device=device, dtype=torch.float32)
|
||||
|
||||
output = torch.zeros(
|
||||
total_q,
|
||||
mla_dims["num_q_heads"] * mla_dims["v_head_dim"],
|
||||
device=device,
|
||||
dtype=dtype,
|
||||
)
|
||||
|
||||
prefill_inputs = {
|
||||
"q": prefill_q,
|
||||
"k_c_normed": k_c_normed,
|
||||
"k_pe": k_pe,
|
||||
"k_scale": k_scale,
|
||||
"output": output,
|
||||
}
|
||||
|
||||
return decode_inputs, prefill_inputs
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Backend Initialization
|
||||
# ============================================================================
|
||||
|
||||
|
||||
def _create_backend_impl(
|
||||
backend_cfg: dict,
|
||||
mla_dims: dict,
|
||||
vllm_config: VllmConfig,
|
||||
device: torch.device,
|
||||
):
|
||||
"""
|
||||
Create backend implementation instance.
|
||||
|
||||
Args:
|
||||
backend_cfg: Backend configuration dict
|
||||
mla_dims: MLA dimension configuration
|
||||
vllm_config: VllmConfig instance
|
||||
device: Target device
|
||||
|
||||
Returns:
|
||||
Tuple of (impl, layer, builder_instance)
|
||||
"""
|
||||
# Import backend classes
|
||||
backend_module = importlib.import_module(backend_cfg["module"])
|
||||
impl_class = getattr(backend_module, backend_cfg["impl_class"])
|
||||
|
||||
# Calculate scale
|
||||
scale = 1.0 / np.sqrt(mla_dims["qk_nope_head_dim"] + mla_dims["qk_rope_head_dim"])
|
||||
|
||||
# Create mock kv_b_proj layer for prefill mode
|
||||
mock_kv_b_proj = MockKVBProj(
|
||||
num_heads=mla_dims["num_q_heads"],
|
||||
qk_nope_head_dim=mla_dims["qk_nope_head_dim"],
|
||||
v_head_dim=mla_dims["v_head_dim"],
|
||||
)
|
||||
|
||||
# Create impl
|
||||
impl = impl_class(
|
||||
num_heads=mla_dims["num_q_heads"],
|
||||
head_size=mla_dims["head_dim"],
|
||||
scale=scale,
|
||||
num_kv_heads=mla_dims["num_kv_heads"],
|
||||
alibi_slopes=None,
|
||||
sliding_window=None,
|
||||
kv_cache_dtype="auto",
|
||||
logits_soft_cap=None,
|
||||
attn_type="decoder",
|
||||
kv_sharing_target_layer_name=None,
|
||||
q_lora_rank=None,
|
||||
kv_lora_rank=mla_dims["kv_lora_rank"],
|
||||
qk_nope_head_dim=mla_dims["qk_nope_head_dim"],
|
||||
qk_rope_head_dim=mla_dims["qk_rope_head_dim"],
|
||||
qk_head_dim=mla_dims["qk_nope_head_dim"] + mla_dims["qk_rope_head_dim"],
|
||||
v_head_dim=mla_dims["v_head_dim"],
|
||||
kv_b_proj=mock_kv_b_proj,
|
||||
)
|
||||
|
||||
# Initialize DCP attributes
|
||||
if not hasattr(impl, "dcp_world_size") or impl.dcp_world_size in (None, -1):
|
||||
impl.dcp_world_size = 1
|
||||
impl.dcp_rank = 0
|
||||
|
||||
# Create KV cache spec for MockLayer
|
||||
from vllm.v1.kv_cache_interface import FullAttentionSpec
|
||||
|
||||
kv_cache_spec = FullAttentionSpec(
|
||||
block_size=backend_cfg["block_size"] or vllm_config.cache_config.block_size,
|
||||
num_kv_heads=1, # MLA uses 1 KV head
|
||||
head_size=576, # MLA head dim
|
||||
dtype=torch.bfloat16,
|
||||
)
|
||||
|
||||
# Create mock layer
|
||||
layer = MockLayer(device, impl=impl, kv_cache_spec=kv_cache_spec)
|
||||
|
||||
# Create builder instance if needed
|
||||
builder_instance = None
|
||||
if backend_cfg["builder_class"]:
|
||||
builder_class = getattr(backend_module, backend_cfg["builder_class"])
|
||||
|
||||
# Populate static_forward_context so builder can find the layer
|
||||
# MockLayer inherits from AttentionLayerBase, so isinstance checks pass
|
||||
vllm_config.compilation_config.static_forward_context = {"placeholder": layer}
|
||||
|
||||
builder_instance = builder_class(
|
||||
kv_cache_spec=kv_cache_spec,
|
||||
layer_names=["placeholder"],
|
||||
vllm_config=vllm_config,
|
||||
device=device,
|
||||
)
|
||||
|
||||
return impl, layer, builder_instance
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Config Helpers
|
||||
# ============================================================================
|
||||
|
||||
|
||||
def _extract_mla_dims_from_config(config) -> dict | None:
|
||||
"""
|
||||
Extract MLA dimensions from BenchmarkConfig if all required fields are present.
|
||||
|
||||
Args:
|
||||
config: BenchmarkConfig instance
|
||||
|
||||
Returns:
|
||||
Dict with MLA dimensions if all fields are provided, None otherwise
|
||||
"""
|
||||
# Check if all MLA-specific fields are provided
|
||||
if all(
|
||||
[
|
||||
config.kv_lora_rank is not None,
|
||||
config.qk_nope_head_dim is not None,
|
||||
config.qk_rope_head_dim is not None,
|
||||
config.v_head_dim is not None,
|
||||
]
|
||||
):
|
||||
return {
|
||||
"kv_lora_rank": config.kv_lora_rank,
|
||||
"qk_nope_head_dim": config.qk_nope_head_dim,
|
||||
"qk_rope_head_dim": config.qk_rope_head_dim,
|
||||
"v_head_dim": config.v_head_dim,
|
||||
"num_q_heads": config.num_q_heads,
|
||||
"num_kv_heads": config.num_kv_heads,
|
||||
"head_dim": config.head_dim,
|
||||
}
|
||||
# Fallback: if MLA fields not fully specified, try to construct from basic fields
|
||||
elif config.head_dim == 576:
|
||||
# This looks like a DeepSeek MLA config, use standard dimensions with custom
|
||||
# head count
|
||||
return {
|
||||
"kv_lora_rank": 512,
|
||||
"qk_nope_head_dim": 128,
|
||||
"qk_rope_head_dim": 64,
|
||||
"v_head_dim": 128,
|
||||
"num_q_heads": config.num_q_heads,
|
||||
"num_kv_heads": config.num_kv_heads,
|
||||
"head_dim": config.head_dim,
|
||||
}
|
||||
return None
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Benchmark Execution
|
||||
# ============================================================================
|
||||
|
||||
|
||||
def _run_single_benchmark(
|
||||
config,
|
||||
impl,
|
||||
layer,
|
||||
builder_instance,
|
||||
backend_cfg: dict,
|
||||
mla_dims: dict,
|
||||
device: torch.device,
|
||||
) -> BenchmarkResult:
|
||||
"""
|
||||
Run a single benchmark iteration.
|
||||
|
||||
Args:
|
||||
config: BenchmarkConfig instance
|
||||
impl: Backend implementation instance
|
||||
layer: MockLayer instance
|
||||
builder_instance: Metadata builder instance
|
||||
backend_cfg: Backend configuration dict
|
||||
mla_dims: MLA dimension configuration
|
||||
device: Target device
|
||||
|
||||
Returns:
|
||||
BenchmarkResult with timing statistics
|
||||
"""
|
||||
# Parse batch spec
|
||||
requests = parse_batch_spec(config.batch_spec)
|
||||
q_lens = [r.q_len for r in requests]
|
||||
total_q = sum(q_lens)
|
||||
|
||||
# Determine block size
|
||||
block_size = backend_cfg["block_size"] or config.block_size
|
||||
|
||||
# Build metadata
|
||||
metadata, num_blocks = _build_attention_metadata(
|
||||
requests, block_size, device, builder_instance
|
||||
)
|
||||
|
||||
# Create KV cache
|
||||
kv_cache = torch.zeros(
|
||||
num_blocks,
|
||||
block_size,
|
||||
mla_dims["kv_lora_rank"] + mla_dims["qk_rope_head_dim"],
|
||||
device=device,
|
||||
dtype=torch.bfloat16,
|
||||
)
|
||||
|
||||
# Create input tensors for both decode and prefill modes
|
||||
decode_inputs, prefill_inputs = _create_input_tensors(
|
||||
total_q,
|
||||
mla_dims,
|
||||
backend_cfg["query_format"],
|
||||
device,
|
||||
torch.bfloat16,
|
||||
)
|
||||
|
||||
# Determine which forward method to use based on metadata
|
||||
if metadata.decode is not None:
|
||||
forward_fn = lambda: impl._forward_decode(
|
||||
decode_inputs, kv_cache, metadata, layer
|
||||
)
|
||||
elif metadata.prefill is not None:
|
||||
forward_fn = lambda: impl._forward_prefill(
|
||||
prefill_inputs["q"],
|
||||
prefill_inputs["k_c_normed"],
|
||||
prefill_inputs["k_pe"],
|
||||
kv_cache,
|
||||
metadata,
|
||||
prefill_inputs["k_scale"],
|
||||
prefill_inputs["output"],
|
||||
)
|
||||
else:
|
||||
raise RuntimeError("Metadata has neither decode nor prefill metadata")
|
||||
|
||||
# Warmup
|
||||
for _ in range(config.warmup_iters):
|
||||
forward_fn()
|
||||
torch.cuda.synchronize()
|
||||
|
||||
# Benchmark
|
||||
times = []
|
||||
for _ in range(config.repeats):
|
||||
start = torch.cuda.Event(enable_timing=True)
|
||||
end = torch.cuda.Event(enable_timing=True)
|
||||
|
||||
start.record()
|
||||
for _ in range(config.num_layers):
|
||||
forward_fn()
|
||||
end.record()
|
||||
|
||||
torch.cuda.synchronize()
|
||||
elapsed_ms = start.elapsed_time(end)
|
||||
times.append(elapsed_ms / 1000.0 / config.num_layers)
|
||||
|
||||
mean_time = float(np.mean(times))
|
||||
return BenchmarkResult(
|
||||
config=config,
|
||||
mean_time=mean_time,
|
||||
std_time=float(np.std(times)),
|
||||
min_time=float(np.min(times)),
|
||||
max_time=float(np.max(times)),
|
||||
throughput_tokens_per_sec=total_q / mean_time if mean_time > 0 else 0,
|
||||
)
|
||||
|
||||
|
||||
def _run_mla_benchmark_batched(
|
||||
backend: str,
|
||||
configs_with_params: list[tuple], # [(config, threshold, num_splits), ...]
|
||||
) -> list[BenchmarkResult]:
|
||||
"""
|
||||
Unified batched MLA benchmark runner for all backends.
|
||||
|
||||
Works for: flashattn_mla, flashmla, flashinfer_mla, cutlass_mla
|
||||
|
||||
This function reuses backend initialization across multiple benchmarks
|
||||
to avoid setup/teardown overhead.
|
||||
|
||||
Args:
|
||||
backend: Backend name
|
||||
configs_with_params: List of (config, threshold, num_splits) tuples
|
||||
- threshold: reorder_batch_threshold (FlashAttn/FlashMLA only)
|
||||
- num_splits: num_kv_splits (CUTLASS only)
|
||||
|
||||
Returns:
|
||||
List of BenchmarkResult objects
|
||||
"""
|
||||
if not configs_with_params:
|
||||
return []
|
||||
|
||||
backend_cfg = _get_backend_config(backend)
|
||||
device = torch.device(configs_with_params[0][0].device)
|
||||
torch.cuda.set_device(device)
|
||||
|
||||
# Determine block size
|
||||
config_block_size = configs_with_params[0][0].block_size
|
||||
block_size = backend_cfg["block_size"] or config_block_size
|
||||
|
||||
# Extract MLA dimensions from the first config
|
||||
first_config = configs_with_params[0][0]
|
||||
mla_dims = _extract_mla_dims_from_config(first_config)
|
||||
|
||||
# If config didn't provide MLA dims, fall back to default model
|
||||
if mla_dims is None:
|
||||
mla_dims = setup_mla_dims("deepseek-v3")
|
||||
|
||||
# Create and set vLLM config for MLA (reused across all benchmarks)
|
||||
vllm_config = create_minimal_vllm_config(
|
||||
model_name="deepseek-v3", # Used only for model path
|
||||
block_size=block_size,
|
||||
mla_dims=mla_dims, # Use custom dims from config or default
|
||||
)
|
||||
|
||||
results = []
|
||||
|
||||
with set_current_vllm_config(vllm_config):
|
||||
# Create backend impl, layer, and builder (reused across benchmarks)
|
||||
impl, layer, builder_instance = _create_backend_impl(
|
||||
backend_cfg, mla_dims, vllm_config, device
|
||||
)
|
||||
|
||||
# Run each benchmark with the shared impl
|
||||
for config, threshold, num_splits in configs_with_params:
|
||||
# Set threshold for this benchmark (FlashAttn/FlashMLA only)
|
||||
original_threshold = None
|
||||
if threshold is not None and builder_instance:
|
||||
original_threshold = builder_instance.reorder_batch_threshold
|
||||
builder_instance.reorder_batch_threshold = threshold
|
||||
|
||||
# Set num_splits for CUTLASS
|
||||
original_num_splits = None
|
||||
if num_splits is not None and hasattr(impl, "_num_kv_splits"):
|
||||
original_num_splits = impl._num_kv_splits
|
||||
impl._num_kv_splits = num_splits
|
||||
|
||||
try:
|
||||
result = _run_single_benchmark(
|
||||
config,
|
||||
impl,
|
||||
layer,
|
||||
builder_instance,
|
||||
backend_cfg,
|
||||
mla_dims,
|
||||
device,
|
||||
)
|
||||
results.append(result)
|
||||
|
||||
finally:
|
||||
# Restore original threshold
|
||||
if original_threshold is not None:
|
||||
builder_instance.reorder_batch_threshold = original_threshold
|
||||
|
||||
# Restore original num_splits
|
||||
if original_num_splits is not None:
|
||||
impl._num_kv_splits = original_num_splits
|
||||
|
||||
return results
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Public API
|
||||
# ============================================================================
|
||||
|
||||
|
||||
def run_mla_benchmark(
|
||||
backend: str,
|
||||
config,
|
||||
reorder_batch_threshold: int | None = None,
|
||||
num_kv_splits: int | None = None,
|
||||
) -> BenchmarkResult | list[BenchmarkResult]:
|
||||
"""
|
||||
Unified MLA benchmark runner for all backends.
|
||||
|
||||
Works for: flashattn_mla, flashmla, flashinfer_mla, cutlass_mla
|
||||
|
||||
Always uses batched execution internally for optimal performance.
|
||||
|
||||
Args:
|
||||
backend: Backend name (flashattn_mla, flashmla, flashinfer_mla, cutlass_mla)
|
||||
config: BenchmarkConfig or list of (BenchmarkConfig, param) tuples
|
||||
reorder_batch_threshold: Threshold override for FlashAttn/FlashMLA
|
||||
(single config mode only)
|
||||
num_kv_splits: Number of KV splits for CUTLASS (single config mode only)
|
||||
|
||||
Returns:
|
||||
BenchmarkResult (single mode) or list of BenchmarkResult (batched mode)
|
||||
"""
|
||||
# Normalize to batched mode: (config, threshold, num_splits)
|
||||
if isinstance(config, list):
|
||||
# Already in batched format
|
||||
if len(config) > 0 and isinstance(config[0], tuple):
|
||||
# Format: [(cfg, param), ...] where param is threshold or num_splits
|
||||
if backend in ("flashattn_mla", "flashmla"):
|
||||
configs_with_params = [(cfg, param, None) for cfg, param in config]
|
||||
else: # cutlass_mla or flashinfer_mla
|
||||
configs_with_params = [(cfg, None, param) for cfg, param in config]
|
||||
else:
|
||||
# Format: [cfg, ...] - just configs
|
||||
configs_with_params = [(cfg, None, None) for cfg in config]
|
||||
return_single = False
|
||||
else:
|
||||
# Single config: convert to batched format
|
||||
configs_with_params = [(config, reorder_batch_threshold, num_kv_splits)]
|
||||
return_single = True
|
||||
|
||||
# Use unified batched execution
|
||||
results = _run_mla_benchmark_batched(backend, configs_with_params)
|
||||
|
||||
# Return single result or list based on input
|
||||
return results[0] if return_single else results
|
||||
481
benchmarks/attention_benchmarks/runner.py
Normal file
481
benchmarks/attention_benchmarks/runner.py
Normal file
@@ -0,0 +1,481 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
"""
|
||||
Standard attention benchmark runner - shared utilities for non-MLA benchmarks.
|
||||
|
||||
This module provides helpers for running standard attention backends
|
||||
(FlashAttention, Triton, FlashInfer) with real vLLM integration.
|
||||
"""
|
||||
|
||||
import types
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from batch_spec import parse_batch_spec, reorder_for_flashinfer
|
||||
from common import BenchmarkConfig, BenchmarkResult, MockLayer, get_attention_scale
|
||||
|
||||
from vllm.config import (
|
||||
CacheConfig,
|
||||
CompilationConfig,
|
||||
DeviceConfig,
|
||||
LoadConfig,
|
||||
ModelConfig,
|
||||
ParallelConfig,
|
||||
SchedulerConfig,
|
||||
VllmConfig,
|
||||
)
|
||||
from vllm.v1.attention.backends.utils import CommonAttentionMetadata
|
||||
from vllm.v1.kv_cache_interface import FullAttentionSpec
|
||||
|
||||
# ============================================================================
|
||||
# Backend Configuration
|
||||
# ============================================================================
|
||||
|
||||
|
||||
_BACKEND_CONFIG = {
|
||||
"flash": {
|
||||
"module": "vllm.v1.attention.backends.flash_attn",
|
||||
"backend_class": "FlashAttentionBackend",
|
||||
"dtype": torch.float16,
|
||||
"cache_layout": "standard",
|
||||
# ^ [2, num_blocks, block_size, num_kv_heads, head_dim]
|
||||
},
|
||||
"triton": {
|
||||
"module": "vllm.v1.attention.backends.triton_attn",
|
||||
"backend_class": "TritonAttentionBackend",
|
||||
"dtype": torch.float32,
|
||||
"cache_layout": "standard",
|
||||
},
|
||||
"flashinfer": {
|
||||
"module": "vllm.v1.attention.backends.flashinfer",
|
||||
"backend_class": "FlashInferBackend",
|
||||
"dtype": torch.float16,
|
||||
"cache_layout": "flashinfer",
|
||||
# ^ [num_blocks, 2, block_size, num_kv_heads, head_dim]
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
def _get_backend_config(backend: str) -> dict:
|
||||
if backend not in _BACKEND_CONFIG:
|
||||
raise ValueError(
|
||||
f"Unknown backend: {backend}. "
|
||||
f"Available: {', '.join(_BACKEND_CONFIG.keys())}"
|
||||
)
|
||||
return _BACKEND_CONFIG[backend]
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Metadata Building Helpers
|
||||
# ============================================================================
|
||||
|
||||
|
||||
def _build_common_attn_metadata(
|
||||
q_lens: list[int],
|
||||
kv_lens: list[int],
|
||||
block_size: int,
|
||||
device: torch.device,
|
||||
) -> CommonAttentionMetadata:
|
||||
"""Build CommonAttentionMetadata from query/kv lengths."""
|
||||
batch_size = len(q_lens)
|
||||
total_tokens = sum(q_lens)
|
||||
|
||||
query_start_loc = torch.zeros(batch_size + 1, dtype=torch.int32, device=device)
|
||||
query_start_loc[1:] = torch.tensor(q_lens, dtype=torch.int32, device=device).cumsum(
|
||||
0
|
||||
)
|
||||
query_start_loc_cpu = query_start_loc.cpu()
|
||||
|
||||
seq_lens = torch.tensor(kv_lens, dtype=torch.int32, device=device)
|
||||
seq_lens_cpu = seq_lens.cpu()
|
||||
max_seq_len = int(seq_lens_cpu.max())
|
||||
|
||||
context_lens = [kv - q for kv, q in zip(kv_lens, q_lens)]
|
||||
num_computed_tokens_cpu = torch.tensor(context_lens, dtype=torch.int32)
|
||||
|
||||
max_blocks = (max(kv_lens) + block_size - 1) // block_size
|
||||
num_blocks = batch_size * max_blocks
|
||||
block_table_tensor = torch.arange(
|
||||
num_blocks, dtype=torch.int32, device=device
|
||||
).view(batch_size, max_blocks)
|
||||
slot_mapping = torch.arange(total_tokens, dtype=torch.int64, device=device)
|
||||
|
||||
max_query_len = max(q_lens)
|
||||
|
||||
return CommonAttentionMetadata(
|
||||
query_start_loc=query_start_loc,
|
||||
query_start_loc_cpu=query_start_loc_cpu,
|
||||
seq_lens=seq_lens,
|
||||
seq_lens_cpu=seq_lens_cpu,
|
||||
num_computed_tokens_cpu=num_computed_tokens_cpu,
|
||||
num_reqs=batch_size,
|
||||
num_actual_tokens=total_tokens,
|
||||
max_query_len=max_query_len,
|
||||
max_seq_len=max_seq_len,
|
||||
block_table_tensor=block_table_tensor,
|
||||
slot_mapping=slot_mapping,
|
||||
causal=True,
|
||||
)
|
||||
|
||||
|
||||
def _create_vllm_config(
|
||||
config: BenchmarkConfig,
|
||||
dtype: torch.dtype,
|
||||
max_num_blocks: int,
|
||||
) -> VllmConfig:
|
||||
"""Create a VllmConfig for benchmarking with mock model methods."""
|
||||
model_config = ModelConfig(
|
||||
model="meta-llama/Meta-Llama-3-8B",
|
||||
tokenizer="meta-llama/Meta-Llama-3-8B",
|
||||
trust_remote_code=False,
|
||||
dtype=dtype,
|
||||
seed=0,
|
||||
max_model_len=1024,
|
||||
)
|
||||
|
||||
cache_config = CacheConfig(
|
||||
block_size=config.block_size,
|
||||
cache_dtype="auto",
|
||||
swap_space=0,
|
||||
)
|
||||
cache_config.num_gpu_blocks = max_num_blocks
|
||||
cache_config.num_cpu_blocks = 0
|
||||
|
||||
parallel_config = ParallelConfig(tensor_parallel_size=1)
|
||||
scheduler_config = SchedulerConfig(
|
||||
max_num_seqs=256,
|
||||
max_num_batched_tokens=8192,
|
||||
max_model_len=8192,
|
||||
is_encoder_decoder=False,
|
||||
enable_chunked_prefill=True,
|
||||
)
|
||||
device_config = DeviceConfig()
|
||||
load_config = LoadConfig()
|
||||
compilation_config = CompilationConfig()
|
||||
|
||||
# Add mock methods for benchmark config values
|
||||
model_config.get_num_layers = types.MethodType(
|
||||
lambda self: config.num_layers, model_config
|
||||
)
|
||||
model_config.get_sliding_window_for_layer = types.MethodType(
|
||||
lambda self, i: None, model_config
|
||||
)
|
||||
model_config.get_logits_soft_cap_for_layer = types.MethodType(
|
||||
lambda self, i: 0.0, model_config
|
||||
)
|
||||
model_config.get_sm_scale_for_layer = types.MethodType(
|
||||
lambda self, i: 1.0 / config.head_dim**0.5, model_config
|
||||
)
|
||||
model_config.get_num_attention_heads = types.MethodType(
|
||||
lambda self, parallel_config=None: config.num_q_heads, model_config
|
||||
)
|
||||
model_config.get_num_kv_heads = types.MethodType(
|
||||
lambda self, parallel_config=None: config.num_kv_heads, model_config
|
||||
)
|
||||
model_config.get_head_size = types.MethodType(
|
||||
lambda self: config.head_dim, model_config
|
||||
)
|
||||
model_config.get_sliding_window = types.MethodType(lambda self: None, model_config)
|
||||
|
||||
return VllmConfig(
|
||||
model_config=model_config,
|
||||
cache_config=cache_config,
|
||||
parallel_config=parallel_config,
|
||||
scheduler_config=scheduler_config,
|
||||
device_config=device_config,
|
||||
load_config=load_config,
|
||||
compilation_config=compilation_config,
|
||||
)
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Backend Initialization
|
||||
# ============================================================================
|
||||
|
||||
|
||||
def _create_backend_impl(
|
||||
backend_cfg: dict,
|
||||
config: BenchmarkConfig,
|
||||
device: torch.device,
|
||||
):
|
||||
"""Create backend implementation instance."""
|
||||
import importlib
|
||||
|
||||
backend_module = importlib.import_module(backend_cfg["module"])
|
||||
backend_class = getattr(backend_module, backend_cfg["backend_class"])
|
||||
|
||||
scale = get_attention_scale(config.head_dim)
|
||||
dtype = backend_cfg["dtype"]
|
||||
|
||||
impl = backend_class.get_impl_cls()(
|
||||
num_heads=config.num_q_heads,
|
||||
head_size=config.head_dim,
|
||||
scale=scale,
|
||||
num_kv_heads=config.num_kv_heads,
|
||||
alibi_slopes=None,
|
||||
sliding_window=None,
|
||||
kv_cache_dtype="auto",
|
||||
)
|
||||
|
||||
kv_cache_spec = FullAttentionSpec(
|
||||
block_size=config.block_size,
|
||||
num_kv_heads=config.num_kv_heads,
|
||||
head_size=config.head_dim,
|
||||
dtype=dtype,
|
||||
)
|
||||
|
||||
layer = MockLayer(device, kv_cache_spec=kv_cache_spec)
|
||||
|
||||
return backend_class, impl, layer, dtype
|
||||
|
||||
|
||||
def _create_metadata_builder(
|
||||
backend_class,
|
||||
kv_cache_spec: FullAttentionSpec,
|
||||
vllm_config: VllmConfig,
|
||||
device: torch.device,
|
||||
):
|
||||
"""Create metadata builder instance."""
|
||||
return backend_class.get_builder_cls()(
|
||||
kv_cache_spec=kv_cache_spec,
|
||||
layer_names=["layer_0"],
|
||||
vllm_config=vllm_config,
|
||||
device=device,
|
||||
)
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Tensor Creation Helpers
|
||||
# ============================================================================
|
||||
|
||||
|
||||
def _create_input_tensors(
|
||||
config: BenchmarkConfig,
|
||||
total_q: int,
|
||||
device: torch.device,
|
||||
dtype: torch.dtype,
|
||||
) -> tuple:
|
||||
"""Create Q, K, V input tensors for all layers."""
|
||||
q_list = [
|
||||
torch.randn(
|
||||
total_q, config.num_q_heads, config.head_dim, device=device, dtype=dtype
|
||||
)
|
||||
for _ in range(config.num_layers)
|
||||
]
|
||||
k_list = [
|
||||
torch.randn(
|
||||
total_q, config.num_kv_heads, config.head_dim, device=device, dtype=dtype
|
||||
)
|
||||
for _ in range(config.num_layers)
|
||||
]
|
||||
v_list = [
|
||||
torch.randn(
|
||||
total_q, config.num_kv_heads, config.head_dim, device=device, dtype=dtype
|
||||
)
|
||||
for _ in range(config.num_layers)
|
||||
]
|
||||
return q_list, k_list, v_list
|
||||
|
||||
|
||||
def _create_kv_cache(
|
||||
config: BenchmarkConfig,
|
||||
max_num_blocks: int,
|
||||
cache_layout: str,
|
||||
device: torch.device,
|
||||
dtype: torch.dtype,
|
||||
) -> list:
|
||||
"""Create KV cache tensors for all layers."""
|
||||
if cache_layout == "flashinfer":
|
||||
# FlashInfer layout: [num_blocks, 2, block_size, num_kv_heads, head_dim]
|
||||
cache_list = [
|
||||
torch.zeros(
|
||||
max_num_blocks,
|
||||
2,
|
||||
config.block_size,
|
||||
config.num_kv_heads,
|
||||
config.head_dim,
|
||||
device=device,
|
||||
dtype=dtype,
|
||||
)
|
||||
for _ in range(config.num_layers)
|
||||
]
|
||||
else:
|
||||
# Standard layout: [2, num_blocks, block_size, num_kv_heads, head_dim]
|
||||
cache_list = [
|
||||
torch.zeros(
|
||||
2,
|
||||
max_num_blocks,
|
||||
config.block_size,
|
||||
config.num_kv_heads,
|
||||
config.head_dim,
|
||||
device=device,
|
||||
dtype=dtype,
|
||||
)
|
||||
for _ in range(config.num_layers)
|
||||
]
|
||||
return cache_list
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Benchmark Execution
|
||||
# ============================================================================
|
||||
|
||||
|
||||
def _run_single_benchmark(
|
||||
config: BenchmarkConfig,
|
||||
impl,
|
||||
layer,
|
||||
q_list: list,
|
||||
k_list: list,
|
||||
v_list: list,
|
||||
cache_list: list,
|
||||
attn_metadata,
|
||||
device: torch.device,
|
||||
dtype: torch.dtype,
|
||||
) -> tuple:
|
||||
"""Run single benchmark iteration with warmup and timing loop."""
|
||||
total_q = q_list[0].shape[0]
|
||||
out = torch.empty(
|
||||
total_q, config.num_q_heads, config.head_dim, device=device, dtype=dtype
|
||||
)
|
||||
|
||||
# Warmup
|
||||
for _ in range(config.warmup_iters):
|
||||
for i in range(config.num_layers):
|
||||
impl.forward(
|
||||
layer,
|
||||
q_list[i],
|
||||
k_list[i],
|
||||
v_list[i],
|
||||
cache_list[i],
|
||||
attn_metadata,
|
||||
output=out,
|
||||
)
|
||||
torch.cuda.synchronize()
|
||||
|
||||
# Benchmark
|
||||
times = []
|
||||
for _ in range(config.repeats):
|
||||
start = torch.cuda.Event(enable_timing=True)
|
||||
end = torch.cuda.Event(enable_timing=True)
|
||||
|
||||
start.record()
|
||||
for i in range(config.num_layers):
|
||||
impl.forward(
|
||||
layer,
|
||||
q_list[i],
|
||||
k_list[i],
|
||||
v_list[i],
|
||||
cache_list[i],
|
||||
attn_metadata,
|
||||
output=out,
|
||||
)
|
||||
end.record()
|
||||
|
||||
torch.cuda.synchronize()
|
||||
elapsed_ms = start.elapsed_time(end)
|
||||
times.append(elapsed_ms / 1000.0 / config.num_layers) # seconds per layer
|
||||
|
||||
mem_stats = {}
|
||||
if config.profile_memory:
|
||||
mem_stats = {
|
||||
"allocated_mb": torch.cuda.memory_allocated(device) / 1024**2,
|
||||
"reserved_mb": torch.cuda.memory_reserved(device) / 1024**2,
|
||||
}
|
||||
|
||||
return times, mem_stats
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Public API
|
||||
# ============================================================================
|
||||
|
||||
|
||||
def run_attention_benchmark(config: BenchmarkConfig) -> BenchmarkResult:
|
||||
"""
|
||||
Run standard attention benchmark with real kernels.
|
||||
|
||||
Supports: flash, triton, flashinfer
|
||||
|
||||
Args:
|
||||
config: Benchmark configuration
|
||||
|
||||
Returns:
|
||||
BenchmarkResult with timing and memory statistics
|
||||
"""
|
||||
device = torch.device(config.device)
|
||||
torch.cuda.set_device(device)
|
||||
|
||||
backend_cfg = _get_backend_config(config.backend)
|
||||
|
||||
requests = parse_batch_spec(config.batch_spec)
|
||||
|
||||
if config.backend == "flashinfer":
|
||||
requests = reorder_for_flashinfer(requests)
|
||||
|
||||
q_lens = [r.q_len for r in requests]
|
||||
kv_lens = [r.kv_len for r in requests]
|
||||
total_q = sum(q_lens)
|
||||
max_kv = max(kv_lens)
|
||||
|
||||
max_num_blocks = (max_kv + config.block_size - 1) // config.block_size
|
||||
|
||||
backend_class, impl, layer, dtype = _create_backend_impl(
|
||||
backend_cfg, config, device
|
||||
)
|
||||
|
||||
common_metadata = _build_common_attn_metadata(
|
||||
q_lens, kv_lens, config.block_size, device
|
||||
)
|
||||
|
||||
kv_cache_spec = FullAttentionSpec(
|
||||
block_size=config.block_size,
|
||||
num_kv_heads=config.num_kv_heads,
|
||||
head_size=config.head_dim,
|
||||
dtype=dtype,
|
||||
)
|
||||
|
||||
vllm_config = _create_vllm_config(config, dtype, max_num_blocks)
|
||||
|
||||
builder = _create_metadata_builder(
|
||||
backend_class, kv_cache_spec, vllm_config, device
|
||||
)
|
||||
|
||||
attn_metadata = builder.build(
|
||||
common_prefix_len=0,
|
||||
common_attn_metadata=common_metadata,
|
||||
)
|
||||
|
||||
q_list, k_list, v_list = _create_input_tensors(config, total_q, device, dtype)
|
||||
|
||||
cache_list = _create_kv_cache(
|
||||
config, max_num_blocks, backend_cfg["cache_layout"], device, dtype
|
||||
)
|
||||
|
||||
times, mem_stats = _run_single_benchmark(
|
||||
config,
|
||||
impl,
|
||||
layer,
|
||||
q_list,
|
||||
k_list,
|
||||
v_list,
|
||||
cache_list,
|
||||
attn_metadata,
|
||||
device,
|
||||
dtype,
|
||||
)
|
||||
|
||||
mean_time = np.mean(times)
|
||||
throughput = total_q / mean_time if mean_time > 0 else 0
|
||||
|
||||
return BenchmarkResult(
|
||||
config=config,
|
||||
mean_time=mean_time,
|
||||
std_time=np.std(times),
|
||||
min_time=np.min(times),
|
||||
max_time=np.max(times),
|
||||
throughput_tokens_per_sec=throughput,
|
||||
memory_allocated_mb=mem_stats.get("allocated_mb"),
|
||||
memory_reserved_mb=mem_stats.get("reserved_mb"),
|
||||
)
|
||||
@@ -20,8 +20,12 @@ FLOAT4_E2M1_MAX = scalar_types.float4_e2m1f.max()
|
||||
FLOAT8_E4M3_MAX = torch.finfo(torch.float8_e4m3fn).max
|
||||
|
||||
PROVIDER_CFGS = {
|
||||
"vllm": dict(backend="vllm", enabled=True),
|
||||
"flashinfer": dict(backend="flashinfer", enabled=True),
|
||||
"vllm": dict(backend="vllm", is_sf_swizzled_layout=False, enabled=True),
|
||||
"vllm-swizzle": dict(backend="vllm", is_sf_swizzled_layout=True, enabled=True),
|
||||
"flashinfer": dict(backend="flashinfer", is_sf_swizzled_layout=False, enabled=True),
|
||||
"flashinfer-swizzle": dict(
|
||||
backend="flashinfer", is_sf_swizzled_layout=True, enabled=True
|
||||
),
|
||||
}
|
||||
|
||||
_enabled = [k for k, v in PROVIDER_CFGS.items() if v["enabled"]]
|
||||
@@ -36,7 +40,7 @@ def compute_global_scale(tensor: torch.Tensor) -> torch.Tensor:
|
||||
@triton.testing.perf_report(
|
||||
triton.testing.Benchmark(
|
||||
x_names=["batch_size"],
|
||||
x_vals=[1, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096],
|
||||
x_vals=[1, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192],
|
||||
x_log=False,
|
||||
line_arg="provider",
|
||||
line_vals=_enabled,
|
||||
@@ -63,19 +67,36 @@ def benchmark(batch_size, provider, N, K):
|
||||
|
||||
if cfg["backend"] == "vllm":
|
||||
# vLLM's FP4 quantization
|
||||
ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
|
||||
lambda: ops.scaled_fp4_quant(a, a_global_scale),
|
||||
quantiles=quantiles,
|
||||
)
|
||||
if cfg["is_sf_swizzled_layout"]:
|
||||
ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
|
||||
lambda: ops.scaled_fp4_quant(
|
||||
a, a_global_scale, is_sf_swizzled_layout=True
|
||||
),
|
||||
quantiles=quantiles,
|
||||
)
|
||||
else:
|
||||
ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
|
||||
lambda: ops.scaled_fp4_quant(
|
||||
a, a_global_scale, is_sf_swizzled_layout=False
|
||||
),
|
||||
quantiles=quantiles,
|
||||
)
|
||||
elif cfg["backend"] == "flashinfer":
|
||||
# FlashInfer's FP4 quantization
|
||||
# Use is_sf_swizzled_layout=True to match vLLM's output format
|
||||
ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
|
||||
lambda: flashinfer_fp4_quantize(
|
||||
a, a_global_scale, is_sf_swizzled_layout=True
|
||||
),
|
||||
quantiles=quantiles,
|
||||
)
|
||||
if cfg["is_sf_swizzled_layout"]:
|
||||
ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
|
||||
lambda: flashinfer_fp4_quantize(
|
||||
a, a_global_scale, is_sf_swizzled_layout=True
|
||||
),
|
||||
quantiles=quantiles,
|
||||
)
|
||||
else:
|
||||
ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
|
||||
lambda: flashinfer_fp4_quantize(
|
||||
a, a_global_scale, is_sf_swizzled_layout=False
|
||||
),
|
||||
quantiles=quantiles,
|
||||
)
|
||||
|
||||
# Convert ms to us for better readability at small batch sizes
|
||||
to_us = lambda t_ms: t_ms * 1000
|
||||
@@ -92,7 +113,9 @@ def prepare_shapes(args):
|
||||
return out
|
||||
|
||||
|
||||
def _test_accuracy_once(M: int, K: int, dtype: torch.dtype, device: str):
|
||||
def _test_accuracy_once(
|
||||
M: int, K: int, dtype: torch.dtype, device: str, is_sf_swizzled_layout: bool
|
||||
):
|
||||
"""Test accuracy between vLLM and FlashInfer FP4 quantization."""
|
||||
# Create input tensor
|
||||
a = torch.randn((M, K), device=device, dtype=dtype)
|
||||
@@ -101,11 +124,13 @@ def _test_accuracy_once(M: int, K: int, dtype: torch.dtype, device: str):
|
||||
a_global_scale = compute_global_scale(a)
|
||||
|
||||
# vLLM quantization
|
||||
vllm_fp4, vllm_scale = ops.scaled_fp4_quant(a, a_global_scale)
|
||||
vllm_fp4, vllm_scale = ops.scaled_fp4_quant(
|
||||
a, a_global_scale, is_sf_swizzled_layout=is_sf_swizzled_layout
|
||||
)
|
||||
|
||||
# FlashInfer quantization (with swizzled layout to match vLLM's output)
|
||||
flashinfer_fp4, flashinfer_scale = flashinfer_fp4_quantize(
|
||||
a, a_global_scale, is_sf_swizzled_layout=True
|
||||
a, a_global_scale, is_sf_swizzled_layout=is_sf_swizzled_layout
|
||||
)
|
||||
flashinfer_scale = flashinfer_scale.view(torch.float8_e4m3fn)
|
||||
|
||||
@@ -114,7 +139,14 @@ def _test_accuracy_once(M: int, K: int, dtype: torch.dtype, device: str):
|
||||
vllm_fp4,
|
||||
flashinfer_fp4,
|
||||
)
|
||||
print(f"M={M}, K={K}, dtype={dtype}: PASSED")
|
||||
# Compare scales
|
||||
torch.testing.assert_close(
|
||||
vllm_scale,
|
||||
flashinfer_scale,
|
||||
)
|
||||
print(
|
||||
f"M={M}, K={K}, dtype={dtype}, is_sf_swizzled_layout={is_sf_swizzled_layout}: PASSED" # noqa: E501
|
||||
)
|
||||
|
||||
|
||||
def test_accuracy():
|
||||
@@ -130,9 +162,10 @@ def test_accuracy():
|
||||
Ms = [1, 1024]
|
||||
Ks = [4096]
|
||||
|
||||
for M in Ms:
|
||||
for K in Ks:
|
||||
_test_accuracy_once(M, K, dtype, device)
|
||||
for is_sf_swizzled_layout in [True, False]:
|
||||
for M in Ms:
|
||||
for K in Ks:
|
||||
_test_accuracy_once(M, K, dtype, device, is_sf_swizzled_layout)
|
||||
|
||||
print("\nAll accuracy tests passed!")
|
||||
|
||||
@@ -145,7 +178,7 @@ if __name__ == "__main__":
|
||||
"--models",
|
||||
nargs="+",
|
||||
type=str,
|
||||
default=["meta-llama/Llama-3.1-8B-Instruct"],
|
||||
default=["meta-llama/Llama-3.3-70B-Instruct"],
|
||||
choices=list(WEIGHT_SHAPES.keys()),
|
||||
)
|
||||
parser.add_argument("--tp-sizes", nargs="+", type=int, default=[1])
|
||||
|
||||
@@ -7,7 +7,7 @@ import itertools
|
||||
import torch
|
||||
|
||||
import vllm.model_executor.layers.activation # noqa F401
|
||||
from vllm.model_executor.custom_op import CustomOp
|
||||
from vllm.model_executor.custom_op import op_registry
|
||||
from vllm.triton_utils import triton
|
||||
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
||||
from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE, set_random_seed
|
||||
@@ -33,14 +33,14 @@ def benchmark_activation(
|
||||
torch.set_default_device(device)
|
||||
|
||||
if func_name == "gelu_and_mul":
|
||||
layer = CustomOp.op_registry[func_name](approximate="none")
|
||||
layer = op_registry[func_name](approximate="none")
|
||||
elif func_name == "gelu_and_mul_tanh":
|
||||
layer = CustomOp.op_registry["gelu_and_mul"](approximate="tanh")
|
||||
layer = op_registry["gelu_and_mul"](approximate="tanh")
|
||||
elif func_name == "fatrelu_and_mul":
|
||||
threshold = 0.5
|
||||
layer = CustomOp.op_registry[func_name](threshold)
|
||||
layer = op_registry[func_name](threshold)
|
||||
else:
|
||||
layer = CustomOp.op_registry[func_name]()
|
||||
layer = op_registry[func_name]()
|
||||
|
||||
x = torch.randn(num_tokens, dim, dtype=dtype, device=device)
|
||||
compiled_layer = torch.compile(layer.forward_native)
|
||||
|
||||
@@ -1,244 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
# Copyright (c) Microsoft Corporation.
|
||||
# Licensed under the MIT License.
|
||||
|
||||
from packaging import version
|
||||
|
||||
from vllm.model_executor.layers.quantization.utils.bitblas_utils import (
|
||||
MINIMUM_BITBLAS_VERSION,
|
||||
)
|
||||
|
||||
try:
|
||||
import bitblas
|
||||
|
||||
if version.parse(bitblas.__version__) < version.parse(MINIMUM_BITBLAS_VERSION):
|
||||
raise ImportError(
|
||||
"bitblas version is wrong. Please "
|
||||
f"install bitblas>={MINIMUM_BITBLAS_VERSION}"
|
||||
)
|
||||
except ImportError as e:
|
||||
bitblas_import_exception = e
|
||||
raise ValueError(
|
||||
"Trying to use the bitblas backend, but could not import"
|
||||
f"with the following error: {bitblas_import_exception}. "
|
||||
"Please install bitblas through the following command: "
|
||||
f"`pip install bitblas>={MINIMUM_BITBLAS_VERSION}`"
|
||||
) from bitblas_import_exception
|
||||
|
||||
from bitblas import Matmul, MatmulConfig, auto_detect_nvidia_target
|
||||
|
||||
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
||||
|
||||
parser = FlexibleArgumentParser(
|
||||
description="Benchmark BitBLAS int4 on a specific target."
|
||||
)
|
||||
|
||||
# Add arguments to the parser
|
||||
parser.add_argument(
|
||||
"--target",
|
||||
type=str,
|
||||
default=auto_detect_nvidia_target(),
|
||||
help="Specify the target device for benchmarking.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--group_size", type=int, default=None, help="Group size for grouped quantization."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--A_dtype",
|
||||
type=str,
|
||||
default="float16",
|
||||
choices=["float16", "float32", "float64", "int32", "int8"],
|
||||
help="Data type of activation A.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--W_dtype",
|
||||
type=str,
|
||||
default="int4",
|
||||
choices=[
|
||||
"float16",
|
||||
"float32",
|
||||
"float64",
|
||||
"int32",
|
||||
"int8",
|
||||
"int4",
|
||||
"int2",
|
||||
"int1",
|
||||
"nf4",
|
||||
"fp4_e2m1",
|
||||
],
|
||||
help="Data type of weight W.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--accum_dtype",
|
||||
type=str,
|
||||
default="float16",
|
||||
choices=["float16", "int32"],
|
||||
help="Data type for accumulation.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--out_dtype",
|
||||
type=str,
|
||||
default="float16",
|
||||
choices=["float16", "float32", "int32", "int8"],
|
||||
help="Data type for output.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--layout",
|
||||
type=str,
|
||||
default="nt",
|
||||
choices=["nt", "nn"],
|
||||
help="Matrix layout, 'nt' for non-transpose A and transpose W.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--with_bias", action="store_true", help="Include bias in the benchmark."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--with_scaling",
|
||||
action="store_true",
|
||||
help="Include scaling factor in the quantization.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--with_zeros", action="store_true", help="Include zeros in the quantization."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--zeros_mode",
|
||||
type=str,
|
||||
default=None,
|
||||
choices=["original", "rescale", "quantized"],
|
||||
help="Specify the mode for calculating zeros.",
|
||||
)
|
||||
|
||||
# Parse the arguments
|
||||
args = parser.parse_args()
|
||||
|
||||
# Assign arguments to variables
|
||||
target = args.target
|
||||
A_dtype = args.A_dtype
|
||||
W_dtype = args.W_dtype
|
||||
accum_dtype = args.accum_dtype
|
||||
out_dtype = args.out_dtype
|
||||
layout = args.layout
|
||||
with_bias = args.with_bias
|
||||
group_size = args.group_size
|
||||
with_scaling = args.with_scaling
|
||||
with_zeros = args.with_zeros
|
||||
zeros_mode = args.zeros_mode
|
||||
|
||||
# Define a list of shared arguments that repeat in every config
|
||||
shared_args = [
|
||||
A_dtype,
|
||||
W_dtype,
|
||||
out_dtype,
|
||||
accum_dtype,
|
||||
layout,
|
||||
with_bias,
|
||||
group_size,
|
||||
with_scaling,
|
||||
with_zeros,
|
||||
zeros_mode,
|
||||
]
|
||||
|
||||
# Define just the (M, K, N) shapes in a more compact list
|
||||
shapes = [
|
||||
# square test
|
||||
(1, 16384, 16384),
|
||||
# BLOOM-176B
|
||||
(1, 43008, 14336),
|
||||
(1, 14336, 14336),
|
||||
(1, 57344, 14336),
|
||||
(1, 14336, 57344),
|
||||
# OPT-65B
|
||||
(1, 9216, 9216),
|
||||
(1, 36864, 9216),
|
||||
(1, 9216, 36864),
|
||||
(1, 22016, 8192),
|
||||
# LLAMA-70B/65B
|
||||
(1, 8192, 22016),
|
||||
(1, 8192, 8192),
|
||||
(1, 28672, 8192),
|
||||
(1, 8192, 28672),
|
||||
# square test
|
||||
(16384, 16384, 16384),
|
||||
# BLOOM-176B
|
||||
(8192, 43008, 14336),
|
||||
(8192, 14336, 14336),
|
||||
(8192, 57344, 14336),
|
||||
(8192, 14336, 57344),
|
||||
# OPT-65B
|
||||
(8192, 9216, 9216),
|
||||
(8192, 36864, 9216),
|
||||
(8192, 9216, 36864),
|
||||
(8192, 22016, 8192),
|
||||
# LLAMA-70B/65B
|
||||
(8192, 8192, 22016),
|
||||
(8192, 8192, 8192),
|
||||
(8192, 28672, 8192),
|
||||
(8192, 8192, 28672),
|
||||
]
|
||||
|
||||
# Build test shapes with all the shared arguments
|
||||
test_shapes = [(MatmulConfig, Matmul, (*shape, *shared_args)) for shape in shapes]
|
||||
|
||||
benchmark_sets = []
|
||||
benchmark_sets.extend(test_shapes)
|
||||
|
||||
benchmark_results = {}
|
||||
for config_class, operator, input_args in benchmark_sets:
|
||||
config = config_class(*input_args)
|
||||
matmul = operator(config, target=target, enable_tuning=True)
|
||||
kernel_latency = matmul.profile_latency()
|
||||
|
||||
print("Time cost is: {:.3f} ms".format(kernel_latency))
|
||||
|
||||
profile_config = {
|
||||
f"{operator.__name__}-{'-'.join([str(i) for i in input_args])}": {
|
||||
"BitBLAS_top20_latency": kernel_latency,
|
||||
}
|
||||
}
|
||||
|
||||
benchmark_results.update(profile_config)
|
||||
|
||||
# Define headers for the table
|
||||
headers = [
|
||||
"PrimFunc",
|
||||
"Input Arguments",
|
||||
"BitBLAS Top20 Latency",
|
||||
]
|
||||
|
||||
# Calculate column widths for pretty printing
|
||||
col_widths = [0, 0, 0]
|
||||
for config_key, values in benchmark_results.items():
|
||||
args_split = config_key.split("-")
|
||||
func_name = args_split[0]
|
||||
input_args_str = "-".join(args_split[1:])
|
||||
col_widths[0] = max(col_widths[0], len(func_name) + 2, len(headers[0]) + 2)
|
||||
col_widths[1] = max(col_widths[1], len(input_args_str) + 2, len(headers[1]) + 2)
|
||||
col_widths[2] = max(
|
||||
col_widths[2],
|
||||
len(f"{values['BitBLAS_top20_latency']:.3f} ms") + 2,
|
||||
len(headers[2]) + 2,
|
||||
)
|
||||
# break only if you want to measure widths from a single example;
|
||||
# otherwise, let it loop over all items.
|
||||
|
||||
# Print header
|
||||
for i, header in enumerate(headers):
|
||||
headers[i] = header.ljust(col_widths[i])
|
||||
print("".join(headers))
|
||||
print("-" * sum(col_widths))
|
||||
|
||||
# Print rows
|
||||
for config_key, values in benchmark_results.items():
|
||||
args_split = config_key.split("-")
|
||||
func_name = args_split[0]
|
||||
input_args_str = "-".join(args_split[1:])
|
||||
row = [
|
||||
func_name,
|
||||
input_args_str,
|
||||
f"{values['BitBLAS_top20_latency']:.3f} ms",
|
||||
]
|
||||
row_str = "".join(
|
||||
[str(cell).ljust(col_widths[idx]) for idx, cell in enumerate(row)]
|
||||
)
|
||||
print(row_str)
|
||||
@@ -9,6 +9,7 @@ but use different quantization strategies and backends.
|
||||
import torch
|
||||
|
||||
import vllm.model_executor.layers.fused_moe.modular_kernel as mk
|
||||
from tests.kernels.moe.utils import make_dummy_moe_config
|
||||
from vllm import _custom_ops as ops
|
||||
from vllm.model_executor.layers.fused_moe.config import fp8_w8a8_moe_quant_config
|
||||
from vllm.model_executor.layers.fused_moe.cutlass_moe import CutlassExpertsFp8
|
||||
@@ -138,12 +139,13 @@ def bench_run(
|
||||
fn = mk.FusedMoEModularKernel(
|
||||
MoEPrepareAndFinalizeNoEP(),
|
||||
CutlassExpertsFp8(
|
||||
out_dtype=a.dtype,
|
||||
e=num_experts,
|
||||
n=n,
|
||||
k=k,
|
||||
moe_config=make_dummy_moe_config(
|
||||
num_experts=num_experts,
|
||||
hidden_dim=k,
|
||||
intermediate_size_per_partition=n,
|
||||
in_dtype=a.dtype,
|
||||
),
|
||||
quant_config=quant_config,
|
||||
device=w1.device,
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
@@ -12,6 +12,7 @@ import torch
|
||||
import torch.utils.benchmark as benchmark
|
||||
|
||||
import vllm.model_executor.layers.fused_moe.modular_kernel as mk
|
||||
from tests.kernels.moe.utils import make_dummy_moe_config
|
||||
from vllm import _custom_ops as ops
|
||||
from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config
|
||||
from vllm.model_executor.layers.fused_moe.config import (
|
||||
@@ -196,10 +197,9 @@ def bench_run(
|
||||
)
|
||||
|
||||
kernel = mk.FusedMoEModularKernel(
|
||||
MoEPrepareAndFinalizeNoEP(defer_input_quant=True),
|
||||
MoEPrepareAndFinalizeNoEP(),
|
||||
CutlassExpertsFp4(
|
||||
out_dtype=dtype,
|
||||
max_experts_per_worker=e,
|
||||
make_dummy_moe_config(),
|
||||
quant_config=quant_config,
|
||||
),
|
||||
)
|
||||
@@ -242,10 +242,9 @@ def bench_run(
|
||||
)
|
||||
|
||||
kernel = mk.FusedMoEModularKernel(
|
||||
MoEPrepareAndFinalizeNoEP(defer_input_quant=True),
|
||||
MoEPrepareAndFinalizeNoEP(),
|
||||
CutlassExpertsFp4(
|
||||
out_dtype=dtype,
|
||||
max_experts_per_worker=e,
|
||||
make_dummy_moe_config(),
|
||||
quant_config=quant_config,
|
||||
),
|
||||
)
|
||||
|
||||
99
benchmarks/kernels/benchmark_fused_topk.py
Normal file
99
benchmarks/kernels/benchmark_fused_topk.py
Normal file
@@ -0,0 +1,99 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import itertools
|
||||
|
||||
import torch
|
||||
|
||||
from vllm.model_executor.layers.fused_moe.router.fused_topk_router import fused_topk
|
||||
from vllm.triton_utils import triton
|
||||
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
||||
|
||||
num_tokens_range = [2**i for i in range(0, 8, 2)]
|
||||
num_experts_range = [16, 32, 64, 128, 256, 512]
|
||||
topk_range = [3, 4]
|
||||
configs = list(itertools.product(num_tokens_range, num_experts_range, topk_range))
|
||||
|
||||
|
||||
def torch_topk(
|
||||
gating_output: torch.Tensor,
|
||||
topk: int,
|
||||
renormalize: bool,
|
||||
scoring_func: str = "softmax",
|
||||
):
|
||||
if scoring_func == "softmax":
|
||||
scores = torch.softmax(gating_output.float(), dim=-1)
|
||||
else:
|
||||
scores = torch.sigmoid(gating_output.float())
|
||||
topk_weights, topk_ids = torch.topk(scores, k=topk, dim=-1)
|
||||
|
||||
if renormalize:
|
||||
topk_weights = topk_weights / topk_weights.sum(dim=-1, keepdim=True)
|
||||
|
||||
return topk_weights, topk_ids
|
||||
|
||||
|
||||
def get_benchmark(scoring_func):
|
||||
@triton.testing.perf_report(
|
||||
triton.testing.Benchmark(
|
||||
x_names=["num_tokens", "num_experts", "topk"],
|
||||
x_vals=[list(_) for _ in configs],
|
||||
line_arg="provider",
|
||||
line_vals=["torch", "vllm"],
|
||||
line_names=["Torch", "vLLM"],
|
||||
styles=[("blue", "-"), ("red", "-")],
|
||||
ylabel="us",
|
||||
plot_name=f"fused-topk-perf-{scoring_func}",
|
||||
args={},
|
||||
)
|
||||
)
|
||||
def benchmark(num_tokens, num_experts, topk, provider):
|
||||
dtype = torch.bfloat16
|
||||
hidden_size = 1024
|
||||
renormalize = True
|
||||
hidden_states = torch.randn(
|
||||
(num_tokens, hidden_size), dtype=dtype, device="cuda"
|
||||
)
|
||||
gating_output = torch.randn(
|
||||
(num_tokens, num_experts), dtype=dtype, device="cuda"
|
||||
)
|
||||
|
||||
quantiles = [0.5, 0.2, 0.8]
|
||||
|
||||
if provider == "torch":
|
||||
ms, min_ms, max_ms = triton.testing.do_bench(
|
||||
lambda: torch_topk(
|
||||
gating_output=gating_output,
|
||||
topk=topk,
|
||||
renormalize=renormalize,
|
||||
scoring_func=scoring_func,
|
||||
),
|
||||
quantiles=quantiles,
|
||||
)
|
||||
else:
|
||||
ms, min_ms, max_ms = triton.testing.do_bench(
|
||||
lambda: fused_topk(
|
||||
hidden_states=hidden_states,
|
||||
gating_output=gating_output,
|
||||
topk=topk,
|
||||
renormalize=renormalize,
|
||||
scoring_func=scoring_func,
|
||||
),
|
||||
quantiles=quantiles,
|
||||
)
|
||||
|
||||
return 1000 * ms, 1000 * max_ms, 1000 * min_ms
|
||||
|
||||
return benchmark
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = FlexibleArgumentParser(description="Benchmark the MoE topk kernel.")
|
||||
parser.add_argument("--scoring-func", type=str, default="softmax")
|
||||
parser.add_argument("--save-path", type=str, default="./configs/fused_topk/")
|
||||
args = parser.parse_args()
|
||||
|
||||
# Get the benchmark function
|
||||
benchmark = get_benchmark(args.scoring_func)
|
||||
# Run performance benchmark
|
||||
benchmark.run(print_data=True, save_path=args.save_path)
|
||||
@@ -6,6 +6,7 @@ import torch.utils.benchmark as benchmark
|
||||
from benchmark_shapes import WEIGHT_SHAPES_MOE
|
||||
|
||||
import vllm.model_executor.layers.fused_moe.modular_kernel as mk
|
||||
from tests.kernels.moe.utils import make_dummy_moe_config
|
||||
from vllm import _custom_ops as ops
|
||||
from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config
|
||||
from vllm.model_executor.layers.fused_moe.config import fp8_w8a8_moe_quant_config
|
||||
@@ -134,13 +135,13 @@ def bench_run(
|
||||
fn = mk.FusedMoEModularKernel(
|
||||
MoEPrepareAndFinalizeNoEP(),
|
||||
CutlassExpertsFp8(
|
||||
out_dtype=a.dtype,
|
||||
# NOTE(rob): w2 is shaped as [E, hidden, intermediate]
|
||||
e=w2.shape[0],
|
||||
n=w2.shape[2],
|
||||
k=w2.shape[1],
|
||||
moe_config=make_dummy_moe_config(
|
||||
num_experts=w2.shape[0],
|
||||
hidden_dim=w2.shape[1],
|
||||
intermediate_size_per_partition=w2.shape[2],
|
||||
in_dtype=a.dtype,
|
||||
),
|
||||
quant_config=quant_config,
|
||||
device=w1.device,
|
||||
),
|
||||
)
|
||||
|
||||
@@ -166,13 +167,13 @@ def bench_run(
|
||||
fn = mk.FusedMoEModularKernel(
|
||||
MoEPrepareAndFinalizeNoEP(),
|
||||
CutlassExpertsFp8(
|
||||
out_dtype=a.dtype,
|
||||
# NOTE(rob): w2 is shaped as [E, hidden, intermediate]
|
||||
e=w2.shape[0],
|
||||
n=w2.shape[2],
|
||||
k=w2.shape[1],
|
||||
moe_config=make_dummy_moe_config(
|
||||
num_experts=w2.shape[0],
|
||||
hidden_dim=w2.shape[1],
|
||||
intermediate_size_per_partition=w2.shape[2],
|
||||
in_dtype=a.dtype,
|
||||
),
|
||||
quant_config=quant_config,
|
||||
device=w1.device,
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
@@ -842,6 +842,7 @@ class BenchmarkTensors:
|
||||
"sorted_token_ids": sorted_token_ids,
|
||||
"expert_ids": expert_ids,
|
||||
"num_tokens_post_padded": num_tokens_post_padded,
|
||||
"token_lora_mapping": self.lora_kernel_meta.token_lora_mapping,
|
||||
"top_k_num": ctx.top_k_num,
|
||||
"device": self.input.device,
|
||||
"N": lora_rank,
|
||||
@@ -915,6 +916,7 @@ class BenchmarkTensors:
|
||||
"sorted_token_ids": sorted_token_ids,
|
||||
"expert_ids": expert_ids,
|
||||
"num_tokens_post_padded": num_tokens_post_padded,
|
||||
"token_lora_mapping": self.lora_kernel_meta.token_lora_mapping,
|
||||
"top_k_num": ctx.top_k_num,
|
||||
"device": self.input.device,
|
||||
"N": lora_rank,
|
||||
|
||||
@@ -231,7 +231,7 @@ def marlin_create_bench_fn(bt: BenchmarkTensors) -> Callable:
|
||||
assert bt.w_tok_s is None
|
||||
assert bt.group_size is not None
|
||||
|
||||
fn = lambda: ops.gptq_marlin_gemm(
|
||||
fn = lambda: ops.marlin_gemm(
|
||||
a=bt.a,
|
||||
c=None,
|
||||
b_q_weight=w_q,
|
||||
|
||||
@@ -6,12 +6,6 @@ import torch.utils.benchmark as benchmark
|
||||
from benchmark_shapes import WEIGHT_SHAPES
|
||||
|
||||
from vllm import _custom_ops as ops
|
||||
from vllm.model_executor.layers.quantization.gptq_marlin_24 import (
|
||||
GPTQ_MARLIN_24_MAX_PARALLEL,
|
||||
GPTQ_MARLIN_24_MIN_THREAD_N,
|
||||
GPTQ_MARLIN_24_SUPPORTED_GROUP_SIZES,
|
||||
GPTQ_MARLIN_24_SUPPORTED_QUANT_TYPES,
|
||||
)
|
||||
from vllm.model_executor.layers.quantization.utils.allspark_utils import (
|
||||
ALLSPARK_AMPERE_M_CUBLAS_THRESHOLD,
|
||||
ALLSPARK_SUPPORTED_QUANT_TYPES,
|
||||
@@ -34,9 +28,6 @@ from vllm.model_executor.layers.quantization.utils.marlin_utils_test import (
|
||||
awq_marlin_quantize,
|
||||
marlin_quantize,
|
||||
)
|
||||
from vllm.model_executor.layers.quantization.utils.marlin_utils_test_24 import (
|
||||
marlin_24_quantize,
|
||||
)
|
||||
from vllm.model_executor.layers.quantization.utils.quant_utils import (
|
||||
gptq_pack,
|
||||
gptq_quantize_weights,
|
||||
@@ -78,14 +69,7 @@ def bench_run(
|
||||
if size_k % group_size != 0:
|
||||
return
|
||||
|
||||
marlin_24_supported = (
|
||||
quant_type in GPTQ_MARLIN_24_SUPPORTED_QUANT_TYPES
|
||||
and group_size in GPTQ_MARLIN_24_SUPPORTED_GROUP_SIZES
|
||||
)
|
||||
repack_supported = (
|
||||
quant_type in GPTQ_MARLIN_24_SUPPORTED_QUANT_TYPES
|
||||
and group_size in MARLIN_SUPPORTED_GROUP_SIZES
|
||||
)
|
||||
repack_supported = group_size in MARLIN_SUPPORTED_GROUP_SIZES
|
||||
allspark_supported = (
|
||||
quant_type in ALLSPARK_SUPPORTED_QUANT_TYPES
|
||||
and group_size == -1
|
||||
@@ -126,14 +110,6 @@ def bench_run(
|
||||
marlin_sort_indices,
|
||||
)
|
||||
|
||||
def gen_marlin_24_params():
|
||||
marlin_24_w_ref = marlin_24_q_w_comp = marlin_24_meta = marlin_24_s = None
|
||||
if marlin_24_supported:
|
||||
(marlin_24_w_ref, marlin_24_q_w_comp, marlin_24_meta, marlin_24_s) = (
|
||||
marlin_24_quantize(b, quant_type, group_size)
|
||||
)
|
||||
return (marlin_24_w_ref, marlin_24_q_w_comp, marlin_24_meta, marlin_24_s)
|
||||
|
||||
def gen_repack_params():
|
||||
q_w_gptq = None
|
||||
repack_sort_indices = None
|
||||
@@ -188,9 +164,6 @@ def bench_run(
|
||||
marlin_g_idx,
|
||||
marlin_sort_indices,
|
||||
) = gen_marlin_params()
|
||||
marlin_24_w_ref, marlin_24_q_w_comp, marlin_24_meta, marlin_24_s = (
|
||||
gen_marlin_24_params()
|
||||
)
|
||||
q_w_gptq, repack_sort_indices = gen_repack_params()
|
||||
qw_reorder, s_reorder, zp_reorder, sm_count, sm_version, CUBLAS_M_THRESHOLD = (
|
||||
gen_allspark_params()
|
||||
@@ -200,9 +173,6 @@ def bench_run(
|
||||
marlin_workspace = MarlinWorkspace(
|
||||
size_n, GPTQ_MARLIN_MIN_THREAD_N, GPTQ_MARLIN_MAX_PARALLEL
|
||||
)
|
||||
marlin_24_workspace = MarlinWorkspace(
|
||||
size_n, GPTQ_MARLIN_24_MIN_THREAD_N, GPTQ_MARLIN_24_MAX_PARALLEL
|
||||
)
|
||||
|
||||
globals = {
|
||||
# Gen params
|
||||
@@ -222,12 +192,6 @@ def bench_run(
|
||||
"marlin_sort_indices": marlin_sort_indices,
|
||||
"marlin_workspace": marlin_workspace,
|
||||
"is_k_full": is_k_full,
|
||||
# Marlin_24 params
|
||||
"marlin_24_w_ref": marlin_24_w_ref,
|
||||
"marlin_24_q_w_comp": marlin_24_q_w_comp,
|
||||
"marlin_24_meta": marlin_24_meta,
|
||||
"marlin_24_s": marlin_24_s,
|
||||
"marlin_24_workspace": marlin_24_workspace,
|
||||
# GPTQ params
|
||||
"q_w_gptq": q_w_gptq,
|
||||
"repack_sort_indices": repack_sort_indices,
|
||||
@@ -239,8 +203,7 @@ def bench_run(
|
||||
"sm_version": sm_version,
|
||||
"CUBLAS_M_THRESHOLD": CUBLAS_M_THRESHOLD,
|
||||
# Kernels
|
||||
"gptq_marlin_gemm": ops.gptq_marlin_gemm,
|
||||
"gptq_marlin_24_gemm": ops.gptq_marlin_24_gemm,
|
||||
"marlin_gemm": ops.marlin_gemm,
|
||||
"gptq_marlin_repack": ops.gptq_marlin_repack,
|
||||
"allspark_w8a16_gemm": ops.allspark_w8a16_gemm,
|
||||
}
|
||||
@@ -263,35 +226,24 @@ def bench_run(
|
||||
|
||||
results.append(
|
||||
benchmark.Timer(
|
||||
stmt="output = gptq_marlin_gemm(a, None, marlin_q_w, marlin_s, None, marlin_s2, marlin_zp, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, quant_type, size_m, size_n, size_k, is_k_full, False, False, False)", # noqa: E501
|
||||
stmt="output = marlin_gemm(a, None, marlin_q_w, marlin_s, None, marlin_s2, marlin_zp, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, quant_type, size_m, size_n, size_k, is_k_full, False, False, False)", # noqa: E501
|
||||
globals=globals,
|
||||
label=label,
|
||||
sub_label=sub_label,
|
||||
description="gptq_marlin_gemm",
|
||||
description="marlin_gemm",
|
||||
).blocked_autorange(min_run_time=min_run_time)
|
||||
)
|
||||
|
||||
results.append(
|
||||
benchmark.Timer(
|
||||
stmt="output = gptq_marlin_gemm(a, None, marlin_q_w, marlin_s, None, marlin_s2, marlin_zp, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, quant_type, size_m, size_n, size_k, is_k_full, False, True, False)", # noqa: E501
|
||||
stmt="output = marlin_gemm(a, None, marlin_q_w, marlin_s, None, marlin_s2, marlin_zp, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, quant_type, size_m, size_n, size_k, is_k_full, False, True, False)", # noqa: E501
|
||||
globals=globals,
|
||||
label=label,
|
||||
sub_label=sub_label,
|
||||
description="gptq_marlin_gemm_fp32",
|
||||
description="marlin_gemm_fp32",
|
||||
).blocked_autorange(min_run_time=min_run_time)
|
||||
)
|
||||
|
||||
if marlin_24_supported:
|
||||
results.append(
|
||||
benchmark.Timer(
|
||||
stmt="output = gptq_marlin_24_gemm(a, marlin_24_q_w_comp, marlin_24_meta, marlin_24_s, marlin_24_workspace.scratch, quant_type, size_m, size_n, size_k)", # noqa: E501
|
||||
globals=globals,
|
||||
label=label,
|
||||
sub_label=sub_label,
|
||||
description="gptq_marlin_24_gemm",
|
||||
).blocked_autorange(min_run_time=min_run_time)
|
||||
)
|
||||
|
||||
if repack_supported:
|
||||
results.append(
|
||||
benchmark.Timer(
|
||||
|
||||
@@ -15,12 +15,18 @@ import ray
|
||||
import torch
|
||||
from ray.experimental.tqdm_ray import tqdm
|
||||
|
||||
from vllm.model_executor.layers.fused_moe import fused_topk
|
||||
from vllm.model_executor.layers.fused_moe.config import (
|
||||
FusedMoEConfig,
|
||||
FusedMoEParallelConfig,
|
||||
FusedMoEQuantConfig,
|
||||
RoutingMethodType,
|
||||
_get_config_dtype_str,
|
||||
)
|
||||
from vllm.model_executor.layers.fused_moe.fused_moe import *
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.model_executor.layers.fused_moe.triton_deep_gemm_moe import (
|
||||
TritonOrDeepGemmExperts,
|
||||
)
|
||||
from vllm.transformers_utils.config import get_config
|
||||
from vllm.triton_utils import triton
|
||||
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
||||
@@ -194,10 +200,36 @@ def benchmark_config(
|
||||
block_shape=block_quant_shape,
|
||||
)
|
||||
|
||||
deep_gemm_experts = None
|
||||
if use_deep_gemm:
|
||||
deep_gemm_experts = mk.FusedMoEModularKernel(
|
||||
prepare_finalize=MoEPrepareAndFinalizeNoEP(),
|
||||
fused_experts=TritonOrDeepGemmExperts(
|
||||
moe_config=FusedMoEConfig(
|
||||
num_experts=num_experts,
|
||||
experts_per_token=topk,
|
||||
hidden_dim=hidden_size,
|
||||
intermediate_size_per_partition=shard_intermediate_size,
|
||||
num_local_experts=num_experts,
|
||||
activation="silu",
|
||||
moe_parallel_config=FusedMoEParallelConfig.make_no_parallel(),
|
||||
in_dtype=init_dtype,
|
||||
routing_method=RoutingMethodType.TopK,
|
||||
device="cuda",
|
||||
),
|
||||
quant_config=quant_config,
|
||||
),
|
||||
)
|
||||
|
||||
with override_config(config):
|
||||
topk_weights, topk_ids, token_expert_indices = fused_topk(
|
||||
x, input_gating, topk, renormalize=not use_deep_gemm
|
||||
)
|
||||
|
||||
if use_deep_gemm:
|
||||
return deep_gemm_experts(
|
||||
x, w1, w2, topk_weights, topk_ids, inplace=True
|
||||
)
|
||||
return fused_experts(
|
||||
x,
|
||||
w1,
|
||||
@@ -206,7 +238,6 @@ def benchmark_config(
|
||||
topk_ids,
|
||||
inplace=True,
|
||||
quant_config=quant_config,
|
||||
allow_deep_gemm=use_deep_gemm,
|
||||
)
|
||||
|
||||
# JIT compilation & warmup
|
||||
@@ -450,6 +481,8 @@ class BenchmarkWorker:
|
||||
block_quant_shape: list[int] = None,
|
||||
use_deep_gemm: bool = False,
|
||||
) -> tuple[dict[str, int], float]:
|
||||
# local import to allow serialization by ray
|
||||
|
||||
set_random_seed(self.seed)
|
||||
dtype_str = _get_config_dtype_str(
|
||||
dtype, use_int8_w8a16=use_int8_w8a16, use_fp8_w8a8=use_fp8_w8a8
|
||||
@@ -503,6 +536,9 @@ class BenchmarkWorker:
|
||||
block_quant_shape: list[int],
|
||||
use_deep_gemm: bool,
|
||||
) -> dict[str, int]:
|
||||
# local import to allow serialization by ray
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
best_config = None
|
||||
best_time = float("inf")
|
||||
if current_platform.is_rocm():
|
||||
@@ -614,20 +650,28 @@ def save_configs(
|
||||
f.write("\n")
|
||||
|
||||
|
||||
def get_compressed_tensors_block_structure(config, default_value=None):
|
||||
config_groups = config.get("config_groups", {})
|
||||
if len(config_groups) != 1:
|
||||
return default_value
|
||||
group = next(iter(config_groups.values()))
|
||||
weights = group.get("weights", {})
|
||||
block_structure = weights.get("block_structure", default_value)
|
||||
return block_structure
|
||||
|
||||
|
||||
def get_weight_block_size_safety(config, default_value=None):
|
||||
quantization_config = getattr(config, "quantization_config", {})
|
||||
if isinstance(quantization_config, dict):
|
||||
return quantization_config.get("weight_block_size", default_value)
|
||||
if "weight_block_size" in quantization_config:
|
||||
return quantization_config["weight_block_size"]
|
||||
return get_compressed_tensors_block_structure(
|
||||
quantization_config, default_value
|
||||
)
|
||||
return default_value
|
||||
|
||||
|
||||
def main(args: argparse.Namespace):
|
||||
print(args)
|
||||
|
||||
config = get_config(model=args.model, trust_remote_code=args.trust_remote_code)
|
||||
if args.model_prefix:
|
||||
config = getattr(config, args.model_prefix)
|
||||
|
||||
def get_model_params(config):
|
||||
if config.architectures[0] == "DbrxForCausalLM":
|
||||
E = config.ffn_config.moe_num_experts
|
||||
topk = config.ffn_config.moe_top_k
|
||||
@@ -642,8 +686,11 @@ def main(args: argparse.Namespace):
|
||||
"DeepseekV2ForCausalLM",
|
||||
"DeepseekV3ForCausalLM",
|
||||
"DeepseekV32ForCausalLM",
|
||||
"GlmMoeDsaForCausalLM",
|
||||
"Glm4MoeForCausalLM",
|
||||
"Glm4MoeLiteForCausalLM",
|
||||
"NemotronHForCausalLM",
|
||||
"MistralLarge3ForCausalLM",
|
||||
):
|
||||
E = config.n_routed_experts
|
||||
topk = config.num_experts_per_tok
|
||||
@@ -664,16 +711,20 @@ def main(args: argparse.Namespace):
|
||||
topk = text_config.num_experts_per_tok
|
||||
intermediate_size = text_config.moe_intermediate_size
|
||||
hidden_size = text_config.hidden_size
|
||||
elif config.architectures[0] in ("HunYuanMoEV1ForCausalLM"):
|
||||
elif config.architectures[0] == "HunYuanMoEV1ForCausalLM":
|
||||
E = config.num_experts
|
||||
topk = config.moe_topk[0]
|
||||
intermediate_size = config.moe_intermediate_size[0]
|
||||
hidden_size = config.hidden_size
|
||||
elif config.architectures[0] in ["Qwen3OmniMoeForConditionalGeneration"]:
|
||||
elif config.architectures[0] == "Qwen3OmniMoeForConditionalGeneration":
|
||||
E = config.thinker_config.text_config.num_experts
|
||||
topk = config.thinker_config.text_config.num_experts_per_tok
|
||||
intermediate_size = config.thinker_config.text_config.moe_intermediate_size
|
||||
hidden_size = config.thinker_config.text_config.hidden_size
|
||||
elif config.architectures[0] == "PixtralForConditionalGeneration":
|
||||
# Pixtral can contain different LLM architectures,
|
||||
# recurse to get their parameters
|
||||
return get_model_params(config.get_text_config())
|
||||
else:
|
||||
# Support for llama4
|
||||
config = config.get_text_config()
|
||||
@@ -682,6 +733,16 @@ def main(args: argparse.Namespace):
|
||||
topk = config.num_experts_per_tok
|
||||
intermediate_size = config.intermediate_size
|
||||
hidden_size = config.hidden_size
|
||||
return E, topk, intermediate_size, hidden_size
|
||||
|
||||
|
||||
def main(args: argparse.Namespace):
|
||||
print(args)
|
||||
|
||||
config = get_config(model=args.model, trust_remote_code=args.trust_remote_code)
|
||||
if args.model_prefix:
|
||||
config = getattr(config, args.model_prefix)
|
||||
E, topk, intermediate_size, hidden_size = get_model_params(config)
|
||||
enable_ep = bool(args.enable_expert_parallel)
|
||||
if enable_ep:
|
||||
ensure_divisibility(E, args.tp_size, "Number of experts")
|
||||
|
||||
@@ -8,10 +8,8 @@ import ray
|
||||
import torch
|
||||
from transformers import AutoConfig
|
||||
|
||||
from vllm.model_executor.layers.fused_moe.fused_moe import *
|
||||
from vllm.model_executor.layers.fused_moe import fused_topk
|
||||
from vllm.model_executor.layers.fused_moe.moe_permute_unpermute import (
|
||||
_moe_permute,
|
||||
_moe_unpermute_and_reduce,
|
||||
moe_permute,
|
||||
moe_unpermute,
|
||||
)
|
||||
@@ -41,16 +39,13 @@ def benchmark_permute(
|
||||
use_fp8_w8a8: bool,
|
||||
use_int8_w8a16: bool,
|
||||
num_iters: int = 100,
|
||||
use_customized_permute: bool = False,
|
||||
) -> float:
|
||||
# init_dtype = torch.float16 if use_fp8_w8a8 else dtype
|
||||
hidden_states = torch.randn(num_tokens, hidden_size, dtype=dtype)
|
||||
# output_hidden_states = torch.empty_like(hidden_states)
|
||||
if use_fp8_w8a8:
|
||||
align_block_size = 128 # deepgemm needs 128 m aligned block
|
||||
qhidden_states, scale = _fp8_quantize(hidden_states, None, None)
|
||||
else:
|
||||
align_block_size = None
|
||||
qhidden_states = hidden_states
|
||||
|
||||
gating_output = torch.randn(num_iters, num_tokens, num_experts, dtype=torch.float32)
|
||||
@@ -64,31 +59,13 @@ def benchmark_permute(
|
||||
input_gating.copy_(gating_output[i])
|
||||
|
||||
def run():
|
||||
if use_customized_permute:
|
||||
(
|
||||
permuted_hidden_states,
|
||||
a1q_scale,
|
||||
first_token_off,
|
||||
inv_perm_idx,
|
||||
m_indices,
|
||||
) = moe_permute(
|
||||
qhidden_states,
|
||||
a1q_scale=None,
|
||||
topk_ids=topk_ids,
|
||||
n_expert=num_experts,
|
||||
expert_map=None,
|
||||
align_block_size=align_block_size,
|
||||
)
|
||||
else:
|
||||
(
|
||||
permuted_hidden_states,
|
||||
a1q_scale,
|
||||
sorted_token_ids,
|
||||
expert_ids,
|
||||
inv_perm,
|
||||
) = _moe_permute(
|
||||
qhidden_states, None, topk_ids, num_experts, None, align_block_size
|
||||
)
|
||||
moe_permute(
|
||||
qhidden_states,
|
||||
a1q_scale=None,
|
||||
topk_ids=topk_ids,
|
||||
n_expert=num_experts,
|
||||
expert_map=None,
|
||||
)
|
||||
|
||||
# JIT compilation & warmup
|
||||
run()
|
||||
@@ -133,16 +110,12 @@ def benchmark_unpermute(
|
||||
use_fp8_w8a8: bool,
|
||||
use_int8_w8a16: bool,
|
||||
num_iters: int = 100,
|
||||
use_customized_permute: bool = False,
|
||||
) -> float:
|
||||
# init_dtype = torch.float16 if use_fp8_w8a8 else dtype
|
||||
hidden_states = torch.randn(num_tokens, hidden_size, dtype=dtype)
|
||||
output_hidden_states = torch.empty_like(hidden_states)
|
||||
if use_fp8_w8a8:
|
||||
align_block_size = 128 # deepgemm needs 128 m aligned block
|
||||
qhidden_states, scale = _fp8_quantize(hidden_states, None, None)
|
||||
else:
|
||||
align_block_size = None
|
||||
qhidden_states = hidden_states
|
||||
|
||||
input_gating = torch.randn(num_tokens, num_experts, dtype=torch.float32)
|
||||
@@ -152,78 +125,36 @@ def benchmark_unpermute(
|
||||
)
|
||||
|
||||
def prepare():
|
||||
if use_customized_permute:
|
||||
(
|
||||
permuted_hidden_states,
|
||||
a1q_scale,
|
||||
first_token_off,
|
||||
inv_perm_idx,
|
||||
m_indices,
|
||||
) = moe_permute(
|
||||
qhidden_states,
|
||||
a1q_scale=None,
|
||||
topk_ids=topk_ids,
|
||||
n_expert=num_experts,
|
||||
expert_map=None,
|
||||
align_block_size=align_block_size,
|
||||
)
|
||||
# convert to fp16/bf16 as gemm output
|
||||
return (
|
||||
permuted_hidden_states.to(dtype),
|
||||
first_token_off,
|
||||
inv_perm_idx,
|
||||
m_indices,
|
||||
)
|
||||
else:
|
||||
(
|
||||
permuted_qhidden_states,
|
||||
a1q_scale,
|
||||
sorted_token_ids,
|
||||
expert_ids,
|
||||
inv_perm,
|
||||
) = _moe_permute(
|
||||
qhidden_states, None, topk_ids, num_experts, None, align_block_size
|
||||
)
|
||||
# convert to fp16/bf16 as gemm output
|
||||
return (
|
||||
permuted_qhidden_states.to(dtype),
|
||||
a1q_scale,
|
||||
sorted_token_ids,
|
||||
expert_ids,
|
||||
inv_perm,
|
||||
)
|
||||
(
|
||||
permuted_hidden_states,
|
||||
_,
|
||||
first_token_off,
|
||||
inv_perm_idx,
|
||||
_,
|
||||
) = moe_permute(
|
||||
qhidden_states,
|
||||
a1q_scale=None,
|
||||
topk_ids=topk_ids,
|
||||
n_expert=num_experts,
|
||||
expert_map=None,
|
||||
)
|
||||
# convert to fp16/bf16 as gemm output
|
||||
return (
|
||||
permuted_hidden_states.to(dtype),
|
||||
first_token_off,
|
||||
inv_perm_idx,
|
||||
)
|
||||
|
||||
def run(input: tuple):
|
||||
if use_customized_permute:
|
||||
(
|
||||
permuted_hidden_states,
|
||||
first_token_off,
|
||||
inv_perm_idx,
|
||||
m_indices,
|
||||
) = input
|
||||
output = torch.empty_like(hidden_states)
|
||||
moe_unpermute(
|
||||
output,
|
||||
permuted_hidden_states,
|
||||
topk_weights,
|
||||
inv_perm_idx,
|
||||
first_token_off,
|
||||
)
|
||||
else:
|
||||
(
|
||||
permuted_hidden_states,
|
||||
a1q_scale,
|
||||
sorted_token_ids,
|
||||
expert_ids,
|
||||
inv_perm,
|
||||
) = input
|
||||
_moe_unpermute_and_reduce(
|
||||
output_hidden_states,
|
||||
permuted_hidden_states,
|
||||
inv_perm,
|
||||
topk_weights,
|
||||
True,
|
||||
)
|
||||
(permuted_hidden_states, first_token_off, inv_perm_idx) = input
|
||||
output = torch.empty_like(hidden_states)
|
||||
moe_unpermute(
|
||||
output,
|
||||
permuted_hidden_states,
|
||||
topk_weights,
|
||||
inv_perm_idx,
|
||||
first_token_off,
|
||||
)
|
||||
|
||||
# JIT compilation & warmup
|
||||
input = prepare()
|
||||
@@ -278,8 +209,7 @@ class BenchmarkWorker:
|
||||
dtype: torch.dtype,
|
||||
use_fp8_w8a8: bool,
|
||||
use_int8_w8a16: bool,
|
||||
use_customized_permute: bool = False,
|
||||
) -> tuple[dict[str, int], float]:
|
||||
) -> tuple[float, float]:
|
||||
set_random_seed(self.seed)
|
||||
|
||||
permute_time = benchmark_permute(
|
||||
@@ -291,7 +221,6 @@ class BenchmarkWorker:
|
||||
use_fp8_w8a8,
|
||||
use_int8_w8a16,
|
||||
num_iters=100,
|
||||
use_customized_permute=use_customized_permute,
|
||||
)
|
||||
unpermute_time = benchmark_unpermute(
|
||||
num_tokens,
|
||||
@@ -302,7 +231,6 @@ class BenchmarkWorker:
|
||||
use_fp8_w8a8,
|
||||
use_int8_w8a16,
|
||||
num_iters=100,
|
||||
use_customized_permute=use_customized_permute,
|
||||
)
|
||||
return permute_time, unpermute_time
|
||||
|
||||
@@ -330,6 +258,7 @@ def main(args: argparse.Namespace):
|
||||
config.architectures[0] == "DeepseekV3ForCausalLM"
|
||||
or config.architectures[0] == "DeepseekV2ForCausalLM"
|
||||
or config.architectures[0] == "Glm4MoeForCausalLM"
|
||||
or config.architectures[0] == "Glm4MoeLiteForCausalLM"
|
||||
):
|
||||
E = config.n_routed_experts
|
||||
topk = config.num_experts_per_tok
|
||||
@@ -348,7 +277,6 @@ def main(args: argparse.Namespace):
|
||||
dtype = torch.float16 if current_platform.is_rocm() else config.dtype
|
||||
use_fp8_w8a8 = args.dtype == "fp8_w8a8"
|
||||
use_int8_w8a16 = args.dtype == "int8_w8a16"
|
||||
use_customized_permute = args.use_customized_permute
|
||||
|
||||
if args.batch_size is None:
|
||||
batch_sizes = [
|
||||
@@ -400,7 +328,6 @@ def main(args: argparse.Namespace):
|
||||
dtype,
|
||||
use_fp8_w8a8,
|
||||
use_int8_w8a16,
|
||||
use_customized_permute,
|
||||
)
|
||||
for batch_size in batch_sizes
|
||||
],
|
||||
@@ -420,7 +347,6 @@ if __name__ == "__main__":
|
||||
parser.add_argument(
|
||||
"--dtype", type=str, choices=["auto", "fp8_w8a8", "int8_w8a16"], default="auto"
|
||||
)
|
||||
parser.add_argument("--use-customized-permute", action="store_true")
|
||||
parser.add_argument("--seed", type=int, default=0)
|
||||
parser.add_argument("--batch-size", type=int, required=False)
|
||||
parser.add_argument("--trust-remote-code", action="store_true")
|
||||
|
||||
@@ -22,8 +22,8 @@ from vllm.utils.argparse_utils import FlexibleArgumentParser
|
||||
|
||||
mp.set_start_method("spawn", force=True)
|
||||
|
||||
assert current_platform.is_cuda(), (
|
||||
"Only support tune w8a8 block fp8 kernel on CUDA device."
|
||||
assert current_platform.is_cuda() or current_platform.is_rocm(), (
|
||||
"Only support tune w8a8 block fp8 kernel on CUDA/ROCm device."
|
||||
)
|
||||
|
||||
DTYPE_MAP = {
|
||||
|
||||
@@ -14,7 +14,7 @@ from vllm._custom_ops import (
|
||||
)
|
||||
from vllm.platforms import CpuArchEnum, current_platform
|
||||
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
||||
from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE
|
||||
from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE, set_random_seed
|
||||
from vllm.v1.attention.backends.cpu_attn import CPUAttentionBackend, _get_attn_isa
|
||||
|
||||
|
||||
@@ -58,7 +58,7 @@ def main(
|
||||
seed: int = 0,
|
||||
iters: int = 20,
|
||||
) -> None:
|
||||
current_platform.seed_everything(seed)
|
||||
set_random_seed(seed)
|
||||
num_seqs = len(seq_lens)
|
||||
query_lens = [x[0] for x in seq_lens]
|
||||
kv_lens = [x[1] for x in seq_lens]
|
||||
|
||||
@@ -7,8 +7,8 @@ import time
|
||||
import numpy as np
|
||||
import torch
|
||||
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
||||
from vllm.utils.torch_utils import set_random_seed
|
||||
|
||||
# Check if CPU MoE operations are available
|
||||
try:
|
||||
@@ -41,7 +41,7 @@ def main(
|
||||
seed: int = 0,
|
||||
iters: int = 20,
|
||||
) -> None:
|
||||
current_platform.seed_everything(seed)
|
||||
set_random_seed(seed)
|
||||
# up_dim = 2 * intermediate_size for gate + up projection
|
||||
up_dim = 2 * intermediate_size
|
||||
|
||||
|
||||
@@ -14,7 +14,6 @@ from vllm.triton_utils import triton
|
||||
from vllm.utils.deep_gemm import (
|
||||
calc_diff,
|
||||
fp8_gemm_nt,
|
||||
get_col_major_tma_aligned_tensor,
|
||||
per_block_cast_to_fp8,
|
||||
)
|
||||
|
||||
@@ -48,8 +47,9 @@ def benchmark_shape(
|
||||
block_size = [128, 128]
|
||||
|
||||
# Pre-quantize A for all implementations
|
||||
A_deepgemm, A_scale_deepgemm = per_token_group_quant_fp8(A, block_size[1])
|
||||
A_scale_deepgemm = get_col_major_tma_aligned_tensor(A_scale_deepgemm)
|
||||
A_deepgemm, A_scale_deepgemm = per_token_group_quant_fp8(
|
||||
A, block_size[1], column_major_scales=True, tma_aligned_scales=True
|
||||
)
|
||||
C_deepgemm = torch.empty((m, n), device="cuda", dtype=torch.bfloat16)
|
||||
A_vllm, A_scale_vllm = per_token_group_quant_fp8(A, block_size[1])
|
||||
A_vllm_cutlass, A_scale_vllm_cutlass = per_token_group_quant_fp8(
|
||||
|
||||
@@ -13,6 +13,8 @@ endif()
|
||||
#
|
||||
# Define environment variables for special configurations
|
||||
#
|
||||
set(ENABLE_AVX2 $ENV{VLLM_CPU_AVX2})
|
||||
set(ENABLE_AVX512 $ENV{VLLM_CPU_AVX512})
|
||||
set(ENABLE_AVX512BF16 $ENV{VLLM_CPU_AVX512BF16})
|
||||
set(ENABLE_AVX512VNNI $ENV{VLLM_CPU_AVX512VNNI})
|
||||
set(ENABLE_AMXBF16 $ENV{VLLM_CPU_AMXBF16})
|
||||
@@ -103,6 +105,16 @@ else()
|
||||
find_isa(${CPUINFO} "bf16" ARM_BF16_FOUND) # Check for ARM BF16 support
|
||||
find_isa(${CPUINFO} "S390" S390_FOUND)
|
||||
find_isa(${CPUINFO} "v" RVV_FOUND) # Check for RISC-V RVV support
|
||||
|
||||
# Support cross-compilation by allowing override via environment variables
|
||||
if (ENABLE_AVX2)
|
||||
set(AVX2_FOUND ON)
|
||||
message(STATUS "AVX2 support enabled via VLLM_CPU_AVX2 environment variable")
|
||||
endif()
|
||||
if (ENABLE_AVX512)
|
||||
set(AVX512_FOUND ON)
|
||||
message(STATUS "AVX512 support enabled via VLLM_CPU_AVX512 environment variable")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if (AVX512_FOUND AND NOT AVX512_DISABLED)
|
||||
@@ -347,6 +359,19 @@ else()
|
||||
add_compile_definitions(-DVLLM_NUMA_DISABLED)
|
||||
endif()
|
||||
|
||||
#
|
||||
# Generate CPU attention dispatch header
|
||||
#
|
||||
message(STATUS "Generating CPU attention dispatch header")
|
||||
execute_process(
|
||||
COMMAND ${Python_EXECUTABLE} ${CMAKE_SOURCE_DIR}/csrc/cpu/generate_cpu_attn_dispatch.py
|
||||
WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}/csrc/cpu
|
||||
RESULT_VARIABLE GEN_RESULT
|
||||
)
|
||||
if(NOT GEN_RESULT EQUAL 0)
|
||||
message(FATAL_ERROR "Failed to generate CPU attention dispatch header")
|
||||
endif()
|
||||
|
||||
#
|
||||
# _C extension
|
||||
#
|
||||
@@ -379,6 +404,12 @@ if (AVX512_FOUND AND NOT AVX512_DISABLED)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if (ASIMD_FOUND AND NOT APPLE_SILICON_FOUND)
|
||||
set(VLLM_EXT_SRC
|
||||
"csrc/cpu/shm.cpp"
|
||||
${VLLM_EXT_SRC})
|
||||
endif()
|
||||
|
||||
if(USE_ONEDNN)
|
||||
set(VLLM_EXT_SRC
|
||||
"csrc/cpu/dnnl_kernels.cpp"
|
||||
|
||||
@@ -19,7 +19,7 @@ else()
|
||||
FetchContent_Declare(
|
||||
flashmla
|
||||
GIT_REPOSITORY https://github.com/vllm-project/FlashMLA
|
||||
GIT_TAG 46d64a8ebef03fa50b4ae74937276a5c940e3f95
|
||||
GIT_TAG c2afa9cb93e674d5a9120a170a6da57b89267208
|
||||
GIT_PROGRESS TRUE
|
||||
CONFIGURE_COMMAND ""
|
||||
BUILD_COMMAND ""
|
||||
@@ -30,6 +30,24 @@ endif()
|
||||
FetchContent_MakeAvailable(flashmla)
|
||||
message(STATUS "FlashMLA is available at ${flashmla_SOURCE_DIR}")
|
||||
|
||||
# Vendor FlashMLA interface into vLLM with torch-ops shim.
|
||||
set(FLASHMLA_VENDOR_DIR "${CMAKE_SOURCE_DIR}/vllm/third_party/flashmla")
|
||||
file(MAKE_DIRECTORY "${FLASHMLA_VENDOR_DIR}")
|
||||
file(READ "${flashmla_SOURCE_DIR}/flash_mla/flash_mla_interface.py"
|
||||
FLASHMLA_INTERFACE_CONTENT)
|
||||
string(REPLACE "import flash_mla.cuda as flash_mla_cuda"
|
||||
"import vllm._flashmla_C\nflash_mla_cuda = torch.ops._flashmla_C"
|
||||
FLASHMLA_INTERFACE_CONTENT
|
||||
"${FLASHMLA_INTERFACE_CONTENT}")
|
||||
file(WRITE "${FLASHMLA_VENDOR_DIR}/flash_mla_interface.py"
|
||||
"${FLASHMLA_INTERFACE_CONTENT}")
|
||||
|
||||
# Install the generated flash_mla_interface.py to the wheel
|
||||
# Use COMPONENT _flashmla_C to ensure it's installed with the C extension
|
||||
install(FILES "${FLASHMLA_VENDOR_DIR}/flash_mla_interface.py"
|
||||
DESTINATION vllm/third_party/flashmla/
|
||||
COMPONENT _flashmla_C)
|
||||
|
||||
# The FlashMLA kernels only work on hopper and require CUDA 12.3 or later.
|
||||
# Only build FlashMLA kernels if we are building for something compatible with
|
||||
# sm90a
|
||||
@@ -55,16 +73,42 @@ if(FLASH_MLA_ARCHS)
|
||||
|
||||
set(FlashMLA_SOURCES
|
||||
${flashmla_SOURCE_DIR}/csrc/torch_api.cpp
|
||||
${flashmla_SOURCE_DIR}/csrc/pybind.cpp
|
||||
${flashmla_SOURCE_DIR}/csrc/smxx/get_mla_metadata.cu
|
||||
${flashmla_SOURCE_DIR}/csrc/smxx/mla_combine.cu
|
||||
${flashmla_SOURCE_DIR}/csrc/sm90/decode/dense/splitkv_mla.cu
|
||||
${flashmla_SOURCE_DIR}/csrc/sm90/decode/sparse_fp8/splitkv_mla.cu
|
||||
|
||||
# Misc kernels for decoding
|
||||
${flashmla_SOURCE_DIR}/csrc/smxx/decode/get_decoding_sched_meta/get_decoding_sched_meta.cu
|
||||
${flashmla_SOURCE_DIR}/csrc/smxx/decode/combine/combine.cu
|
||||
|
||||
# sm90 dense decode
|
||||
${flashmla_SOURCE_DIR}/csrc/sm90/decode/dense/instantiations/fp16.cu
|
||||
${flashmla_SOURCE_DIR}/csrc/sm90/decode/dense/instantiations/bf16.cu
|
||||
|
||||
# sm90 sparse decode
|
||||
${flashmla_SOURCE_DIR}/csrc/sm90/decode/sparse_fp8/instantiations/model1_persistent_h64.cu
|
||||
${flashmla_SOURCE_DIR}/csrc/sm90/decode/sparse_fp8/instantiations/model1_persistent_h128.cu
|
||||
${flashmla_SOURCE_DIR}/csrc/sm90/decode/sparse_fp8/instantiations/v32_persistent_h64.cu
|
||||
${flashmla_SOURCE_DIR}/csrc/sm90/decode/sparse_fp8/instantiations/v32_persistent_h128.cu
|
||||
|
||||
# sm90 sparse prefill
|
||||
${flashmla_SOURCE_DIR}/csrc/sm90/prefill/sparse/fwd.cu
|
||||
${flashmla_SOURCE_DIR}/csrc/sm100/decode/sparse_fp8/splitkv_mla.cu
|
||||
${flashmla_SOURCE_DIR}/csrc/sm90/prefill/sparse/instantiations/phase1_k512.cu
|
||||
${flashmla_SOURCE_DIR}/csrc/sm90/prefill/sparse/instantiations/phase1_k512_topklen.cu
|
||||
${flashmla_SOURCE_DIR}/csrc/sm90/prefill/sparse/instantiations/phase1_k576.cu
|
||||
${flashmla_SOURCE_DIR}/csrc/sm90/prefill/sparse/instantiations/phase1_k576_topklen.cu
|
||||
|
||||
# sm100 dense prefill & backward
|
||||
${flashmla_SOURCE_DIR}/csrc/sm100/prefill/dense/fmha_cutlass_fwd_sm100.cu
|
||||
${flashmla_SOURCE_DIR}/csrc/sm100/prefill/dense/fmha_cutlass_bwd_sm100.cu
|
||||
${flashmla_SOURCE_DIR}/csrc/sm100/prefill/sparse/fwd.cu
|
||||
|
||||
# sm100 sparse prefill
|
||||
${flashmla_SOURCE_DIR}/csrc/sm100/prefill/sparse/fwd/head64/instantiations/phase1_k512.cu
|
||||
${flashmla_SOURCE_DIR}/csrc/sm100/prefill/sparse/fwd/head64/instantiations/phase1_k576.cu
|
||||
${flashmla_SOURCE_DIR}/csrc/sm100/prefill/sparse/fwd/head128/instantiations/phase1_k512.cu
|
||||
${flashmla_SOURCE_DIR}/csrc/sm100/prefill/sparse/fwd/head128/instantiations/phase1_k576.cu
|
||||
${flashmla_SOURCE_DIR}/csrc/sm100/prefill/sparse/fwd_for_small_topk/head128/instantiations/phase1_prefill_k512.cu
|
||||
|
||||
# sm100 sparse decode
|
||||
${flashmla_SOURCE_DIR}/csrc/sm100/decode/head64/instantiations/v32.cu
|
||||
${flashmla_SOURCE_DIR}/csrc/sm100/decode/head64/instantiations/model1.cu
|
||||
${flashmla_SOURCE_DIR}/csrc/sm100/prefill/sparse/fwd_for_small_topk/head128/instantiations/phase1_decode_k512.cu
|
||||
)
|
||||
|
||||
set(FlashMLA_Extension_SOURCES
|
||||
@@ -76,6 +120,7 @@ if(FLASH_MLA_ARCHS)
|
||||
|
||||
set(FlashMLA_INCLUDES
|
||||
${flashmla_SOURCE_DIR}/csrc
|
||||
${flashmla_SOURCE_DIR}/csrc/kerutils/include
|
||||
${flashmla_SOURCE_DIR}/csrc/sm90
|
||||
${flashmla_SOURCE_DIR}/csrc/cutlass/include
|
||||
${flashmla_SOURCE_DIR}/csrc/cutlass/tools/util/include
|
||||
@@ -83,7 +128,6 @@ if(FLASH_MLA_ARCHS)
|
||||
|
||||
set(FlashMLA_Extension_INCLUDES
|
||||
${flashmla_SOURCE_DIR}/csrc
|
||||
${flashmla_SOURCE_DIR}/csrc/sm90
|
||||
${flashmla_SOURCE_DIR}/csrc/extension/sm90/dense_fp8/
|
||||
${flashmla_SOURCE_DIR}/csrc/cutlass/include
|
||||
${flashmla_SOURCE_DIR}/csrc/cutlass/tools/util/include
|
||||
@@ -110,9 +154,12 @@ if(FLASH_MLA_ARCHS)
|
||||
|
||||
# Keep Stable ABI for the module, but *not* for CUDA/C++ files.
|
||||
# This prevents Py_LIMITED_API from affecting nvcc and C++ compiles.
|
||||
# Also enable C++20 for the FlashMLA sources (required for std::span, requires, etc.)
|
||||
target_compile_options(_flashmla_C PRIVATE
|
||||
$<$<COMPILE_LANGUAGE:CUDA>:-UPy_LIMITED_API>
|
||||
$<$<COMPILE_LANGUAGE:CXX>:-UPy_LIMITED_API>)
|
||||
$<$<COMPILE_LANGUAGE:CXX>:-UPy_LIMITED_API>
|
||||
$<$<COMPILE_LANGUAGE:CXX>:-std=c++20>
|
||||
$<$<COMPILE_LANGUAGE:CUDA>:-std=c++20>)
|
||||
|
||||
define_extension_target(
|
||||
_flashmla_extension_C
|
||||
|
||||
@@ -7,6 +7,7 @@
|
||||
#include <vector>
|
||||
|
||||
void swap_blocks(torch::Tensor& src, torch::Tensor& dst,
|
||||
int64_t block_size_in_bytes,
|
||||
const torch::Tensor& block_mapping);
|
||||
|
||||
void reshape_and_cache(torch::Tensor& key, torch::Tensor& value,
|
||||
|
||||
@@ -24,7 +24,14 @@
|
||||
typedef __hip_bfloat16 __nv_bfloat16;
|
||||
#endif
|
||||
|
||||
#if defined(__gfx942__)
|
||||
constexpr float kFp8ScaleDivisor = 224.f;
|
||||
#else
|
||||
constexpr float kFp8ScaleDivisor = 448.f;
|
||||
#endif
|
||||
|
||||
void swap_blocks(torch::Tensor& src, torch::Tensor& dst,
|
||||
int64_t block_size_in_bytes,
|
||||
const torch::Tensor& block_mapping) {
|
||||
torch::Device src_device = src.device();
|
||||
torch::Device dst_device = dst.device();
|
||||
@@ -49,10 +56,6 @@ void swap_blocks(torch::Tensor& src, torch::Tensor& dst,
|
||||
char* src_ptr = static_cast<char*>(src.data_ptr());
|
||||
char* dst_ptr = static_cast<char*>(dst.data_ptr());
|
||||
|
||||
// We use the stride instead of numel in case the cache is padded for memory
|
||||
// alignment reasons, we assume the blocks data (inclusive of any padding)
|
||||
// is contiguous in memory
|
||||
const int64_t block_size_in_bytes = src.element_size() * src.stride(0);
|
||||
const at::cuda::OptionalCUDAGuard device_guard(
|
||||
src_device.is_cuda() ? src_device : dst_device);
|
||||
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
||||
@@ -205,7 +208,8 @@ __global__ void reshape_and_cache_flash_kernel(
|
||||
const int64_t block_stride, const int64_t page_stride,
|
||||
const int64_t head_stride, const int64_t key_stride,
|
||||
const int64_t value_stride, const int num_heads, const int head_size,
|
||||
const int block_size, const float* k_scale, const float* v_scale) {
|
||||
const int block_size, const float* k_scale, const float* v_scale,
|
||||
const int kv_scale_stride) {
|
||||
const int64_t token_idx = blockIdx.x;
|
||||
const int64_t slot_idx = slot_mapping[token_idx];
|
||||
// NOTE: slot_idx can be -1 if the token is padded
|
||||
@@ -229,21 +233,23 @@ __global__ void reshape_and_cache_flash_kernel(
|
||||
// this is true for the NHD layout where `head_stride == head_size`
|
||||
const bool is_contiguous_heads = (head_stride == head_size);
|
||||
|
||||
float k_scale_val = (kv_dt == Fp8KVCacheDataType::kAuto) ? 0.f : *k_scale;
|
||||
float v_scale_val = (kv_dt == Fp8KVCacheDataType::kAuto) ? 0.f : *v_scale;
|
||||
constexpr int VEC_SIZE = (sizeof(scalar_t) == 2) ? 8 : 4;
|
||||
CopyWithScaleOp<cache_t, scalar_t, kv_dt> k_op{k_scale_val};
|
||||
CopyWithScaleOp<cache_t, scalar_t, kv_dt> v_op{v_scale_val};
|
||||
if (is_contiguous_heads) {
|
||||
// NHD layout
|
||||
|
||||
if (is_contiguous_heads && kv_scale_stride == 0) {
|
||||
// NHD layout and k/v_scales are [1] (i.e. single scale for all heads)
|
||||
// kv cache: [num_blocks, block_size, num_heads, head_size]
|
||||
float k_scale_val = (kv_dt == Fp8KVCacheDataType::kAuto) ? 0.f : *k_scale;
|
||||
float v_scale_val = (kv_dt == Fp8KVCacheDataType::kAuto) ? 0.f : *v_scale;
|
||||
|
||||
CopyWithScaleOp<cache_t, scalar_t, kv_dt> k_op{k_scale_val};
|
||||
CopyWithScaleOp<cache_t, scalar_t, kv_dt> v_op{v_scale_val};
|
||||
|
||||
vectorize_with_alignment<VEC_SIZE>(key_src, key_dst, n_elems, threadIdx.x,
|
||||
blockDim.x, k_op);
|
||||
|
||||
vectorize_with_alignment<VEC_SIZE>(value_src, value_dst, n_elems,
|
||||
threadIdx.x, blockDim.x, v_op);
|
||||
|
||||
} else {
|
||||
// HND layout OR k/v_scales are [num_heads] (i.e. per-attn-head)
|
||||
// HND layout: heads are strided, but each head_size segment is contiguous
|
||||
// kv cache: [num_blocks, num_heads, block_size, head_size]
|
||||
const int lane = threadIdx.x & 31; // 0..31 within warp
|
||||
@@ -259,6 +265,16 @@ __global__ void reshape_and_cache_flash_kernel(
|
||||
cache_t* __restrict__ v_dst_h =
|
||||
value_dst + static_cast<int64_t>(head) * head_stride;
|
||||
|
||||
float k_scale_val = (kv_dt == Fp8KVCacheDataType::kAuto)
|
||||
? 0.f
|
||||
: k_scale[head * kv_scale_stride];
|
||||
float v_scale_val = (kv_dt == Fp8KVCacheDataType::kAuto)
|
||||
? 0.f
|
||||
: v_scale[head * kv_scale_stride];
|
||||
|
||||
CopyWithScaleOp<cache_t, scalar_t, kv_dt> k_op{k_scale_val};
|
||||
CopyWithScaleOp<cache_t, scalar_t, kv_dt> v_op{v_scale_val};
|
||||
|
||||
// within each head, let the 32 threads of the warp perform the vector
|
||||
// copy
|
||||
vectorize_with_alignment<VEC_SIZE>(k_src_h, k_dst_h, head_size, lane, 32,
|
||||
@@ -391,8 +407,7 @@ __global__ void concat_and_cache_ds_mla_kernel(
|
||||
}
|
||||
|
||||
// Compute the scale for the tile
|
||||
float tile_scale = max_abs / 448.f;
|
||||
tile_scale = fmaxf(tile_scale, FLT_MIN);
|
||||
float tile_scale = fmaxf(max_abs / kFp8ScaleDivisor, FLT_MIN);
|
||||
|
||||
// The first lane of each half-warp writes the scale to kv_cache
|
||||
if ((lane_idx == 0) || (lane_idx == 16)) {
|
||||
@@ -461,11 +476,8 @@ __global__ void indexer_k_quant_and_cache_kernel(
|
||||
#endif
|
||||
}
|
||||
|
||||
#if defined(__gfx942__)
|
||||
float scale = fmaxf(amax, 1e-4) / 224.0f;
|
||||
#else
|
||||
float scale = fmaxf(amax, 1e-4) / 448.0f;
|
||||
#endif
|
||||
float scale = fmaxf(amax, 1e-4) / kFp8ScaleDivisor;
|
||||
|
||||
if (use_ue8m0) {
|
||||
scale = exp2f(ceilf(log2f(scale)));
|
||||
}
|
||||
@@ -608,7 +620,8 @@ void reshape_and_cache(
|
||||
slot_mapping.data_ptr<int64_t>(), block_stride, page_stride, \
|
||||
head_stride, key_stride, value_stride, num_heads, head_size, \
|
||||
block_size, reinterpret_cast<const float*>(k_scale.data_ptr()), \
|
||||
reinterpret_cast<const float*>(v_scale.data_ptr()));
|
||||
reinterpret_cast<const float*>(v_scale.data_ptr()), \
|
||||
kv_scale_stride);
|
||||
|
||||
void reshape_and_cache_flash(
|
||||
torch::Tensor& key, // [num_tokens, num_heads, head_size]
|
||||
@@ -617,8 +630,9 @@ void reshape_and_cache_flash(
|
||||
torch::Tensor&
|
||||
value_cache, // [num_blocks, block_size, num_heads, head_size]
|
||||
torch::Tensor& slot_mapping, // [num_tokens] or [num_actual_tokens]
|
||||
const std::string& kv_cache_dtype, torch::Tensor& k_scale,
|
||||
torch::Tensor& v_scale) {
|
||||
const std::string& kv_cache_dtype,
|
||||
torch::Tensor& k_scale, // [1] or [num_heads]
|
||||
torch::Tensor& v_scale) { // [1] or [num_heads]
|
||||
// NOTE(woosuk): In vLLM V1, key.size(0) can be different from
|
||||
// slot_mapping.size(0) because of padding for CUDA graphs.
|
||||
// In vLLM V0, key.size(0) is always equal to slot_mapping.size(0) because
|
||||
@@ -641,6 +655,12 @@ void reshape_and_cache_flash(
|
||||
int64_t head_stride = key_cache.stride(2);
|
||||
TORCH_CHECK(key_cache.stride(0) == value_cache.stride(0));
|
||||
|
||||
TORCH_CHECK(k_scale.sizes() == v_scale.sizes(),
|
||||
"k_scale and v_scale must have the same shape");
|
||||
TORCH_CHECK(k_scale.numel() == 1 || k_scale.numel() == num_heads,
|
||||
"k_scale and v_scale must be of shape [1] or [num_heads]");
|
||||
int kv_scale_stride = (k_scale.numel() > 1) ? 1 : 0;
|
||||
|
||||
dim3 grid(num_tokens);
|
||||
dim3 block(std::min(num_heads * head_size, 512));
|
||||
const at::cuda::OptionalCUDAGuard device_guard(device_of(key));
|
||||
|
||||
@@ -1,79 +1,4 @@
|
||||
#include "cpu_attn_vec.hpp"
|
||||
#include "cpu_attn_vec16.hpp"
|
||||
|
||||
#ifdef CPU_CAPABILITY_AMXBF16
|
||||
#include "cpu_attn_amx.hpp"
|
||||
#define AMX_DISPATCH(...) \
|
||||
case cpu_attention::ISA::AMX: { \
|
||||
using attn_impl = cpu_attention::AttentionImpl<cpu_attention::ISA::AMX, \
|
||||
scalar_t, head_dim>; \
|
||||
return __VA_ARGS__(); \
|
||||
}
|
||||
#else
|
||||
#define AMX_DISPATCH(...) case cpu_attention::ISA::AMX:
|
||||
#endif
|
||||
|
||||
#ifdef __aarch64__
|
||||
#include "cpu_attn_neon.hpp"
|
||||
// NEON requires head_dim to be a multiple of 32
|
||||
#define NEON_DISPATCH(...) \
|
||||
case cpu_attention::ISA::NEON: { \
|
||||
using attn_impl = cpu_attention::AttentionImpl<cpu_attention::ISA::NEON, \
|
||||
scalar_t, head_dim>; \
|
||||
return __VA_ARGS__(); \
|
||||
}
|
||||
#else
|
||||
#define NEON_DISPATCH(...) case cpu_attention::ISA::NEON:
|
||||
#endif // #ifdef __aarch64__
|
||||
|
||||
#define CPU_ATTN_DISPATCH_CASE(HEAD_DIM, ...) \
|
||||
case HEAD_DIM: { \
|
||||
constexpr size_t head_dim = HEAD_DIM; \
|
||||
return __VA_ARGS__(); \
|
||||
}
|
||||
|
||||
#define CPU_ATTN_DISPATCH_CASE_HEADDIM(HEAD_DIM, ...) \
|
||||
[&] { \
|
||||
switch (HEAD_DIM) { \
|
||||
CPU_ATTN_DISPATCH_CASE(32, __VA_ARGS__) \
|
||||
CPU_ATTN_DISPATCH_CASE(64, __VA_ARGS__) \
|
||||
CPU_ATTN_DISPATCH_CASE(80, __VA_ARGS__) \
|
||||
CPU_ATTN_DISPATCH_CASE(96, __VA_ARGS__) \
|
||||
CPU_ATTN_DISPATCH_CASE(112, __VA_ARGS__) \
|
||||
CPU_ATTN_DISPATCH_CASE(128, __VA_ARGS__) \
|
||||
CPU_ATTN_DISPATCH_CASE(160, __VA_ARGS__) \
|
||||
CPU_ATTN_DISPATCH_CASE(192, __VA_ARGS__) \
|
||||
CPU_ATTN_DISPATCH_CASE(224, __VA_ARGS__) \
|
||||
CPU_ATTN_DISPATCH_CASE(256, __VA_ARGS__) \
|
||||
default: { \
|
||||
TORCH_CHECK(false, "Invalid CPU attention head_dim: " + \
|
||||
std::to_string(HEAD_DIM)); \
|
||||
} \
|
||||
} \
|
||||
}()
|
||||
|
||||
#define CPU_ATTN_DISPATCH_IMPL(ISA_TYPE, ...) \
|
||||
[&] { \
|
||||
switch (ISA_TYPE) { \
|
||||
AMX_DISPATCH(__VA_ARGS__) \
|
||||
NEON_DISPATCH(__VA_ARGS__) \
|
||||
case cpu_attention::ISA::VEC: { \
|
||||
using attn_impl = \
|
||||
cpu_attention::AttentionImpl<cpu_attention::ISA::VEC, scalar_t, \
|
||||
head_dim>; \
|
||||
return __VA_ARGS__(); \
|
||||
} \
|
||||
case cpu_attention::ISA::VEC16: { \
|
||||
using attn_impl = \
|
||||
cpu_attention::AttentionImpl<cpu_attention::ISA::VEC16, scalar_t, \
|
||||
head_dim>; \
|
||||
return __VA_ARGS__(); \
|
||||
} \
|
||||
default: { \
|
||||
TORCH_CHECK(false, "Invalid CPU attention ISA type."); \
|
||||
} \
|
||||
} \
|
||||
}()
|
||||
#include "cpu_attn_dispatch_generated.h"
|
||||
|
||||
torch::Tensor get_scheduler_metadata(
|
||||
const int64_t num_req, const int64_t num_heads_q,
|
||||
@@ -122,16 +47,14 @@ torch::Tensor get_scheduler_metadata(
|
||||
input.enable_kv_split = enable_kv_split;
|
||||
|
||||
VLLM_DISPATCH_FLOATING_TYPES(dtype, "get_scheduler_metadata", [&]() {
|
||||
CPU_ATTN_DISPATCH_CASE_HEADDIM(head_dim, [&] {
|
||||
CPU_ATTN_DISPATCH_IMPL(isa, [&]() {
|
||||
input.elem_size = sizeof(scalar_t);
|
||||
input.q_buffer_elem_size = sizeof(attn_impl::q_buffer_t);
|
||||
input.logits_buffer_elem_size = sizeof(attn_impl::logits_buffer_t);
|
||||
input.output_buffer_elem_size =
|
||||
sizeof(attn_impl::partial_output_buffer_t);
|
||||
input.max_num_q_per_iter = attn_impl::MaxQHeadNumPerIteration;
|
||||
input.kv_block_alignment = attn_impl::BlockSizeAlignment;
|
||||
});
|
||||
CPU_ATTN_DISPATCH(head_dim, isa, [&]() {
|
||||
input.elem_size = sizeof(scalar_t);
|
||||
input.q_buffer_elem_size = sizeof(attn_impl::q_buffer_t);
|
||||
input.logits_buffer_elem_size = sizeof(attn_impl::logits_buffer_t);
|
||||
input.output_buffer_elem_size =
|
||||
sizeof(attn_impl::partial_output_buffer_t);
|
||||
input.max_num_q_per_iter = attn_impl::MaxQHeadNumPerIteration;
|
||||
input.kv_block_alignment = attn_impl::BlockSizeAlignment;
|
||||
});
|
||||
});
|
||||
|
||||
@@ -184,18 +107,14 @@ void cpu_attn_reshape_and_cache(
|
||||
|
||||
VLLM_DISPATCH_FLOATING_TYPES(
|
||||
key.scalar_type(), "cpu_attn_reshape_and_cache", [&]() {
|
||||
CPU_ATTN_DISPATCH_CASE_HEADDIM(head_dim, [&] {
|
||||
CPU_ATTN_DISPATCH_IMPL(isa_tag, [&]() {
|
||||
attn_impl::reshape_and_cache(
|
||||
key.data_ptr<scalar_t>(), value.data_ptr<scalar_t>(),
|
||||
key_cache.data_ptr<scalar_t>(),
|
||||
value_cache.data_ptr<scalar_t>(),
|
||||
slot_mapping.data_ptr<int64_t>(), token_num,
|
||||
key_token_num_stride, value_token_num_stride, head_num,
|
||||
key_head_num_stride, value_head_num_stride, num_blocks,
|
||||
num_blocks_stride, cache_head_num_stride, block_size,
|
||||
block_size_stride);
|
||||
});
|
||||
CPU_ATTN_DISPATCH(head_dim, isa_tag, [&]() {
|
||||
attn_impl::reshape_and_cache(
|
||||
key.data_ptr<scalar_t>(), value.data_ptr<scalar_t>(),
|
||||
key_cache.data_ptr<scalar_t>(), value_cache.data_ptr<scalar_t>(),
|
||||
slot_mapping.data_ptr<int64_t>(), token_num, key_token_num_stride,
|
||||
value_token_num_stride, head_num, key_head_num_stride,
|
||||
value_head_num_stride, num_blocks, num_blocks_stride,
|
||||
cache_head_num_stride, block_size, block_size_stride);
|
||||
});
|
||||
});
|
||||
}
|
||||
@@ -257,12 +176,10 @@ void cpu_attention_with_kv_cache(
|
||||
|
||||
VLLM_DISPATCH_FLOATING_TYPES(
|
||||
query.scalar_type(), "cpu_attention_with_kv_cache", [&]() {
|
||||
CPU_ATTN_DISPATCH_CASE_HEADDIM(query.size(2), [&] {
|
||||
CPU_ATTN_DISPATCH_IMPL(input.metadata->isa, [&]() {
|
||||
TORCH_CHECK_EQ(input.block_size % attn_impl::BlockSizeAlignment, 0);
|
||||
cpu_attention::AttentionMainLoop<attn_impl> mainloop;
|
||||
mainloop(&input);
|
||||
});
|
||||
CPU_ATTN_DISPATCH(query.size(2), input.metadata->isa, [&]() {
|
||||
TORCH_CHECK_EQ(input.block_size % attn_impl::BlockSizeAlignment, 0);
|
||||
cpu_attention::AttentionMainLoop<attn_impl> mainloop;
|
||||
mainloop(&input);
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
@@ -377,7 +377,7 @@ class AttentionImpl<ISA::AMX, scalar_t, head_dim> {
|
||||
const int32_t q_heads_per_kv, const int64_t q_num_stride,
|
||||
const int64_t q_head_stride, const float scale) {
|
||||
constexpr int64_t bytes_per_head = head_dim * sizeof(scalar_t);
|
||||
// static_assert(bytes_per_head % AMX_TILE_ROW_BYTES == 0);
|
||||
static_assert(bytes_per_head % AMX_TILE_ROW_BYTES == 0);
|
||||
constexpr int64_t head_size_block_num = bytes_per_head / AMX_TILE_ROW_BYTES;
|
||||
constexpr int64_t head_elem_num_pre_block =
|
||||
AMX_TILE_ROW_BYTES / sizeof(scalar_t);
|
||||
|
||||
@@ -816,14 +816,10 @@ struct VecTypeTrait<float> {
|
||||
using vec_t = vec_op::FP32Vec16;
|
||||
};
|
||||
|
||||
// ARM only supports BF16 with ARMv8.6-A extension
|
||||
#if (defined(__aarch64__) && !defined(ARM_BF16_SUPPORT))
|
||||
#else
|
||||
template <>
|
||||
struct VecTypeTrait<c10::BFloat16> {
|
||||
using vec_t = vec_op::BF16Vec16;
|
||||
};
|
||||
#endif
|
||||
|
||||
#if !defined(__powerpc__) && !defined(__s390x__)
|
||||
template <>
|
||||
@@ -1111,7 +1107,8 @@ class AttentionMainLoop {
|
||||
if (sliding_window_left != -1) {
|
||||
pos = std::max(pos, curr_token_pos - sliding_window_left);
|
||||
}
|
||||
return pos;
|
||||
// Clamp to tile end to avoid OOB when window starts past the tile
|
||||
return std::min(pos, kv_tile_end_pos);
|
||||
}();
|
||||
|
||||
int32_t right_kv_pos = [&]() {
|
||||
@@ -1585,17 +1582,10 @@ class AttentionMainLoop {
|
||||
|
||||
if (use_sink) {
|
||||
alignas(64) float s_aux_fp32[16];
|
||||
#if defined(__aarch64__) && !defined(ARM_BF16_SUPPORT)
|
||||
// ARM without native BF16 support: manual conversion
|
||||
for (int i = 0; i < 16; ++i) {
|
||||
s_aux_fp32[i] = static_cast<float>(curr_s_aux[i]);
|
||||
}
|
||||
#else
|
||||
// All other platforms have BF16Vec16 available
|
||||
vec_op::BF16Vec16 vec_bf16(curr_s_aux);
|
||||
vec_op::FP32Vec16 vec_fp32(vec_bf16);
|
||||
vec_fp32.save(s_aux_fp32);
|
||||
#endif
|
||||
|
||||
float* __restrict__ curr_sum_buffer = sum_buffer;
|
||||
float* __restrict__ curr_max_buffer = max_buffer;
|
||||
|
||||
@@ -4,6 +4,9 @@
|
||||
#include "cpu_attn_impl.hpp"
|
||||
#include <arm_neon.h>
|
||||
#include <type_traits>
|
||||
#ifdef ARM_BF16_SUPPORT
|
||||
#include "cpu_attn_neon_bfmmla.hpp"
|
||||
#endif
|
||||
namespace cpu_attention {
|
||||
|
||||
namespace {
|
||||
@@ -57,7 +60,7 @@ FORCE_INLINE void load_row8_B_as_f32<c10::BFloat16>(const c10::BFloat16* p,
|
||||
#endif
|
||||
}
|
||||
|
||||
// Mx8, with 1 <= M <= 8 , K streamed, unroll-by-4 with NEON FMLAs
|
||||
// Mx8, with 1 <= M <= 8 , K streamed, unroll-by-4 with ASIMD FMLAs
|
||||
// #Loads = (K // 4) * (M + 4 * sizeof(kv_cache_t) / 2)
|
||||
// #FMLAs = (K // 4) * (4 * 2 * M)
|
||||
// We have (4 * 2 * M) FMLAs for (M + 4 * sizeof(kv_cache_t) / 2) loads
|
||||
@@ -264,7 +267,7 @@ class AttentionImpl<ISA::NEON, scalar_t, head_dim> {
|
||||
constexpr static ISA ISAType = ISA::NEON;
|
||||
constexpr static bool scale_on_logits = false; // apply scale on q_buffer
|
||||
|
||||
// static_assert(HeadDim % HeadDimAlignment == 0);
|
||||
static_assert(HeadDim % HeadDimAlignment == 0);
|
||||
// the gemm micro kernel is Mx8
|
||||
static_assert(HeadDimAlignment % 8 == 0);
|
||||
static_assert(BlockSizeAlignment % 8 == 0);
|
||||
@@ -381,6 +384,18 @@ class AttentionImpl<ISA::NEON, scalar_t, head_dim> {
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
#ifdef ARM_BF16_SUPPORT
|
||||
// For BF16 on Arm, reuse the BFMMLA kernels with 32-token alignment.
|
||||
template <int64_t head_dim>
|
||||
class AttentionImpl<ISA::NEON, c10::BFloat16, head_dim>
|
||||
: public AttentionImplNEONBFMMLA<BLOCK_SIZE_ALIGNMENT, ISA::NEON,
|
||||
head_dim> {};
|
||||
#endif
|
||||
} // namespace cpu_attention
|
||||
|
||||
#endif // #ifndef CPU_ATTN_NEON_HPP
|
||||
#undef BLOCK_SIZE_ALIGNMENT
|
||||
#undef HEAD_SIZE_ALIGNMENT
|
||||
#undef MAX_Q_HEAD_NUM_PER_ITER
|
||||
|
||||
#endif // #ifndef CPU_ATTN_ASIMD_HPP
|
||||
|
||||
682
csrc/cpu/cpu_attn_neon_bfmmla.hpp
Normal file
682
csrc/cpu/cpu_attn_neon_bfmmla.hpp
Normal file
@@ -0,0 +1,682 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
// SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
#ifndef CPU_ATTN_NEON_BFMMLA_HPP
|
||||
#define CPU_ATTN_NEON_BFMMLA_HPP
|
||||
|
||||
#include "cpu_attn_impl.hpp"
|
||||
|
||||
#include <arm_neon.h>
|
||||
|
||||
#include <cstdint>
|
||||
#include <vector>
|
||||
|
||||
namespace cpu_attention {
|
||||
|
||||
namespace {
|
||||
|
||||
// BFMMLA tile dimensions
|
||||
constexpr int32_t TILE_ROWS = 2; // M dimension
|
||||
constexpr int32_t TILE_K = 4; // K reduction
|
||||
constexpr int32_t TILE_COLS = 2; // N dimension (column-pair)
|
||||
|
||||
// Derived constants
|
||||
constexpr int32_t OUTPUT_COLS_PER_BLOCK = 8; // 4 column-pairs
|
||||
constexpr int32_t K_TOKENS_PER_GROUP = 8; // Tokens grouped in K cache
|
||||
constexpr int32_t V_TOKENS_PER_ROW_BLOCK = 4; // Tokens per V cache row block
|
||||
constexpr int32_t K_INNER_STRIDE = K_TOKENS_PER_GROUP * TILE_K;
|
||||
constexpr int32_t V_INNER_STRIDE = V_TOKENS_PER_ROW_BLOCK * TILE_COLS;
|
||||
constexpr int32_t PACK_ELEMENTS_PER_K_CHUNK = TILE_ROWS * TILE_K; // A packing
|
||||
|
||||
// Matrix Packing and Accumulator
|
||||
// Reshape two rows of Q into BFMMLA-friendly interleaved
|
||||
// Input: row0 = [a0,a1,a2,a3], row1 = [b0,b1,b2,b3]
|
||||
// Output: [a0,a1,a2,a3,b0,b1,b2,b3, a4,a5,a6,a7,b4,b5,b6,b7]
|
||||
// For K tail (K % TILE_K != 0): pads with zeros to complete the final chunk
|
||||
FORCE_INLINE void reshape_Q_2xK_for_bfmmla(const c10::BFloat16* __restrict r0,
|
||||
const c10::BFloat16* __restrict r1,
|
||||
c10::BFloat16* __restrict dst,
|
||||
int32_t K) {
|
||||
const uint16_t* s0 = reinterpret_cast<const uint16_t*>(r0);
|
||||
const uint16_t* s1 = reinterpret_cast<const uint16_t*>(r1);
|
||||
uint16_t* d = reinterpret_cast<uint16_t*>(dst);
|
||||
|
||||
// Process TILE_K elements at a time (PACK_ELEMENTS_PER_K_CHUNK output)
|
||||
int32_t k = 0;
|
||||
for (; k + TILE_K <= K; k += TILE_K, d += PACK_ELEMENTS_PER_K_CHUNK) {
|
||||
vst1q_u16(d, vcombine_u16(vld1_u16(s0 + k), vld1_u16(s1 + k)));
|
||||
}
|
||||
|
||||
// Handle K tail: pack remaining elements with zero-padding
|
||||
const int32_t tail = K - k;
|
||||
if (tail > 0) {
|
||||
// Pack remaining tail elements: [r0[k..k+tail-1], pad, r1[k..k+tail-1],
|
||||
// pad]
|
||||
for (int32_t t = 0; t < tail; ++t) {
|
||||
d[t] = s0[k + t];
|
||||
d[t + TILE_K] = s1[k + t];
|
||||
}
|
||||
// Zero-pad the rest
|
||||
for (int32_t t = tail; t < TILE_K; ++t) {
|
||||
d[t] = 0;
|
||||
d[t + TILE_K] = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// 2x2 accumulator load/store with compile-time row count
|
||||
template <int32_t m_rows>
|
||||
FORCE_INLINE float32x4_t load_acc_2x2(float* base, int64_t ldc, int col_off) {
|
||||
static_assert(m_rows == 1 || m_rows == 2);
|
||||
float32x2_t row0 = vld1_f32(base + col_off);
|
||||
float32x2_t row1 =
|
||||
(m_rows == 2) ? vld1_f32(base + ldc + col_off) : vdup_n_f32(0.f);
|
||||
return vcombine_f32(row0, row1);
|
||||
}
|
||||
|
||||
template <int32_t m_rows>
|
||||
FORCE_INLINE void store_acc_2x2(float32x4_t acc, float* base, int64_t ldc,
|
||||
int col_off) {
|
||||
static_assert(m_rows == 1 || m_rows == 2);
|
||||
vst1_f32(base + col_off, vget_low_f32(acc));
|
||||
if constexpr (m_rows == 2) {
|
||||
vst1_f32(base + ldc + col_off, vget_high_f32(acc));
|
||||
}
|
||||
}
|
||||
|
||||
// Initialize 4 column-pair accumulators for 2 rows (8 columns total)
|
||||
#define INIT_ACC_ROWPAIR_4(a0, a1, a2, a3, Crow, ldc, m_rows, accum) \
|
||||
do { \
|
||||
if (accum) { \
|
||||
if (m_rows == 2) { \
|
||||
a0 = load_acc_2x2<2>(Crow, ldc, 0); \
|
||||
a1 = load_acc_2x2<2>(Crow, ldc, 2); \
|
||||
a2 = load_acc_2x2<2>(Crow, ldc, 4); \
|
||||
a3 = load_acc_2x2<2>(Crow, ldc, 6); \
|
||||
} else { \
|
||||
a0 = load_acc_2x2<1>(Crow, ldc, 0); \
|
||||
a1 = load_acc_2x2<1>(Crow, ldc, 2); \
|
||||
a2 = load_acc_2x2<1>(Crow, ldc, 4); \
|
||||
a3 = load_acc_2x2<1>(Crow, ldc, 6); \
|
||||
} \
|
||||
} else { \
|
||||
a0 = a1 = a2 = a3 = vdupq_n_f32(0.f); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
// Store 4 column-pair accumulators back to C matrix
|
||||
#define STORE_ACC_ROWPAIR_4(a0, a1, a2, a3, Crow, ldc, m_rows) \
|
||||
do { \
|
||||
if (m_rows == 2) { \
|
||||
store_acc_2x2<2>(a0, Crow, ldc, 0); \
|
||||
store_acc_2x2<2>(a1, Crow, ldc, 2); \
|
||||
store_acc_2x2<2>(a2, Crow, ldc, 4); \
|
||||
store_acc_2x2<2>(a3, Crow, ldc, 6); \
|
||||
} else { \
|
||||
store_acc_2x2<1>(a0, Crow, ldc, 0); \
|
||||
store_acc_2x2<1>(a1, Crow, ldc, 2); \
|
||||
store_acc_2x2<1>(a2, Crow, ldc, 4); \
|
||||
store_acc_2x2<1>(a3, Crow, ldc, 6); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
// Perform 4 BFMMLA operations: acc += A @ B for 4 column-pairs
|
||||
#define BFMMLA_COMPUTE_4(r0, r1, r2, r3, a, b0, b1, b2, b3) \
|
||||
do { \
|
||||
r0 = vbfmmlaq_f32(r0, a, b0); \
|
||||
r1 = vbfmmlaq_f32(r1, a, b1); \
|
||||
r2 = vbfmmlaq_f32(r2, a, b2); \
|
||||
r3 = vbfmmlaq_f32(r3, a, b3); \
|
||||
} while (0)
|
||||
|
||||
// Micro-kernel: updates a small fixed tile using BFMMLA.
|
||||
// RP = number of row-pairs (1,2,4)
|
||||
// Computes C[TILE_ROWS*RP, OUTPUT_COLS_PER_BLOCK] += A_packed @ B.
|
||||
// A_packed interleaves RP row-pairs; B layout is driven by the attention phase:
|
||||
// - AttentionGemmPhase::QK -> token-column layout (Q @ K^T)
|
||||
// - AttentionGemmPhase::PV -> token-row layout (P @ V)
|
||||
// K_static < 0 enables runtime K (PV only)
|
||||
template <int32_t RP, int32_t K_static, AttentionGemmPhase phase>
|
||||
FORCE_INLINE void gemm_rowpairs_x8_bfmmla_neon(
|
||||
const bfloat16_t* const* __restrict A_packed_rp,
|
||||
const int32_t* __restrict m_rows_rp, const bfloat16_t* __restrict B_blk,
|
||||
float* __restrict C, int64_t ldc, bool accumulate, int64_t b_stride,
|
||||
int32_t K_runtime = 0) {
|
||||
static_assert(RP == 1 || RP == 2 || RP == 4, "RP must be 1,2,4");
|
||||
static_assert(K_static < 0 || K_static % TILE_K == 0,
|
||||
"K must be divisible by TILE_K");
|
||||
static_assert(K_static >= 0 || phase == AttentionGemmPhase::PV,
|
||||
"Runtime K only supported for PV");
|
||||
|
||||
constexpr bool runtime_k = (K_static < 0);
|
||||
const int32_t K_iters =
|
||||
runtime_k ? (K_runtime / TILE_K) : (K_static / TILE_K);
|
||||
const int32_t K_tail = runtime_k ? (K_runtime % TILE_K) : 0;
|
||||
|
||||
if (!runtime_k) {
|
||||
// Help the compiler fold away unused K_runtime when K is compile-time
|
||||
(void)K_runtime;
|
||||
}
|
||||
|
||||
auto* C_al = C;
|
||||
const auto* B_al = B_blk;
|
||||
|
||||
// Setup A pointers
|
||||
const bfloat16_t* a_ptr[4] = {
|
||||
A_packed_rp[0],
|
||||
(RP >= 2) ? A_packed_rp[1] : nullptr,
|
||||
(RP >= 4) ? A_packed_rp[2] : nullptr,
|
||||
(RP >= 4) ? A_packed_rp[3] : nullptr,
|
||||
};
|
||||
|
||||
// Setup B pointers based on layout
|
||||
const bfloat16_t* b_ptr[4];
|
||||
if constexpr (phase == AttentionGemmPhase::PV) {
|
||||
b_ptr[0] = B_blk + 0 * b_stride;
|
||||
b_ptr[1] = B_blk + 1 * b_stride;
|
||||
b_ptr[2] = B_blk + 2 * b_stride;
|
||||
b_ptr[3] = B_blk + 3 * b_stride;
|
||||
}
|
||||
|
||||
float32x4_t acc[4][4];
|
||||
|
||||
// Initialize accumulators
|
||||
#define INIT_RP(rp) \
|
||||
if constexpr (RP > rp) { \
|
||||
INIT_ACC_ROWPAIR_4(acc[rp][0], acc[rp][1], acc[rp][2], acc[rp][3], \
|
||||
C_al + (rp * 2) * ldc, ldc, m_rows_rp[rp], accumulate); \
|
||||
}
|
||||
INIT_RP(0);
|
||||
INIT_RP(1);
|
||||
INIT_RP(2);
|
||||
INIT_RP(3);
|
||||
#undef INIT_RP
|
||||
|
||||
// Main compute loop
|
||||
for (int32_t ki = 0; ki < K_iters; ++ki) {
|
||||
bfloat16x8_t b0, b1, b2, b3;
|
||||
if constexpr (phase == AttentionGemmPhase::PV) {
|
||||
b0 = vld1q_bf16(b_ptr[0] + ki * V_INNER_STRIDE);
|
||||
b1 = vld1q_bf16(b_ptr[1] + ki * V_INNER_STRIDE);
|
||||
b2 = vld1q_bf16(b_ptr[2] + ki * V_INNER_STRIDE);
|
||||
b3 = vld1q_bf16(b_ptr[3] + ki * V_INNER_STRIDE);
|
||||
} else {
|
||||
const bfloat16_t* b_base = B_al + ki * b_stride;
|
||||
b0 = vld1q_bf16(b_base + 0 * V_INNER_STRIDE);
|
||||
b1 = vld1q_bf16(b_base + 1 * V_INNER_STRIDE);
|
||||
b2 = vld1q_bf16(b_base + 2 * V_INNER_STRIDE);
|
||||
b3 = vld1q_bf16(b_base + 3 * V_INNER_STRIDE);
|
||||
}
|
||||
|
||||
#define COMPUTE_RP(rp) \
|
||||
if constexpr (RP > rp) { \
|
||||
bfloat16x8_t a = vld1q_bf16(a_ptr[rp] + ki * PACK_ELEMENTS_PER_K_CHUNK); \
|
||||
BFMMLA_COMPUTE_4(acc[rp][0], acc[rp][1], acc[rp][2], acc[rp][3], a, b0, \
|
||||
b1, b2, b3); \
|
||||
}
|
||||
COMPUTE_RP(0);
|
||||
COMPUTE_RP(1);
|
||||
COMPUTE_RP(2);
|
||||
COMPUTE_RP(3);
|
||||
#undef COMPUTE_RP
|
||||
}
|
||||
|
||||
// K tail for runtime PV: fallback path
|
||||
if constexpr (runtime_k) {
|
||||
if (K_tail > 0) {
|
||||
const int32_t tail_offset = K_iters * V_INNER_STRIDE;
|
||||
const int32_t a_tail_offset = K_iters * PACK_ELEMENTS_PER_K_CHUNK;
|
||||
for (int32_t kt = 0; kt < K_tail; ++kt) {
|
||||
float32x4_t b_vecs[4];
|
||||
for (int32_t p = 0; p < 4; ++p) {
|
||||
const bfloat16_t* bp = b_ptr[p] + tail_offset + kt * TILE_COLS;
|
||||
const float b0 = vcvtah_f32_bf16(bp[0]);
|
||||
const float b1 = vcvtah_f32_bf16(bp[1]);
|
||||
const float32x2_t b_pair = vset_lane_f32(b1, vdup_n_f32(b0), 1);
|
||||
b_vecs[p] = vcombine_f32(b_pair, b_pair);
|
||||
}
|
||||
|
||||
#define TAIL_RP(rp) \
|
||||
if constexpr (RP > rp) { \
|
||||
const bfloat16_t* ap = A_packed_rp[rp] + a_tail_offset; \
|
||||
float a_row0 = vcvtah_f32_bf16(ap[kt]); \
|
||||
float a_row1 = \
|
||||
(m_rows_rp[rp] == 2) ? vcvtah_f32_bf16(ap[kt + TILE_K]) : 0.0f; \
|
||||
const float32x4_t a_vec = \
|
||||
vcombine_f32(vdup_n_f32(a_row0), vdup_n_f32(a_row1)); \
|
||||
for (int32_t p = 0; p < 4; ++p) { \
|
||||
acc[rp][p] = vmlaq_f32(acc[rp][p], a_vec, b_vecs[p]); \
|
||||
} \
|
||||
}
|
||||
TAIL_RP(0);
|
||||
TAIL_RP(1);
|
||||
TAIL_RP(2);
|
||||
TAIL_RP(3);
|
||||
#undef TAIL_RP
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Store results
|
||||
#define STORE_RP(rp) \
|
||||
if constexpr (RP > rp) { \
|
||||
STORE_ACC_ROWPAIR_4(acc[rp][0], acc[rp][1], acc[rp][2], acc[rp][3], \
|
||||
C_al + (rp * 2) * ldc, ldc, m_rows_rp[rp]); \
|
||||
}
|
||||
STORE_RP(0);
|
||||
STORE_RP(1);
|
||||
STORE_RP(2);
|
||||
STORE_RP(3);
|
||||
#undef STORE_RP
|
||||
}
|
||||
|
||||
// Meso-kernel: packs a small MBxK slice of A, then tiles over N and calls the
|
||||
// micro-kernel for each OUTPUT_COLS_PER_BLOCK chunk. K_static < 0 enables
|
||||
// runtime K (PV only).
|
||||
template <int32_t MB, int32_t N, int32_t K_static, AttentionGemmPhase phase>
|
||||
FORCE_INLINE void gemm_packA_compute_MB_xN(
|
||||
const c10::BFloat16* __restrict A, const c10::BFloat16* __restrict B,
|
||||
float* __restrict C, int32_t K_runtime, int64_t lda, int64_t ldc,
|
||||
int64_t b_layout_stride, int64_t b_reduction_stride, bool accumulate) {
|
||||
static_assert(MB >= 1 && MB <= 8, "MB must be in [1,8]");
|
||||
static_assert(N % OUTPUT_COLS_PER_BLOCK == 0,
|
||||
"N must be a multiple of OUTPUT_COLS_PER_BLOCK");
|
||||
static_assert(K_static < 0 || K_static % TILE_K == 0,
|
||||
"K must be divisible by TILE_K");
|
||||
static_assert(K_static >= 0 || phase == AttentionGemmPhase::PV,
|
||||
"Runtime K only supported for PV");
|
||||
|
||||
constexpr bool runtime_k = (K_static < 0);
|
||||
const int32_t K_val = runtime_k ? K_runtime : K_static;
|
||||
|
||||
// Keep small packs on-stack to avoid heap churn
|
||||
constexpr int32_t STACK_PACK_STRIDE =
|
||||
(1024 / TILE_K) * PACK_ELEMENTS_PER_K_CHUNK;
|
||||
|
||||
constexpr int32_t ROW_PAIRS = (MB + 1) / TILE_ROWS;
|
||||
const int32_t pack_stride =
|
||||
runtime_k ? ((K_val + TILE_K - 1) / TILE_K) * PACK_ELEMENTS_PER_K_CHUNK
|
||||
: (K_static / TILE_K) * PACK_ELEMENTS_PER_K_CHUNK;
|
||||
|
||||
alignas(64) c10::BFloat16 A_packed_stack[ROW_PAIRS * STACK_PACK_STRIDE];
|
||||
std::vector<c10::BFloat16> A_packed_heap;
|
||||
c10::BFloat16* A_packed =
|
||||
(pack_stride <= STACK_PACK_STRIDE)
|
||||
? A_packed_stack
|
||||
: (A_packed_heap.resize(ROW_PAIRS * pack_stride),
|
||||
A_packed_heap.data());
|
||||
|
||||
for (int32_t rp = 0; rp < ROW_PAIRS; ++rp) {
|
||||
const int32_t m = rp * TILE_ROWS;
|
||||
const int32_t m_rows = (m + 1 < MB) ? TILE_ROWS : 1;
|
||||
const c10::BFloat16* A0 = A + m * lda;
|
||||
const c10::BFloat16* A1 = (m_rows == TILE_ROWS) ? (A + (m + 1) * lda) : A0;
|
||||
reshape_Q_2xK_for_bfmmla(A0, A1, A_packed + rp * pack_stride, K_val);
|
||||
}
|
||||
|
||||
for (int32_t n = 0; n < N; n += OUTPUT_COLS_PER_BLOCK) {
|
||||
const c10::BFloat16* B_blk_c10 =
|
||||
(phase == AttentionGemmPhase::PV)
|
||||
? (B + (n / TILE_COLS) * b_layout_stride)
|
||||
: (B + (n / OUTPUT_COLS_PER_BLOCK) * b_layout_stride);
|
||||
const bfloat16_t* B_blk = reinterpret_cast<const bfloat16_t*>(B_blk_c10);
|
||||
|
||||
// Process row-pairs in groups of 4, 2, then 1
|
||||
int32_t row_pair_idx = 0;
|
||||
|
||||
#define PROCESS_RP_GROUP(group_size) \
|
||||
for (; row_pair_idx + (group_size - 1) < ROW_PAIRS; \
|
||||
row_pair_idx += group_size) { \
|
||||
const bfloat16_t* Ap[group_size]; \
|
||||
int32_t mr[group_size]; \
|
||||
for (int32_t i = 0; i < group_size; ++i) { \
|
||||
Ap[i] = reinterpret_cast<const bfloat16_t*>( \
|
||||
A_packed + (row_pair_idx + i) * pack_stride); \
|
||||
mr[i] = (((row_pair_idx + i) * TILE_ROWS + 1) < MB) ? TILE_ROWS : 1; \
|
||||
} \
|
||||
float* C_blk = C + (row_pair_idx * TILE_ROWS) * ldc + n; \
|
||||
if constexpr (runtime_k) { \
|
||||
gemm_rowpairs_x8_bfmmla_neon<group_size, -1, phase>( \
|
||||
Ap, mr, B_blk, C_blk, ldc, accumulate, b_layout_stride, K_val); \
|
||||
} else { \
|
||||
gemm_rowpairs_x8_bfmmla_neon<group_size, K_static, phase>( \
|
||||
Ap, mr, B_blk, C_blk, ldc, accumulate, \
|
||||
(phase == AttentionGemmPhase::PV) ? b_layout_stride \
|
||||
: b_reduction_stride); \
|
||||
} \
|
||||
}
|
||||
|
||||
PROCESS_RP_GROUP(4);
|
||||
PROCESS_RP_GROUP(2);
|
||||
PROCESS_RP_GROUP(1);
|
||||
#undef PROCESS_RP_GROUP
|
||||
}
|
||||
}
|
||||
|
||||
// Macro-kernel: iterates over M in MB={8,4,2,1} chunks.
|
||||
// Supports compile-time K specialization when K >= 0; otherwise uses runtime K
|
||||
// (runtime K path is only supported for PV).
|
||||
template <AttentionGemmPhase phase, int32_t N, int32_t K = -1>
|
||||
FORCE_INLINE void gemm_macro_neon_bfmmla(
|
||||
const c10::BFloat16* __restrict A, const c10::BFloat16* __restrict B,
|
||||
float* __restrict C, int32_t M, int32_t K_runtime, int64_t lda, int64_t ldc,
|
||||
int64_t b_layout_stride, int64_t b_reduction_stride, bool accumulate) {
|
||||
static_assert(N % OUTPUT_COLS_PER_BLOCK == 0,
|
||||
"N must be a multiple of OUTPUT_COLS_PER_BLOCK");
|
||||
|
||||
if constexpr (K >= 0) {
|
||||
static_assert(K % TILE_K == 0, "K must be divisible by TILE_K");
|
||||
for (int32_t m = 0; m < M;) {
|
||||
const int32_t rem = M - m;
|
||||
const c10::BFloat16* A_blk = A + m * lda;
|
||||
float* C_blk = C + m * ldc;
|
||||
|
||||
#define DISPATCH_MB(mb) \
|
||||
gemm_packA_compute_MB_xN<mb, N, K, phase>(A_blk, B, C_blk, 0, lda, ldc, \
|
||||
b_layout_stride, \
|
||||
b_reduction_stride, accumulate)
|
||||
|
||||
if (rem >= 8) {
|
||||
DISPATCH_MB(8);
|
||||
m += 8;
|
||||
} else if (rem >= 4) {
|
||||
DISPATCH_MB(4);
|
||||
m += 4;
|
||||
} else if (rem >= 2) {
|
||||
DISPATCH_MB(2);
|
||||
m += 2;
|
||||
} else {
|
||||
DISPATCH_MB(1);
|
||||
m += 1;
|
||||
}
|
||||
#undef DISPATCH_MB
|
||||
}
|
||||
} else {
|
||||
static_assert(phase == AttentionGemmPhase::PV,
|
||||
"Runtime K specialization only supported for PV.");
|
||||
const int32_t K_val = K_runtime;
|
||||
|
||||
for (int32_t m = 0; m < M;) {
|
||||
const int32_t rem = M - m;
|
||||
const c10::BFloat16* A_blk = A + m * lda;
|
||||
float* C_blk = C + m * ldc;
|
||||
|
||||
#define DISPATCH_MB_RUNTIME(mb) \
|
||||
gemm_packA_compute_MB_xN<mb, N, -1, phase>(A_blk, B, C_blk, K_val, lda, ldc, \
|
||||
b_layout_stride, \
|
||||
b_reduction_stride, accumulate)
|
||||
|
||||
if (rem >= 8) {
|
||||
DISPATCH_MB_RUNTIME(8);
|
||||
m += 8;
|
||||
} else if (rem >= 4) {
|
||||
DISPATCH_MB_RUNTIME(4);
|
||||
m += 4;
|
||||
} else if (rem >= 2) {
|
||||
DISPATCH_MB_RUNTIME(2);
|
||||
m += 2;
|
||||
} else {
|
||||
DISPATCH_MB_RUNTIME(1);
|
||||
m += 1;
|
||||
}
|
||||
#undef DISPATCH_MB_RUNTIME
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#undef INIT_ACC_ROWPAIR_4
|
||||
#undef STORE_ACC_ROWPAIR_4
|
||||
#undef BFMMLA_COMPUTE_4
|
||||
|
||||
} // namespace
|
||||
|
||||
// TileGemm Adapter for Attention
|
||||
|
||||
template <typename kv_cache_t, int32_t BlockTokens, int32_t HeadDim>
|
||||
class TileGemmNEONBFMMLA {
|
||||
public:
|
||||
template <AttentionGemmPhase phase, int32_t head_dim_ct>
|
||||
FORCE_INLINE static void gemm(const int32_t m_size, void* __restrict__ a_tile,
|
||||
kv_cache_t* __restrict__ b_tile,
|
||||
float* __restrict__ c_tile, const int64_t lda,
|
||||
[[maybe_unused]] const int64_t ldb,
|
||||
const int64_t ldc,
|
||||
[[maybe_unused]] const int32_t block_size,
|
||||
[[maybe_unused]] const int32_t dynamic_k_size,
|
||||
const bool accum_c) {
|
||||
static_assert(BlockTokens % OUTPUT_COLS_PER_BLOCK == 0);
|
||||
// BFMMLA kernels require compile-time head_dim; keep head_dim_ct only for
|
||||
// API parity with other tile_gemm implementations.
|
||||
if constexpr (head_dim_ct >= 0) {
|
||||
static_assert(head_dim_ct == HeadDim,
|
||||
"BFMMLA expects head_dim_ct to match HeadDim; PV passes "
|
||||
"-1 for API parity.");
|
||||
}
|
||||
|
||||
if constexpr (phase == AttentionGemmPhase::QK) {
|
||||
const int64_t b_reduction_stride = K_INNER_STRIDE;
|
||||
const int64_t b_token_block_stride = (HeadDim / TILE_K) * K_INNER_STRIDE;
|
||||
|
||||
gemm_macro_neon_bfmmla<AttentionGemmPhase::QK, BlockTokens, HeadDim>(
|
||||
reinterpret_cast<const c10::BFloat16*>(a_tile), b_tile, c_tile,
|
||||
m_size, 0, lda, ldc, b_token_block_stride, b_reduction_stride,
|
||||
accum_c);
|
||||
} else {
|
||||
const int64_t b_pair_stride =
|
||||
(block_size / V_TOKENS_PER_ROW_BLOCK) * V_INNER_STRIDE;
|
||||
|
||||
// PV gemm with runtime K specialization
|
||||
switch (dynamic_k_size) {
|
||||
case 32:
|
||||
gemm_macro_neon_bfmmla<AttentionGemmPhase::PV, HeadDim, 32>(
|
||||
reinterpret_cast<const c10::BFloat16*>(a_tile), b_tile, c_tile,
|
||||
m_size, 32, lda, ldc, b_pair_stride, 0, accum_c);
|
||||
break;
|
||||
case 128:
|
||||
gemm_macro_neon_bfmmla<AttentionGemmPhase::PV, HeadDim, 128>(
|
||||
reinterpret_cast<const c10::BFloat16*>(a_tile), b_tile, c_tile,
|
||||
m_size, 128, lda, ldc, b_pair_stride, 0, accum_c);
|
||||
break;
|
||||
case 256:
|
||||
gemm_macro_neon_bfmmla<AttentionGemmPhase::PV, HeadDim, 256>(
|
||||
reinterpret_cast<const c10::BFloat16*>(a_tile), b_tile, c_tile,
|
||||
m_size, 256, lda, ldc, b_pair_stride, 0, accum_c);
|
||||
break;
|
||||
default:
|
||||
gemm_macro_neon_bfmmla<AttentionGemmPhase::PV, HeadDim>(
|
||||
reinterpret_cast<const c10::BFloat16*>(a_tile), b_tile, c_tile,
|
||||
m_size, dynamic_k_size, lda, ldc, b_pair_stride, 0, accum_c);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
// Shared ASIMD BFMMLA implementation (BF16 only). The block size alignment and
|
||||
// ISA tag are template parameters so we can reuse the same kernels for
|
||||
// different NEON configurations.
|
||||
template <int64_t block_size_alignment, ISA isa_type, int64_t head_dim>
|
||||
class AttentionImplNEONBFMMLA {
|
||||
public:
|
||||
using query_t = c10::BFloat16;
|
||||
using q_buffer_t = c10::BFloat16;
|
||||
using kv_cache_t = c10::BFloat16;
|
||||
using logits_buffer_t = float;
|
||||
using partial_output_buffer_t = float;
|
||||
using prob_buffer_t = c10::BFloat16;
|
||||
|
||||
static constexpr int64_t BlockSizeAlignment = block_size_alignment;
|
||||
// HeadDimAlignment equals head_dim so that the PV phase processes
|
||||
// the full head dimension in a single gemm call.
|
||||
static constexpr int64_t HeadDimAlignment = head_dim;
|
||||
static constexpr int64_t MaxQHeadNumPerIteration = 16;
|
||||
static constexpr int64_t HeadDim = head_dim;
|
||||
static constexpr ISA ISAType = isa_type;
|
||||
static constexpr bool scale_on_logits = false;
|
||||
|
||||
static_assert(HeadDim % OUTPUT_COLS_PER_BLOCK == 0);
|
||||
static_assert(BlockSizeAlignment % OUTPUT_COLS_PER_BLOCK == 0);
|
||||
static_assert(HeadDim % TILE_K == 0, "HeadDim must be a multiple of TILE_K");
|
||||
|
||||
public:
|
||||
template <template <typename tile_gemm_t> typename attention>
|
||||
FORCE_INLINE void execute_attention(DEFINE_CPU_ATTENTION_PARAMS) {
|
||||
attention<
|
||||
TileGemmNEONBFMMLA<kv_cache_t, static_cast<int32_t>(BlockSizeAlignment),
|
||||
static_cast<int32_t>(HeadDim)>>
|
||||
attention_iteration;
|
||||
attention_iteration(CPU_ATTENTION_PARAMS);
|
||||
}
|
||||
|
||||
// Key cache stride per token group (TokenColumn layout; QK)
|
||||
static constexpr int64_t k_cache_token_group_stride(
|
||||
[[maybe_unused]] const int32_t block_size) {
|
||||
static_assert(BlockSizeAlignment % K_TOKENS_PER_GROUP == 0);
|
||||
return (BlockSizeAlignment / K_TOKENS_PER_GROUP) *
|
||||
((head_dim / TILE_K) * K_INNER_STRIDE);
|
||||
}
|
||||
|
||||
// Value cache stride per token group (TokenRow layout; PV)
|
||||
static constexpr int64_t v_cache_token_group_stride(
|
||||
[[maybe_unused]] const int32_t block_size) {
|
||||
static_assert(BlockSizeAlignment % V_TOKENS_PER_ROW_BLOCK == 0);
|
||||
return (BlockSizeAlignment / V_TOKENS_PER_ROW_BLOCK) * V_INNER_STRIDE;
|
||||
}
|
||||
|
||||
// The stride to move to the "next" head_dim group
|
||||
// is the full V cache size per head, since HeadDimAlignment == head_dim.
|
||||
// Hence, the stride is not used in this case
|
||||
static constexpr int64_t v_cache_head_group_stride(
|
||||
[[maybe_unused]] const int32_t block_size) {
|
||||
return head_dim * block_size;
|
||||
}
|
||||
|
||||
// Convert Q heads to BF16 and apply scale factor using native BF16 intrinsics
|
||||
static void copy_q_heads_tile(c10::BFloat16* __restrict__ src,
|
||||
c10::BFloat16* __restrict__ q_buffer,
|
||||
const int32_t q_num,
|
||||
const int32_t q_heads_per_kv,
|
||||
const int64_t q_num_stride,
|
||||
const int64_t q_head_stride, float scale) {
|
||||
constexpr int32_t dim = static_cast<int32_t>(head_dim);
|
||||
const float32x4_t scale_vec = vdupq_n_f32(scale);
|
||||
|
||||
for (int32_t qi = 0; qi < q_num; ++qi) {
|
||||
for (int32_t hi = 0; hi < q_heads_per_kv; ++hi) {
|
||||
c10::BFloat16* __restrict__ curr_q =
|
||||
src + qi * q_num_stride + hi * q_head_stride;
|
||||
c10::BFloat16* __restrict__ dst =
|
||||
q_buffer + qi * q_heads_per_kv * head_dim + hi * head_dim;
|
||||
|
||||
for (int32_t i = 0; i < dim; i += OUTPUT_COLS_PER_BLOCK) {
|
||||
bfloat16x8_t in8 =
|
||||
vld1q_bf16(reinterpret_cast<const bfloat16_t*>(curr_q + i));
|
||||
float32x4_t lo = vmulq_f32(vcvtq_low_f32_bf16(in8), scale_vec);
|
||||
float32x4_t hi = vmulq_f32(vcvtq_high_f32_bf16(in8), scale_vec);
|
||||
|
||||
bfloat16x4_t lo_b = vcvt_bf16_f32(lo);
|
||||
bfloat16x4_t hi_b = vcvt_bf16_f32(hi);
|
||||
bfloat16x8_t out = vcombine_bf16(lo_b, hi_b);
|
||||
vst1q_bf16(reinterpret_cast<bfloat16_t*>(dst + i), out);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public:
|
||||
// Reshape and cache K/V into BFMMLA-optimized layouts
|
||||
// K cache:
|
||||
// [block_size/K_TOKENS_PER_GROUP][head_dim/TILE_K][K_INNER_STRIDE]
|
||||
// - TokenColumn
|
||||
// V cache:
|
||||
// [head_dim/TILE_COLS][block_size/V_TOKENS_PER_ROW_BLOCK][V_INNER_STRIDE]
|
||||
// - TokenRows
|
||||
static void reshape_and_cache(
|
||||
const c10::BFloat16* __restrict__ key,
|
||||
const c10::BFloat16* __restrict__ value,
|
||||
c10::BFloat16* __restrict__ key_cache,
|
||||
c10::BFloat16* __restrict__ value_cache,
|
||||
const int64_t* __restrict__ slot_mapping, const int64_t token_num,
|
||||
const int64_t key_token_num_stride, const int64_t value_token_num_stride,
|
||||
const int64_t head_num, const int64_t key_head_num_stride,
|
||||
const int64_t value_head_num_stride,
|
||||
[[maybe_unused]] const int64_t num_blocks,
|
||||
const int64_t num_blocks_stride, const int64_t cache_head_num_stride,
|
||||
const int64_t block_size,
|
||||
[[maybe_unused]] const int64_t block_size_stride) {
|
||||
const int64_t k_block_stride = (head_dim / TILE_K) * K_INNER_STRIDE;
|
||||
const int64_t v_pair_stride =
|
||||
(block_size / V_TOKENS_PER_ROW_BLOCK) * V_INNER_STRIDE;
|
||||
|
||||
#pragma omp parallel for
|
||||
for (int64_t head_idx = 0; head_idx < head_num; ++head_idx) {
|
||||
for (int64_t token_idx = 0; token_idx < token_num; ++token_idx) {
|
||||
const int64_t pos = slot_mapping[token_idx];
|
||||
if (pos < 0) continue;
|
||||
|
||||
const int64_t block_idx = pos / block_size;
|
||||
const int64_t block_offset = pos % block_size;
|
||||
|
||||
// Key cache: TokenColumn QK
|
||||
{
|
||||
const c10::BFloat16* __restrict key_src =
|
||||
key + token_idx * key_token_num_stride +
|
||||
head_idx * key_head_num_stride;
|
||||
|
||||
c10::BFloat16* __restrict key_base = key_cache +
|
||||
block_idx * num_blocks_stride +
|
||||
head_idx * cache_head_num_stride;
|
||||
|
||||
const int64_t block_in_block = block_offset / K_TOKENS_PER_GROUP;
|
||||
const int64_t pair_in_block =
|
||||
(block_offset % K_TOKENS_PER_GROUP) / TILE_COLS;
|
||||
const int64_t lane_base = (block_offset & 1) ? TILE_K : 0;
|
||||
|
||||
c10::BFloat16* __restrict block_base =
|
||||
key_base + block_in_block * k_block_stride;
|
||||
|
||||
for (int64_t hd4 = 0; hd4 < head_dim / TILE_K; ++hd4) {
|
||||
uint16_t* dst_u16 = reinterpret_cast<uint16_t*>(
|
||||
block_base + hd4 * K_INNER_STRIDE +
|
||||
pair_in_block * V_INNER_STRIDE + lane_base);
|
||||
const uint16_t* src_u16 =
|
||||
reinterpret_cast<const uint16_t*>(key_src + hd4 * TILE_K);
|
||||
vst1_u16(dst_u16, vld1_u16(src_u16));
|
||||
}
|
||||
}
|
||||
|
||||
// Value cache: TokenRow PV
|
||||
{
|
||||
const c10::BFloat16* __restrict value_src =
|
||||
value + token_idx * value_token_num_stride +
|
||||
head_idx * value_head_num_stride;
|
||||
|
||||
c10::BFloat16* __restrict value_base =
|
||||
value_cache + block_idx * num_blocks_stride +
|
||||
head_idx * cache_head_num_stride;
|
||||
|
||||
const int64_t row_block = block_offset / V_TOKENS_PER_ROW_BLOCK;
|
||||
const int64_t lane = block_offset & (V_TOKENS_PER_ROW_BLOCK - 1);
|
||||
|
||||
c10::BFloat16* __restrict row_block_base =
|
||||
value_base + row_block * V_INNER_STRIDE;
|
||||
|
||||
for (int64_t hd2 = 0; hd2 < head_dim / TILE_COLS; ++hd2) {
|
||||
c10::BFloat16* __restrict dst_val =
|
||||
row_block_base + hd2 * v_pair_stride;
|
||||
|
||||
const uint16_t* src_u16 =
|
||||
reinterpret_cast<const uint16_t*>(value_src);
|
||||
uint16_t* dst_u16 = reinterpret_cast<uint16_t*>(dst_val);
|
||||
dst_u16[lane] = src_u16[hd2 * TILE_COLS + 0];
|
||||
dst_u16[lane + V_TOKENS_PER_ROW_BLOCK] =
|
||||
src_u16[hd2 * TILE_COLS + 1];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace cpu_attention
|
||||
|
||||
#endif // CPU_ATTN_ASIMD_BFMMLA_HPP
|
||||
File diff suppressed because it is too large
Load Diff
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user