Compare commits
1222 Commits
ci/build/2
...
v0.11.1rc2
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
c9461e05a4 | ||
|
|
4dfdb821c8 | ||
|
|
58fab50d82 | ||
|
|
db6f28d898 | ||
|
|
14e2f1231e | ||
|
|
7c4767f1eb | ||
|
|
9771e0b432 | ||
|
|
980de31ca0 | ||
|
|
1c160841ea | ||
|
|
4ca13a8667 | ||
|
|
675aa2ec64 | ||
|
|
3ae082c373 | ||
|
|
49c00fe304 | ||
|
|
141d3b9fc5 | ||
|
|
abf3db40ef | ||
|
|
8e4ca4d14e | ||
|
|
1a0f4defb7 | ||
|
|
843af7f7fc | ||
|
|
1f633b8632 | ||
|
|
a4c29e6e82 | ||
|
|
8f18feb191 | ||
|
|
ed540d6d4c | ||
|
|
f6027b2855 | ||
|
|
ab3e80042e | ||
|
|
ceacedc1f9 | ||
|
|
bfa59be8f1 | ||
|
|
265ecb05fb | ||
|
|
09a7e6f617 | ||
|
|
6c2eef5a5d | ||
|
|
19748806f0 | ||
|
|
4a8a567e16 | ||
|
|
344a0017c0 | ||
|
|
becb7de40b | ||
|
|
250fb1b8ea | ||
|
|
647214f3d5 | ||
|
|
ddeec11ba9 | ||
|
|
86ed77022d | ||
|
|
aa1356ec53 | ||
|
|
ecc3c0940a | ||
|
|
ba09652de2 | ||
|
|
bd66b8529b | ||
|
|
6c728f7771 | ||
|
|
80e9452984 | ||
|
|
c3a2c6ac5f | ||
|
|
72f431e709 | ||
|
|
be4445072c | ||
|
|
f381cf2302 | ||
|
|
5ff5d94e77 | ||
|
|
f95da13c3d | ||
|
|
aef368aa08 | ||
|
|
5f6cbf60d6 | ||
|
|
3ada34f9cb | ||
|
|
0eb8f2b880 | ||
|
|
163965d183 | ||
|
|
a03cf9bc70 | ||
|
|
352c0c8a28 | ||
|
|
bfe0b4bd2a | ||
|
|
58fbbcb2f5 | ||
|
|
87778d5f00 | ||
|
|
f9e7ad5400 | ||
|
|
4d0f266113 | ||
|
|
e93ff6c8b9 | ||
|
|
1c691f4a71 | ||
|
|
9fce7bee74 | ||
|
|
b63f2143f8 | ||
|
|
f32bf7582e | ||
|
|
8a81d776ce | ||
|
|
f6fdacd82c | ||
|
|
d31f7844f8 | ||
|
|
7a6c8c3fa1 | ||
|
|
221bf72577 | ||
|
|
b3aba04e5a | ||
|
|
8a297115e2 | ||
|
|
191eed0bb9 | ||
|
|
fb860670da | ||
|
|
83e760c57d | ||
|
|
c2bba69065 | ||
|
|
e133d6d218 | ||
|
|
a1946c9f61 | ||
|
|
9f020f4f31 | ||
|
|
3b45075206 | ||
|
|
168e578efc | ||
|
|
6ac5e06f7c | ||
|
|
5c2acb270a | ||
|
|
b26b70bec4 | ||
|
|
ab4be40fc5 | ||
|
|
245e4f2c01 | ||
|
|
1d165d6d85 | ||
|
|
83004020fd | ||
|
|
12e21701e7 | ||
|
|
30a33b92ee | ||
|
|
7c572544e4 | ||
|
|
c312320764 | ||
|
|
c981f0ea78 | ||
|
|
6367bde739 | ||
|
|
f50cc221ea | ||
|
|
acedc74b1a | ||
|
|
d29483b58a | ||
|
|
950cf9e58e | ||
|
|
3125d79950 | ||
|
|
e33ee23ee3 | ||
|
|
b10c64c834 | ||
|
|
0925b28a8e | ||
|
|
99722d5f0e | ||
|
|
4c91a28e30 | ||
|
|
b038d9c40c | ||
|
|
2ba60ec7fe | ||
|
|
bd7157a071 | ||
|
|
be429d0cfd | ||
|
|
c253745eb8 | ||
|
|
daec4d2624 | ||
|
|
6c9fdbf725 | ||
|
|
483ea64611 | ||
|
|
e20eba753b | ||
|
|
bbc1b29665 | ||
|
|
acb1bfa601 | ||
|
|
75c7ad9918 | ||
|
|
5550ff9c25 | ||
|
|
3aeb19a39e | ||
|
|
8c017b3490 | ||
|
|
9c2c2287a0 | ||
|
|
fec2b341ad | ||
|
|
87bc0c492f | ||
|
|
fe3b9372ad | ||
|
|
bde9e2272a | ||
|
|
08405609cc | ||
|
|
ab81379ea6 | ||
|
|
4ffd6e8942 | ||
|
|
965c5f4914 | ||
|
|
4d055ef465 | ||
|
|
17c540a993 | ||
|
|
4d4d6bad19 | ||
|
|
11ae016bd7 | ||
|
|
41d3071918 | ||
|
|
fb5e10d3fb | ||
|
|
b2f78cbad4 | ||
|
|
23583ee28c | ||
|
|
01c977e96d | ||
|
|
b3dda72c23 | ||
|
|
fb0571b077 | ||
|
|
2ed8b6b3d0 | ||
|
|
013abde6ef | ||
|
|
a5464dcf92 | ||
|
|
ac3ed5a815 | ||
|
|
e6ba2000ae | ||
|
|
aa255ff55a | ||
|
|
7bb736d00e | ||
|
|
9f4e30904b | ||
|
|
5afd3276df | ||
|
|
43721bc67f | ||
|
|
02d709a6f1 | ||
|
|
4a510ab487 | ||
|
|
314fa8abbf | ||
|
|
334535b6fb | ||
|
|
dcbb3f1871 | ||
|
|
00417f4e44 | ||
|
|
ed344f4116 | ||
|
|
e51928793e | ||
|
|
d2740fafbf | ||
|
|
17838e50ef | ||
|
|
44c8555621 | ||
|
|
f7d318de2b | ||
|
|
76f0d05bc6 | ||
|
|
7d8975de84 | ||
|
|
785d8b6410 | ||
|
|
f6cdc9a02f | ||
|
|
509cdc0370 | ||
|
|
9b6504c307 | ||
|
|
e19b16dde6 | ||
|
|
582f2c6be7 | ||
|
|
f8a0acbdbe | ||
|
|
1317034379 | ||
|
|
0ecc553ee6 | ||
|
|
f96bc3649c | ||
|
|
938c43ea7f | ||
|
|
0a9ef0cfce | ||
|
|
e5b438a247 | ||
|
|
0b99f5d302 | ||
|
|
1f491aa0c8 | ||
|
|
de92d916fe | ||
|
|
a1063628a4 | ||
|
|
d796375258 | ||
|
|
14f8456344 | ||
|
|
4794c2bd92 | ||
|
|
d3cbaa08dc | ||
|
|
828523ad8e | ||
|
|
136a17fe6e | ||
|
|
f57438338d | ||
|
|
5d598680e3 | ||
|
|
8f4b313c37 | ||
|
|
f93e348010 | ||
|
|
f54f85129e | ||
|
|
d4d1a6024f | ||
|
|
db1764e4e0 | ||
|
|
7f83b4ee8e | ||
|
|
5c3bae1a6a | ||
|
|
5210dc3940 | ||
|
|
650b51f9f9 | ||
|
|
6256697997 | ||
|
|
71557a5f7c | ||
|
|
f3c378ffa7 | ||
|
|
f5ed68ef63 | ||
|
|
efdef57b1f | ||
|
|
b8a4572157 | ||
|
|
302ef403a2 | ||
|
|
8865da157b | ||
|
|
f0862eae43 | ||
|
|
8c851f6d04 | ||
|
|
7cfa420f49 | ||
|
|
a27b288e4a | ||
|
|
e471d7ca7e | ||
|
|
c43ca8259e | ||
|
|
85a65e7f51 | ||
|
|
a2986b3e33 | ||
|
|
96b9aa5aa0 | ||
|
|
e66d787bce | ||
|
|
bfad142e25 | ||
|
|
9354660036 | ||
|
|
07ca70af8d | ||
|
|
2dcd12d357 | ||
|
|
579d2e5458 | ||
|
|
0512c04aee | ||
|
|
7e0ef4084a | ||
|
|
4aed506b65 | ||
|
|
a86b4c58e8 | ||
|
|
ff4810ba73 | ||
|
|
9d6964926e | ||
|
|
0e65818910 | ||
|
|
380f17527c | ||
|
|
b92ab3deda | ||
|
|
acaa2c0a4a | ||
|
|
82af928c41 | ||
|
|
87efc681db | ||
|
|
c3a722fcb2 | ||
|
|
aba48f7db1 | ||
|
|
04b5f9802d | ||
|
|
efc8f7d814 | ||
|
|
6d87a2838c | ||
|
|
e6cdbd6792 | ||
|
|
df850c4912 | ||
|
|
720394de43 | ||
|
|
88a49745af | ||
|
|
ca683a2a72 | ||
|
|
e9f1b8c9e9 | ||
|
|
ea97940d6c | ||
|
|
fdd32750f0 | ||
|
|
c715ba3735 | ||
|
|
9c4cb68339 | ||
|
|
780eb03d9b | ||
|
|
ef9676a1f1 | ||
|
|
70b1b330e1 | ||
|
|
d1d063a588 | ||
|
|
7e6edb1469 | ||
|
|
74704d4553 | ||
|
|
d2f816d6ff | ||
|
|
577d498212 | ||
|
|
fd85c9f426 | ||
|
|
d32c611f45 | ||
|
|
01ad27faff | ||
|
|
481545b397 | ||
|
|
d3cc8427c0 | ||
|
|
4821ac1b4d | ||
|
|
4497c8f821 | ||
|
|
2e36cdbe2b | ||
|
|
fe3edb4cf0 | ||
|
|
29350922c6 | ||
|
|
8ae169286f | ||
|
|
8a0af6a561 | ||
|
|
cfded80793 | ||
|
|
b59dd19b55 | ||
|
|
3e051bda82 | ||
|
|
8317f72354 | ||
|
|
d8bebb008a | ||
|
|
35bc22f23c | ||
|
|
fa96fb9c70 | ||
|
|
e3fdb627d9 | ||
|
|
7200a21cd1 | ||
|
|
577c72a227 | ||
|
|
314285d4f2 | ||
|
|
d2a7938582 | ||
|
|
89342ce4c0 | ||
|
|
f89f599395 | ||
|
|
e251e457c5 | ||
|
|
afc47e4de7 | ||
|
|
e3b90c1ba2 | ||
|
|
134f70b3ed | ||
|
|
a1b2d658ee | ||
|
|
5c7fe25491 | ||
|
|
53c9a7cee2 | ||
|
|
0d21b9b51e | ||
|
|
10214b6935 | ||
|
|
4a61950f4d | ||
|
|
3263799056 | ||
|
|
8e67b2557a | ||
|
|
4073c82c4e | ||
|
|
767c3ab869 | ||
|
|
4f207c7174 | ||
|
|
782505ed8e | ||
|
|
98f30b8cba | ||
|
|
3cd36660f7 | ||
|
|
46ad73955a | ||
|
|
41f3884438 | ||
|
|
60e419c1ee | ||
|
|
7ef6052804 | ||
|
|
4fca1a1bd2 | ||
|
|
a6049be73c | ||
|
|
18ed7746ea | ||
|
|
8fcaaf6a16 | ||
|
|
9bb38130cb | ||
|
|
b91d8db873 | ||
|
|
045b396d09 | ||
|
|
76852017ea | ||
|
|
82e64c7a20 | ||
|
|
4ca204055e | ||
|
|
c5c8f5ea59 | ||
|
|
01653a917b | ||
|
|
0cd103e7cb | ||
|
|
5be7ca1b99 | ||
|
|
f0a30a067b | ||
|
|
9d6cff3ede | ||
|
|
a25f2adee9 | ||
|
|
d0bed837ac | ||
|
|
f7ee69868a | ||
|
|
d2a71530c1 | ||
|
|
086609de64 | ||
|
|
727144bed1 | ||
|
|
55392bc879 | ||
|
|
ddaff2938e | ||
|
|
27ed39a347 | ||
|
|
8f8474fbe3 | ||
|
|
be067861c6 | ||
|
|
5bc26c438d | ||
|
|
eef921f45e | ||
|
|
e317414ce1 | ||
|
|
949cb0170d | ||
|
|
e94cfd51da | ||
|
|
7c12763b24 | ||
|
|
3b780a4bbb | ||
|
|
30f78af147 | ||
|
|
19a9b169bf | ||
|
|
96ad65b7fe | ||
|
|
8d2b8c0ff2 | ||
|
|
b2155ed317 | ||
|
|
910abdbd08 | ||
|
|
cddce79fda | ||
|
|
e519281920 | ||
|
|
7b03584de8 | ||
|
|
ae9d0e7da5 | ||
|
|
0e67102d93 | ||
|
|
f4ba2061cf | ||
|
|
1e6848a65d | ||
|
|
67661375fa | ||
|
|
213b64452a | ||
|
|
784c231151 | ||
|
|
606b00e80f | ||
|
|
720d3cd0f0 | ||
|
|
ab196edefb | ||
|
|
3ee202ea1e | ||
|
|
ad430a67ca | ||
|
|
6f0f570c43 | ||
|
|
b545a0b207 | ||
|
|
29255cfc3b | ||
|
|
da4455609d | ||
|
|
aafb99a4d4 | ||
|
|
757fa4a4da | ||
|
|
c6187f55f7 | ||
|
|
8983e0216f | ||
|
|
1ee35382cb | ||
|
|
6e783bc54b | ||
|
|
c9d33c60dc | ||
|
|
2e54db4d2b | ||
|
|
44f633dba1 | ||
|
|
a462331e36 | ||
|
|
4069db3f2e | ||
|
|
0d37450eb7 | ||
|
|
47e66c24e2 | ||
|
|
3b736e1c38 | ||
|
|
2c1c7dfb35 | ||
|
|
e246ad6f0c | ||
|
|
5728da11ea | ||
|
|
92be3f3517 | ||
|
|
d1ddf340c8 | ||
|
|
ec10fd0abc | ||
|
|
0426e3c5e1 | ||
|
|
4bdf7ac593 | ||
|
|
dc7976dd9f | ||
|
|
e4791438ed | ||
|
|
e6e898f95d | ||
|
|
ddcbc2f334 | ||
|
|
a83ff278d6 | ||
|
|
cf4cd6c24f | ||
|
|
b960441812 | ||
|
|
1317028aa8 | ||
|
|
5e49c3e777 | ||
|
|
0d7c3cb51d | ||
|
|
1b2c440cd6 | ||
|
|
0f29dca988 | ||
|
|
d24cf322e1 | ||
|
|
d17f0fbf30 | ||
|
|
43ab8cfaa5 | ||
|
|
de253d63b7 | ||
|
|
8bd696fa53 | ||
|
|
bb6d8c21f9 | ||
|
|
ebf6ef1a9b | ||
|
|
0c52d6ef81 | ||
|
|
467a4f98f1 | ||
|
|
e614ab7806 | ||
|
|
2a03f93de9 | ||
|
|
da364615fc | ||
|
|
f08919b7d1 | ||
|
|
93f2c0aa08 | ||
|
|
4ebc9108a7 | ||
|
|
e1ba235668 | ||
|
|
b82f4307c9 | ||
|
|
76879cc160 | ||
|
|
b25d7b5657 | ||
|
|
e09d1753ec | ||
|
|
4ba8875749 | ||
|
|
6273fe8d3d | ||
|
|
9fb3ae4e6f | ||
|
|
76afe4edf8 | ||
|
|
c1b06fc182 | ||
|
|
241b4cfe66 | ||
|
|
9fc983c707 | ||
|
|
2f99f2f506 | ||
|
|
338b1bf04f | ||
|
|
e39dc46f8f | ||
|
|
10c75b5439 | ||
|
|
f9582fd8f4 | ||
|
|
f377333bd7 | ||
|
|
f8607863d8 | ||
|
|
335b28f7d1 | ||
|
|
5e65d6b2ad | ||
|
|
0d4f48fa10 | ||
|
|
127c8b782a | ||
|
|
cd9890544b | ||
|
|
067da2d1df | ||
|
|
046118b938 | ||
|
|
b32260ab85 | ||
|
|
f80e7866c0 | ||
|
|
31a4b3e6c4 | ||
|
|
caf8b1c084 | ||
|
|
1b86bd8e18 | ||
|
|
59012df99b | ||
|
|
3d1f67616d | ||
|
|
6ebaf43ee4 | ||
|
|
0c824fc46f | ||
|
|
eb577e4655 | ||
|
|
8f36850f73 | ||
|
|
29fd2662ba | ||
|
|
30a3e5af69 | ||
|
|
a38c1bfe09 | ||
|
|
320feae6f5 | ||
|
|
1e4ecca1d0 | ||
|
|
c0a7b89d8e | ||
|
|
6f59beaf0b | ||
|
|
41f1cf38f2 | ||
|
|
08d26a1b7e | ||
|
|
63773a6200 | ||
|
|
883b42896a | ||
|
|
e1098ced95 | ||
|
|
d100d78eb3 | ||
|
|
7e4cd070b0 | ||
|
|
46b0779996 | ||
|
|
de342585ff | ||
|
|
185d8ed44f | ||
|
|
d9836d4517 | ||
|
|
5f7e8a916a | ||
|
|
4dbdf4a294 | ||
|
|
c6873c4e6d | ||
|
|
2111b4643c | ||
|
|
c50901f3b9 | ||
|
|
8229280a9c | ||
|
|
f77df94647 | ||
|
|
f231e5bc21 | ||
|
|
2161efe978 | ||
|
|
f23b4c04fd | ||
|
|
93540958b8 | ||
|
|
44b9af5bb2 | ||
|
|
7cd95dc8a3 | ||
|
|
c02058c222 | ||
|
|
b2ea5ba677 | ||
|
|
824a3f403f | ||
|
|
05f6846ede | ||
|
|
20db99cc69 | ||
|
|
6431be808f | ||
|
|
4727a8afa7 | ||
|
|
b8f603cebe | ||
|
|
fc679696f8 | ||
|
|
ab5e7d93f4 | ||
|
|
0340f45553 | ||
|
|
19a00eb210 | ||
|
|
391612e78b | ||
|
|
77c95f72f7 | ||
|
|
59f30d0448 | ||
|
|
43c146ca42 | ||
|
|
7c2ec0fe87 | ||
|
|
039b6bade3 | ||
|
|
6c04638214 | ||
|
|
91ac7f764d | ||
|
|
4be7d7c1c9 | ||
|
|
59b477645c | ||
|
|
778f554157 | ||
|
|
d3c84297c3 | ||
|
|
f509a20846 | ||
|
|
60bc25e74c | ||
|
|
b893d661b1 | ||
|
|
6b6e98775f | ||
|
|
9c3c21c519 | ||
|
|
512b8affa4 | ||
|
|
1c0c68202c | ||
|
|
5f317530ec | ||
|
|
557b2e961d | ||
|
|
4e256cadc2 | ||
|
|
d6953beb91 | ||
|
|
17edd8a807 | ||
|
|
3303cfb4ac | ||
|
|
b7e8e4e6be | ||
|
|
432e1cbc23 | ||
|
|
201c971e96 | ||
|
|
e0986ea07b | ||
|
|
a964e5e6c3 | ||
|
|
78c1d5bfd2 | ||
|
|
59a85c366e | ||
|
|
119f00630b | ||
|
|
a42d2df75f | ||
|
|
5c057e068f | ||
|
|
ed3aeb25a4 | ||
|
|
86ee949128 | ||
|
|
4570535ec4 | ||
|
|
2a6dc67eb5 | ||
|
|
f05fea1f5e | ||
|
|
d0df145c2a | ||
|
|
1838cd4860 | ||
|
|
7d6b03381e | ||
|
|
7c2e91c4e0 | ||
|
|
736fbf4c89 | ||
|
|
44ea85137a | ||
|
|
d3d649efec | ||
|
|
ea507c3a93 | ||
|
|
9705fba7b7 | ||
|
|
2f7dbc9b42 | ||
|
|
ea25a76c05 | ||
|
|
67bc0c003e | ||
|
|
5a05f26603 | ||
|
|
7ef40bb983 | ||
|
|
767cbb011d | ||
|
|
7cfa4b24bf | ||
|
|
b71fcd4905 | ||
|
|
75003f34e8 | ||
|
|
78b8015a4d | ||
|
|
831b124151 | ||
|
|
c1ffcb55da | ||
|
|
0879736aab | ||
|
|
a26917332f | ||
|
|
cd9e5b8340 | ||
|
|
300a59c4c3 | ||
|
|
d76541a6c5 | ||
|
|
dd96465fd7 | ||
|
|
4f8f47e87e | ||
|
|
d78fda7cda | ||
|
|
73a99cc2a5 | ||
|
|
adae0c1f43 | ||
|
|
cbf9221992 | ||
|
|
5f42fc53b6 | ||
|
|
8ee846c27c | ||
|
|
812b7f54a8 | ||
|
|
5f2cacdb1e | ||
|
|
aa5053e3fe | ||
|
|
79aa244678 | ||
|
|
2ed3f20dba | ||
|
|
48f309029a | ||
|
|
0e93ac0b3a | ||
|
|
5446ad1d24 | ||
|
|
f9a8084e48 | ||
|
|
3e70e3d4d5 | ||
|
|
eb0fa43868 | ||
|
|
0ad9951c41 | ||
|
|
8c9117181d | ||
|
|
c4b48d3c0f | ||
|
|
10d765482d | ||
|
|
39b643dc1a | ||
|
|
711f485643 | ||
|
|
9c5ee91b2a | ||
|
|
27edd2aeb4 | ||
|
|
e5017cd6d6 | ||
|
|
6a7796e871 | ||
|
|
47b9339546 | ||
|
|
5d5146eee3 | ||
|
|
2aaa423842 | ||
|
|
ad2d788016 | ||
|
|
36ce76c632 | ||
|
|
f1fc2107a3 | ||
|
|
13cdc02173 | ||
|
|
502640c3f9 | ||
|
|
3d5f1c8640 | ||
|
|
1cab2f9cad | ||
|
|
1e50f1be70 | ||
|
|
ad87ba927a | ||
|
|
decf7f794b | ||
|
|
d00d652998 | ||
|
|
3b279a84be | ||
|
|
5e4a8223c6 | ||
|
|
e51de388a2 | ||
|
|
cc253b73d3 | ||
|
|
7d6fb905d9 | ||
|
|
418d111f8c | ||
|
|
be8921fbba | ||
|
|
d4e7a1152d | ||
|
|
be22bb6f3d | ||
|
|
169313b9f8 | ||
|
|
0b018d8baf | ||
|
|
c31246800c | ||
|
|
4134312b35 | ||
|
|
da554f932e | ||
|
|
aac622e0cd | ||
|
|
1726e93ef1 | ||
|
|
ee04c0cd04 | ||
|
|
c36f0aa300 | ||
|
|
5234dc7451 | ||
|
|
3b7c20a6b5 | ||
|
|
f9e714813a | ||
|
|
2518230d3e | ||
|
|
a332b84578 | ||
|
|
1405f0c7ba | ||
|
|
84d57342b6 | ||
|
|
57b46d769e | ||
|
|
f48b6a03ba | ||
|
|
2a69ab4899 | ||
|
|
8d7da92fd7 | ||
|
|
e952eee698 | ||
|
|
66bca9b8bd | ||
|
|
99028fda44 | ||
|
|
1244948885 | ||
|
|
a73f6491c8 | ||
|
|
001e50c92c | ||
|
|
96ebcaa3ad | ||
|
|
5db1870bb9 | ||
|
|
2ce26b9b5d | ||
|
|
a388252ac4 | ||
|
|
9a9f48dff7 | ||
|
|
67f3fb0844 | ||
|
|
43b752c325 | ||
|
|
cfd302db9b | ||
|
|
fb610ae684 | ||
|
|
2f652e6cdf | ||
|
|
e6a226efba | ||
|
|
a2e6fa7e03 | ||
|
|
9f1c4ecaf2 | ||
|
|
ef283548f7 | ||
|
|
f4db5e6de1 | ||
|
|
099aaee536 | ||
|
|
35fe398c7c | ||
|
|
bb6d43047e | ||
|
|
bc546f76a1 | ||
|
|
80608ba5af | ||
|
|
e184c9c510 | ||
|
|
d7e34b4210 | ||
|
|
ef6e0e7132 | ||
|
|
1ad3aca682 | ||
|
|
8d0afa9b42 | ||
|
|
fa7e254a7f | ||
|
|
e23cacda35 | ||
|
|
2e1b8bc2b6 | ||
|
|
e47433b3c1 | ||
|
|
23194d83e8 | ||
|
|
61aedb5ffe | ||
|
|
d3bd171123 | ||
|
|
89e4050af4 | ||
|
|
78a47f87ce | ||
|
|
6a113d9aed | ||
|
|
2e4fe48c37 | ||
|
|
8eb0a1d906 | ||
|
|
fea3e476aa | ||
|
|
61a3431613 | ||
|
|
9bedac9623 | ||
|
|
c42ff4f4fd | ||
|
|
d5ab28511c | ||
|
|
e61eb5e09d | ||
|
|
0899ba5b42 | ||
|
|
145ac73317 | ||
|
|
d0d138bc55 | ||
|
|
43227236ec | ||
|
|
8616300ae2 | ||
|
|
edbaadd91f | ||
|
|
9360d34fa1 | ||
|
|
1b67b04656 | ||
|
|
bd51f78e39 | ||
|
|
65ecb4f134 | ||
|
|
143844fa43 | ||
|
|
219cfbe7f6 | ||
|
|
9b44a7d926 | ||
|
|
a3ae45a38c | ||
|
|
0307428d65 | ||
|
|
471997adf6 | ||
|
|
b1ded114b9 | ||
|
|
f4e4088c99 | ||
|
|
0efd540dbc | ||
|
|
6144754014 | ||
|
|
69311446ba | ||
|
|
da63274d9f | ||
|
|
c216119d64 | ||
|
|
5546acb463 | ||
|
|
c0ec81836f | ||
|
|
b65e56babe | ||
|
|
49996cd597 | ||
|
|
ecb37e276a | ||
|
|
a5354b3ed2 | ||
|
|
f9df8b4ad7 | ||
|
|
ec152c8748 | ||
|
|
7977e5027c | ||
|
|
3f5d902d2a | ||
|
|
27d7638b94 | ||
|
|
176173989a | ||
|
|
23b8ee672d | ||
|
|
3939152069 | ||
|
|
cd87bfbf37 | ||
|
|
b3613e3ace | ||
|
|
d346ec695e | ||
|
|
c242c98031 | ||
|
|
f1d53d150c | ||
|
|
92da847cf5 | ||
|
|
3958b96bf5 | ||
|
|
8bf8f45822 | ||
|
|
6f5c0931c1 | ||
|
|
4e33a7ea85 | ||
|
|
dc48ba0c75 | ||
|
|
4778b42660 | ||
|
|
c70ac4b8ff | ||
|
|
cf89202855 | ||
|
|
f075693da7 | ||
|
|
f708bd4904 | ||
|
|
0002b7f0d1 | ||
|
|
11aafd9886 | ||
|
|
b761df963c | ||
|
|
33f6aaf972 | ||
|
|
56aafa8c0b | ||
|
|
8d52f2b3a7 | ||
|
|
984d18498a | ||
|
|
d4d9899860 | ||
|
|
db1e42f627 | ||
|
|
bc9d7b5595 | ||
|
|
fe6b19c314 | ||
|
|
2827b3f4a3 | ||
|
|
2b6b1d7809 | ||
|
|
633f943e30 | ||
|
|
b03b1b97f6 | ||
|
|
dfb9af2014 | ||
|
|
19f76ee68e | ||
|
|
dd70437a4f | ||
|
|
99b3a504c5 | ||
|
|
6e30010d2f | ||
|
|
52621c8f5c | ||
|
|
d48f4d6daf | ||
|
|
e84e0735c7 | ||
|
|
3edf87d25f | ||
|
|
392edee34a | ||
|
|
983056e456 | ||
|
|
13dd93c667 | ||
|
|
53a30845be | ||
|
|
8b77328ffe | ||
|
|
9fe4c2bdb9 | ||
|
|
081b5594a2 | ||
|
|
57329a8c01 | ||
|
|
8c435c9bce | ||
|
|
e71b8e210d | ||
|
|
89fa54e6f7 | ||
|
|
3d54bdcb73 | ||
|
|
6b0fcbbf43 | ||
|
|
0fa673af4c | ||
|
|
3468f17ebe | ||
|
|
71b25b0d48 | ||
|
|
0ea80c87d9 | ||
|
|
b8d9e4a326 | ||
|
|
13cc7f5370 | ||
|
|
916bd9204d | ||
|
|
e04a1b6b21 | ||
|
|
2e5df88c92 | ||
|
|
0754ac4c49 | ||
|
|
03858e6d1c | ||
|
|
532a6cfccb | ||
|
|
eb32335e35 | ||
|
|
69a8c8e99a | ||
|
|
6c340da4df | ||
|
|
2f17117606 | ||
|
|
1e9a77e037 | ||
|
|
d2af67441d | ||
|
|
0bcc3a160d | ||
|
|
70fbdb26e9 | ||
|
|
7f570f1caa | ||
|
|
eaeca3cd7f | ||
|
|
12c1287d64 | ||
|
|
17b4c6685c | ||
|
|
3c2b2ccece | ||
|
|
7be9ffcd9f | ||
|
|
393de22d2e | ||
|
|
1260180c67 | ||
|
|
af4ee63e0e | ||
|
|
bc092ea873 | ||
|
|
755ed7b05b | ||
|
|
a676e668ee | ||
|
|
c85be1f6dd | ||
|
|
845adb3ec6 | ||
|
|
90b139cfff | ||
|
|
4492e3a554 | ||
|
|
05c19485a5 | ||
|
|
52d0cb8458 | ||
|
|
5c1e496a75 | ||
|
|
e7f27ea648 | ||
|
|
1f29141258 | ||
|
|
6160ba4151 | ||
|
|
fea8006062 | ||
|
|
e6750d0b18 | ||
|
|
8c853050e7 | ||
|
|
f84a472a03 | ||
|
|
54e42b72db | ||
|
|
2dda3e35d0 | ||
|
|
d83f3f7cb3 | ||
|
|
302eb941f3 | ||
|
|
487745ff49 | ||
|
|
9313be5017 | ||
|
|
8938774c79 | ||
|
|
e18b714b2e | ||
|
|
b1068903fd | ||
|
|
164299500b | ||
|
|
58c360d9be | ||
|
|
42488dae69 | ||
|
|
b67dece2d8 | ||
|
|
2338daffd3 | ||
|
|
2e19a848d4 | ||
|
|
77a7fce1bb | ||
|
|
6488f3481b | ||
|
|
27ec3c78f3 | ||
|
|
1cbcfb94de | ||
|
|
fed8a9b107 | ||
|
|
190c45a6af | ||
|
|
5caaeb714c | ||
|
|
d747c2ef18 | ||
|
|
c30b405b8f | ||
|
|
77d906995c | ||
|
|
359d293006 | ||
|
|
9df8da548e | ||
|
|
bf68fd76a9 | ||
|
|
de94289a98 | ||
|
|
1983609239 | ||
|
|
d06b5a95cb | ||
|
|
be0bb568c9 | ||
|
|
c8bde93367 | ||
|
|
88d7bdbd23 | ||
|
|
0d235b874a | ||
|
|
7ad5e50adf | ||
|
|
dc464a3d39 | ||
|
|
1210e4d95b | ||
|
|
e0b24ea030 | ||
|
|
bde2a1a8a4 | ||
|
|
5e25b12236 | ||
|
|
c85d75cf08 | ||
|
|
abad204be6 | ||
|
|
7361ab379f | ||
|
|
95bc60e4cb | ||
|
|
4f2954f724 | ||
|
|
eca7be9077 | ||
|
|
969b4da3a6 | ||
|
|
4f8c4b890a | ||
|
|
ae002924e9 | ||
|
|
690f948e4a | ||
|
|
08275ec0a2 | ||
|
|
c828d1bf98 | ||
|
|
8b8a8afc89 | ||
|
|
8bdd8b5c51 | ||
|
|
a8ffc4f0f2 | ||
|
|
d5944d5146 | ||
|
|
24fab45d96 | ||
|
|
63400259d0 | ||
|
|
8c1c81a3de | ||
|
|
a3a7828010 | ||
|
|
5abb117901 | ||
|
|
867ecdd1c8 | ||
|
|
24e8222745 | ||
|
|
100b630a60 | ||
|
|
527821d191 | ||
|
|
846197f505 | ||
|
|
2357480b1a | ||
|
|
f11e3c516b | ||
|
|
875d6def90 | ||
|
|
cc1dc7ed6d | ||
|
|
a903669e10 | ||
|
|
2c58742dff | ||
|
|
4c966e440e | ||
|
|
da5e7e4329 | ||
|
|
f05a4f0e34 | ||
|
|
61d1b35561 | ||
|
|
b6a136b58c | ||
|
|
0d9fe260dd | ||
|
|
273690a50a | ||
|
|
231c2c63e4 | ||
|
|
4322c553a6 | ||
|
|
babad6e5dd | ||
|
|
9383cd6f10 | ||
|
|
ba8d2165b6 | ||
|
|
c98be0a232 | ||
|
|
5774b0a1da | ||
|
|
e8db44f883 | ||
|
|
fafbe11af4 | ||
|
|
78237e43bf | ||
|
|
eea1783989 | ||
|
|
f225ea7dd9 | ||
|
|
fc97733da8 | ||
|
|
4741239db7 | ||
|
|
c625f9043c | ||
|
|
6fa78d8f23 | ||
|
|
9949aa2ef1 | ||
|
|
0b7bed9c38 | ||
|
|
ac0048c0ae | ||
|
|
090197034f | ||
|
|
f31ff87460 | ||
|
|
d588cd2406 | ||
|
|
45d7d852d3 | ||
|
|
8bed179109 | ||
|
|
f552d5e578 | ||
|
|
8db2939289 | ||
|
|
d5e0fca264 | ||
|
|
8d0ee5a564 | ||
|
|
922979bfcc | ||
|
|
239ef0c1ac | ||
|
|
1d7f95b85c | ||
|
|
cfbee3d0e7 | ||
|
|
06a41334c7 | ||
|
|
175811e3b5 | ||
|
|
c10101a3eb | ||
|
|
ac243886b0 | ||
|
|
3d2c56b7a9 | ||
|
|
64c824cd78 | ||
|
|
417a164af6 | ||
|
|
b6f01bd9a7 | ||
|
|
4cf71cc88a | ||
|
|
a66d131381 | ||
|
|
21467f9a1c | ||
|
|
f92d952632 | ||
|
|
6d0b827cbd | ||
|
|
0eecb31663 | ||
|
|
793be8d057 | ||
|
|
7b57a433da | ||
|
|
5aeb925452 | ||
|
|
04d3752329 | ||
|
|
bc6e542d9f | ||
|
|
af7dfb0d1a | ||
|
|
1c3ffdbecc | ||
|
|
c438b2951c | ||
|
|
0ff8ebb2d7 | ||
|
|
26e673fe93 | ||
|
|
65a5910ce3 | ||
|
|
9aea7373ff | ||
|
|
30d08911f7 | ||
|
|
cf56cf78b4 | ||
|
|
7ed82d1974 | ||
|
|
12dbd834cf | ||
|
|
035fd2bd2c | ||
|
|
1cd885bd54 | ||
|
|
62b38dc832 | ||
|
|
c99db8c8dd | ||
|
|
72dd1595b4 | ||
|
|
572ddf83ce | ||
|
|
86647d1cd0 | ||
|
|
52c2a8d4ad | ||
|
|
367a480bd3 | ||
|
|
bef180f009 | ||
|
|
d88918e4c2 | ||
|
|
3c713a9711 | ||
|
|
bf8b26cad1 | ||
|
|
032d661d27 | ||
|
|
e08a3a3fdb | ||
|
|
3d9a1d2de5 | ||
|
|
be874c0201 | ||
|
|
9607d5eb44 | ||
|
|
c60e6137f0 | ||
|
|
f91480b2d4 | ||
|
|
6c5f82e5aa | ||
|
|
b7f186bbb3 | ||
|
|
3642909617 | ||
|
|
c308501cb6 | ||
|
|
535d80056b | ||
|
|
a25ade5d47 | ||
|
|
8945b001db | ||
|
|
b8a287a0a8 | ||
|
|
c7e713616a | ||
|
|
a36c675817 | ||
|
|
3da17c2cc2 | ||
|
|
14c1432789 | ||
|
|
ee7a66dd9a | ||
|
|
431535b522 | ||
|
|
711e912946 | ||
|
|
e69e0b8b5f | ||
|
|
ddc9048394 | ||
|
|
b1a63d1b3b | ||
|
|
48ecb4438b | ||
|
|
e57fc15971 | ||
|
|
4bdf400218 | ||
|
|
7852b82b93 | ||
|
|
a2a5f79e09 | ||
|
|
c59a0eca42 | ||
|
|
b716ab93a7 | ||
|
|
138f0d1e75 | ||
|
|
2506ce5189 | ||
|
|
47fd08aaf9 | ||
|
|
12aed7e453 | ||
|
|
d90e212a3a | ||
|
|
2821986450 | ||
|
|
6c117cff7d | ||
|
|
7ac67ea525 | ||
|
|
ce75e15373 | ||
|
|
aed16879a9 | ||
|
|
cf278ff3b2 | ||
|
|
838d7116ba | ||
|
|
5089fd749c | ||
|
|
a3d087adec | ||
|
|
058525b997 | ||
|
|
1dfea5f4a9 | ||
|
|
cea91a32f2 | ||
|
|
a684c0124c | ||
|
|
f2718d2948 | ||
|
|
825fdb11ad | ||
|
|
8c1d4acbfe | ||
|
|
486c5599e3 | ||
|
|
a6149aa587 | ||
|
|
6c8a3c099b | ||
|
|
31a8a2a7bc | ||
|
|
1a0a04dae9 | ||
|
|
6d8246aaff | ||
|
|
9d1c50a5ac | ||
|
|
9a4600e4dc | ||
|
|
9fac6aa30b | ||
|
|
a53ad626d6 | ||
|
|
1c3dad22ff | ||
|
|
d2a30a2d93 | ||
|
|
75fb112d80 | ||
|
|
38db529f66 | ||
|
|
064cac7bb7 | ||
|
|
e19bce40a1 | ||
|
|
505805b645 | ||
|
|
bbdc0f2366 | ||
|
|
dc34059360 | ||
|
|
c4cb0af98a | ||
|
|
1c3b1634aa | ||
|
|
2ea50e977a | ||
|
|
b419937c78 | ||
|
|
5f696c33b1 | ||
|
|
67244c86f0 | ||
|
|
072d7e53e5 | ||
|
|
01a583fea4 | ||
|
|
bc19d75985 | ||
|
|
fbd6523ac0 | ||
|
|
470484a4f5 | ||
|
|
21da73343a | ||
|
|
66072b36db | ||
|
|
3ed1ec4af2 | ||
|
|
5a33ae9a3f | ||
|
|
c9ff9e6f0c | ||
|
|
eaffe4486c | ||
|
|
8ed039d527 | ||
|
|
37970105fe | ||
|
|
cc935fdd7e | ||
|
|
abdfcd4f3d | ||
|
|
4f02b77de4 | ||
|
|
29283e8976 | ||
|
|
05b044e698 | ||
|
|
aa3f105c59 | ||
|
|
ef7eefe17a | ||
|
|
350c94deb3 | ||
|
|
f4cd80f944 | ||
|
|
349e0e3462 | ||
|
|
81b16a2bc9 | ||
|
|
e111d5b0ae | ||
|
|
a904ea78ea | ||
|
|
b7433ca1a4 | ||
|
|
5c65a72bb1 | ||
|
|
9d8a2d86d2 | ||
|
|
3bc18127ff | ||
|
|
bec060fd99 | ||
|
|
52bc9d5b3e | ||
|
|
dc2979c585 | ||
|
|
027d37df38 | ||
|
|
b98219670f | ||
|
|
32baf1d036 | ||
|
|
3127274d02 | ||
|
|
4ac510f484 | ||
|
|
7fb2a5be28 | ||
|
|
6c036615dc | ||
|
|
2fc24e94f9 | ||
|
|
2c3c1bd07a | ||
|
|
5963b98b46 | ||
|
|
e6585ddb45 | ||
|
|
2a4d6412e6 | ||
|
|
e67a79db03 | ||
|
|
9f882d8791 | ||
|
|
1a456c7c90 | ||
|
|
fedb75fa27 | ||
|
|
bff2e5f1d6 | ||
|
|
3c068c637b | ||
|
|
f20c3b0951 | ||
|
|
883131544f | ||
|
|
ee5fd49150 | ||
|
|
7ae9887542 | ||
|
|
e3db5ebb66 | ||
|
|
9d442b7c48 | ||
|
|
eb68c2dcd9 | ||
|
|
8b32464ac1 | ||
|
|
99cc41ad50 | ||
|
|
d6a518fdde | ||
|
|
4aa8c7b047 | ||
|
|
4b946d693e | ||
|
|
087c6ffc92 | ||
|
|
4a2d33e371 | ||
|
|
8f3616f422 | ||
|
|
47f670b03b | ||
|
|
dd6a910aac | ||
|
|
1b962e2457 | ||
|
|
bfe9380161 | ||
|
|
9fccd04e30 | ||
|
|
252ada5559 | ||
|
|
e120533d7a | ||
|
|
2b85697031 | ||
|
|
544fe76b95 | ||
|
|
bb58dc8c20 | ||
|
|
0fb2551c23 | ||
|
|
6c47f6bfa4 | ||
|
|
c15309a730 | ||
|
|
4a9375fe9d | ||
|
|
03191cd8f0 | ||
|
|
b77bf34e53 | ||
|
|
dd39baf717 | ||
|
|
43a62c51be | ||
|
|
ca2d1925ef | ||
|
|
0f7acdd73c | ||
|
|
5801e49776 | ||
|
|
58d4c705a8 | ||
|
|
ea3de5ef0d | ||
|
|
67532a1a68 | ||
|
|
5672ba90bd | ||
|
|
dd83a157f1 | ||
|
|
5a411ef6c4 | ||
|
|
eeb135eb87 | ||
|
|
3059b9cc6b | ||
|
|
64ad551878 | ||
|
|
cef32104b4 | ||
|
|
493b10f8bf | ||
|
|
d119fc8614 | ||
|
|
dbebb7f812 | ||
|
|
3053a22b33 | ||
|
|
02d4b85454 | ||
|
|
86daa875fe | ||
|
|
dcf2f3ec06 | ||
|
|
218454b9b2 | ||
|
|
f4d6eb95cf | ||
|
|
cd1f885bcf | ||
|
|
d593cf28fa | ||
|
|
faa7a5daac | ||
|
|
567939953b | ||
|
|
08369289af | ||
|
|
73cfb3c5ee | ||
|
|
4e5affeaa1 | ||
|
|
e4f0b4cd96 | ||
|
|
de3e53a75b | ||
|
|
85e0df1392 | ||
|
|
0faf3cc3e8 | ||
|
|
7ea5c73ad7 | ||
|
|
27fcfe7bcf | ||
|
|
68dbde5dbb | ||
|
|
04ad0dc275 | ||
|
|
238c4c1705 | ||
|
|
8c54610265 | ||
|
|
17871983a2 | ||
|
|
759ef49b15 | ||
|
|
5206ab20ba | ||
|
|
0af3ce1355 | ||
|
|
e1279ef00f | ||
|
|
2942970d44 | ||
|
|
3c96e7b8a1 | ||
|
|
b42566f440 | ||
|
|
d96e11167d | ||
|
|
2891603efd | ||
|
|
de2cc3d867 | ||
|
|
e95084308b | ||
|
|
7f6f2c1182 | ||
|
|
5bcc153d7b | ||
|
|
45bfa49cb8 | ||
|
|
fd2f10546c | ||
|
|
e757a629e7 | ||
|
|
aae725af7c | ||
|
|
73df49ef3a | ||
|
|
25aba2b6a3 | ||
|
|
94b03f88dd | ||
|
|
49bfc538e4 | ||
|
|
a0b26701c9 | ||
|
|
c4afdb69cc | ||
|
|
b834b4cbf1 | ||
|
|
740f0647b1 | ||
|
|
01413e0cf5 | ||
|
|
0e219cd50b | ||
|
|
72c99f2a75 | ||
|
|
bf214ca226 | ||
|
|
2e41f5abca | ||
|
|
bc0f6059a2 | ||
|
|
8de261b04a | ||
|
|
a0d8b9738d | ||
|
|
59e17dd4a0 | ||
|
|
4979eb79da | ||
|
|
a8c0f59973 | ||
|
|
f4a948f33f | ||
|
|
3f3313981c | ||
|
|
78818dd1b0 | ||
|
|
8e5cdcda4e | ||
|
|
90f3f7d73e | ||
|
|
6dc8da5dc1 | ||
|
|
79cbcab871 | ||
|
|
ff68035932 | ||
|
|
1177dd53e9 | ||
|
|
fc2dbcda8b | ||
|
|
fec347dee1 | ||
|
|
cc3173ae98 | ||
|
|
3e903b6cb4 | ||
|
|
973c9d01da |
@@ -5,11 +5,11 @@ import os
|
|||||||
import sys
|
import sys
|
||||||
import zipfile
|
import zipfile
|
||||||
|
|
||||||
# Read the VLLM_MAX_SIZE_MB environment variable, defaulting to 450 MiB
|
# Read the VLLM_MAX_SIZE_MB environment variable, defaulting to 500 MiB
|
||||||
# Note that we have 800 MiB quota, please use it wisely.
|
# Note that we have 800 MiB quota, please use it wisely.
|
||||||
# See https://github.com/pypi/support/issues/6326 .
|
# See https://github.com/pypi/support/issues/6326 .
|
||||||
# Please also sync the value with the one in Dockerfile.
|
# Please also sync the value with the one in Dockerfile.
|
||||||
VLLM_MAX_SIZE_MB = int(os.environ.get("VLLM_MAX_SIZE_MB", 450))
|
VLLM_MAX_SIZE_MB = int(os.environ.get("VLLM_MAX_SIZE_MB", 500))
|
||||||
|
|
||||||
|
|
||||||
def print_top_10_largest_files(zip_file):
|
def print_top_10_largest_files(zip_file):
|
||||||
|
|||||||
12
.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-QQQ.yaml
Normal file
12
.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-QQQ.yaml
Normal file
@@ -0,0 +1,12 @@
|
|||||||
|
# For vllm script, with -t option (tensor parallel size).
|
||||||
|
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m HandH1998/QQQ-Llama-3-8b-g128 -b 32 -l 1000 -f 5 -t 1
|
||||||
|
model_name: "HandH1998/QQQ-Llama-3-8b-g128"
|
||||||
|
tasks:
|
||||||
|
- name: "gsm8k"
|
||||||
|
metrics:
|
||||||
|
- name: "exact_match,strict-match"
|
||||||
|
value: 0.419
|
||||||
|
- name: "exact_match,flexible-extract"
|
||||||
|
value: 0.416
|
||||||
|
limit: 1000
|
||||||
|
num_fewshot: 5
|
||||||
@@ -0,0 +1,12 @@
|
|||||||
|
# For hf script, without -t option (tensor parallel size).
|
||||||
|
# bash .buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh -m meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8 -l 100 -t 8
|
||||||
|
model_name: "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8"
|
||||||
|
backend: "vllm-vlm"
|
||||||
|
tasks:
|
||||||
|
- name: "chartqa"
|
||||||
|
metrics:
|
||||||
|
- name: "relaxed_accuracy,none"
|
||||||
|
# TODO(zhewenl): model card is 0.90, but the actual score is 0.80.
|
||||||
|
value: 0.80
|
||||||
|
limit: 100
|
||||||
|
num_fewshot: 0
|
||||||
@@ -0,0 +1,10 @@
|
|||||||
|
# For hf script, without -t option (tensor parallel size).
|
||||||
|
# bash .buildkite/lm-eval-harness/run-lm-eval-mmlupro-vllm-baseline.sh -m meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8 -l 250 -t 8 -f 5
|
||||||
|
model_name: "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8"
|
||||||
|
tasks:
|
||||||
|
- name: "mmlu_pro"
|
||||||
|
metrics:
|
||||||
|
- name: "exact_match,custom-extract"
|
||||||
|
value: 0.80
|
||||||
|
limit: 250 # will run on 250 * 14 subjects = 3500 samples
|
||||||
|
num_fewshot: 5
|
||||||
@@ -1,4 +1,5 @@
|
|||||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic -b auto -l 1319 -f 5 -t 1
|
# For vllm script, with -t option (tensor parallel size)
|
||||||
|
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic -l 1319 -t 1
|
||||||
model_name: "RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic"
|
model_name: "RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic"
|
||||||
tasks:
|
tasks:
|
||||||
- name: "gsm8k"
|
- name: "gsm8k"
|
||||||
|
|||||||
@@ -0,0 +1,12 @@
|
|||||||
|
# For vllm script, with -t option (tensor parallel size).
|
||||||
|
# bash .buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh -m Qwen/Qwen2.5-VL-7B-Instruct -l 2500 -t 1
|
||||||
|
|
||||||
|
model_name: "Qwen/Qwen2.5-VL-7B-Instruct"
|
||||||
|
backend: "vllm-vlm"
|
||||||
|
tasks:
|
||||||
|
- name: "chartqa"
|
||||||
|
metrics:
|
||||||
|
- name: "relaxed_accuracy,none"
|
||||||
|
value: 0.855
|
||||||
|
limit: 2500
|
||||||
|
num_fewshot: 0
|
||||||
1
.buildkite/lm-eval-harness/configs/models-large-h100.txt
Normal file
1
.buildkite/lm-eval-harness/configs/models-large-h100.txt
Normal file
@@ -0,0 +1 @@
|
|||||||
|
Meta-Llama-4-Maverick-17B-128E-Instruct-FP8.yaml
|
||||||
@@ -0,0 +1 @@
|
|||||||
|
Meta-Llama-4-Maverick-17B-128E-Instruct-FP8-MM.yaml
|
||||||
1
.buildkite/lm-eval-harness/configs/models-mm-small.txt
Normal file
1
.buildkite/lm-eval-harness/configs/models-mm-small.txt
Normal file
@@ -0,0 +1 @@
|
|||||||
|
Qwen2.5-VL-7B-Instruct.yaml
|
||||||
44
.buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh
Executable file
44
.buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh
Executable file
@@ -0,0 +1,44 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
# We can use this script to compute baseline accuracy on chartqa for vllm.
|
||||||
|
#
|
||||||
|
# Make sure you have lm-eval-harness installed:
|
||||||
|
# pip install lm-eval==0.4.9
|
||||||
|
|
||||||
|
usage() {
|
||||||
|
echo``
|
||||||
|
echo "Runs lm eval harness on ChartQA using multimodal vllm."
|
||||||
|
echo "This pathway is intended to be used to create baselines for "
|
||||||
|
echo "our correctness tests in vllm's CI."
|
||||||
|
echo
|
||||||
|
echo "usage: ${0} <options>"
|
||||||
|
echo
|
||||||
|
echo " -m - huggingface stub or local directory of the model"
|
||||||
|
echo " -l - limit number of samples to run"
|
||||||
|
echo " -t - tensor parallel size to run at"
|
||||||
|
echo
|
||||||
|
}
|
||||||
|
|
||||||
|
while getopts "m:l:t:" OPT; do
|
||||||
|
case ${OPT} in
|
||||||
|
m )
|
||||||
|
MODEL="$OPTARG"
|
||||||
|
;;
|
||||||
|
l )
|
||||||
|
LIMIT="$OPTARG"
|
||||||
|
;;
|
||||||
|
t )
|
||||||
|
TP_SIZE="$OPTARG"
|
||||||
|
;;
|
||||||
|
\? )
|
||||||
|
usage
|
||||||
|
exit 1
|
||||||
|
;;
|
||||||
|
esac
|
||||||
|
done
|
||||||
|
|
||||||
|
lm_eval --model vllm-vlm \
|
||||||
|
--model_args "pretrained=$MODEL,tensor_parallel_size=$TP_SIZE" \
|
||||||
|
--tasks chartqa \
|
||||||
|
--batch_size auto \
|
||||||
|
--apply_chat_template \
|
||||||
|
--limit $LIMIT
|
||||||
0
.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh
Normal file → Executable file
0
.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh
Normal file → Executable file
@@ -0,0 +1,50 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
# We can use this script to compute baseline accuracy on MMLUPRO for vllm.
|
||||||
|
# We use this for fp8, which HF does not support.
|
||||||
|
#
|
||||||
|
# Make sure you have lm-eval-harness installed:
|
||||||
|
# pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api]
|
||||||
|
|
||||||
|
usage() {
|
||||||
|
echo``
|
||||||
|
echo "Runs lm eval harness on MMLU Pro using huggingface transformers."
|
||||||
|
echo "This pathway is intended to be used to create baselines for "
|
||||||
|
echo "our automated nm-test-accuracy workflow"
|
||||||
|
echo
|
||||||
|
echo "usage: ${0} <options>"
|
||||||
|
echo
|
||||||
|
echo " -m - huggingface stub or local directory of the model"
|
||||||
|
echo " -l - limit number of samples to run"
|
||||||
|
echo " -f - number of fewshot samples to use"
|
||||||
|
echo " -t - tensor parallel size to run at"
|
||||||
|
echo
|
||||||
|
}
|
||||||
|
|
||||||
|
while getopts "m:b:l:f:t:" OPT; do
|
||||||
|
case ${OPT} in
|
||||||
|
m )
|
||||||
|
MODEL="$OPTARG"
|
||||||
|
;;
|
||||||
|
b )
|
||||||
|
BATCH_SIZE="$OPTARG"
|
||||||
|
;;
|
||||||
|
l )
|
||||||
|
LIMIT="$OPTARG"
|
||||||
|
;;
|
||||||
|
f )
|
||||||
|
FEWSHOT="$OPTARG"
|
||||||
|
;;
|
||||||
|
t )
|
||||||
|
TP_SIZE="$OPTARG"
|
||||||
|
;;
|
||||||
|
\? )
|
||||||
|
usage
|
||||||
|
exit 1
|
||||||
|
;;
|
||||||
|
esac
|
||||||
|
done
|
||||||
|
|
||||||
|
lm_eval --model vllm \
|
||||||
|
--model_args "pretrained=$MODEL,tensor_parallel_size=$TP_SIZE,add_bos_token=true,trust_remote_code=true,max_model_len=4096" \
|
||||||
|
--tasks mmlu_pro --num_fewshot "$FEWSHOT" --limit "$LIMIT" \
|
||||||
|
--batch_size auto
|
||||||
@@ -19,21 +19,27 @@ RTOL = 0.08
|
|||||||
def launch_lm_eval(eval_config, tp_size):
|
def launch_lm_eval(eval_config, tp_size):
|
||||||
trust_remote_code = eval_config.get("trust_remote_code", False)
|
trust_remote_code = eval_config.get("trust_remote_code", False)
|
||||||
max_model_len = eval_config.get("max_model_len", 4096)
|
max_model_len = eval_config.get("max_model_len", 4096)
|
||||||
|
batch_size = eval_config.get("batch_size", "auto")
|
||||||
|
backend = eval_config.get("backend", "vllm")
|
||||||
model_args = (
|
model_args = (
|
||||||
f"pretrained={eval_config['model_name']},"
|
f"pretrained={eval_config['model_name']},"
|
||||||
f"tensor_parallel_size={tp_size},"
|
f"tensor_parallel_size={tp_size},"
|
||||||
f"enforce_eager=true,"
|
f"enforce_eager=true,"
|
||||||
f"add_bos_token=true,"
|
f"add_bos_token=true,"
|
||||||
f"trust_remote_code={trust_remote_code},"
|
f"trust_remote_code={trust_remote_code},"
|
||||||
f"max_model_len={max_model_len}"
|
f"max_model_len={max_model_len},"
|
||||||
)
|
)
|
||||||
results = lm_eval.simple_evaluate(
|
results = lm_eval.simple_evaluate(
|
||||||
model="vllm",
|
model=backend,
|
||||||
model_args=model_args,
|
model_args=model_args,
|
||||||
tasks=[task["name"] for task in eval_config["tasks"]],
|
tasks=[task["name"] for task in eval_config["tasks"]],
|
||||||
num_fewshot=eval_config["num_fewshot"],
|
num_fewshot=eval_config["num_fewshot"],
|
||||||
limit=eval_config["limit"],
|
limit=eval_config["limit"],
|
||||||
batch_size="auto",
|
# TODO(yeq): using chat template w/ fewshot_as_multiturn is supposed help
|
||||||
|
# text models. however, this is regressing measured strict-match for
|
||||||
|
# existing text models in CI, so only apply it for mm.
|
||||||
|
apply_chat_template=backend == "vllm-vlm",
|
||||||
|
batch_size=batch_size,
|
||||||
)
|
)
|
||||||
return results
|
return results
|
||||||
|
|
||||||
|
|||||||
@@ -8,7 +8,7 @@ This benchmark aims to:
|
|||||||
|
|
||||||
Latest results: [results link](https://blog.vllm.ai/2024/09/05/perf-update.html), scroll to the end.
|
Latest results: [results link](https://blog.vllm.ai/2024/09/05/perf-update.html), scroll to the end.
|
||||||
|
|
||||||
Latest reproduction guilde: [github issue link](https://github.com/vllm-project/vllm/issues/8176)
|
Latest reproduction guide: [github issue link](https://github.com/vllm-project/vllm/issues/8176)
|
||||||
|
|
||||||
## Setup
|
## Setup
|
||||||
|
|
||||||
|
|||||||
@@ -368,7 +368,7 @@ if __name__ == "__main__":
|
|||||||
# The GPUs sometimes come in format of "GPUTYPE\nGPUTYPE\n...",
|
# The GPUs sometimes come in format of "GPUTYPE\nGPUTYPE\n...",
|
||||||
# we want to turn it into "8xGPUTYPE"
|
# we want to turn it into "8xGPUTYPE"
|
||||||
df["GPU"] = df["GPU"].apply(
|
df["GPU"] = df["GPU"].apply(
|
||||||
lambda x: f"{len(x.split('\n'))}x{x.split('\n')[0]}"
|
lambda x: f"{len(x.splitlines())}x{x.splitlines()[0]}"
|
||||||
)
|
)
|
||||||
|
|
||||||
# get markdown tables
|
# get markdown tables
|
||||||
|
|||||||
@@ -181,18 +181,14 @@ launch_vllm_server() {
|
|||||||
if echo "$common_params" | jq -e 'has("fp8")' >/dev/null; then
|
if echo "$common_params" | jq -e 'has("fp8")' >/dev/null; then
|
||||||
echo "Key 'fp8' exists in common params. Use neuralmagic fp8 model for convenience."
|
echo "Key 'fp8' exists in common params. Use neuralmagic fp8 model for convenience."
|
||||||
model=$(echo "$common_params" | jq -r '.neuralmagic_quantized_model')
|
model=$(echo "$common_params" | jq -r '.neuralmagic_quantized_model')
|
||||||
server_command="python3 \
|
server_command="vllm serve $model \
|
||||||
-m vllm.entrypoints.openai.api_server \
|
|
||||||
-tp $tp \
|
-tp $tp \
|
||||||
--model $model \
|
|
||||||
--port $port \
|
--port $port \
|
||||||
$server_args"
|
$server_args"
|
||||||
else
|
else
|
||||||
echo "Key 'fp8' does not exist in common params."
|
echo "Key 'fp8' does not exist in common params."
|
||||||
server_command="python3 \
|
server_command="vllm serve $model \
|
||||||
-m vllm.entrypoints.openai.api_server \
|
|
||||||
-tp $tp \
|
-tp $tp \
|
||||||
--model $model \
|
|
||||||
--port $port \
|
--port $port \
|
||||||
$server_args"
|
$server_args"
|
||||||
fi
|
fi
|
||||||
|
|||||||
@@ -365,8 +365,7 @@ run_serving_tests() {
|
|||||||
continue
|
continue
|
||||||
fi
|
fi
|
||||||
|
|
||||||
server_command="$server_envs python3 \
|
server_command="$server_envs vllm serve \
|
||||||
-m vllm.entrypoints.openai.api_server \
|
|
||||||
$server_args"
|
$server_args"
|
||||||
|
|
||||||
# run the server
|
# run the server
|
||||||
@@ -455,11 +454,6 @@ main() {
|
|||||||
fi
|
fi
|
||||||
check_hf_token
|
check_hf_token
|
||||||
|
|
||||||
# Set to v1 to run v1 benchmark
|
|
||||||
if [[ "${ENGINE_VERSION:-v0}" == "v1" ]]; then
|
|
||||||
export VLLM_USE_V1=1
|
|
||||||
fi
|
|
||||||
|
|
||||||
# dependencies
|
# dependencies
|
||||||
(which wget && which curl) || (apt-get update && apt-get install -y wget curl)
|
(which wget && which curl) || (apt-get update && apt-get install -y wget curl)
|
||||||
(which jq) || (apt-get update && apt-get -y install jq)
|
(which jq) || (apt-get update && apt-get -y install jq)
|
||||||
|
|||||||
@@ -1,46 +0,0 @@
|
|||||||
# This local pyproject file is part of the migration from yapf to ruff format.
|
|
||||||
# It uses the same core rules as the main pyproject.toml file, but with the
|
|
||||||
# following differences:
|
|
||||||
# - ruff line length is overridden to 88
|
|
||||||
# - deprecated typing ignores (UP006, UP035) have been removed
|
|
||||||
|
|
||||||
[tool.ruff]
|
|
||||||
line-length = 88
|
|
||||||
|
|
||||||
[tool.ruff.lint.per-file-ignores]
|
|
||||||
"vllm/third_party/**" = ["ALL"]
|
|
||||||
"vllm/version.py" = ["F401"]
|
|
||||||
"vllm/_version.py" = ["ALL"]
|
|
||||||
|
|
||||||
[tool.ruff.lint]
|
|
||||||
select = [
|
|
||||||
# pycodestyle
|
|
||||||
"E",
|
|
||||||
# Pyflakes
|
|
||||||
"F",
|
|
||||||
# pyupgrade
|
|
||||||
"UP",
|
|
||||||
# flake8-bugbear
|
|
||||||
"B",
|
|
||||||
# flake8-simplify
|
|
||||||
"SIM",
|
|
||||||
# isort
|
|
||||||
"I",
|
|
||||||
# flake8-logging-format
|
|
||||||
"G",
|
|
||||||
]
|
|
||||||
ignore = [
|
|
||||||
# star imports
|
|
||||||
"F405", "F403",
|
|
||||||
# lambda expression assignment
|
|
||||||
"E731",
|
|
||||||
# Loop control variable not used within loop body
|
|
||||||
"B007",
|
|
||||||
# f-string format
|
|
||||||
"UP032",
|
|
||||||
# Can remove once 3.10+ is the minimum Python version
|
|
||||||
"UP007",
|
|
||||||
]
|
|
||||||
|
|
||||||
[tool.ruff.format]
|
|
||||||
docstring-code-format = true
|
|
||||||
@@ -1,24 +1,37 @@
|
|||||||
steps:
|
steps:
|
||||||
# aarch64 + CUDA builds. PyTorch 2.8 aarch64 + CUDA wheel is only available on CUDA 12.9
|
# aarch64 + CUDA builds
|
||||||
- label: "Build arm64 wheel - CUDA 12.9"
|
- label: "Build arm64 wheel - CUDA 12.9"
|
||||||
|
depends_on: ~
|
||||||
id: build-wheel-arm64-cuda-12-9
|
id: build-wheel-arm64-cuda-12-9
|
||||||
agents:
|
agents:
|
||||||
queue: arm64_cpu_queue_postmerge
|
queue: arm64_cpu_queue_postmerge
|
||||||
commands:
|
commands:
|
||||||
# #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here:
|
# #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here:
|
||||||
# https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7
|
# https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7
|
||||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg torch_cuda_arch_list='8.7 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg VLLM_MAIN_CUDA_VERSION=12.9 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
||||||
- "mkdir artifacts"
|
- "mkdir artifacts"
|
||||||
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
||||||
- "bash .buildkite/scripts/upload-wheels.sh"
|
- "bash .buildkite/scripts/upload-wheels.sh"
|
||||||
env:
|
env:
|
||||||
DOCKER_BUILDKIT: "1"
|
DOCKER_BUILDKIT: "1"
|
||||||
|
|
||||||
- block: "Build CUDA 12.8 wheel"
|
# aarch64 build
|
||||||
key: block-build-cu128-wheel
|
- label: "Build arm64 CPU wheel"
|
||||||
|
depends_on: ~
|
||||||
|
id: build-wheel-arm64-cpu
|
||||||
|
agents:
|
||||||
|
queue: arm64_cpu_queue_postmerge
|
||||||
|
commands:
|
||||||
|
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_BUILD_ACL=ON --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile.cpu ."
|
||||||
|
- "mkdir artifacts"
|
||||||
|
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
||||||
|
- "bash .buildkite/scripts/upload-wheels.sh"
|
||||||
|
env:
|
||||||
|
DOCKER_BUILDKIT: "1"
|
||||||
|
|
||||||
|
# x86 + CUDA builds
|
||||||
- label: "Build wheel - CUDA 12.8"
|
- label: "Build wheel - CUDA 12.8"
|
||||||
depends_on: block-build-cu128-wheel
|
depends_on: ~
|
||||||
id: build-wheel-cuda-12-8
|
id: build-wheel-cuda-12-8
|
||||||
agents:
|
agents:
|
||||||
queue: cpu_queue_postmerge
|
queue: cpu_queue_postmerge
|
||||||
@@ -30,37 +43,33 @@ steps:
|
|||||||
env:
|
env:
|
||||||
DOCKER_BUILDKIT: "1"
|
DOCKER_BUILDKIT: "1"
|
||||||
|
|
||||||
- block: "Build CUDA 12.6 wheel"
|
|
||||||
key: block-build-cu126-wheel
|
|
||||||
depends_on: ~
|
|
||||||
|
|
||||||
- label: "Build wheel - CUDA 12.6"
|
|
||||||
depends_on: block-build-cu126-wheel
|
|
||||||
id: build-wheel-cuda-12-6
|
|
||||||
agents:
|
|
||||||
queue: cpu_queue_postmerge
|
|
||||||
commands:
|
|
||||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.6.3 --build-arg torch_cuda_arch_list='7.0 7.5 8.0 8.9 9.0+PTX' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
|
||||||
- "mkdir artifacts"
|
|
||||||
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
|
||||||
- "bash .buildkite/scripts/upload-wheels.sh"
|
|
||||||
env:
|
|
||||||
DOCKER_BUILDKIT: "1"
|
|
||||||
|
|
||||||
# x86 + CUDA builds
|
|
||||||
- label: "Build wheel - CUDA 12.9"
|
- label: "Build wheel - CUDA 12.9"
|
||||||
depends_on: ~
|
depends_on: ~
|
||||||
id: build-wheel-cuda-12-9
|
id: build-wheel-cuda-12-9
|
||||||
agents:
|
agents:
|
||||||
queue: cpu_queue_postmerge
|
queue: cpu_queue_postmerge
|
||||||
commands:
|
commands:
|
||||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg torch_cuda_arch_list='7.0 7.5 8.0 8.9 9.0+PTX' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
||||||
- "mkdir artifacts"
|
- "mkdir artifacts"
|
||||||
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
||||||
- "bash .buildkite/scripts/upload-wheels.sh"
|
- "bash .buildkite/scripts/upload-wheels.sh"
|
||||||
env:
|
env:
|
||||||
DOCKER_BUILDKIT: "1"
|
DOCKER_BUILDKIT: "1"
|
||||||
|
|
||||||
|
- label: "Build wheel - CUDA 13.0"
|
||||||
|
depends_on: ~
|
||||||
|
id: build-wheel-cuda-13-0
|
||||||
|
agents:
|
||||||
|
queue: cpu_queue_postmerge
|
||||||
|
commands:
|
||||||
|
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
||||||
|
- "mkdir artifacts"
|
||||||
|
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
||||||
|
- "bash .buildkite/scripts/upload-wheels.sh"
|
||||||
|
env:
|
||||||
|
DOCKER_BUILDKIT: "1"
|
||||||
|
|
||||||
|
# Build release images (12.9)
|
||||||
- label: "Build release image (x86)"
|
- label: "Build release image (x86)"
|
||||||
depends_on: ~
|
depends_on: ~
|
||||||
id: build-release-image-x86
|
id: build-release-image-x86
|
||||||
@@ -68,13 +77,12 @@ steps:
|
|||||||
queue: cpu_queue_postmerge
|
queue: cpu_queue_postmerge
|
||||||
commands:
|
commands:
|
||||||
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
||||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
|
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
|
||||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)"
|
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)"
|
||||||
# re-tag to default image tag and push, just in case arm64 build fails
|
# re-tag to default image tag and push, just in case arm64 build fails
|
||||||
- "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
|
- "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
|
||||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
|
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
|
||||||
|
|
||||||
# PyTorch 2.8 aarch64 + CUDA wheel is only available on CUDA 12.9
|
|
||||||
- label: "Build release image (arm64)"
|
- label: "Build release image (arm64)"
|
||||||
depends_on: ~
|
depends_on: ~
|
||||||
id: build-release-image-arm64
|
id: build-release-image-arm64
|
||||||
@@ -82,7 +90,7 @@ steps:
|
|||||||
queue: arm64_cpu_queue_postmerge
|
queue: arm64_cpu_queue_postmerge
|
||||||
commands:
|
commands:
|
||||||
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
||||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg torch_cuda_arch_list='8.7 9.0 10.0+PTX 12.0' --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
|
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
|
||||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)"
|
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)"
|
||||||
|
|
||||||
# Add job to create multi-arch manifest
|
# Add job to create multi-arch manifest
|
||||||
@@ -102,8 +110,6 @@ steps:
|
|||||||
depends_on:
|
depends_on:
|
||||||
- create-multi-arch-manifest
|
- create-multi-arch-manifest
|
||||||
- build-wheel-cuda-12-8
|
- build-wheel-cuda-12-8
|
||||||
- build-wheel-cuda-12-6
|
|
||||||
- build-wheel-cuda-12-9
|
|
||||||
id: annotate-release-workflow
|
id: annotate-release-workflow
|
||||||
agents:
|
agents:
|
||||||
queue: cpu_queue_postmerge
|
queue: cpu_queue_postmerge
|
||||||
@@ -150,6 +156,22 @@ steps:
|
|||||||
env:
|
env:
|
||||||
DOCKER_BUILDKIT: "1"
|
DOCKER_BUILDKIT: "1"
|
||||||
|
|
||||||
|
- block: "Build arm64 CPU release image"
|
||||||
|
key: block-arm64-cpu-release-image-build
|
||||||
|
depends_on: ~
|
||||||
|
|
||||||
|
- label: "Build and publish arm64 CPU release image"
|
||||||
|
depends_on: block-arm64-cpu-release-image-build
|
||||||
|
agents:
|
||||||
|
queue: arm64_cpu_queue_postmerge
|
||||||
|
commands:
|
||||||
|
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
||||||
|
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ."
|
||||||
|
- "docker push public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:latest"
|
||||||
|
- "docker push public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:$(buildkite-agent meta-data get release-version)"
|
||||||
|
env:
|
||||||
|
DOCKER_BUILDKIT: "1"
|
||||||
|
|
||||||
- label: "Build and publish nightly multi-arch image to DockerHub"
|
- label: "Build and publish nightly multi-arch image to DockerHub"
|
||||||
depends_on:
|
depends_on:
|
||||||
- create-multi-arch-manifest
|
- create-multi-arch-manifest
|
||||||
@@ -158,11 +180,16 @@ steps:
|
|||||||
queue: cpu_queue_postmerge
|
queue: cpu_queue_postmerge
|
||||||
commands:
|
commands:
|
||||||
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
||||||
- "docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
|
- "docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64"
|
||||||
- "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT vllm/vllm-openai:nightly"
|
- "docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64"
|
||||||
- "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT vllm/vllm-openai:nightly-$BUILDKITE_COMMIT"
|
- "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64 vllm/vllm-openai:nightly-x86_64"
|
||||||
- "docker push vllm/vllm-openai:nightly"
|
- "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64 vllm/vllm-openai:nightly-aarch64"
|
||||||
- "docker push vllm/vllm-openai:nightly-$BUILDKITE_COMMIT"
|
- "docker push vllm/vllm-openai:nightly-x86_64"
|
||||||
|
- "docker push vllm/vllm-openai:nightly-aarch64"
|
||||||
|
- "docker manifest create vllm/vllm-openai:nightly vllm/vllm-openai:nightly-x86_64 vllm/vllm-openai:nightly-aarch64 --amend"
|
||||||
|
- "docker manifest create vllm/vllm-openai:nightly-$BUILDKITE_COMMIT vllm/vllm-openai:nightly-x86_64 vllm/vllm-openai:nightly-aarch64 --amend"
|
||||||
|
- "docker manifest push vllm/vllm-openai:nightly"
|
||||||
|
- "docker manifest push vllm/vllm-openai:nightly-$BUILDKITE_COMMIT"
|
||||||
# Clean up old nightly builds (keep only last 14)
|
# Clean up old nightly builds (keep only last 14)
|
||||||
- "bash .buildkite/scripts/cleanup-nightly-builds.sh"
|
- "bash .buildkite/scripts/cleanup-nightly-builds.sh"
|
||||||
plugins:
|
plugins:
|
||||||
@@ -171,3 +198,4 @@ steps:
|
|||||||
password-env: DOCKERHUB_TOKEN
|
password-env: DOCKERHUB_TOKEN
|
||||||
env:
|
env:
|
||||||
DOCKER_BUILDKIT: "1"
|
DOCKER_BUILDKIT: "1"
|
||||||
|
DOCKERHUB_USERNAME: "vllmbot"
|
||||||
|
|||||||
@@ -14,18 +14,33 @@ buildkite-agent annotate --style 'info' --context 'release-workflow' << EOF
|
|||||||
To download the wheel:
|
To download the wheel:
|
||||||
\`\`\`
|
\`\`\`
|
||||||
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux1_x86_64.whl .
|
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux1_x86_64.whl .
|
||||||
|
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux2014_aarch64.whl .
|
||||||
|
|
||||||
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu126/vllm-${RELEASE_VERSION}+cu126-cp38-abi3-manylinux1_x86_64.whl .
|
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu126/vllm-${RELEASE_VERSION}+cu126-cp38-abi3-manylinux1_x86_64.whl .
|
||||||
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu118/vllm-${RELEASE_VERSION}+cu118-cp38-abi3-manylinux1_x86_64.whl .
|
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu129/vllm-${RELEASE_VERSION}+cu129-cp38-abi3-manylinux1_x86_64.whl .
|
||||||
\`\`\`
|
\`\`\`
|
||||||
|
|
||||||
To download and upload the image:
|
To download and upload the image:
|
||||||
|
|
||||||
\`\`\`
|
\`\`\`
|
||||||
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}
|
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64
|
||||||
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT} vllm/vllm-openai
|
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64
|
||||||
docker tag vllm/vllm-openai vllm/vllm-openai:latest
|
|
||||||
docker tag vllm/vllm-openai vllm/vllm-openai:v${RELEASE_VERSION}
|
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64 vllm/vllm-openai:x86_64
|
||||||
docker push vllm/vllm-openai:latest
|
docker tag vllm/vllm-openai:x86_64 vllm/vllm-openai:latest-x86_64
|
||||||
docker push vllm/vllm-openai:v${RELEASE_VERSION}
|
docker tag vllm/vllm-openai:x86_64 vllm/vllm-openai:v${RELEASE_VERSION}-x86_64
|
||||||
|
docker push vllm/vllm-openai:latest-x86_64
|
||||||
|
docker push vllm/vllm-openai:v${RELEASE_VERSION}-x86_64
|
||||||
|
|
||||||
|
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64 vllm/vllm-openai:aarch64
|
||||||
|
docker tag vllm/vllm-openai:aarch64 vllm/vllm-openai:latest-aarch64
|
||||||
|
docker tag vllm/vllm-openai:aarch64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
|
||||||
|
docker push vllm/vllm-openai:latest-aarch64
|
||||||
|
docker push vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
|
||||||
|
|
||||||
|
docker manifest create vllm/vllm-openai:latest vllm/vllm-openai:latest-x86_64 vllm/vllm-openai:latest-aarch64 --amend
|
||||||
|
docker manifest create vllm/vllm-openai:v${RELEASE_VERSION} vllm/vllm-openai:v${RELEASE_VERSION}-x86_64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64 --amend
|
||||||
|
docker manifest push vllm/vllm-openai:latest
|
||||||
|
docker manifest push vllm/vllm-openai:v${RELEASE_VERSION}
|
||||||
\`\`\`
|
\`\`\`
|
||||||
EOF
|
EOF
|
||||||
@@ -8,20 +8,41 @@ set -ex
|
|||||||
# DockerHub API endpoint for vllm/vllm-openai repository
|
# DockerHub API endpoint for vllm/vllm-openai repository
|
||||||
REPO_API_URL="https://hub.docker.com/v2/repositories/vllm/vllm-openai/tags"
|
REPO_API_URL="https://hub.docker.com/v2/repositories/vllm/vllm-openai/tags"
|
||||||
|
|
||||||
# Get DockerHub token from environment
|
# Get DockerHub credentials from environment
|
||||||
if [ -z "$DOCKERHUB_TOKEN" ]; then
|
if [ -z "$DOCKERHUB_TOKEN" ]; then
|
||||||
echo "Error: DOCKERHUB_TOKEN environment variable is not set"
|
echo "Error: DOCKERHUB_TOKEN environment variable is not set"
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
if [ -z "$DOCKERHUB_USERNAME" ]; then
|
||||||
|
echo "Error: DOCKERHUB_USERNAME environment variable is not set"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Get DockerHub bearer token
|
||||||
|
echo "Getting DockerHub bearer token..."
|
||||||
|
set +x
|
||||||
|
BEARER_TOKEN=$(curl -s -X POST \
|
||||||
|
-H "Content-Type: application/json" \
|
||||||
|
-d "{\"username\": \"$DOCKERHUB_USERNAME\", \"password\": \"$DOCKERHUB_TOKEN\"}" \
|
||||||
|
"https://hub.docker.com/v2/users/login" | jq -r '.token')
|
||||||
|
set -x
|
||||||
|
|
||||||
|
if [ -z "$BEARER_TOKEN" ] || [ "$BEARER_TOKEN" = "null" ]; then
|
||||||
|
echo "Error: Failed to get DockerHub bearer token"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
# Function to get all tags from DockerHub
|
# Function to get all tags from DockerHub
|
||||||
get_all_tags() {
|
get_all_tags() {
|
||||||
local page=1
|
local page=1
|
||||||
local all_tags=""
|
local all_tags=""
|
||||||
|
|
||||||
while true; do
|
while true; do
|
||||||
local response=$(curl -s -H "Authorization: Bearer $DOCKERHUB_TOKEN" \
|
set +x
|
||||||
|
local response=$(curl -s -H "Authorization: Bearer $BEARER_TOKEN" \
|
||||||
"$REPO_API_URL?page=$page&page_size=100")
|
"$REPO_API_URL?page=$page&page_size=100")
|
||||||
|
set -x
|
||||||
|
|
||||||
# Get both last_updated timestamp and tag name, separated by |
|
# Get both last_updated timestamp and tag name, separated by |
|
||||||
local tags=$(echo "$response" | jq -r '.results[] | select(.name | startswith("nightly-")) | "\(.last_updated)|\(.name)"')
|
local tags=$(echo "$response" | jq -r '.results[] | select(.name | startswith("nightly-")) | "\(.last_updated)|\(.name)"')
|
||||||
@@ -43,7 +64,9 @@ delete_tag() {
|
|||||||
echo "Deleting tag: $tag_name"
|
echo "Deleting tag: $tag_name"
|
||||||
|
|
||||||
local delete_url="https://hub.docker.com/v2/repositories/vllm/vllm-openai/tags/$tag_name"
|
local delete_url="https://hub.docker.com/v2/repositories/vllm/vllm-openai/tags/$tag_name"
|
||||||
local response=$(curl -s -X DELETE -H "Authorization: Bearer $DOCKERHUB_TOKEN" "$delete_url")
|
set +x
|
||||||
|
local response=$(curl -s -X DELETE -H "Authorization: Bearer $BEARER_TOKEN" "$delete_url")
|
||||||
|
set -x
|
||||||
|
|
||||||
if echo "$response" | jq -e '.detail' > /dev/null 2>&1; then
|
if echo "$response" | jq -e '.detail' > /dev/null 2>&1; then
|
||||||
echo "Warning: Failed to delete tag $tag_name: $(echo "$response" | jq -r '.detail')"
|
echo "Warning: Failed to delete tag $tag_name: $(echo "$response" | jq -r '.detail')"
|
||||||
|
|||||||
@@ -86,10 +86,6 @@ if [[ $commands == *"pytest -v -s models/test_registry.py"* ]]; then
|
|||||||
commands=${commands//"pytest -v -s models/test_registry.py"/"pytest -v -s models/test_registry.py -k 'not BambaForCausalLM and not GritLM and not Mamba2ForCausalLM and not Zamba2ForCausalLM'"}
|
commands=${commands//"pytest -v -s models/test_registry.py"/"pytest -v -s models/test_registry.py -k 'not BambaForCausalLM and not GritLM and not Mamba2ForCausalLM and not Zamba2ForCausalLM'"}
|
||||||
fi
|
fi
|
||||||
|
|
||||||
if [[ $commands == *"VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4 and not plamo2'"* ]]; then
|
|
||||||
commands=${commands//"VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4 and not plamo2'"/"VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4 and not plamo2 and not BambaForCausalLM and not Gemma2ForCausalLM and not Grok1ModelForCausalLM and not Zamba2ForCausalLM and not Gemma2Model and not GritLM'"}
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [[ $commands == *"pytest -v -s compile/test_basic_correctness.py"* ]]; then
|
if [[ $commands == *"pytest -v -s compile/test_basic_correctness.py"* ]]; then
|
||||||
commands=${commands//"pytest -v -s compile/test_basic_correctness.py"/"VLLM_USE_TRITON_FLASH_ATTN=0 pytest -v -s compile/test_basic_correctness.py"}
|
commands=${commands//"pytest -v -s compile/test_basic_correctness.py"/"VLLM_USE_TRITON_FLASH_ATTN=0 pytest -v -s compile/test_basic_correctness.py"}
|
||||||
fi
|
fi
|
||||||
@@ -167,12 +163,6 @@ if [[ $commands == *" entrypoints/llm "* ]]; then
|
|||||||
--ignore=entrypoints/llm/test_prompt_validation.py "}
|
--ignore=entrypoints/llm/test_prompt_validation.py "}
|
||||||
fi
|
fi
|
||||||
|
|
||||||
#Obsolete currently
|
|
||||||
##ignore certain Entrypoints/llm tests
|
|
||||||
#if [[ $commands == *" && pytest -v -s entrypoints/llm/test_guided_generate.py"* ]]; then
|
|
||||||
# commands=${commands//" && pytest -v -s entrypoints/llm/test_guided_generate.py"/" "}
|
|
||||||
#fi
|
|
||||||
|
|
||||||
# --ignore=entrypoints/openai/test_encoder_decoder.py \
|
# --ignore=entrypoints/openai/test_encoder_decoder.py \
|
||||||
# --ignore=entrypoints/openai/test_embedding.py \
|
# --ignore=entrypoints/openai/test_embedding.py \
|
||||||
# --ignore=entrypoints/openai/test_oot_registration.py
|
# --ignore=entrypoints/openai/test_oot_registration.py
|
||||||
|
|||||||
@@ -25,25 +25,28 @@ function cpu_tests() {
|
|||||||
|
|
||||||
# offline inference
|
# offline inference
|
||||||
podman exec -it "$container_id" bash -c "
|
podman exec -it "$container_id" bash -c "
|
||||||
set -e
|
set -xve
|
||||||
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
|
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m" >> $HOME/test_basic.log
|
||||||
|
|
||||||
# Run basic model test
|
# Run basic model test
|
||||||
podman exec -it "$container_id" bash -c "
|
podman exec -it "$container_id" bash -c "
|
||||||
set -e
|
set -evx
|
||||||
pip install pytest pytest-asyncio einops peft Pillow soundfile transformers_stream_generator matplotlib
|
pip install pytest pytest-asyncio einops peft Pillow soundfile transformers_stream_generator matplotlib
|
||||||
pip install sentence-transformers datamodel_code_generator
|
pip install sentence-transformers datamodel_code_generator
|
||||||
pytest -v -s tests/models/language/generation/test_bart.py -m cpu_model
|
|
||||||
|
# Note: disable Bart until supports V1
|
||||||
|
# pytest -v -s tests/models/language/generation/test_bart.py -m cpu_model
|
||||||
pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-openai-community/gpt2]
|
pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-openai-community/gpt2]
|
||||||
pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-facebook/opt-125m]
|
pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-facebook/opt-125m]
|
||||||
pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-google/gemma-1.1-2b-it]
|
pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-google/gemma-1.1-2b-it]
|
||||||
pytest -v -s tests/models/language/pooling/test_classification.py::test_models[float-jason9693/Qwen2.5-1.5B-apeach]
|
pytest -v -s tests/models/language/pooling/test_classification.py::test_models[float-jason9693/Qwen2.5-1.5B-apeach]
|
||||||
pytest -v -s tests/models/language/pooling/test_embedding.py -m cpu_model"
|
# TODO: Below test case tests/models/language/pooling/test_embedding.py::test_models[True-ssmits/Qwen2-7B-Instruct-embed-base] fails on ppc64le. Disabling it for time being.
|
||||||
|
# pytest -v -s tests/models/language/pooling/test_embedding.py -m cpu_model" >> $HOME/test_rest.log
|
||||||
}
|
}
|
||||||
|
|
||||||
# All of CPU tests are expected to be finished less than 40 mins.
|
# All of CPU tests are expected to be finished less than 40 mins.
|
||||||
|
|
||||||
export container_id
|
export container_id
|
||||||
export -f cpu_tests
|
export -f cpu_tests
|
||||||
timeout 40m bash -c cpu_tests
|
timeout 120m bash -c cpu_tests
|
||||||
|
|
||||||
|
|||||||
@@ -58,15 +58,11 @@ function cpu_tests() {
|
|||||||
# pytest -x -v -s tests/kernels/attention/test_cache.py -m cpu_model
|
# pytest -x -v -s tests/kernels/attention/test_cache.py -m cpu_model
|
||||||
# pytest -x -v -s tests/kernels/attention/test_mla_decode_cpu.py -m cpu_model
|
# pytest -x -v -s tests/kernels/attention/test_mla_decode_cpu.py -m cpu_model
|
||||||
|
|
||||||
# Note: disable Bart until supports V1
|
pytest -x -v -s tests/models/language/generation -m cpu_model
|
||||||
pytest -x -v -s tests/models/language/generation -m cpu_model \
|
VLLM_CPU_SGL_KERNEL=1 pytest -x -v -s tests/models/language/generation -m cpu_model
|
||||||
--ignore=tests/models/language/generation/test_bart.py
|
|
||||||
VLLM_CPU_SGL_KERNEL=1 pytest -x -v -s tests/models/language/generation -m cpu_model \
|
|
||||||
--ignore=tests/models/language/generation/test_bart.py
|
|
||||||
|
|
||||||
pytest -x -v -s tests/models/language/pooling -m cpu_model
|
pytest -x -v -s tests/models/language/pooling -m cpu_model
|
||||||
pytest -x -v -s tests/models/multimodal/generation \
|
pytest -x -v -s tests/models/multimodal/generation \
|
||||||
--ignore=tests/models/multimodal/generation/test_mllama.py \
|
|
||||||
--ignore=tests/models/multimodal/generation/test_pixtral.py \
|
--ignore=tests/models/multimodal/generation/test_pixtral.py \
|
||||||
-m cpu_model"
|
-m cpu_model"
|
||||||
|
|
||||||
@@ -74,7 +70,7 @@ function cpu_tests() {
|
|||||||
docker exec cpu-test-"$NUMA_NODE" bash -c "
|
docker exec cpu-test-"$NUMA_NODE" bash -c "
|
||||||
set -e
|
set -e
|
||||||
pytest -x -s -v \
|
pytest -x -s -v \
|
||||||
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_logprobs[False-10-32-neuralmagic/Llama-3.2-1B-quantized.w8a8]"
|
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_logprobs"
|
||||||
|
|
||||||
# Note: disable it until supports V1
|
# Note: disable it until supports V1
|
||||||
# Run AWQ test
|
# Run AWQ test
|
||||||
|
|||||||
191
.buildkite/scripts/hardware_ci/run-npu-test.sh
Normal file
191
.buildkite/scripts/hardware_ci/run-npu-test.sh
Normal file
@@ -0,0 +1,191 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
# This script build the Ascend NPU docker image and run the offline inference inside the container.
|
||||||
|
# It serves a sanity check for compilation and basic model usage.
|
||||||
|
set -ex
|
||||||
|
|
||||||
|
# Base ubuntu image with basic ascend development libraries and python installed
|
||||||
|
VLLM_ASCEND_REPO="https://github.com/vllm-project/vllm-ascend.git"
|
||||||
|
CONFIG_FILE_REMOTE_PATH="tests/e2e/vllm_interface/vllm_test.cfg"
|
||||||
|
TEST_RUN_CONFIG_FILE="vllm_test.cfg"
|
||||||
|
VLLM_ASCEND_TMP_DIR=
|
||||||
|
# Get the test run configuration file from the vllm-ascend repository
|
||||||
|
fetch_vllm_test_cfg() {
|
||||||
|
VLLM_ASCEND_TMP_DIR=$(mktemp -d)
|
||||||
|
# Ensure that the temporary directory is cleaned up when an exception occurs during configuration file retrieval
|
||||||
|
cleanup() {
|
||||||
|
rm -rf "${VLLM_ASCEND_TMP_DIR}"
|
||||||
|
}
|
||||||
|
trap cleanup EXIT
|
||||||
|
|
||||||
|
GIT_TRACE=1 git clone -v --depth 1 "${VLLM_ASCEND_REPO}" "${VLLM_ASCEND_TMP_DIR}"
|
||||||
|
if [ ! -f "${VLLM_ASCEND_TMP_DIR}/${CONFIG_FILE_REMOTE_PATH}" ]; then
|
||||||
|
echo "Error: file '${CONFIG_FILE_REMOTE_PATH}' does not exist in the warehouse" >&2
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# If the file already exists locally, just overwrite it
|
||||||
|
cp "${VLLM_ASCEND_TMP_DIR}/${CONFIG_FILE_REMOTE_PATH}" "${TEST_RUN_CONFIG_FILE}"
|
||||||
|
echo "Copied ${CONFIG_FILE_REMOTE_PATH} to ${TEST_RUN_CONFIG_FILE}"
|
||||||
|
|
||||||
|
# Since the trap will be overwritten later, and when it is executed here, the task of cleaning up resources
|
||||||
|
# when the trap is abnormal has been completed, so the temporary resources are manually deleted here.
|
||||||
|
rm -rf "${VLLM_ASCEND_TMP_DIR}"
|
||||||
|
trap - EXIT
|
||||||
|
}
|
||||||
|
|
||||||
|
# Downloads test run configuration file from a remote URL.
|
||||||
|
# Loads the configuration into the current script environment.
|
||||||
|
get_config() {
|
||||||
|
if [ ! -f "${TEST_RUN_CONFIG_FILE}" ]; then
|
||||||
|
echo "Error: file '${TEST_RUN_CONFIG_FILE}' does not exist in the warehouse" >&2
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
source "${TEST_RUN_CONFIG_FILE}"
|
||||||
|
echo "Base docker image name that get from configuration: ${BASE_IMAGE_NAME}"
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
|
||||||
|
# get test running configuration.
|
||||||
|
fetch_vllm_test_cfg
|
||||||
|
get_config
|
||||||
|
# Check if the function call was successful. If not, exit the script.
|
||||||
|
if [ $? -ne 0 ]; then
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
image_name="npu/vllm-ci:${BUILDKITE_COMMIT}_${EPOCHSECONDS}"
|
||||||
|
container_name="npu_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
|
||||||
|
|
||||||
|
# BUILDKITE_AGENT_NAME format is {hostname}-{agent_idx}-{npu_card_num}cards
|
||||||
|
agent_idx=$(echo "${BUILDKITE_AGENT_NAME}" | awk -F'-' '{print $(NF-1)}')
|
||||||
|
echo "agent_idx: ${agent_idx}"
|
||||||
|
builder_name="cachebuilder${agent_idx}"
|
||||||
|
builder_cache_dir="/mnt/docker-cache${agent_idx}"
|
||||||
|
mkdir -p ${builder_cache_dir}
|
||||||
|
|
||||||
|
# Try building the docker image
|
||||||
|
cat <<EOF | DOCKER_BUILDKIT=1 docker build \
|
||||||
|
--add-host cache-service-vllm.nginx-pypi-cache.svc.cluster.local:${PYPI_CACHE_HOST} \
|
||||||
|
--builder ${builder_name} --cache-from type=local,src=${builder_cache_dir} \
|
||||||
|
--cache-to type=local,dest=${builder_cache_dir},mode=max \
|
||||||
|
--progress=plain --load -t ${image_name} -f - .
|
||||||
|
FROM ${BASE_IMAGE_NAME}
|
||||||
|
|
||||||
|
# Define environments
|
||||||
|
ENV DEBIAN_FRONTEND=noninteractive
|
||||||
|
|
||||||
|
RUN pip config set global.index-url http://cache-service-vllm.nginx-pypi-cache.svc.cluster.local:${PYPI_CACHE_PORT}/pypi/simple && \
|
||||||
|
pip config set global.trusted-host cache-service-vllm.nginx-pypi-cache.svc.cluster.local && \
|
||||||
|
apt-get update -y && \
|
||||||
|
apt-get install -y python3-pip git vim wget net-tools gcc g++ cmake libnuma-dev && \
|
||||||
|
rm -rf /var/cache/apt/* && \
|
||||||
|
rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
|
# Install for pytest to make the docker build cache layer always valid
|
||||||
|
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||||
|
pip install pytest>=6.0 modelscope
|
||||||
|
|
||||||
|
WORKDIR /workspace/vllm
|
||||||
|
|
||||||
|
# Install vLLM dependencies in advance. Effect: As long as common.txt remains unchanged, the docker cache layer will be valid.
|
||||||
|
COPY requirements/common.txt /workspace/vllm/requirements/common.txt
|
||||||
|
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||||
|
pip install -r requirements/common.txt
|
||||||
|
|
||||||
|
COPY . .
|
||||||
|
|
||||||
|
# Install vLLM
|
||||||
|
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||||
|
VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /workspace/vllm/ --extra-index https://download.pytorch.org/whl/cpu/ && \
|
||||||
|
python3 -m pip uninstall -y triton
|
||||||
|
|
||||||
|
# Install vllm-ascend
|
||||||
|
WORKDIR /workspace
|
||||||
|
ARG VLLM_ASCEND_REPO=https://github.com/vllm-project/vllm-ascend.git
|
||||||
|
ARG VLLM_ASCEND_TAG=main
|
||||||
|
RUN git config --global url."https://gh-proxy.test.osinfra.cn/https://github.com/".insteadOf "https://github.com/" && \
|
||||||
|
git clone --depth 1 \$VLLM_ASCEND_REPO --branch \$VLLM_ASCEND_TAG /workspace/vllm-ascend
|
||||||
|
|
||||||
|
# Install vllm dependencies in advance. Effect: As long as common.txt remains unchanged, the docker cache layer will be valid.
|
||||||
|
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||||
|
pip install -r /workspace/vllm-ascend/requirements.txt
|
||||||
|
|
||||||
|
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||||
|
export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi && \
|
||||||
|
source /usr/local/Ascend/ascend-toolkit/set_env.sh && \
|
||||||
|
source /usr/local/Ascend/nnal/atb/set_env.sh && \
|
||||||
|
export LD_LIBRARY_PATH=\$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/`uname -i`-linux/devlib && \
|
||||||
|
python3 -m pip install -v -e /workspace/vllm-ascend/ --extra-index https://download.pytorch.org/whl/cpu/
|
||||||
|
|
||||||
|
ENV VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||||
|
ENV VLLM_USE_MODELSCOPE=True
|
||||||
|
|
||||||
|
WORKDIR /workspace/vllm-ascend
|
||||||
|
|
||||||
|
CMD ["/bin/bash"]
|
||||||
|
|
||||||
|
EOF
|
||||||
|
|
||||||
|
# Setup cleanup
|
||||||
|
remove_docker_container() {
|
||||||
|
docker rm -f "${container_name}" || true;
|
||||||
|
docker image rm -f "${image_name}" || true;
|
||||||
|
docker system prune -f || true;
|
||||||
|
}
|
||||||
|
trap remove_docker_container EXIT
|
||||||
|
|
||||||
|
# Generate corresponding --device args based on BUILDKITE_AGENT_NAME
|
||||||
|
# Ascend NPU BUILDKITE_AGENT_NAME format is {hostname}-{agent_idx}-{npu_card_num}cards, and agent_idx starts from 1.
|
||||||
|
# e.g. atlas-a2-001-1-2cards means this is the 1-th agent on atlas-a2-001 host, and it has 2 NPU cards.
|
||||||
|
# returns --device /dev/davinci0 --device /dev/davinci1
|
||||||
|
parse_and_gen_devices() {
|
||||||
|
local input="$1"
|
||||||
|
local index cards_num
|
||||||
|
if [[ "$input" =~ ([0-9]+)-([0-9]+)cards$ ]]; then
|
||||||
|
index="${BASH_REMATCH[1]}"
|
||||||
|
cards_num="${BASH_REMATCH[2]}"
|
||||||
|
else
|
||||||
|
echo "parse error" >&2
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
local devices=""
|
||||||
|
local i=0
|
||||||
|
while (( i < cards_num )); do
|
||||||
|
local dev_idx=$(((index - 1)*cards_num + i ))
|
||||||
|
devices="$devices --device /dev/davinci${dev_idx}"
|
||||||
|
((i++))
|
||||||
|
done
|
||||||
|
|
||||||
|
# trim leading space
|
||||||
|
devices="${devices#"${devices%%[![:space:]]*}"}"
|
||||||
|
# Output devices: assigned to the caller variable
|
||||||
|
printf '%s' "$devices"
|
||||||
|
}
|
||||||
|
|
||||||
|
devices=$(parse_and_gen_devices "${BUILDKITE_AGENT_NAME}") || exit 1
|
||||||
|
|
||||||
|
# Run the image and execute the Out-Of-Tree (OOT) platform interface test case on Ascend NPU hardware.
|
||||||
|
# This test checks whether the OOT platform interface is functioning properly in conjunction with
|
||||||
|
# the hardware plugin vllm-ascend.
|
||||||
|
model_cache_dir=/mnt/modelscope${agent_idx}
|
||||||
|
mkdir -p ${model_cache_dir}
|
||||||
|
docker run \
|
||||||
|
${devices} \
|
||||||
|
--device /dev/davinci_manager \
|
||||||
|
--device /dev/devmm_svm \
|
||||||
|
--device /dev/hisi_hdc \
|
||||||
|
-v /usr/local/dcmi:/usr/local/dcmi \
|
||||||
|
-v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi \
|
||||||
|
-v /usr/local/Ascend/driver/lib64/:/usr/local/Ascend/driver/lib64/ \
|
||||||
|
-v /usr/local/Ascend/driver/version.info:/usr/local/Ascend/driver/version.info \
|
||||||
|
-v /etc/ascend_install.info:/etc/ascend_install.info \
|
||||||
|
-v ${model_cache_dir}:/root/.cache/modelscope \
|
||||||
|
--entrypoint="" \
|
||||||
|
--name "${container_name}" \
|
||||||
|
"${image_name}" \
|
||||||
|
bash -c '
|
||||||
|
set -e
|
||||||
|
pytest -v -s tests/e2e/vllm_interface/
|
||||||
|
'
|
||||||
@@ -62,12 +62,11 @@ echo "--- Installing Python dependencies ---"
|
|||||||
python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \
|
python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \
|
||||||
&& python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \
|
&& python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \
|
||||||
&& python3 -m pip install --progress-bar off "lm-eval @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d" \
|
&& python3 -m pip install --progress-bar off "lm-eval @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d" \
|
||||||
&& python3 -m pip install --progress-bar off hf-transfer
|
&& python3 -m pip install --progress-bar off hf-transfer tblib==3.1.0
|
||||||
echo "--- Python dependencies installed ---"
|
echo "--- Python dependencies installed ---"
|
||||||
export VLLM_USE_V1=1
|
|
||||||
export VLLM_XLA_CHECK_RECOMPILATION=1
|
export VLLM_XLA_CHECK_RECOMPILATION=1
|
||||||
export VLLM_XLA_CACHE_PATH=
|
export VLLM_XLA_CACHE_PATH=
|
||||||
echo "Using VLLM V1"
|
|
||||||
|
|
||||||
echo "--- Hardware Information ---"
|
echo "--- Hardware Information ---"
|
||||||
# tpu-info
|
# tpu-info
|
||||||
|
|||||||
@@ -62,12 +62,11 @@ echo "--- Installing Python dependencies ---"
|
|||||||
python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \
|
python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \
|
||||||
&& python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \
|
&& python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \
|
||||||
&& python3 -m pip install --progress-bar off "lm-eval @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d" \
|
&& python3 -m pip install --progress-bar off "lm-eval @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d" \
|
||||||
&& python3 -m pip install --progress-bar off hf-transfer
|
&& python3 -m pip install --progress-bar off hf-transfer tblib==3.1.0
|
||||||
echo "--- Python dependencies installed ---"
|
echo "--- Python dependencies installed ---"
|
||||||
export VLLM_USE_V1=1
|
|
||||||
export VLLM_XLA_CHECK_RECOMPILATION=1
|
export VLLM_XLA_CHECK_RECOMPILATION=1
|
||||||
export VLLM_XLA_CACHE_PATH=
|
export VLLM_XLA_CACHE_PATH=
|
||||||
echo "Using VLLM V1"
|
|
||||||
|
|
||||||
echo "--- Hardware Information ---"
|
echo "--- Hardware Information ---"
|
||||||
# tpu-info
|
# tpu-info
|
||||||
|
|||||||
@@ -35,16 +35,14 @@ docker run \
|
|||||||
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 -O3 -O.cudagraph_mode=NONE
|
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 -O3 -O.cudagraph_mode=NONE
|
||||||
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray
|
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray
|
||||||
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp
|
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp
|
||||||
VLLM_ATTENTION_BACKEND=TRITON_ATTN_VLLM_V1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
|
VLLM_ATTENTION_BACKEND=TRITON_ATTN python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
|
||||||
cd tests
|
cd tests
|
||||||
pytest -v -s v1/core
|
pytest -v -s v1/core
|
||||||
pytest -v -s v1/engine
|
pytest -v -s v1/engine
|
||||||
pytest -v -s v1/sample --ignore=v1/sample/test_logprobs.py --ignore=v1/sample/test_logprobs_e2e.py
|
pytest -v -s v1/sample --ignore=v1/sample/test_logprobs.py --ignore=v1/sample/test_logprobs_e2e.py
|
||||||
pytest -v -s v1/worker --ignore=v1/worker/test_gpu_model_runner.py
|
pytest -v -s v1/worker --ignore=v1/worker/test_gpu_model_runner.py
|
||||||
pytest -v -s v1/structured_output
|
pytest -v -s v1/structured_output
|
||||||
pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_eagle.py --ignore=v1/spec_decode/test_tree_attention.py
|
pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_tree_attention.py
|
||||||
pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_nixl_connector.py --ignore=v1/kv_connector/unit/test_shared_storage_connector.py
|
pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_nixl_connector.py --ignore=v1/kv_connector/unit/test_shared_storage_connector.py
|
||||||
pytest -v -s v1/test_serial_utils.py
|
pytest -v -s v1/test_serial_utils.py
|
||||||
pytest -v -s v1/test_utils.py
|
|
||||||
pytest -v -s v1/test_metrics_reader.py
|
|
||||||
'
|
'
|
||||||
|
|||||||
@@ -18,7 +18,7 @@ vllm bench throughput --input-len 256 --output-len 256 --output-json throughput_
|
|||||||
bench_throughput_exit_code=$?
|
bench_throughput_exit_code=$?
|
||||||
|
|
||||||
# run server-based benchmarks and upload the result to buildkite
|
# run server-based benchmarks and upload the result to buildkite
|
||||||
python3 -m vllm.entrypoints.openai.api_server --model meta-llama/Llama-2-7b-chat-hf &
|
vllm serve meta-llama/Llama-2-7b-chat-hf &
|
||||||
server_pid=$!
|
server_pid=$!
|
||||||
wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
|
wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
|
||||||
|
|
||||||
|
|||||||
59
.buildkite/scripts/run-prime-rl-test.sh
Executable file
59
.buildkite/scripts/run-prime-rl-test.sh
Executable file
@@ -0,0 +1,59 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
|
||||||
|
# Setup script for Prime-RL integration tests
|
||||||
|
# This script prepares the environment for running Prime-RL tests with nightly vLLM
|
||||||
|
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||||
|
REPO_ROOT="$(cd "${SCRIPT_DIR}/../.." && pwd)"
|
||||||
|
PRIME_RL_REPO="https://github.com/PrimeIntellect-ai/prime-rl.git"
|
||||||
|
PRIME_RL_DIR="${REPO_ROOT}/prime-rl"
|
||||||
|
|
||||||
|
echo "Setting up Prime-RL integration test environment..."
|
||||||
|
|
||||||
|
# Clean up any existing Prime-RL directory
|
||||||
|
if [ -d "${PRIME_RL_DIR}" ]; then
|
||||||
|
echo "Removing existing Prime-RL directory..."
|
||||||
|
rm -rf "${PRIME_RL_DIR}"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Install UV if not available
|
||||||
|
if ! command -v uv &> /dev/null; then
|
||||||
|
echo "Installing UV package manager..."
|
||||||
|
curl -LsSf https://astral.sh/uv/install.sh | sh
|
||||||
|
source $HOME/.local/bin/env
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Clone Prime-RL repository at specific branch for reproducible tests
|
||||||
|
PRIME_RL_BRANCH="integ-vllm-main"
|
||||||
|
echo "Cloning Prime-RL repository at branch: ${PRIME_RL_BRANCH}..."
|
||||||
|
git clone --branch "${PRIME_RL_BRANCH}" --single-branch "${PRIME_RL_REPO}" "${PRIME_RL_DIR}"
|
||||||
|
cd "${PRIME_RL_DIR}"
|
||||||
|
|
||||||
|
echo "Setting up UV project environment..."
|
||||||
|
export UV_PROJECT_ENVIRONMENT=/usr/local
|
||||||
|
ln -s /usr/bin/python3 /usr/local/bin/python
|
||||||
|
|
||||||
|
# Remove vllm pin from pyproject.toml
|
||||||
|
echo "Removing vllm pin from pyproject.toml..."
|
||||||
|
sed -i '/vllm==/d' pyproject.toml
|
||||||
|
|
||||||
|
# Sync Prime-RL dependencies
|
||||||
|
echo "Installing Prime-RL dependencies..."
|
||||||
|
uv sync --inexact && uv sync --inexact --all-extras
|
||||||
|
|
||||||
|
# Verify installation
|
||||||
|
echo "Verifying installations..."
|
||||||
|
uv run python -c "import vllm; print(f'vLLM version: {vllm.__version__}')"
|
||||||
|
uv run python -c "import prime_rl; print('Prime-RL imported successfully')"
|
||||||
|
|
||||||
|
echo "Prime-RL integration test environment setup complete!"
|
||||||
|
|
||||||
|
echo "Running Prime-RL integration tests..."
|
||||||
|
export WANDB_MODE=offline # this makes this test not require a WANDB_API_KEY
|
||||||
|
uv run pytest -vs tests/integration/test_rl.py -m gpu
|
||||||
|
|
||||||
|
echo "Prime-RL integration tests completed!"
|
||||||
@@ -9,6 +9,6 @@ MAX_NUM_BATCHED_TOKENS=1024
|
|||||||
TENSOR_PARALLEL_SIZE=1
|
TENSOR_PARALLEL_SIZE=1
|
||||||
MAX_MODEL_LEN=2048
|
MAX_MODEL_LEN=2048
|
||||||
DOWNLOAD_DIR=/mnt/disks/persist
|
DOWNLOAD_DIR=/mnt/disks/persist
|
||||||
EXPECTED_THROUGHPUT=10.0
|
EXPECTED_THROUGHPUT=8.7
|
||||||
INPUT_LEN=1800
|
INPUT_LEN=1800
|
||||||
OUTPUT_LEN=128
|
OUTPUT_LEN=128
|
||||||
|
|||||||
@@ -42,7 +42,7 @@ echo "lanching vllm..."
|
|||||||
echo "logging to $VLLM_LOG"
|
echo "logging to $VLLM_LOG"
|
||||||
echo
|
echo
|
||||||
|
|
||||||
VLLM_USE_V1=1 vllm serve $MODEL \
|
vllm serve $MODEL \
|
||||||
--seed 42 \
|
--seed 42 \
|
||||||
--max-num-seqs $MAX_NUM_SEQS \
|
--max-num-seqs $MAX_NUM_SEQS \
|
||||||
--max-num-batched-tokens $MAX_NUM_BATCHED_TOKENS \
|
--max-num-batched-tokens $MAX_NUM_BATCHED_TOKENS \
|
||||||
|
|||||||
@@ -58,33 +58,25 @@ python3 .buildkite/generate_index.py --wheel "$normal_wheel"
|
|||||||
aws s3 cp "$wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
|
aws s3 cp "$wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
|
||||||
aws s3 cp "$normal_wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
|
aws s3 cp "$normal_wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
|
||||||
|
|
||||||
if [[ $normal_wheel == *"cu126"* ]]; then
|
if [[ $normal_wheel == *"cu129"* ]]; then
|
||||||
# if $normal_wheel matches cu126, do not upload the index.html
|
|
||||||
echo "Skipping index files for cu126 wheels"
|
|
||||||
elif [[ $normal_wheel == *"cu128"* ]]; then
|
|
||||||
# if $normal_wheel matches cu128, do not upload the index.html
|
|
||||||
echo "Skipping index files for cu128 wheels"
|
|
||||||
else
|
|
||||||
# only upload index.html for cu129 wheels (default wheels) as it
|
# only upload index.html for cu129 wheels (default wheels) as it
|
||||||
# is available on both x86 and arm64
|
# is available on both x86 and arm64
|
||||||
aws s3 cp index.html "s3://vllm-wheels/$BUILDKITE_COMMIT/vllm/index.html"
|
aws s3 cp index.html "s3://vllm-wheels/$BUILDKITE_COMMIT/vllm/index.html"
|
||||||
aws s3 cp "s3://vllm-wheels/nightly/index.html" "s3://vllm-wheels/$BUILDKITE_COMMIT/index.html"
|
aws s3 cp "s3://vllm-wheels/nightly/index.html" "s3://vllm-wheels/$BUILDKITE_COMMIT/index.html"
|
||||||
|
else
|
||||||
|
echo "Skipping index files for non-cu129 wheels"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# generate index for nightly
|
# generate index for nightly
|
||||||
aws s3 cp "$wheel" "s3://vllm-wheels/nightly/"
|
aws s3 cp "$wheel" "s3://vllm-wheels/nightly/"
|
||||||
aws s3 cp "$normal_wheel" "s3://vllm-wheels/nightly/"
|
aws s3 cp "$normal_wheel" "s3://vllm-wheels/nightly/"
|
||||||
|
|
||||||
if [[ $normal_wheel == *"cu126"* ]]; then
|
if [[ $normal_wheel == *"cu129"* ]]; then
|
||||||
# if $normal_wheel matches cu126, do not upload the index.html
|
|
||||||
echo "Skipping index files for cu126 wheels"
|
|
||||||
elif [[ $normal_wheel == *"cu128"* ]]; then
|
|
||||||
# if $normal_wheel matches cu128, do not upload the index.html
|
|
||||||
echo "Skipping index files for cu128 wheels"
|
|
||||||
else
|
|
||||||
# only upload index.html for cu129 wheels (default wheels) as it
|
# only upload index.html for cu129 wheels (default wheels) as it
|
||||||
# is available on both x86 and arm64
|
# is available on both x86 and arm64
|
||||||
aws s3 cp index.html "s3://vllm-wheels/nightly/vllm/index.html"
|
aws s3 cp index.html "s3://vllm-wheels/nightly/vllm/index.html"
|
||||||
|
else
|
||||||
|
echo "Skipping index files for non-cu129 wheels"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
aws s3 cp "$wheel" "s3://vllm-wheels/$version/"
|
aws s3 cp "$wheel" "s3://vllm-wheels/$version/"
|
||||||
|
|||||||
1319
.buildkite/test-amd.yaml
Normal file
1319
.buildkite/test-amd.yaml
Normal file
File diff suppressed because it is too large
Load Diff
@@ -6,24 +6,28 @@
|
|||||||
# to generate the final pipeline yaml file.
|
# to generate the final pipeline yaml file.
|
||||||
|
|
||||||
# Documentation
|
# Documentation
|
||||||
# label(str): the name of the test. emoji allowed.
|
# label(str): the name of the test. emojis allowed.
|
||||||
# fast_check(bool): whether to run this on each commit on fastcheck pipeline.
|
# fast_check(bool): whether to run this on each commit on the fastcheck pipeline.
|
||||||
# torch_nightly(bool): whether to run this on vllm against torch nightly pipeline.
|
# torch_nightly(bool): whether to run this on vllm against the torch nightly pipeline.
|
||||||
# fast_check_only(bool): run this test on fastcheck pipeline only
|
# fast_check_only(bool): run this test on the fastcheck pipeline only
|
||||||
# optional(bool): never run this test by default (i.e. need to unblock manually) unless it's scheduled nightly run.
|
# optional(bool): never run this test by default (i.e. need to unblock manually) unless it's a scheduled nightly run.
|
||||||
|
# soft_fail(bool): allow this step to fail without failing the entire pipeline (useful for flaky or experimental tests).
|
||||||
# command(str): the single command to run for tests. incompatible with commands.
|
# command(str): the single command to run for tests. incompatible with commands.
|
||||||
# commands(list): the list of commands to run for test. incompatbile with command.
|
# commands(list): the list of commands to run for the test. incompatible with command.
|
||||||
# mirror_hardwares(list): the list of hardwares to run the test on as well. currently only supports [amd]
|
# mirror_hardwares(list): the list of hardware to run the test on as well. currently only supports [amdexperimental]
|
||||||
# gpu(str): override the GPU selection for the test. default is on L4 GPUs. currently only supports a100
|
# gpu(str): override the GPU selection for the test. default is L4 GPUs. supports a100, b200, h200
|
||||||
# num_gpus(int): override the number of GPUs for the test. default to 1 GPU. currently support 2,4.
|
# num_gpus(int): override the number of GPUs for the test. defaults to 1 GPU. currently supports 2,4.
|
||||||
# num_nodes(int): whether to simulate multi-node setup by launch multiple containers on one host,
|
# num_nodes(int): whether to simulate multi-node setup by launching multiple containers on one host,
|
||||||
# in this case, commands must be specified. the first command runs on first host, the second
|
# in this case, commands must be specified. the first command runs on the first host, the second
|
||||||
# command runs on the second host.
|
# command runs on the second host.
|
||||||
# working_dir(str): specify the place where command should execute, default to /vllm-workspace/tests
|
# timeout_in_minutes(int): sets a timeout for the step in minutes. if not specified, uses the default timeout.
|
||||||
# source_file_dependencies(list): the list of prefix to opt-in the test for, if empty, the test will always run.
|
# parallelism(int): number of parallel jobs to run for this step. enables test sharding using $$BUILDKITE_PARALLEL_JOB
|
||||||
|
# and $$BUILDKITE_PARALLEL_JOB_COUNT environment variables.
|
||||||
|
# working_dir(str): specify the place where the command should execute, default to /vllm-workspace/tests
|
||||||
|
# source_file_dependencies(list): the list of prefixes to opt-in the test for, if empty, the test will always run.
|
||||||
|
|
||||||
# When adding a test
|
# When adding a test
|
||||||
# - If the test belong to an existing group, add it there
|
# - If the test belongs to an existing group, add it there
|
||||||
# - If the test is short, add to any existing step
|
# - If the test is short, add to any existing step
|
||||||
# - If the test takes more than 10min, then it is okay to create a new step.
|
# - If the test takes more than 10min, then it is okay to create a new step.
|
||||||
# Note that all steps execute in parallel.
|
# Note that all steps execute in parallel.
|
||||||
@@ -46,25 +50,28 @@ steps:
|
|||||||
mirror_hardwares: [amdexperimental]
|
mirror_hardwares: [amdexperimental]
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
- tests/mq_llm_engine
|
- tests/multimodal
|
||||||
- tests/async_engine
|
- tests/utils_
|
||||||
|
commands:
|
||||||
|
- pytest -v -s -m 'not cpu_test' multimodal
|
||||||
|
- pytest -v -s utils_
|
||||||
|
|
||||||
|
- label: Async Engine, Inputs, Utils, Worker Test (CPU) # 4 mins
|
||||||
|
timeout_in_minutes: 10
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/
|
||||||
- tests/test_inputs.py
|
- tests/test_inputs.py
|
||||||
- tests/test_outputs.py
|
- tests/test_outputs.py
|
||||||
- tests/multimodal
|
- tests/multimodal
|
||||||
- tests/utils_
|
|
||||||
- tests/worker
|
|
||||||
- tests/standalone_tests/lazy_imports.py
|
- tests/standalone_tests/lazy_imports.py
|
||||||
- tests/transformers_utils
|
- tests/transformers_utils
|
||||||
|
no_gpu: true
|
||||||
commands:
|
commands:
|
||||||
- python3 standalone_tests/lazy_imports.py
|
- python3 standalone_tests/lazy_imports.py
|
||||||
- pytest -v -s mq_llm_engine # MQLLMEngine
|
|
||||||
- pytest -v -s async_engine # AsyncLLMEngine
|
|
||||||
- pytest -v -s test_inputs.py
|
- pytest -v -s test_inputs.py
|
||||||
- pytest -v -s test_outputs.py
|
- pytest -v -s test_outputs.py
|
||||||
- pytest -v -s multimodal
|
- pytest -v -s -m 'cpu_test' multimodal
|
||||||
- pytest -v -s utils_ # Utils
|
- pytest -v -s transformers_utils
|
||||||
- pytest -v -s worker # Worker
|
|
||||||
- pytest -v -s transformers_utils # transformers_utils
|
|
||||||
|
|
||||||
- label: Python-only Installation Test # 10min
|
- label: Python-only Installation Test # 10min
|
||||||
timeout_in_minutes: 20
|
timeout_in_minutes: 20
|
||||||
@@ -84,25 +91,12 @@ steps:
|
|||||||
- vllm/
|
- vllm/
|
||||||
- tests/basic_correctness/test_basic_correctness
|
- tests/basic_correctness/test_basic_correctness
|
||||||
- tests/basic_correctness/test_cpu_offload
|
- tests/basic_correctness/test_cpu_offload
|
||||||
- tests/basic_correctness/test_preemption
|
|
||||||
- tests/basic_correctness/test_cumem.py
|
- tests/basic_correctness/test_cumem.py
|
||||||
commands:
|
commands:
|
||||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||||
- pytest -v -s basic_correctness/test_cumem.py
|
- pytest -v -s basic_correctness/test_cumem.py
|
||||||
- pytest -v -s basic_correctness/test_basic_correctness.py
|
- pytest -v -s basic_correctness/test_basic_correctness.py
|
||||||
- pytest -v -s basic_correctness/test_cpu_offload.py
|
- pytest -v -s basic_correctness/test_cpu_offload.py
|
||||||
- VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py
|
|
||||||
|
|
||||||
- label: Core Test # 22min
|
|
||||||
timeout_in_minutes: 35
|
|
||||||
mirror_hardwares: [amdexperimental]
|
|
||||||
fast_check: true
|
|
||||||
source_file_dependencies:
|
|
||||||
- vllm/core
|
|
||||||
- vllm/distributed
|
|
||||||
- tests/core
|
|
||||||
commands:
|
|
||||||
- pytest -v -s core
|
|
||||||
|
|
||||||
- label: Entrypoints Unit Tests # 5min
|
- label: Entrypoints Unit Tests # 5min
|
||||||
timeout_in_minutes: 10
|
timeout_in_minutes: 10
|
||||||
@@ -127,10 +121,9 @@ steps:
|
|||||||
- tests/entrypoints/offline_mode
|
- tests/entrypoints/offline_mode
|
||||||
commands:
|
commands:
|
||||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||||
- pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_lazy_outlines.py --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_collective_rpc.py
|
- pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_collective_rpc.py
|
||||||
- pytest -v -s entrypoints/llm/test_lazy_outlines.py # it needs a clean process
|
|
||||||
- pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
|
- pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
|
||||||
- VLLM_USE_V1=0 pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
|
- pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
|
||||||
|
|
||||||
- label: Entrypoints Integration Test (API Server) # 100min
|
- label: Entrypoints Integration Test (API Server) # 100min
|
||||||
timeout_in_minutes: 130
|
timeout_in_minutes: 130
|
||||||
@@ -168,7 +161,6 @@ steps:
|
|||||||
num_gpus: 4
|
num_gpus: 4
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/distributed/
|
- vllm/distributed/
|
||||||
- vllm/core/
|
|
||||||
- tests/distributed/test_utils
|
- tests/distributed/test_utils
|
||||||
- tests/distributed/test_pynccl
|
- tests/distributed/test_pynccl
|
||||||
- tests/distributed/test_events
|
- tests/distributed/test_events
|
||||||
@@ -176,28 +168,36 @@ steps:
|
|||||||
- examples/offline_inference/rlhf.py
|
- examples/offline_inference/rlhf.py
|
||||||
- examples/offline_inference/rlhf_colocate.py
|
- examples/offline_inference/rlhf_colocate.py
|
||||||
- tests/examples/offline_inference/data_parallel.py
|
- tests/examples/offline_inference/data_parallel.py
|
||||||
- tests/v1/test_async_llm_dp.py
|
- tests/v1/distributed
|
||||||
- tests/v1/test_external_lb_dp.py
|
|
||||||
- tests/v1/test_internal_lb_dp.py
|
|
||||||
- tests/v1/test_hybrid_lb_dp.py
|
|
||||||
- tests/v1/engine/test_engine_core_client.py
|
- tests/v1/engine/test_engine_core_client.py
|
||||||
|
- tests/distributed/test_symm_mem_allreduce.py
|
||||||
commands:
|
commands:
|
||||||
# test with tp=2 and external_dp=2
|
# https://github.com/NVIDIA/nccl/issues/1838
|
||||||
- VLLM_USE_V1=0 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
|
- export NCCL_CUMEM_HOST_ENABLE=0
|
||||||
|
# test with torchrun tp=2 and external_dp=2
|
||||||
- torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
|
- torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
|
||||||
# test with tp=2 and pp=2
|
# test with torchrun tp=2 and pp=2
|
||||||
- PP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
|
- PP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
|
||||||
|
# test with torchrun tp=4 and dp=1
|
||||||
|
- TP_SIZE=4 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
|
||||||
|
# test with torchrun tp=2, pp=2 and dp=1
|
||||||
|
- PP_SIZE=2 TP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
|
||||||
|
# test with torchrun tp=1 and dp=4 with ep
|
||||||
|
- DP_SIZE=4 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
|
||||||
|
# test with torchrun tp=2 and dp=2 with ep
|
||||||
|
- TP_SIZE=2 DP_SIZE=2 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
|
||||||
# test with internal dp
|
# test with internal dp
|
||||||
- python3 ../examples/offline_inference/data_parallel.py --enforce-eager
|
- python3 ../examples/offline_inference/data_parallel.py --enforce-eager
|
||||||
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py
|
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
|
||||||
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/test_external_lb_dp.py
|
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
|
||||||
- TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/test_internal_lb_dp.py
|
- TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_internal_lb_dp.py
|
||||||
- TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/test_hybrid_lb_dp.py
|
- TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_hybrid_lb_dp.py
|
||||||
- pytest -v -s v1/engine/test_engine_core_client.py::test_kv_cache_events_dp
|
- pytest -v -s v1/engine/test_engine_core_client.py::test_kv_cache_events_dp
|
||||||
- pytest -v -s distributed/test_utils.py
|
- pytest -v -s distributed/test_utils.py
|
||||||
- pytest -v -s compile/test_basic_correctness.py
|
- pytest -v -s compile/test_basic_correctness.py
|
||||||
- pytest -v -s distributed/test_pynccl.py
|
- pytest -v -s distributed/test_pynccl.py
|
||||||
- pytest -v -s distributed/test_events.py
|
- pytest -v -s distributed/test_events.py
|
||||||
|
- pytest -v -s distributed/test_symm_mem_allreduce.py
|
||||||
# TODO: create a dedicated test section for multi-GPU example tests
|
# TODO: create a dedicated test section for multi-GPU example tests
|
||||||
# when we have multiple distributed example tests
|
# when we have multiple distributed example tests
|
||||||
- pushd ../examples/offline_inference
|
- pushd ../examples/offline_inference
|
||||||
@@ -230,16 +230,14 @@ steps:
|
|||||||
num_gpus: 2
|
num_gpus: 2
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
- tests/metrics
|
|
||||||
- tests/v1/tracing
|
- tests/v1/tracing
|
||||||
commands:
|
commands:
|
||||||
- pytest -v -s metrics
|
|
||||||
- "pip install \
|
- "pip install \
|
||||||
'opentelemetry-sdk>=1.26.0' \
|
'opentelemetry-sdk>=1.26.0' \
|
||||||
'opentelemetry-api>=1.26.0' \
|
'opentelemetry-api>=1.26.0' \
|
||||||
'opentelemetry-exporter-otlp>=1.26.0' \
|
'opentelemetry-exporter-otlp>=1.26.0' \
|
||||||
'opentelemetry-semantic-conventions-ai>=0.4.1'"
|
'opentelemetry-semantic-conventions-ai>=0.4.1'"
|
||||||
- pytest -v -s tracing
|
- pytest -v -s v1/tracing
|
||||||
|
|
||||||
##### fast check tests #####
|
##### fast check tests #####
|
||||||
##### 1 GPU test #####
|
##### 1 GPU test #####
|
||||||
@@ -300,23 +298,35 @@ steps:
|
|||||||
- tests/v1
|
- tests/v1
|
||||||
commands:
|
commands:
|
||||||
# split the test to avoid interference
|
# split the test to avoid interference
|
||||||
- pytest -v -s v1/core
|
- pytest -v -s -m 'not cpu_test' v1/core
|
||||||
- pytest -v -s v1/executor
|
- pytest -v -s v1/executor
|
||||||
|
- pytest -v -s v1/kv_offload
|
||||||
- pytest -v -s v1/sample
|
- pytest -v -s v1/sample
|
||||||
- pytest -v -s v1/logits_processors
|
- pytest -v -s v1/logits_processors
|
||||||
- pytest -v -s v1/worker
|
- pytest -v -s v1/worker
|
||||||
- pytest -v -s v1/structured_output
|
|
||||||
- pytest -v -s v1/spec_decode
|
- pytest -v -s v1/spec_decode
|
||||||
- pytest -v -s v1/kv_connector/unit
|
- pytest -v -s -m 'not cpu_test' v1/kv_connector/unit
|
||||||
- pytest -v -s v1/metrics
|
- pytest -v -s -m 'not cpu_test' v1/metrics
|
||||||
- pytest -v -s v1/test_serial_utils.py
|
|
||||||
- pytest -v -s v1/test_utils.py
|
|
||||||
- pytest -v -s v1/test_oracle.py
|
- pytest -v -s v1/test_oracle.py
|
||||||
- pytest -v -s v1/test_metrics_reader.py
|
- pytest -v -s v1/test_request.py
|
||||||
# Integration test for streaming correctness (requires special branch).
|
# Integration test for streaming correctness (requires special branch).
|
||||||
- pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api
|
- pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api
|
||||||
- pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
|
- pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
|
||||||
|
|
||||||
|
- label: V1 Test others (CPU) # 5 mins
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/
|
||||||
|
- tests/v1
|
||||||
|
no_gpu: true
|
||||||
|
commands:
|
||||||
|
# split the test to avoid interference
|
||||||
|
- pytest -v -s -m 'cpu_test' v1/core
|
||||||
|
- pytest -v -s v1/structured_output
|
||||||
|
- pytest -v -s v1/test_serial_utils.py
|
||||||
|
- pytest -v -s -m 'cpu_test' v1/kv_connector/unit
|
||||||
|
- pytest -v -s -m 'cpu_test' v1/metrics
|
||||||
|
|
||||||
|
|
||||||
- label: Examples Test # 30min
|
- label: Examples Test # 30min
|
||||||
timeout_in_minutes: 45
|
timeout_in_minutes: 45
|
||||||
mirror_hardwares: [amdexperimental]
|
mirror_hardwares: [amdexperimental]
|
||||||
@@ -335,12 +345,14 @@ steps:
|
|||||||
- python3 offline_inference/vision_language.py --seed 0
|
- python3 offline_inference/vision_language.py --seed 0
|
||||||
- python3 offline_inference/vision_language_pooling.py --seed 0
|
- python3 offline_inference/vision_language_pooling.py --seed 0
|
||||||
- python3 offline_inference/vision_language_multi_image.py --seed 0
|
- python3 offline_inference/vision_language_multi_image.py --seed 0
|
||||||
- VLLM_USE_V1=0 python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
|
- python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
|
||||||
- python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
|
- python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
|
||||||
- python3 offline_inference/basic/classify.py
|
- python3 offline_inference/basic/classify.py
|
||||||
- python3 offline_inference/basic/embed.py
|
- python3 offline_inference/basic/embed.py
|
||||||
- python3 offline_inference/basic/score.py
|
- python3 offline_inference/basic/score.py
|
||||||
- VLLM_USE_V1=0 python3 offline_inference/profiling.py --model facebook/opt-125m run_num_steps --num-steps 2
|
- python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
|
||||||
|
# https://github.com/vllm-project/vllm/pull/26682 uses slightly more memory in PyTorch 2.9+ causing this test to OOM in 1xL4 GPU
|
||||||
|
- python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536
|
||||||
|
|
||||||
- label: Platform Tests (CUDA) # 4min
|
- label: Platform Tests (CUDA) # 4min
|
||||||
timeout_in_minutes: 15
|
timeout_in_minutes: 15
|
||||||
@@ -375,7 +387,12 @@ steps:
|
|||||||
--num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \
|
--num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \
|
||||||
--ignore=lora/test_chatglm3_tp.py \
|
--ignore=lora/test_chatglm3_tp.py \
|
||||||
--ignore=lora/test_llama_tp.py \
|
--ignore=lora/test_llama_tp.py \
|
||||||
--ignore=lora/test_llm_with_multi_loras.py
|
--ignore=lora/test_llm_with_multi_loras.py \
|
||||||
|
--ignore=lora/test_olmoe_tp.py \
|
||||||
|
--ignore=lora/test_deepseekv2_tp.py \
|
||||||
|
--ignore=lora/test_gptoss.py \
|
||||||
|
--ignore=lora/test_qwen3moe_tp.py
|
||||||
|
|
||||||
parallelism: 4
|
parallelism: 4
|
||||||
|
|
||||||
- label: PyTorch Compilation Unit Tests # 15min
|
- label: PyTorch Compilation Unit Tests # 15min
|
||||||
@@ -389,11 +406,12 @@ steps:
|
|||||||
- pytest -v -s compile/test_pass_manager.py
|
- pytest -v -s compile/test_pass_manager.py
|
||||||
- pytest -v -s compile/test_fusion.py
|
- pytest -v -s compile/test_fusion.py
|
||||||
- pytest -v -s compile/test_fusion_attn.py
|
- pytest -v -s compile/test_fusion_attn.py
|
||||||
|
- pytest -v -s compile/test_functionalization.py
|
||||||
- pytest -v -s compile/test_silu_mul_quant_fusion.py
|
- pytest -v -s compile/test_silu_mul_quant_fusion.py
|
||||||
- pytest -v -s compile/test_sequence_parallelism.py
|
|
||||||
- pytest -v -s compile/test_async_tp.py
|
|
||||||
- pytest -v -s compile/test_fusion_all_reduce.py
|
- pytest -v -s compile/test_fusion_all_reduce.py
|
||||||
- pytest -v -s compile/test_decorator.py
|
- pytest -v -s compile/test_decorator.py
|
||||||
|
- pytest -v -s compile/test_noop_elimination.py
|
||||||
|
- pytest -v -s compile/test_aot_compile.py
|
||||||
|
|
||||||
- label: PyTorch Fullgraph Smoke Test # 15min
|
- label: PyTorch Fullgraph Smoke Test # 15min
|
||||||
timeout_in_minutes: 30
|
timeout_in_minutes: 30
|
||||||
@@ -406,8 +424,8 @@ steps:
|
|||||||
- pytest -v -s compile/test_basic_correctness.py
|
- pytest -v -s compile/test_basic_correctness.py
|
||||||
- pytest -v -s compile/piecewise/
|
- pytest -v -s compile/piecewise/
|
||||||
|
|
||||||
- label: PyTorch Fullgraph Test # 20min
|
- label: PyTorch Fullgraph Test # 22min
|
||||||
timeout_in_minutes: 30
|
timeout_in_minutes: 35
|
||||||
mirror_hardwares: [amdexperimental]
|
mirror_hardwares: [amdexperimental]
|
||||||
torch_nightly: true
|
torch_nightly: true
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
@@ -415,6 +433,7 @@ steps:
|
|||||||
- tests/compile
|
- tests/compile
|
||||||
commands:
|
commands:
|
||||||
- pytest -v -s compile/test_full_graph.py
|
- pytest -v -s compile/test_full_graph.py
|
||||||
|
- pytest -v -s compile/test_fusions_e2e.py
|
||||||
|
|
||||||
- label: Kernels Core Operation Test # 48min
|
- label: Kernels Core Operation Test # 48min
|
||||||
timeout_in_minutes: 75
|
timeout_in_minutes: 75
|
||||||
@@ -422,8 +441,9 @@ steps:
|
|||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- csrc/
|
- csrc/
|
||||||
- tests/kernels/core
|
- tests/kernels/core
|
||||||
|
- tests/kernels/test_top_k_per_row.py
|
||||||
commands:
|
commands:
|
||||||
- pytest -v -s kernels/core
|
- pytest -v -s kernels/core kernels/test_top_k_per_row.py
|
||||||
|
|
||||||
- label: Kernels Attention Test %N # 23min
|
- label: Kernels Attention Test %N # 23min
|
||||||
timeout_in_minutes: 35
|
timeout_in_minutes: 35
|
||||||
@@ -467,32 +487,22 @@ steps:
|
|||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- csrc/mamba/
|
- csrc/mamba/
|
||||||
- tests/kernels/mamba
|
- tests/kernels/mamba
|
||||||
|
- vllm/model_executor/layers/mamba/ops
|
||||||
commands:
|
commands:
|
||||||
- pytest -v -s kernels/mamba
|
- pytest -v -s kernels/mamba
|
||||||
|
|
||||||
- label: Tensorizer Test # 14min
|
- label: Model Executor Test # 23min
|
||||||
timeout_in_minutes: 25
|
timeout_in_minutes: 35
|
||||||
mirror_hardwares: [amdexperimental]
|
|
||||||
source_file_dependencies:
|
|
||||||
- vllm/model_executor/model_loader
|
|
||||||
- tests/tensorizer_loader
|
|
||||||
- tests/entrypoints/openai/test_tensorizer_entrypoint.py
|
|
||||||
commands:
|
|
||||||
- apt-get update && apt-get install -y curl libsodium23
|
|
||||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
|
||||||
- pytest -v -s tensorizer_loader
|
|
||||||
- pytest -v -s entrypoints/openai/test_tensorizer_entrypoint.py
|
|
||||||
|
|
||||||
- label: Model Executor Test # 7min
|
|
||||||
timeout_in_minutes: 20
|
|
||||||
mirror_hardwares: [amdexperimental]
|
mirror_hardwares: [amdexperimental]
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/model_executor
|
- vllm/model_executor
|
||||||
- tests/model_executor
|
- tests/model_executor
|
||||||
|
- tests/entrypoints/openai/test_tensorizer_entrypoint.py
|
||||||
commands:
|
commands:
|
||||||
- apt-get update && apt-get install -y curl libsodium23
|
- apt-get update && apt-get install -y curl libsodium23
|
||||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||||
- pytest -v -s model_executor
|
- pytest -v -s model_executor
|
||||||
|
- pytest -v -s entrypoints/openai/test_tensorizer_entrypoint.py
|
||||||
|
|
||||||
- label: Benchmarks # 11min
|
- label: Benchmarks # 11min
|
||||||
timeout_in_minutes: 20
|
timeout_in_minutes: 20
|
||||||
@@ -526,8 +536,9 @@ steps:
|
|||||||
# since torchao nightly is only compatible with torch nightly currently
|
# since torchao nightly is only compatible with torch nightly currently
|
||||||
# https://github.com/pytorch/ao/issues/2919, we'll have to skip new torchao tests for now
|
# https://github.com/pytorch/ao/issues/2919, we'll have to skip new torchao tests for now
|
||||||
# we can only upgrade after this is resolved
|
# we can only upgrade after this is resolved
|
||||||
- pip install --pre torchao==0.13.0.dev20250814 --index-url https://download.pytorch.org/whl/nightly/cu128
|
# TODO(jerryzh168): resolve the above comment
|
||||||
- VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization
|
- uv pip install --system torchao==0.13.0 --index-url https://download.pytorch.org/whl/cu129
|
||||||
|
- VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py
|
||||||
|
|
||||||
- label: LM Eval Small Models # 53min
|
- label: LM Eval Small Models # 53min
|
||||||
timeout_in_minutes: 75
|
timeout_in_minutes: 75
|
||||||
@@ -548,15 +559,6 @@ steps:
|
|||||||
commands: # LMEval+Transcription WER check
|
commands: # LMEval+Transcription WER check
|
||||||
- pytest -s entrypoints/openai/correctness/
|
- pytest -s entrypoints/openai/correctness/
|
||||||
|
|
||||||
- label: Encoder Decoder tests # 12min
|
|
||||||
timeout_in_minutes: 20
|
|
||||||
mirror_hardwares: [amdexperimental]
|
|
||||||
source_file_dependencies:
|
|
||||||
- vllm/
|
|
||||||
- tests/encoder_decoder
|
|
||||||
commands:
|
|
||||||
- pytest -v -s encoder_decoder
|
|
||||||
|
|
||||||
- label: OpenAI-Compatible Tool Use # 23 min
|
- label: OpenAI-Compatible Tool Use # 23 min
|
||||||
timeout_in_minutes: 35
|
timeout_in_minutes: 35
|
||||||
mirror_hardwares: [amdexperimental]
|
mirror_hardwares: [amdexperimental]
|
||||||
@@ -564,10 +566,17 @@ steps:
|
|||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
- tests/tool_use
|
- tests/tool_use
|
||||||
- tests/mistral_tool_use
|
|
||||||
commands:
|
commands:
|
||||||
- pytest -v -s tool_use
|
- pytest -v -s -m 'not cpu_test' tool_use
|
||||||
- pytest -v -s mistral_tool_use
|
|
||||||
|
- label: OpenAI-Compatible Tool Use (CPU) # 5 mins
|
||||||
|
timeout_in_minutes: 10
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/
|
||||||
|
- tests/tool_use
|
||||||
|
no_gpu: true
|
||||||
|
commands:
|
||||||
|
- pytest -v -s -m 'cpu_test' tool_use
|
||||||
|
|
||||||
##### models test #####
|
##### models test #####
|
||||||
|
|
||||||
@@ -607,13 +616,19 @@ steps:
|
|||||||
- vllm/
|
- vllm/
|
||||||
- tests/models/test_transformers.py
|
- tests/models/test_transformers.py
|
||||||
- tests/models/test_registry.py
|
- tests/models/test_registry.py
|
||||||
|
commands:
|
||||||
|
- pytest -v -s models/test_transformers.py models/test_registry.py
|
||||||
|
|
||||||
|
- label: Basic Models Test (Other CPU) # 5min
|
||||||
|
timeout_in_minutes: 10
|
||||||
|
torch_nightly: true
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/
|
||||||
- tests/models/test_utils.py
|
- tests/models/test_utils.py
|
||||||
- tests/models/test_vision.py
|
- tests/models/test_vision.py
|
||||||
|
no_gpu: true
|
||||||
commands:
|
commands:
|
||||||
- pytest -v -s models/test_transformers.py \
|
- pytest -v -s models/test_utils.py models/test_vision.py
|
||||||
models/test_registry.py \
|
|
||||||
models/test_utils.py \
|
|
||||||
models/test_vision.py
|
|
||||||
|
|
||||||
- label: Language Models Tests (Standard)
|
- label: Language Models Tests (Standard)
|
||||||
timeout_in_minutes: 25
|
timeout_in_minutes: 25
|
||||||
@@ -728,6 +743,16 @@ steps:
|
|||||||
- pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing
|
- pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing
|
||||||
- cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model # Otherwise, mp_method="spawn" doesn't work
|
- cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model # Otherwise, mp_method="spawn" doesn't work
|
||||||
|
|
||||||
|
- label: Multi-Modal Accuracy Eval (Small Models) # 50min
|
||||||
|
timeout_in_minutes: 70
|
||||||
|
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/multimodal/
|
||||||
|
- vllm/inputs/
|
||||||
|
- vllm/v1/core/
|
||||||
|
commands:
|
||||||
|
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-mm-small.txt --tp-size=1
|
||||||
|
|
||||||
- label: Multi-Modal Models Test (Extended) 1
|
- label: Multi-Modal Models Test (Extended) 1
|
||||||
mirror_hardwares: [amdexperimental]
|
mirror_hardwares: [amdexperimental]
|
||||||
optional: true
|
optional: true
|
||||||
@@ -783,14 +808,16 @@ steps:
|
|||||||
commands:
|
commands:
|
||||||
- pip install --upgrade git+https://github.com/huggingface/transformers
|
- pip install --upgrade git+https://github.com/huggingface/transformers
|
||||||
- pytest -v -s tests/models/test_initialization.py
|
- pytest -v -s tests/models/test_initialization.py
|
||||||
|
- pytest -v -s tests/models/test_transformers.py
|
||||||
- pytest -v -s tests/models/multimodal/processing/
|
- pytest -v -s tests/models/multimodal/processing/
|
||||||
- pytest -v -s tests/models/multimodal/test_mapping.py
|
- pytest -v -s tests/models/multimodal/test_mapping.py
|
||||||
- python3 examples/offline_inference/basic/chat.py
|
- python3 examples/offline_inference/basic/chat.py
|
||||||
- python3 examples/offline_inference/audio_language.py --model-type whisper
|
|
||||||
- python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
|
- python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
|
||||||
|
# Whisper needs spawn method to avoid deadlock
|
||||||
|
- VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper
|
||||||
|
|
||||||
- label: Blackwell Test # 38 min
|
- label: Blackwell Test # 21 min
|
||||||
timeout_in_minutes: 60
|
timeout_in_minutes: 30
|
||||||
working_dir: "/vllm-workspace/"
|
working_dir: "/vllm-workspace/"
|
||||||
gpu: b200
|
gpu: b200
|
||||||
# optional: true
|
# optional: true
|
||||||
@@ -803,8 +830,6 @@ steps:
|
|||||||
- vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py
|
- vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py
|
||||||
- vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
|
- vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
|
||||||
- vllm/v1/attention/backends/flashinfer.py
|
- vllm/v1/attention/backends/flashinfer.py
|
||||||
- vllm/compilation/fusion.py
|
|
||||||
- vllm/compilation/fusion_attn.py
|
|
||||||
commands:
|
commands:
|
||||||
- nvidia-smi
|
- nvidia-smi
|
||||||
- python3 examples/offline_inference/basic/chat.py
|
- python3 examples/offline_inference/basic/chat.py
|
||||||
@@ -817,17 +842,77 @@ steps:
|
|||||||
# Quantization
|
# Quantization
|
||||||
- pytest -v -s tests/kernels/quantization/test_cutlass_scaled_mm.py -k 'fp8'
|
- pytest -v -s tests/kernels/quantization/test_cutlass_scaled_mm.py -k 'fp8'
|
||||||
- pytest -v -s tests/kernels/quantization/test_nvfp4_quant.py
|
- pytest -v -s tests/kernels/quantization/test_nvfp4_quant.py
|
||||||
- pytest -v -s tests/kernels/quantization/test_silu_nvfp4_quant_fusion.py
|
- pytest -v -s tests/kernels/quantization/test_silu_mul_nvfp4_quant.py
|
||||||
- pytest -v -s tests/kernels/quantization/test_nvfp4_scaled_mm.py
|
- pytest -v -s tests/kernels/quantization/test_nvfp4_scaled_mm.py
|
||||||
- pytest -v -s tests/kernels/quantization/test_flashinfer_scaled_mm.py
|
- pytest -v -s tests/kernels/quantization/test_flashinfer_scaled_mm.py
|
||||||
- pytest -v -s tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py
|
- pytest -v -s tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py
|
||||||
|
- pytest -v -s tests/kernels/quantization/test_nvfp4_qutlass.py
|
||||||
|
- pytest -v -s tests/kernels/quantization/test_mxfp4_qutlass.py
|
||||||
- pytest -v -s tests/kernels/moe/test_nvfp4_moe.py
|
- pytest -v -s tests/kernels/moe/test_nvfp4_moe.py
|
||||||
- pytest -v -s tests/kernels/moe/test_mxfp4_moe.py
|
- pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py
|
||||||
# Fusion
|
|
||||||
- pytest -v -s tests/compile/test_fusion_all_reduce.py
|
|
||||||
- pytest -v -s tests/compile/test_fusion_attn.py::test_attention_quant_pattern
|
|
||||||
- pytest -v -s tests/kernels/moe/test_flashinfer.py
|
- pytest -v -s tests/kernels/moe/test_flashinfer.py
|
||||||
|
|
||||||
|
- label: Blackwell Fusion Tests # 30 min
|
||||||
|
timeout_in_minutes: 40
|
||||||
|
working_dir: "/vllm-workspace/"
|
||||||
|
gpu: b200
|
||||||
|
source_file_dependencies:
|
||||||
|
- csrc/quantization/fp4/
|
||||||
|
- vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
|
||||||
|
- vllm/v1/attention/backends/flashinfer.py
|
||||||
|
- vllm/compilation/
|
||||||
|
# can affect pattern matching
|
||||||
|
- vllm/model_executor/layers/layernorm.py
|
||||||
|
- vllm/model_executor/layers/activation.py
|
||||||
|
- vllm/model_executor/layers/quantization/input_quant_fp8.py
|
||||||
|
commands:
|
||||||
|
- nvidia-smi
|
||||||
|
- pytest -v -s tests/compile/test_fusion_attn.py
|
||||||
- pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
|
- pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
|
||||||
|
# this runner has 2 GPUs available even though num_gpus=2 is not set
|
||||||
|
- pytest -v -s tests/compile/test_fusion_all_reduce.py
|
||||||
|
- pytest -v -s tests/compile/test_fusions_e2e.py
|
||||||
|
|
||||||
|
- label: Blackwell GPT-OSS Eval
|
||||||
|
timeout_in_minutes: 60
|
||||||
|
working_dir: "/vllm-workspace/"
|
||||||
|
gpu: b200
|
||||||
|
optional: true # run on nightlies
|
||||||
|
source_file_dependencies:
|
||||||
|
- tests/evals/gpt_oss
|
||||||
|
- vllm/model_executor/models/gpt_oss.py
|
||||||
|
- vllm/model_executor/layers/quantization/mxfp4.py
|
||||||
|
- vllm/v1/attention/backends/flashinfer.py
|
||||||
|
commands:
|
||||||
|
- uv pip install --system 'gpt-oss[eval]==0.0.5'
|
||||||
|
- pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58
|
||||||
|
|
||||||
|
- label: Blackwell Quantized MoE Test
|
||||||
|
timeout_in_minutes: 60
|
||||||
|
working_dir: "/vllm-workspace/"
|
||||||
|
gpu: b200
|
||||||
|
source_file_dependencies:
|
||||||
|
- tests/quantization/test_blackwell_moe.py
|
||||||
|
- vllm/model_executor/models/deepseek_v2.py
|
||||||
|
- vllm/model_executor/models/gpt_oss.py
|
||||||
|
- vllm/model_executor/models/llama4.py
|
||||||
|
- vllm/model_executor/layers/fused_moe
|
||||||
|
- vllm/model_executor/layers/quantization/compressed_tensors
|
||||||
|
- vllm/model_executor/layers/quantization/modelopt.py
|
||||||
|
- vllm/model_executor/layers/quantization/mxfp4.py
|
||||||
|
- vllm/v1/attention/backends/flashinfer.py
|
||||||
|
commands:
|
||||||
|
- pytest -s -v tests/quantization/test_blackwell_moe.py
|
||||||
|
|
||||||
|
- label: Blackwell LM Eval Small Models
|
||||||
|
timeout_in_minutes: 120
|
||||||
|
gpu: b200
|
||||||
|
optional: true # run on nightlies
|
||||||
|
source_file_dependencies:
|
||||||
|
- csrc/
|
||||||
|
- vllm/model_executor/layers/quantization
|
||||||
|
commands:
|
||||||
|
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt --tp-size=1
|
||||||
|
|
||||||
##### 1 GPU test #####
|
##### 1 GPU test #####
|
||||||
##### multi gpus test #####
|
##### multi gpus test #####
|
||||||
@@ -871,47 +956,61 @@ steps:
|
|||||||
- NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed'
|
- NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed'
|
||||||
- python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=1 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code
|
- python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=1 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code
|
||||||
|
|
||||||
- label: Distributed Tests (2 GPUs) # 110min
|
- label: Distributed Tests (2 GPUs) # 68min
|
||||||
timeout_in_minutes: 150
|
timeout_in_minutes: 90
|
||||||
mirror_hardwares: [amdexperimental]
|
mirror_hardwares: [amdexperimental]
|
||||||
working_dir: "/vllm-workspace/tests"
|
working_dir: "/vllm-workspace/tests"
|
||||||
num_gpus: 2
|
num_gpus: 2
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
|
- vllm/compilation/
|
||||||
- vllm/distributed/
|
- vllm/distributed/
|
||||||
- vllm/engine/
|
- vllm/engine/
|
||||||
- vllm/executor/
|
- vllm/executor/
|
||||||
- vllm/model_executor/models/
|
|
||||||
- tests/distributed/
|
|
||||||
- vllm/compilation
|
|
||||||
- vllm/worker/worker_base.py
|
- vllm/worker/worker_base.py
|
||||||
- vllm/worker/worker.py
|
|
||||||
- vllm/worker/model_runner.py
|
|
||||||
- entrypoints/llm/test_collective_rpc.py
|
|
||||||
- tests/v1/test_async_llm_dp.py
|
|
||||||
- tests/v1/test_external_lb_dp.py
|
|
||||||
- tests/v1/entrypoints/openai/test_multi_api_servers.py
|
|
||||||
- vllm/v1/engine/
|
- vllm/v1/engine/
|
||||||
|
- vllm/v1/worker/
|
||||||
|
- tests/compile/test_basic_correctness.py
|
||||||
|
- tests/compile/test_wrapper.py
|
||||||
|
- tests/distributed/
|
||||||
|
- tests/entrypoints/llm/test_collective_rpc.py
|
||||||
|
- tests/v1/distributed
|
||||||
|
- tests/v1/entrypoints/openai/test_multi_api_servers.py
|
||||||
|
- tests/v1/shutdown
|
||||||
|
- tests/v1/worker/test_worker_memory_snapshot.py
|
||||||
commands:
|
commands:
|
||||||
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py
|
# https://github.com/NVIDIA/nccl/issues/1838
|
||||||
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/test_external_lb_dp.py
|
- export NCCL_CUMEM_HOST_ENABLE=0
|
||||||
|
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
|
||||||
|
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
|
||||||
- DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py
|
- DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py
|
||||||
- pytest -v -s entrypoints/llm/test_collective_rpc.py
|
- pytest -v -s entrypoints/llm/test_collective_rpc.py
|
||||||
- pytest -v -s ./compile/test_basic_correctness.py
|
- pytest -v -s ./compile/test_basic_correctness.py
|
||||||
- pytest -v -s ./compile/test_wrapper.py
|
- pytest -v -s ./compile/test_wrapper.py
|
||||||
- VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
|
- VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
|
||||||
|
- VLLM_TEST_SAME_HOST=1 VLLM_TEST_WITH_DEFAULT_DEVICE_SET=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
|
||||||
|
- pytest -v -s distributed/test_sequence_parallel.py
|
||||||
|
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
|
||||||
|
- pytest -v -s v1/worker/test_worker_memory_snapshot.py
|
||||||
|
|
||||||
|
- label: Distributed Model Tests (2 GPUs) # 37min
|
||||||
|
timeout_in_minutes: 50
|
||||||
|
mirror_hardwares: [amdexperimental]
|
||||||
|
working_dir: "/vllm-workspace/tests"
|
||||||
|
num_gpus: 2
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/model_executor/model_loader/sharded_state_loader.py
|
||||||
|
- vllm/model_executor/models/
|
||||||
|
- tests/basic_correctness/
|
||||||
|
- tests/model_executor/model_loader/test_sharded_state_loader.py
|
||||||
|
- tests/models/
|
||||||
|
commands:
|
||||||
- TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
|
- TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
|
||||||
|
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s model_executor/model_loader/test_sharded_state_loader.py
|
||||||
# Avoid importing model tests that cause CUDA reinitialization error
|
# Avoid importing model tests that cause CUDA reinitialization error
|
||||||
- pytest models/test_transformers.py -v -s -m 'distributed(num_gpus=2)'
|
- pytest models/test_transformers.py -v -s -m 'distributed(num_gpus=2)'
|
||||||
- pytest models/language -v -s -m 'distributed(num_gpus=2)'
|
- pytest models/language -v -s -m 'distributed(num_gpus=2)'
|
||||||
- pytest models/multimodal -v -s -m 'distributed(num_gpus=2)' --ignore models/multimodal/generation/test_whisper.py
|
- pytest models/multimodal -v -s -m 'distributed(num_gpus=2)' --ignore models/multimodal/generation/test_whisper.py
|
||||||
- VLLM_WORKER_MULTIPROC_METHOD=spawn pytest models/multimodal/generation/test_whisper.py -v -s -m 'distributed(num_gpus=2)'
|
- VLLM_WORKER_MULTIPROC_METHOD=spawn pytest models/multimodal/generation/test_whisper.py -v -s -m 'distributed(num_gpus=2)'
|
||||||
# test sequence parallel
|
|
||||||
- pytest -v -s distributed/test_sequence_parallel.py
|
|
||||||
# this test fails consistently.
|
|
||||||
# TODO: investigate and fix
|
|
||||||
- VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
|
|
||||||
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
|
|
||||||
- pytest -v -s models/multimodal/generation/test_maverick.py
|
|
||||||
|
|
||||||
- label: Plugin Tests (2 GPUs) # 40min
|
- label: Plugin Tests (2 GPUs) # 40min
|
||||||
timeout_in_minutes: 60
|
timeout_in_minutes: 60
|
||||||
@@ -932,6 +1031,11 @@ steps:
|
|||||||
- pytest -v -s plugins_tests/test_io_processor_plugins.py
|
- pytest -v -s plugins_tests/test_io_processor_plugins.py
|
||||||
- pip uninstall prithvi_io_processor_plugin -y
|
- pip uninstall prithvi_io_processor_plugin -y
|
||||||
# end io_processor plugins test
|
# end io_processor plugins test
|
||||||
|
# begin stat_logger plugins test
|
||||||
|
- pip install -e ./plugins/vllm_add_dummy_stat_logger
|
||||||
|
- pytest -v -s plugins_tests/test_stats_logger_plugins.py
|
||||||
|
- pip uninstall dummy_stat_logger -y
|
||||||
|
# end stat_logger plugins test
|
||||||
# other tests continue here:
|
# other tests continue here:
|
||||||
- pytest -v -s plugins_tests/test_scheduler_plugins.py
|
- pytest -v -s plugins_tests/test_scheduler_plugins.py
|
||||||
- pip install -e ./plugins/vllm_add_dummy_model
|
- pip install -e ./plugins/vllm_add_dummy_model
|
||||||
@@ -954,7 +1058,6 @@ steps:
|
|||||||
commands:
|
commands:
|
||||||
- pytest -v -s distributed/test_pp_cudagraph.py
|
- pytest -v -s distributed/test_pp_cudagraph.py
|
||||||
- pytest -v -s distributed/test_pipeline_parallel.py
|
- pytest -v -s distributed/test_pipeline_parallel.py
|
||||||
# - pytest -v -s distributed/test_context_parallel.py # TODO: enable it on Hopper runners or add triton MLA support
|
|
||||||
|
|
||||||
- label: LoRA TP Test (Distributed) # 17 min
|
- label: LoRA TP Test (Distributed) # 17 min
|
||||||
timeout_in_minutes: 30
|
timeout_in_minutes: 30
|
||||||
@@ -972,6 +1075,7 @@ steps:
|
|||||||
- pytest -v -s -x lora/test_chatglm3_tp.py
|
- pytest -v -s -x lora/test_chatglm3_tp.py
|
||||||
- pytest -v -s -x lora/test_llama_tp.py
|
- pytest -v -s -x lora/test_llama_tp.py
|
||||||
- pytest -v -s -x lora/test_llm_with_multi_loras.py
|
- pytest -v -s -x lora/test_llm_with_multi_loras.py
|
||||||
|
- pytest -v -s -x lora/test_olmoe_tp.py
|
||||||
|
|
||||||
|
|
||||||
- label: Weight Loading Multiple GPU Test # 33min
|
- label: Weight Loading Multiple GPU Test # 33min
|
||||||
@@ -998,6 +1102,17 @@ steps:
|
|||||||
commands:
|
commands:
|
||||||
- bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large.txt
|
- bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large.txt
|
||||||
|
|
||||||
|
- label: NixlConnector PD accuracy tests (Distributed) # 30min
|
||||||
|
timeout_in_minutes: 30
|
||||||
|
working_dir: "/vllm-workspace/tests"
|
||||||
|
num_gpus: 4
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
|
||||||
|
- tests/v1/kv_connector/nixl_integration/
|
||||||
|
commands:
|
||||||
|
- uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
|
||||||
|
- bash v1/kv_connector/nixl_integration/tp_config_sweep_accuracy_test.sh
|
||||||
|
|
||||||
|
|
||||||
##### multi gpus test #####
|
##### multi gpus test #####
|
||||||
##### A100 test #####
|
##### A100 test #####
|
||||||
@@ -1028,9 +1143,38 @@ steps:
|
|||||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||||
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4
|
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4
|
||||||
|
|
||||||
- label: Qwen MoE EP Test # optional
|
##### H200 test #####
|
||||||
|
- label: Distributed Tests (H200) # optional
|
||||||
gpu: h200
|
gpu: h200
|
||||||
optional: true
|
optional: true
|
||||||
|
working_dir: "/vllm-workspace/"
|
||||||
num_gpus: 2
|
num_gpus: 2
|
||||||
commands:
|
commands:
|
||||||
- CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 /vllm-workspace/examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048
|
- pytest -v -s tests/compile/test_async_tp.py
|
||||||
|
- pytest -v -s tests/compile/test_sequence_parallelism.py
|
||||||
|
- pytest -v -s tests/compile/test_fusion_all_reduce.py
|
||||||
|
- pytest -v -s tests/compile/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm
|
||||||
|
- pytest -v -s tests/distributed/test_context_parallel.py
|
||||||
|
- CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048
|
||||||
|
|
||||||
|
##### B200 test #####
|
||||||
|
- label: Distributed Tests (B200) # optional
|
||||||
|
gpu: b200
|
||||||
|
optional: true
|
||||||
|
working_dir: "/vllm-workspace/"
|
||||||
|
num_gpus: 2
|
||||||
|
commands:
|
||||||
|
- pytest -v -s tests/distributed/test_context_parallel.py
|
||||||
|
- pytest -v -s tests/distributed/test_nccl_symm_mem_allreduce.py
|
||||||
|
|
||||||
|
##### RL Integration Tests #####
|
||||||
|
- label: Prime-RL Integration Test # 15min
|
||||||
|
timeout_in_minutes: 30
|
||||||
|
optional: true
|
||||||
|
num_gpus: 2
|
||||||
|
working_dir: "/vllm-workspace"
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/
|
||||||
|
- .buildkite/scripts/run-prime-rl-test.sh
|
||||||
|
commands:
|
||||||
|
- bash .buildkite/scripts/run-prime-rl-test.sh
|
||||||
|
|||||||
47
.coveragerc
Normal file
47
.coveragerc
Normal file
@@ -0,0 +1,47 @@
|
|||||||
|
[run]
|
||||||
|
# Track the installed vllm package (this is what actually gets imported during tests)
|
||||||
|
# Use wildcard pattern to match the installed location
|
||||||
|
source =
|
||||||
|
vllm
|
||||||
|
*/dist-packages/vllm
|
||||||
|
*/site-packages/vllm
|
||||||
|
omit =
|
||||||
|
*/tests/*
|
||||||
|
*/test_*
|
||||||
|
*/__pycache__/*
|
||||||
|
*/build/*
|
||||||
|
*/dist/*
|
||||||
|
*/vllm.egg-info/*
|
||||||
|
*/third_party/*
|
||||||
|
*/examples/*
|
||||||
|
*/benchmarks/*
|
||||||
|
*/docs/*
|
||||||
|
|
||||||
|
[paths]
|
||||||
|
# Map all possible vllm locations to a canonical "vllm" path
|
||||||
|
# This ensures coverage.combine properly merges data from different test runs
|
||||||
|
source =
|
||||||
|
vllm
|
||||||
|
/vllm-workspace/src/vllm
|
||||||
|
/vllm-workspace/vllm
|
||||||
|
*/site-packages/vllm
|
||||||
|
*/dist-packages/vllm
|
||||||
|
|
||||||
|
[report]
|
||||||
|
exclude_lines =
|
||||||
|
pragma: no cover
|
||||||
|
def __repr__
|
||||||
|
if self.debug:
|
||||||
|
if settings.DEBUG
|
||||||
|
raise AssertionError
|
||||||
|
raise NotImplementedError
|
||||||
|
if 0:
|
||||||
|
if __name__ == .__main__.:
|
||||||
|
class .*\bProtocol\):
|
||||||
|
@(abc\.)?abstractmethod
|
||||||
|
|
||||||
|
[html]
|
||||||
|
directory = htmlcov
|
||||||
|
|
||||||
|
[xml]
|
||||||
|
output = coverage.xml
|
||||||
4
.git-blame-ignore-revs
Normal file
4
.git-blame-ignore-revs
Normal file
@@ -0,0 +1,4 @@
|
|||||||
|
# Migrate from `yapf` & `isort` to `ruff`
|
||||||
|
d6953beb91da4e9c99be4c0a1304a2d24189535c
|
||||||
|
# Convert `Optional[x]` to `x | None` and `Union[x, y]` to `x | y`
|
||||||
|
8fcaaf6a165e661f63fc51be906bc05b0767332f
|
||||||
65
.github/CODEOWNERS
vendored
65
.github/CODEOWNERS
vendored
@@ -2,72 +2,86 @@
|
|||||||
# for more info about CODEOWNERS file
|
# for more info about CODEOWNERS file
|
||||||
|
|
||||||
# This lists cover the "core" components of vLLM that require careful review
|
# This lists cover the "core" components of vLLM that require careful review
|
||||||
|
/vllm/attention @LucasWilkinson
|
||||||
/vllm/attention/backends/abstract.py @WoosukKwon @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
|
/vllm/attention/backends/abstract.py @WoosukKwon @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
|
||||||
/vllm/core @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
|
|
||||||
/vllm/engine/llm_engine.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
|
|
||||||
/vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @22quinn
|
/vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @22quinn
|
||||||
/vllm/worker/worker_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @22quinn
|
/vllm/model_executor/layers/fused_moe @mgoin @pavanimajety
|
||||||
/vllm/worker/worker.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
|
/vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth @yewentao256 @pavanimajety
|
||||||
/vllm/model_executor/layers/sampler.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @NickLucche
|
|
||||||
/vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth @yewentao256
|
|
||||||
/vllm/model_executor/layers/mamba @tdoublep
|
/vllm/model_executor/layers/mamba @tdoublep
|
||||||
/vllm/model_executor/model_loader @22quinn
|
/vllm/model_executor/model_loader @22quinn
|
||||||
/vllm/multimodal @DarkLight1337 @ywang96 @NickLucche
|
/vllm/multimodal @DarkLight1337 @ywang96 @NickLucche
|
||||||
/vllm/v1/sample @22quinn @houseroad
|
|
||||||
/vllm/vllm_flash_attn @LucasWilkinson
|
/vllm/vllm_flash_attn @LucasWilkinson
|
||||||
/vllm/lora @jeejeelee
|
/vllm/lora @jeejeelee
|
||||||
/vllm/reasoning @aarnphm @chaunceyjiang
|
/vllm/reasoning @aarnphm @chaunceyjiang
|
||||||
/vllm/entrypoints @aarnphm @chaunceyjiang
|
/vllm/entrypoints @aarnphm @chaunceyjiang
|
||||||
/vllm/compilation @zou3519 @youkaichao @ProExpertProg
|
/vllm/compilation @zou3519 @youkaichao @ProExpertProg
|
||||||
/vllm/distributed/kv_transfer @NickLucche
|
/vllm/distributed/kv_transfer @NickLucche @ApostaC
|
||||||
CMakeLists.txt @tlrmchlsmth @LucasWilkinson
|
CMakeLists.txt @tlrmchlsmth @LucasWilkinson
|
||||||
|
|
||||||
# Any change to the VllmConfig changes can have a large user-facing impact,
|
# Any change to the VllmConfig changes can have a large user-facing impact,
|
||||||
# so spam a lot of people
|
# so spam a lot of people
|
||||||
/vllm/config @simon-mo @WoosukKwon @youkaichao @robertgshaw2-redhat @mgoin @tlrmchlsmth @houseroad @hmellor @yewentao256 @ProExpertProg
|
/vllm/config @simon-mo @WoosukKwon @youkaichao @robertgshaw2-redhat @mgoin @tlrmchlsmth @houseroad @hmellor @yewentao256 @ProExpertProg
|
||||||
|
/vllm/config/cache.py @simon-mo @WoosukKwon @youkaichao @robertgshaw2-redhat @mgoin @tlrmchlsmth @houseroad @hmellor @yewentao256 @ProExpertProg @heheda12345
|
||||||
|
|
||||||
# vLLM V1
|
# vLLM V1
|
||||||
/vllm/v1 @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat
|
/vllm/v1/attention @LucasWilkinson
|
||||||
/vllm/v1/structured_output @mgoin @russellb @aarnphm @benchislett
|
/vllm/v1/attention/backends/mla @pavanimajety
|
||||||
/vllm/v1/spec_decode @benchislett @luccafong
|
/vllm/v1/attention/backends/flashinfer.py @mgoin @pavanimajety
|
||||||
/vllm/v1/attention/backends/triton_attn.py @tdoublep
|
/vllm/v1/attention/backends/triton_attn.py @tdoublep
|
||||||
/vllm/v1/core @heheda12345
|
/vllm/v1/core @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat @heheda12345 @ApostaC
|
||||||
|
/vllm/v1/sample @22quinn @houseroad @njhill
|
||||||
|
/vllm/v1/spec_decode @benchislett @luccafong
|
||||||
|
/vllm/v1/structured_output @mgoin @russellb @aarnphm @benchislett
|
||||||
/vllm/v1/kv_cache_interface.py @heheda12345
|
/vllm/v1/kv_cache_interface.py @heheda12345
|
||||||
|
/vllm/v1/offloading @ApostaC
|
||||||
|
|
||||||
# Test ownership
|
# Test ownership
|
||||||
/.buildkite/lm-eval-harness @mgoin @simon-mo
|
/.buildkite/lm-eval-harness @mgoin @simon-mo
|
||||||
/tests/async_engine @njhill @robertgshaw2-redhat @simon-mo
|
|
||||||
/tests/distributed/test_multi_node_assignment.py @youkaichao
|
/tests/distributed/test_multi_node_assignment.py @youkaichao
|
||||||
/tests/distributed/test_pipeline_parallel.py @youkaichao
|
/tests/distributed/test_pipeline_parallel.py @youkaichao
|
||||||
/tests/distributed/test_same_node.py @youkaichao
|
/tests/distributed/test_same_node.py @youkaichao
|
||||||
/tests/entrypoints @DarkLight1337 @robertgshaw2-redhat @simon-mo @aarnphm @NickLucche
|
/tests/entrypoints @DarkLight1337 @robertgshaw2-redhat @simon-mo @aarnphm @NickLucche
|
||||||
/tests/kernels @tlrmchlsmth @WoosukKwon @yewentao256
|
/tests/evals @mgoin
|
||||||
|
/tests/kernels @mgoin @tlrmchlsmth @WoosukKwon @yewentao256
|
||||||
/tests/models @DarkLight1337 @ywang96
|
/tests/models @DarkLight1337 @ywang96
|
||||||
/tests/multimodal @DarkLight1337 @ywang96 @NickLucche
|
/tests/multimodal @DarkLight1337 @ywang96 @NickLucche
|
||||||
/tests/prefix_caching @comaniac @KuntaiDu
|
/tests/quantization @mgoin @robertgshaw2-redhat @yewentao256 @pavanimajety
|
||||||
/tests/quantization @mgoin @robertgshaw2-redhat @yewentao256
|
|
||||||
/tests/test_inputs.py @DarkLight1337 @ywang96
|
/tests/test_inputs.py @DarkLight1337 @ywang96
|
||||||
/tests/v1/entrypoints/llm/test_struct_output_generate.py @mgoin @russellb @aarnphm
|
/tests/v1/entrypoints/llm/test_struct_output_generate.py @mgoin @russellb @aarnphm
|
||||||
/tests/v1/structured_output @mgoin @russellb @aarnphm
|
/tests/v1/structured_output @mgoin @russellb @aarnphm
|
||||||
/tests/v1/core @heheda12345
|
/tests/v1/core @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat @heheda12345 @ApostaC
|
||||||
/tests/weight_loading @mgoin @youkaichao @yewentao256
|
/tests/weight_loading @mgoin @youkaichao @yewentao256
|
||||||
/tests/lora @jeejeelee
|
/tests/lora @jeejeelee
|
||||||
/tests/models/language/generation/test_hybrid.py @tdoublep
|
/tests/models/language/generation/test_hybrid.py @tdoublep
|
||||||
/tests/v1/kv_connector/nixl_integration @NickLucche
|
/tests/v1/kv_connector/nixl_integration @NickLucche
|
||||||
|
/tests/v1/kv_connector @ApostaC
|
||||||
|
/tests/v1/offloading @ApostaC
|
||||||
|
|
||||||
|
# Transformers backend
|
||||||
|
/vllm/model_executor/models/transformers @hmellor
|
||||||
|
/tests/models/test_transformers.py @hmellor
|
||||||
|
|
||||||
# Docs
|
# Docs
|
||||||
/docs @hmellor
|
/docs/mkdocs @hmellor
|
||||||
|
/docs/**/*.yml @hmellor
|
||||||
|
/requirements/docs.txt @hmellor
|
||||||
|
.readthedocs.yaml @hmellor
|
||||||
mkdocs.yaml @hmellor
|
mkdocs.yaml @hmellor
|
||||||
|
|
||||||
|
# Linting
|
||||||
|
.markdownlint.yaml @hmellor
|
||||||
|
.pre-commit-config.yaml @hmellor
|
||||||
|
/tools/pre_commit @hmellor
|
||||||
|
|
||||||
# CPU
|
# CPU
|
||||||
/vllm/v1/worker/^cpu @bigPYJ1151
|
/vllm/v1/worker/cpu* @bigPYJ1151
|
||||||
/csrc/cpu @bigPYJ1151
|
/csrc/cpu @bigPYJ1151
|
||||||
/vllm/platforms/cpu.py @bigPYJ1151
|
/vllm/platforms/cpu.py @bigPYJ1151
|
||||||
/cmake/cpu_extension.cmake @bigPYJ1151
|
/cmake/cpu_extension.cmake @bigPYJ1151
|
||||||
/docker/Dockerfile.cpu @bigPYJ1151
|
/docker/Dockerfile.cpu @bigPYJ1151
|
||||||
|
|
||||||
# Intel GPU
|
# Intel GPU
|
||||||
/vllm/v1/worker/^xpu @jikunshang
|
/vllm/v1/worker/xpu* @jikunshang
|
||||||
/vllm/platforms/xpu.py @jikunshang
|
/vllm/platforms/xpu.py @jikunshang
|
||||||
/docker/Dockerfile.xpu @jikunshang
|
/docker/Dockerfile.xpu @jikunshang
|
||||||
|
|
||||||
@@ -102,3 +116,14 @@ mkdocs.yaml @hmellor
|
|||||||
/vllm/platforms/tpu.py @NickLucche
|
/vllm/platforms/tpu.py @NickLucche
|
||||||
/vllm/v1/sample/tpu @NickLucche
|
/vllm/v1/sample/tpu @NickLucche
|
||||||
/vllm/tests/v1/tpu @NickLucche
|
/vllm/tests/v1/tpu @NickLucche
|
||||||
|
|
||||||
|
# KVConnector installation files
|
||||||
|
/requirements/kv_connectors.txt @NickLucche
|
||||||
|
|
||||||
|
# Pooling models
|
||||||
|
/examples/*/pooling/ @noooop
|
||||||
|
/tests/models/*/pooling* @noooop
|
||||||
|
/tests/entrypoints/pooling @noooop
|
||||||
|
/vllm/config/pooler.py @noooop
|
||||||
|
/vllm/pooling_params.py @noooop
|
||||||
|
/vllm/model_executor/layers/pooler.py @noooop
|
||||||
|
|||||||
4
.github/ISSUE_TEMPLATE/750-RFC.yml
vendored
4
.github/ISSUE_TEMPLATE/750-RFC.yml
vendored
@@ -43,10 +43,6 @@ body:
|
|||||||
Any other things you would like to mention.
|
Any other things you would like to mention.
|
||||||
validations:
|
validations:
|
||||||
required: false
|
required: false
|
||||||
- type: markdown
|
|
||||||
attributes:
|
|
||||||
value: >
|
|
||||||
Thanks for contributing 🎉! The vLLM core team hosts a biweekly RFC review session at 9:30AM Pacific Time, while most RFCs can be discussed online, you can optionally sign up for a slot to discuss your RFC online [here](https://docs.google.com/document/d/1CiLVBZeIVfR7_PNAKVSusxpceywkoOOB78qoWqHvSZc/edit).
|
|
||||||
- type: checkboxes
|
- type: checkboxes
|
||||||
id: askllm
|
id: askllm
|
||||||
attributes:
|
attributes:
|
||||||
|
|||||||
54
.github/mergify.yml
vendored
54
.github/mergify.yml
vendored
@@ -2,6 +2,7 @@ pull_request_rules:
|
|||||||
- name: label-documentation
|
- name: label-documentation
|
||||||
description: Automatically apply documentation label
|
description: Automatically apply documentation label
|
||||||
conditions:
|
conditions:
|
||||||
|
- label != stale
|
||||||
- or:
|
- or:
|
||||||
- files~=^[^/]+\.md$
|
- files~=^[^/]+\.md$
|
||||||
- files~=^docs/
|
- files~=^docs/
|
||||||
@@ -10,10 +11,13 @@ pull_request_rules:
|
|||||||
label:
|
label:
|
||||||
add:
|
add:
|
||||||
- documentation
|
- documentation
|
||||||
|
comment:
|
||||||
|
message: "Documentation preview: https://vllm--{{number}}.org.readthedocs.build/en/{{number}}/"
|
||||||
|
|
||||||
- name: label-ci-build
|
- name: label-ci-build
|
||||||
description: Automatically apply ci/build label
|
description: Automatically apply ci/build label
|
||||||
conditions:
|
conditions:
|
||||||
|
- label != stale
|
||||||
- or:
|
- or:
|
||||||
- files~=^\.github/
|
- files~=^\.github/
|
||||||
- files~=\.buildkite/
|
- files~=\.buildkite/
|
||||||
@@ -30,6 +34,7 @@ pull_request_rules:
|
|||||||
- name: label-deepseek
|
- name: label-deepseek
|
||||||
description: Automatically apply deepseek label
|
description: Automatically apply deepseek label
|
||||||
conditions:
|
conditions:
|
||||||
|
- label != stale
|
||||||
- or:
|
- or:
|
||||||
- files~=^examples/.*deepseek.*\.py
|
- files~=^examples/.*deepseek.*\.py
|
||||||
- files~=^tests/.*deepseek.*\.py
|
- files~=^tests/.*deepseek.*\.py
|
||||||
@@ -46,6 +51,7 @@ pull_request_rules:
|
|||||||
- name: label-frontend
|
- name: label-frontend
|
||||||
description: Automatically apply frontend label
|
description: Automatically apply frontend label
|
||||||
conditions:
|
conditions:
|
||||||
|
- label != stale
|
||||||
- files~=^vllm/entrypoints/
|
- files~=^vllm/entrypoints/
|
||||||
actions:
|
actions:
|
||||||
label:
|
label:
|
||||||
@@ -55,6 +61,7 @@ pull_request_rules:
|
|||||||
- name: label-llama
|
- name: label-llama
|
||||||
description: Automatically apply llama label
|
description: Automatically apply llama label
|
||||||
conditions:
|
conditions:
|
||||||
|
- label != stale
|
||||||
- or:
|
- or:
|
||||||
- files~=^examples/.*llama.*\.py
|
- files~=^examples/.*llama.*\.py
|
||||||
- files~=^tests/.*llama.*\.py
|
- files~=^tests/.*llama.*\.py
|
||||||
@@ -70,6 +77,7 @@ pull_request_rules:
|
|||||||
- name: label-multi-modality
|
- name: label-multi-modality
|
||||||
description: Automatically apply multi-modality label
|
description: Automatically apply multi-modality label
|
||||||
conditions:
|
conditions:
|
||||||
|
- label != stale
|
||||||
- or:
|
- or:
|
||||||
- files~=^vllm/multimodal/
|
- files~=^vllm/multimodal/
|
||||||
- files~=^tests/multimodal/
|
- files~=^tests/multimodal/
|
||||||
@@ -83,6 +91,7 @@ pull_request_rules:
|
|||||||
- name: label-new-model
|
- name: label-new-model
|
||||||
description: Automatically apply new-model label
|
description: Automatically apply new-model label
|
||||||
conditions:
|
conditions:
|
||||||
|
- label != stale
|
||||||
- and:
|
- and:
|
||||||
- files~=^vllm/model_executor/models/
|
- files~=^vllm/model_executor/models/
|
||||||
- files=vllm/model_executor/models/registry.py
|
- files=vllm/model_executor/models/registry.py
|
||||||
@@ -94,6 +103,7 @@ pull_request_rules:
|
|||||||
- name: label-performance
|
- name: label-performance
|
||||||
description: Automatically apply performance label
|
description: Automatically apply performance label
|
||||||
conditions:
|
conditions:
|
||||||
|
- label != stale
|
||||||
- or:
|
- or:
|
||||||
- files~=^benchmarks/
|
- files~=^benchmarks/
|
||||||
- files~=^vllm/benchmarks/
|
- files~=^vllm/benchmarks/
|
||||||
@@ -107,6 +117,7 @@ pull_request_rules:
|
|||||||
- name: label-qwen
|
- name: label-qwen
|
||||||
description: Automatically apply qwen label
|
description: Automatically apply qwen label
|
||||||
conditions:
|
conditions:
|
||||||
|
- label != stale
|
||||||
- or:
|
- or:
|
||||||
- files~=^examples/.*qwen.*\.py
|
- files~=^examples/.*qwen.*\.py
|
||||||
- files~=^tests/.*qwen.*\.py
|
- files~=^tests/.*qwen.*\.py
|
||||||
@@ -121,6 +132,7 @@ pull_request_rules:
|
|||||||
- name: label-gpt-oss
|
- name: label-gpt-oss
|
||||||
description: Automatically apply gpt-oss label
|
description: Automatically apply gpt-oss label
|
||||||
conditions:
|
conditions:
|
||||||
|
- label != stale
|
||||||
- or:
|
- or:
|
||||||
- files~=^examples/.*gpt[-_]?oss.*\.py
|
- files~=^examples/.*gpt[-_]?oss.*\.py
|
||||||
- files~=^tests/.*gpt[-_]?oss.*\.py
|
- files~=^tests/.*gpt[-_]?oss.*\.py
|
||||||
@@ -142,6 +154,7 @@ pull_request_rules:
|
|||||||
- name: label-rocm
|
- name: label-rocm
|
||||||
description: Automatically apply rocm label
|
description: Automatically apply rocm label
|
||||||
conditions:
|
conditions:
|
||||||
|
- label != stale
|
||||||
- or:
|
- or:
|
||||||
- files~=^csrc/rocm/
|
- files~=^csrc/rocm/
|
||||||
- files~=^docker/Dockerfile.rocm
|
- files~=^docker/Dockerfile.rocm
|
||||||
@@ -162,6 +175,7 @@ pull_request_rules:
|
|||||||
- name: label-structured-output
|
- name: label-structured-output
|
||||||
description: Automatically apply structured-output label
|
description: Automatically apply structured-output label
|
||||||
conditions:
|
conditions:
|
||||||
|
- label != stale
|
||||||
- or:
|
- or:
|
||||||
- files~=^benchmarks/structured_schemas/
|
- files~=^benchmarks/structured_schemas/
|
||||||
- files=benchmarks/benchmark_serving_structured_output.py
|
- files=benchmarks/benchmark_serving_structured_output.py
|
||||||
@@ -171,7 +185,7 @@ pull_request_rules:
|
|||||||
- files=examples/online_serving/openai_chat_completion_structured_outputs.py
|
- files=examples/online_serving/openai_chat_completion_structured_outputs.py
|
||||||
- files=examples/online_serving/openai_chat_completion_structured_outputs_with_reasoning.py
|
- files=examples/online_serving/openai_chat_completion_structured_outputs_with_reasoning.py
|
||||||
- files~=^tests/v1/structured_output/
|
- files~=^tests/v1/structured_output/
|
||||||
- files=tests/v1/entrypoints/llm/test_guided_generate.py
|
- files=tests/v1/entrypoints/llm/test_struct_output_generate.py
|
||||||
- files~=^vllm/v1/structured_output/
|
- files~=^vllm/v1/structured_output/
|
||||||
actions:
|
actions:
|
||||||
label:
|
label:
|
||||||
@@ -181,6 +195,7 @@ pull_request_rules:
|
|||||||
- name: label-speculative-decoding
|
- name: label-speculative-decoding
|
||||||
description: Automatically apply speculative-decoding label
|
description: Automatically apply speculative-decoding label
|
||||||
conditions:
|
conditions:
|
||||||
|
- label != stale
|
||||||
- or:
|
- or:
|
||||||
- files~=^vllm/v1/spec_decode/
|
- files~=^vllm/v1/spec_decode/
|
||||||
- files~=^tests/v1/spec_decode/
|
- files~=^tests/v1/spec_decode/
|
||||||
@@ -196,6 +211,7 @@ pull_request_rules:
|
|||||||
- name: label-v1
|
- name: label-v1
|
||||||
description: Automatically apply v1 label
|
description: Automatically apply v1 label
|
||||||
conditions:
|
conditions:
|
||||||
|
- label != stale
|
||||||
- or:
|
- or:
|
||||||
- files~=^vllm/v1/
|
- files~=^vllm/v1/
|
||||||
- files~=^tests/v1/
|
- files~=^tests/v1/
|
||||||
@@ -208,6 +224,7 @@ pull_request_rules:
|
|||||||
description: Automatically apply tpu label
|
description: Automatically apply tpu label
|
||||||
# Keep this list in sync with `label-tpu-remove` conditions
|
# Keep this list in sync with `label-tpu-remove` conditions
|
||||||
conditions:
|
conditions:
|
||||||
|
- label != stale
|
||||||
- or:
|
- or:
|
||||||
- files~=tpu.py
|
- files~=tpu.py
|
||||||
- files~=_tpu
|
- files~=_tpu
|
||||||
@@ -223,6 +240,7 @@ pull_request_rules:
|
|||||||
description: Automatically remove tpu label
|
description: Automatically remove tpu label
|
||||||
# Keep this list in sync with `label-tpu` conditions
|
# Keep this list in sync with `label-tpu` conditions
|
||||||
conditions:
|
conditions:
|
||||||
|
- label != stale
|
||||||
- and:
|
- and:
|
||||||
- -files~=tpu.py
|
- -files~=tpu.py
|
||||||
- -files~=_tpu
|
- -files~=_tpu
|
||||||
@@ -237,9 +255,9 @@ pull_request_rules:
|
|||||||
- name: label-tool-calling
|
- name: label-tool-calling
|
||||||
description: Automatically add tool-calling label
|
description: Automatically add tool-calling label
|
||||||
conditions:
|
conditions:
|
||||||
|
- label != stale
|
||||||
- or:
|
- or:
|
||||||
- files~=^tests/tool_use/
|
- files~=^tests/tool_use/
|
||||||
- files~=^tests/mistral_tool_use/
|
|
||||||
- files~=^tests/entrypoints/openai/tool_parsers/
|
- files~=^tests/entrypoints/openai/tool_parsers/
|
||||||
- files=tests/entrypoints/openai/test_chat_with_tool_reasoning.py
|
- files=tests/entrypoints/openai/test_chat_with_tool_reasoning.py
|
||||||
- files~=^vllm/entrypoints/openai/tool_parsers/
|
- files~=^vllm/entrypoints/openai/tool_parsers/
|
||||||
@@ -256,8 +274,9 @@ pull_request_rules:
|
|||||||
|
|
||||||
- name: ping author on conflicts and add 'needs-rebase' label
|
- name: ping author on conflicts and add 'needs-rebase' label
|
||||||
conditions:
|
conditions:
|
||||||
- conflict
|
- label != stale
|
||||||
- -closed
|
- conflict
|
||||||
|
- -closed
|
||||||
actions:
|
actions:
|
||||||
label:
|
label:
|
||||||
add:
|
add:
|
||||||
@@ -271,10 +290,12 @@ pull_request_rules:
|
|||||||
|
|
||||||
- name: assign reviewer for tensorizer changes
|
- name: assign reviewer for tensorizer changes
|
||||||
conditions:
|
conditions:
|
||||||
|
- label != stale
|
||||||
|
- or:
|
||||||
- files~=^vllm/model_executor/model_loader/tensorizer.py
|
- files~=^vllm/model_executor/model_loader/tensorizer.py
|
||||||
- files~=^vllm/model_executor/model_loader/tensorizer_loader.py
|
- files~=^vllm/model_executor/model_loader/tensorizer_loader.py
|
||||||
- files~=^tests/entrypoints/openai/test_tensorizer_entrypoint.py
|
- files~=^tests/entrypoints/openai/test_tensorizer_entrypoint.py
|
||||||
- files~=^tests/tensorizer_loader/
|
- files~=^tests/model_executor/model_loader/tensorizer_loader/
|
||||||
actions:
|
actions:
|
||||||
assign:
|
assign:
|
||||||
users:
|
users:
|
||||||
@@ -282,6 +303,7 @@ pull_request_rules:
|
|||||||
|
|
||||||
- name: assign reviewer for modelopt changes
|
- name: assign reviewer for modelopt changes
|
||||||
conditions:
|
conditions:
|
||||||
|
- label != stale
|
||||||
- or:
|
- or:
|
||||||
- files~=^vllm/model_executor/layers/quantization/modelopt\.py$
|
- files~=^vllm/model_executor/layers/quantization/modelopt\.py$
|
||||||
- files~=^vllm/model_executor/layers/quantization/__init__\.py$
|
- files~=^vllm/model_executor/layers/quantization/__init__\.py$
|
||||||
@@ -296,9 +318,27 @@ pull_request_rules:
|
|||||||
|
|
||||||
- name: remove 'needs-rebase' label when conflict is resolved
|
- name: remove 'needs-rebase' label when conflict is resolved
|
||||||
conditions:
|
conditions:
|
||||||
- -conflict
|
- -conflict
|
||||||
- -closed
|
- -closed
|
||||||
actions:
|
actions:
|
||||||
label:
|
label:
|
||||||
remove:
|
remove:
|
||||||
- needs-rebase
|
- needs-rebase
|
||||||
|
|
||||||
|
- name: label-kv-connector
|
||||||
|
description: Automatically apply kv-connector label
|
||||||
|
conditions:
|
||||||
|
- label != stale
|
||||||
|
- or:
|
||||||
|
- files~=^examples/online_serving/disaggregated[^/]*/.*
|
||||||
|
- files~=^examples/offline_inference/disaggregated[^/]*/.*
|
||||||
|
- files~=^examples/others/lmcache/
|
||||||
|
- files~=^tests/v1/kv_connector/
|
||||||
|
- files~=^vllm/distributed/kv_transfer/
|
||||||
|
- title~=(?i)\bP/?D\b
|
||||||
|
- title~=(?i)NIXL
|
||||||
|
- title~=(?i)LMCache
|
||||||
|
actions:
|
||||||
|
label:
|
||||||
|
add:
|
||||||
|
- kv-connector
|
||||||
130
.github/workflows/issue_autolabel.yml
vendored
130
.github/workflows/issue_autolabel.yml
vendored
@@ -13,6 +13,7 @@ jobs:
|
|||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
steps:
|
steps:
|
||||||
- name: Label issues based on keywords
|
- name: Label issues based on keywords
|
||||||
|
id: label-step
|
||||||
uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8.0.0
|
uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8.0.0
|
||||||
with:
|
with:
|
||||||
script: |
|
script: |
|
||||||
@@ -42,7 +43,6 @@ jobs:
|
|||||||
searchIn: "body"
|
searchIn: "body"
|
||||||
},
|
},
|
||||||
],
|
],
|
||||||
|
|
||||||
// Substring search - matches anywhere in text (partial matches)
|
// Substring search - matches anywhere in text (partial matches)
|
||||||
substrings: [
|
substrings: [
|
||||||
{
|
{
|
||||||
@@ -89,14 +89,12 @@ jobs:
|
|||||||
term: "hip_",
|
term: "hip_",
|
||||||
searchIn: "both"
|
searchIn: "both"
|
||||||
},
|
},
|
||||||
|
|
||||||
// ROCm tools and libraries
|
// ROCm tools and libraries
|
||||||
{
|
{
|
||||||
term: "hipify",
|
term: "hipify",
|
||||||
searchIn: "both"
|
searchIn: "both"
|
||||||
},
|
},
|
||||||
],
|
],
|
||||||
|
|
||||||
// Regex patterns - for complex pattern matching
|
// Regex patterns - for complex pattern matching
|
||||||
regexPatterns: [
|
regexPatterns: [
|
||||||
{
|
{
|
||||||
@@ -107,13 +105,17 @@ jobs:
|
|||||||
}
|
}
|
||||||
],
|
],
|
||||||
},
|
},
|
||||||
|
// Add more label configurations here as needed
|
||||||
|
// example: {
|
||||||
|
// keywords: [...],
|
||||||
|
// substrings: [...],
|
||||||
|
// regexPatterns: [...]
|
||||||
|
// },
|
||||||
};
|
};
|
||||||
|
|
||||||
// Helper function to create regex based on search type
|
// Helper function to create regex based on search type
|
||||||
function createSearchRegex(term, type) {
|
function createSearchRegex(term, type) {
|
||||||
// Escape special regex characters in the term
|
// Escape special regex characters in the term
|
||||||
const escapedTerm = term.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
|
const escapedTerm = term.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
|
||||||
|
|
||||||
switch (type) {
|
switch (type) {
|
||||||
case 'keyword':
|
case 'keyword':
|
||||||
// Word boundary search - matches whole words only
|
// Word boundary search - matches whole words only
|
||||||
@@ -125,16 +127,13 @@ jobs:
|
|||||||
throw new Error(`Unknown search type: ${type}`);
|
throw new Error(`Unknown search type: ${type}`);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Helper function to find matching terms in text with line information
|
// Helper function to find matching terms in text with line information
|
||||||
function findMatchingTermsWithLines(text, searchTerms = [], searchType = 'keyword', searchLocation = '') {
|
function findMatchingTermsWithLines(text, searchTerms = [], searchType = 'keyword', searchLocation = '') {
|
||||||
const matches = [];
|
const matches = [];
|
||||||
const lines = text.split('\n');
|
const lines = text.split('\n');
|
||||||
|
|
||||||
for (const termConfig of searchTerms) {
|
for (const termConfig of searchTerms) {
|
||||||
let regex;
|
let regex;
|
||||||
let term, searchIn, pattern, description, flags;
|
let term, searchIn, pattern, description, flags;
|
||||||
|
|
||||||
// Handle different input formats (string or object)
|
// Handle different input formats (string or object)
|
||||||
if (typeof termConfig === 'string') {
|
if (typeof termConfig === 'string') {
|
||||||
term = termConfig;
|
term = termConfig;
|
||||||
@@ -146,21 +145,17 @@ jobs:
|
|||||||
description = termConfig.description;
|
description = termConfig.description;
|
||||||
flags = termConfig.flags;
|
flags = termConfig.flags;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Skip if this term shouldn't be searched in the current location
|
// Skip if this term shouldn't be searched in the current location
|
||||||
if (searchIn !== 'both' && searchIn !== searchLocation) {
|
if (searchIn !== 'both' && searchIn !== searchLocation) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Create appropriate regex
|
// Create appropriate regex
|
||||||
if (searchType === 'regex') {
|
if (searchType === 'regex') {
|
||||||
regex = new RegExp(pattern, flags || "gi");
|
regex = new RegExp(pattern, flags || "gi");
|
||||||
} else {
|
} else {
|
||||||
regex = createSearchRegex(term, searchType);
|
regex = createSearchRegex(term, searchType);
|
||||||
}
|
}
|
||||||
|
|
||||||
const termMatches = [];
|
const termMatches = [];
|
||||||
|
|
||||||
// Check each line for matches
|
// Check each line for matches
|
||||||
lines.forEach((line, lineIndex) => {
|
lines.forEach((line, lineIndex) => {
|
||||||
const lineMatches = line.match(regex);
|
const lineMatches = line.match(regex);
|
||||||
@@ -183,7 +178,6 @@ jobs:
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
if (termMatches.length > 0) {
|
if (termMatches.length > 0) {
|
||||||
matches.push({
|
matches.push({
|
||||||
term: term || (description || pattern),
|
term: term || (description || pattern),
|
||||||
@@ -196,64 +190,48 @@ jobs:
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return matches;
|
return matches;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Helper function to check if label should be added
|
// Helper function to check if label should be added
|
||||||
async function processLabel(labelName, config) {
|
async function processLabel(labelName, config) {
|
||||||
const body = context.payload.issue.body || "";
|
const body = context.payload.issue.body || "";
|
||||||
const title = context.payload.issue.title || "";
|
const title = context.payload.issue.title || "";
|
||||||
|
|
||||||
core.notice(`Processing label: ${labelName}`);
|
core.notice(`Processing label: ${labelName}`);
|
||||||
core.notice(`Issue Title: "${title}"`);
|
core.notice(`Issue Title: "${title}"`);
|
||||||
core.notice(`Issue Body length: ${body.length} characters`);
|
core.notice(`Issue Body length: ${body.length} characters`);
|
||||||
|
|
||||||
let shouldAddLabel = false;
|
let shouldAddLabel = false;
|
||||||
let allMatches = [];
|
let allMatches = [];
|
||||||
let reason = '';
|
let reason = '';
|
||||||
|
|
||||||
const keywords = config.keywords || [];
|
const keywords = config.keywords || [];
|
||||||
const substrings = config.substrings || [];
|
const substrings = config.substrings || [];
|
||||||
const regexPatterns = config.regexPatterns || [];
|
const regexPatterns = config.regexPatterns || [];
|
||||||
|
|
||||||
core.notice(`Searching with ${keywords.length} keywords, ${substrings.length} substrings, and ${regexPatterns.length} regex patterns`);
|
core.notice(`Searching with ${keywords.length} keywords, ${substrings.length} substrings, and ${regexPatterns.length} regex patterns`);
|
||||||
|
|
||||||
// Search in title
|
// Search in title
|
||||||
if (title.trim()) {
|
if (title.trim()) {
|
||||||
core.notice(`Searching in title: "${title}"`);
|
core.notice(`Searching in title: "${title}"`);
|
||||||
|
|
||||||
const titleKeywordMatches = findMatchingTermsWithLines(title, keywords, 'keyword', 'title');
|
const titleKeywordMatches = findMatchingTermsWithLines(title, keywords, 'keyword', 'title');
|
||||||
const titleSubstringMatches = findMatchingTermsWithLines(title, substrings, 'substring', 'title');
|
const titleSubstringMatches = findMatchingTermsWithLines(title, substrings, 'substring', 'title');
|
||||||
const titleRegexMatches = findMatchingTermsWithLines(title, regexPatterns, 'regex', 'title');
|
const titleRegexMatches = findMatchingTermsWithLines(title, regexPatterns, 'regex', 'title');
|
||||||
|
|
||||||
allMatches.push(...titleKeywordMatches, ...titleSubstringMatches, ...titleRegexMatches);
|
allMatches.push(...titleKeywordMatches, ...titleSubstringMatches, ...titleRegexMatches);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Search in body
|
// Search in body
|
||||||
if (body.trim()) {
|
if (body.trim()) {
|
||||||
core.notice(`Searching in body (${body.length} characters)`);
|
core.notice(`Searching in body (${body.length} characters)`);
|
||||||
|
|
||||||
const bodyKeywordMatches = findMatchingTermsWithLines(body, keywords, 'keyword', 'body');
|
const bodyKeywordMatches = findMatchingTermsWithLines(body, keywords, 'keyword', 'body');
|
||||||
const bodySubstringMatches = findMatchingTermsWithLines(body, substrings, 'substring', 'body');
|
const bodySubstringMatches = findMatchingTermsWithLines(body, substrings, 'substring', 'body');
|
||||||
const bodyRegexMatches = findMatchingTermsWithLines(body, regexPatterns, 'regex', 'body');
|
const bodyRegexMatches = findMatchingTermsWithLines(body, regexPatterns, 'regex', 'body');
|
||||||
|
|
||||||
allMatches.push(...bodyKeywordMatches, ...bodySubstringMatches, ...bodyRegexMatches);
|
allMatches.push(...bodyKeywordMatches, ...bodySubstringMatches, ...bodyRegexMatches);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (allMatches.length > 0) {
|
if (allMatches.length > 0) {
|
||||||
core.notice(`Found ${allMatches.length} matching term(s):`);
|
core.notice(`Found ${allMatches.length} matching term(s):`);
|
||||||
|
|
||||||
for (const termMatch of allMatches) {
|
for (const termMatch of allMatches) {
|
||||||
const locationText = termMatch.searchLocation === 'title' ? 'title' : 'body';
|
const locationText = termMatch.searchLocation === 'title' ? 'title' : 'body';
|
||||||
const searchInText = termMatch.searchIn === 'both' ? 'both' : termMatch.searchIn;
|
const searchInText = termMatch.searchIn === 'both' ? 'both' : termMatch.searchIn;
|
||||||
|
|
||||||
if (termMatch.searchType === 'regex') {
|
if (termMatch.searchType === 'regex') {
|
||||||
core.notice(` 📍 Regex: "${termMatch.term}" (pattern: ${termMatch.pattern}) found ${termMatch.count} time(s) in ${locationText} (configured to search in: ${searchInText}):`);
|
core.notice(` 📍 Regex: "${termMatch.term}" (pattern: ${termMatch.pattern}) found ${termMatch.count} time(s) in ${locationText} (configured to search in: ${searchInText}):`);
|
||||||
} else {
|
} else {
|
||||||
core.notice(` 📍 Term: "${termMatch.term}" (${termMatch.searchType} search) found ${termMatch.count} time(s) in ${locationText} (configured to search in: ${searchInText}):`);
|
core.notice(` 📍 Term: "${termMatch.term}" (${termMatch.searchType} search) found ${termMatch.count} time(s) in ${locationText} (configured to search in: ${searchInText}):`);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Show details for each match
|
// Show details for each match
|
||||||
termMatch.matches.forEach((match, index) => {
|
termMatch.matches.forEach((match, index) => {
|
||||||
core.notice(` ${index + 1}. Line ${match.lineNumber} in ${match.searchLocation}: "${match.match}" [${match.searchType}]`);
|
core.notice(` ${index + 1}. Line ${match.lineNumber} in ${match.searchLocation}: "${match.match}" [${match.searchType}]`);
|
||||||
@@ -266,7 +244,6 @@ jobs:
|
|||||||
}
|
}
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
shouldAddLabel = true;
|
shouldAddLabel = true;
|
||||||
const totalMatches = allMatches.reduce((sum, t) => sum + t.count, 0);
|
const totalMatches = allMatches.reduce((sum, t) => sum + t.count, 0);
|
||||||
const titleMatches = allMatches.filter(t => t.searchLocation === 'title').reduce((sum, t) => sum + t.count, 0);
|
const titleMatches = allMatches.filter(t => t.searchLocation === 'title').reduce((sum, t) => sum + t.count, 0);
|
||||||
@@ -274,13 +251,10 @@ jobs:
|
|||||||
const keywordMatches = allMatches.filter(t => t.searchType === 'keyword').reduce((sum, t) => sum + t.count, 0);
|
const keywordMatches = allMatches.filter(t => t.searchType === 'keyword').reduce((sum, t) => sum + t.count, 0);
|
||||||
const substringMatches = allMatches.filter(t => t.searchType === 'substring').reduce((sum, t) => sum + t.count, 0);
|
const substringMatches = allMatches.filter(t => t.searchType === 'substring').reduce((sum, t) => sum + t.count, 0);
|
||||||
const regexMatches = allMatches.filter(t => t.searchType === 'regex').reduce((sum, t) => sum + t.count, 0);
|
const regexMatches = allMatches.filter(t => t.searchType === 'regex').reduce((sum, t) => sum + t.count, 0);
|
||||||
|
|
||||||
reason = `Found ${totalMatches} total matches (${titleMatches} in title, ${bodyMatches} in body) - ${keywordMatches} keyword matches, ${substringMatches} substring matches, ${regexMatches} regex matches`;
|
reason = `Found ${totalMatches} total matches (${titleMatches} in title, ${bodyMatches} in body) - ${keywordMatches} keyword matches, ${substringMatches} substring matches, ${regexMatches} regex matches`;
|
||||||
}
|
}
|
||||||
|
|
||||||
core.notice(`Final decision: ${shouldAddLabel ? 'ADD LABEL' : 'DO NOT ADD LABEL'}`);
|
core.notice(`Final decision: ${shouldAddLabel ? 'ADD LABEL' : 'DO NOT ADD LABEL'}`);
|
||||||
core.notice(`Reason: ${reason || 'No matching terms found'}`);
|
core.notice(`Reason: ${reason || 'No matching terms found'}`);
|
||||||
|
|
||||||
if (shouldAddLabel) {
|
if (shouldAddLabel) {
|
||||||
const existingLabels = context.payload.issue.labels.map(l => l.name);
|
const existingLabels = context.payload.issue.labels.map(l => l.name);
|
||||||
if (!existingLabels.includes(labelName)) {
|
if (!existingLabels.includes(labelName)) {
|
||||||
@@ -296,14 +270,92 @@ jobs:
|
|||||||
core.notice(`Label "${labelName}" already present.`);
|
core.notice(`Label "${labelName}" already present.`);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
core.notice(`No matching terms found for label "${labelName}".`);
|
core.notice(`No matching terms found for label "${labelName}".`);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Process all configured labels
|
// Process all configured labels
|
||||||
const processLabels = Object.entries(labelConfig)
|
const labelsAddedResults = await Promise.all(
|
||||||
.map(([labelName, config]) => processLabel(labelName, config));
|
Object.entries(labelConfig).map(([labelName, config]) =>
|
||||||
const labelsAdded = await Promise.all(processLabels);
|
processLabel(labelName, config).then(added => ({ labelName, added }))
|
||||||
const numLabelsAdded = labelsAdded.reduce((x, y) => x + y, 0);
|
)
|
||||||
|
);
|
||||||
|
|
||||||
|
const numLabelsAdded = labelsAddedResults.filter(r => r.added).length;
|
||||||
core.notice(`Processing complete. ${numLabelsAdded} label(s) added.`);
|
core.notice(`Processing complete. ${numLabelsAdded} label(s) added.`);
|
||||||
|
|
||||||
|
// Return which labels were added for the next step
|
||||||
|
const addedLabels = labelsAddedResults.filter(r => r.added).map(r => r.labelName);
|
||||||
|
core.setOutput('labels_added', JSON.stringify(addedLabels));
|
||||||
|
return addedLabels;
|
||||||
|
|
||||||
|
- name: CC users for labeled issues
|
||||||
|
if: steps.label-step.outputs.labels_added != '[]'
|
||||||
|
uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8.0.0
|
||||||
|
with:
|
||||||
|
script: |
|
||||||
|
// Configuration: Map labels to GitHub users to CC
|
||||||
|
// You can add multiple users per label, and multiple label configurations
|
||||||
|
const ccConfig = {
|
||||||
|
rocm: {
|
||||||
|
users: ['hongxiayang', 'tjtanaa', 'vllmellm'], // Add more users as needed: ['user1', 'user2', 'user3']
|
||||||
|
message: 'CC {users} for ROCm-related issue' // {users} will be replaced with @mentions
|
||||||
|
},
|
||||||
|
// Add more label -> user mappings here
|
||||||
|
// Example:
|
||||||
|
// cuda: {
|
||||||
|
// users: ['user1', 'user2'],
|
||||||
|
// message: 'CC {users} for CUDA-related issue'
|
||||||
|
// },
|
||||||
|
// performance: {
|
||||||
|
// users: ['perfexpert'],
|
||||||
|
// message: 'CC {users} for performance issue'
|
||||||
|
// },
|
||||||
|
};
|
||||||
|
|
||||||
|
const labelsAdded = JSON.parse('${{ steps.label-step.outputs.labels_added }}');
|
||||||
|
core.notice(`Labels added: ${labelsAdded.join(', ')}`);
|
||||||
|
|
||||||
|
// Get existing comments to check for already mentioned users
|
||||||
|
const comments = await github.rest.issues.listComments({
|
||||||
|
owner: context.repo.owner,
|
||||||
|
repo: context.repo.repo,
|
||||||
|
issue_number: context.issue.number,
|
||||||
|
});
|
||||||
|
|
||||||
|
const issueBody = context.payload.issue.body || '';
|
||||||
|
const allExistingText = issueBody + '\n' + comments.data.map(c => c.body).join('\n');
|
||||||
|
|
||||||
|
// Process each label that was added
|
||||||
|
for (const label of labelsAdded) {
|
||||||
|
if (ccConfig[label]) {
|
||||||
|
const config = ccConfig[label];
|
||||||
|
const usersToMention = [];
|
||||||
|
|
||||||
|
// Check which users haven't been mentioned yet
|
||||||
|
for (const user of config.users) {
|
||||||
|
const mentionPattern = new RegExp(`@${user}\\b`, 'i');
|
||||||
|
if (!mentionPattern.test(allExistingText)) {
|
||||||
|
usersToMention.push(user);
|
||||||
|
} else {
|
||||||
|
core.notice(`@${user} already mentioned for label "${label}", skipping`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Post comment if there are users to mention
|
||||||
|
if (usersToMention.length > 0) {
|
||||||
|
const mentions = usersToMention.map(u => `@${u}`).join(' ');
|
||||||
|
const message = config.message.replace('{users}', mentions);
|
||||||
|
|
||||||
|
await github.rest.issues.createComment({
|
||||||
|
owner: context.repo.owner,
|
||||||
|
repo: context.repo.repo,
|
||||||
|
issue_number: context.issue.number,
|
||||||
|
body: message
|
||||||
|
});
|
||||||
|
|
||||||
|
core.notice(`CC comment added for label "${label}": ${mentions}`);
|
||||||
|
} else {
|
||||||
|
core.notice(`All users for label "${label}" already mentioned, skipping comment`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
2
.github/workflows/stale.yml
vendored
2
.github/workflows/stale.yml
vendored
@@ -13,7 +13,7 @@ jobs:
|
|||||||
actions: write
|
actions: write
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/stale@3a9db7e6a41a89f618792c92c0e97cc736e1b13f # v10.0.0
|
- uses: actions/stale@5f858e3efba33a5ca4407a664cc011ad407f2008 # v10.1.0
|
||||||
with:
|
with:
|
||||||
# Increasing this value ensures that changes to this workflow
|
# Increasing this value ensures that changes to this workflow
|
||||||
# propagate to all issues and PRs in days rather than months
|
# propagate to all issues and PRs in days rather than months
|
||||||
|
|||||||
3
.gitignore
vendored
3
.gitignore
vendored
@@ -94,6 +94,9 @@ ipython_config.py
|
|||||||
# generated files
|
# generated files
|
||||||
**/generated/**
|
**/generated/**
|
||||||
|
|
||||||
|
# uv
|
||||||
|
uv.lock
|
||||||
|
|
||||||
# pyenv
|
# pyenv
|
||||||
# For a library or package, you might want to ignore these files since the code is
|
# For a library or package, you might want to ignore these files since the code is
|
||||||
# intended to run in multiple environments; otherwise, check them in:
|
# intended to run in multiple environments; otherwise, check them in:
|
||||||
|
|||||||
@@ -4,7 +4,6 @@ MD013: false
|
|||||||
MD024:
|
MD024:
|
||||||
siblings_only: true
|
siblings_only: true
|
||||||
MD033: false
|
MD033: false
|
||||||
MD042: false
|
|
||||||
MD045: false
|
MD045: false
|
||||||
MD046: false
|
MD046: false
|
||||||
MD051: false
|
MD051: false
|
||||||
|
|||||||
@@ -6,30 +6,19 @@ default_stages:
|
|||||||
- manual # Run in CI
|
- manual # Run in CI
|
||||||
exclude: 'vllm/third_party/.*'
|
exclude: 'vllm/third_party/.*'
|
||||||
repos:
|
repos:
|
||||||
- repo: https://github.com/google/yapf
|
|
||||||
rev: v0.43.0
|
|
||||||
hooks:
|
|
||||||
- id: yapf
|
|
||||||
args: [--in-place, --verbose]
|
|
||||||
# Keep the same list from yapfignore here to avoid yapf failing without any inputs
|
|
||||||
exclude: '(.buildkite|benchmarks|build|examples)/.*'
|
|
||||||
- repo: https://github.com/astral-sh/ruff-pre-commit
|
- repo: https://github.com/astral-sh/ruff-pre-commit
|
||||||
rev: v0.11.7
|
rev: v0.14.0
|
||||||
hooks:
|
hooks:
|
||||||
- id: ruff
|
- id: ruff-check
|
||||||
args: [--output-format, github, --fix]
|
args: [--output-format, github, --fix]
|
||||||
- id: ruff-format
|
- id: ruff-format
|
||||||
files: ^(.buildkite|benchmarks|examples)/.*
|
|
||||||
- repo: https://github.com/crate-ci/typos
|
- repo: https://github.com/crate-ci/typos
|
||||||
rev: v1.35.5
|
rev: v1.38.1
|
||||||
hooks:
|
hooks:
|
||||||
- id: typos
|
- id: typos
|
||||||
- repo: https://github.com/PyCQA/isort
|
args: [--force-exclude]
|
||||||
rev: 6.0.1
|
|
||||||
hooks:
|
|
||||||
- id: isort
|
|
||||||
- repo: https://github.com/pre-commit/mirrors-clang-format
|
- repo: https://github.com/pre-commit/mirrors-clang-format
|
||||||
rev: v20.1.3
|
rev: v21.1.2
|
||||||
hooks:
|
hooks:
|
||||||
- id: clang-format
|
- id: clang-format
|
||||||
exclude: 'csrc/(moe/topk_softmax_kernels.cu|quantization/gguf/(ggml-common.h|dequantize.cuh|vecdotq.cuh|mmq.cuh|mmvq.cuh))|vllm/third_party/.*'
|
exclude: 'csrc/(moe/topk_softmax_kernels.cu|quantization/gguf/(ggml-common.h|dequantize.cuh|vecdotq.cuh|mmq.cuh|mmvq.cuh))|vllm/third_party/.*'
|
||||||
@@ -46,10 +35,10 @@ repos:
|
|||||||
hooks:
|
hooks:
|
||||||
- id: actionlint
|
- id: actionlint
|
||||||
- repo: https://github.com/astral-sh/uv-pre-commit
|
- repo: https://github.com/astral-sh/uv-pre-commit
|
||||||
rev: 0.6.17
|
rev: 0.9.1
|
||||||
hooks:
|
hooks:
|
||||||
- id: pip-compile
|
- id: pip-compile
|
||||||
args: [requirements/test.in, -o, requirements/test.txt, --index-strategy, unsafe-best-match, --torch-backend, cu128]
|
args: [requirements/test.in, -o, requirements/test.txt, --index-strategy, unsafe-best-match, --torch-backend, cu129, --python-platform, x86_64-manylinux_2_28]
|
||||||
files: ^requirements/test\.(in|txt)$
|
files: ^requirements/test\.(in|txt)$
|
||||||
- repo: local
|
- repo: local
|
||||||
hooks:
|
hooks:
|
||||||
@@ -60,38 +49,32 @@ repos:
|
|||||||
files: ^requirements/test\.(in|txt)$
|
files: ^requirements/test\.(in|txt)$
|
||||||
- id: mypy-local
|
- id: mypy-local
|
||||||
name: Run mypy for local Python installation
|
name: Run mypy for local Python installation
|
||||||
entry: tools/mypy.sh 0 "local"
|
entry: python tools/pre_commit/mypy.py 0 "local"
|
||||||
language: python
|
|
||||||
types: [python]
|
|
||||||
additional_dependencies: &mypy_deps [mypy==1.11.1, types-cachetools, types-setuptools, types-PyYAML, types-requests, pydantic]
|
|
||||||
stages: [pre-commit] # Don't run in CI
|
stages: [pre-commit] # Don't run in CI
|
||||||
- id: mypy-3.9 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
|
<<: &mypy_common
|
||||||
name: Run mypy for Python 3.9
|
language: python
|
||||||
entry: tools/mypy.sh 1 "3.9"
|
types_or: [python, pyi]
|
||||||
language: python
|
require_serial: true
|
||||||
types: [python]
|
additional_dependencies: [mypy==1.11.1, regex, types-cachetools, types-setuptools, types-PyYAML, types-requests, types-torch, pydantic]
|
||||||
additional_dependencies: *mypy_deps
|
|
||||||
stages: [manual] # Only run in CI
|
|
||||||
- id: mypy-3.10 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
|
- id: mypy-3.10 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
|
||||||
name: Run mypy for Python 3.10
|
name: Run mypy for Python 3.10
|
||||||
entry: tools/mypy.sh 1 "3.10"
|
entry: python tools/pre_commit/mypy.py 1 "3.10"
|
||||||
language: python
|
<<: *mypy_common
|
||||||
types: [python]
|
|
||||||
additional_dependencies: *mypy_deps
|
|
||||||
stages: [manual] # Only run in CI
|
stages: [manual] # Only run in CI
|
||||||
- id: mypy-3.11 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
|
- id: mypy-3.11 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
|
||||||
name: Run mypy for Python 3.11
|
name: Run mypy for Python 3.11
|
||||||
entry: tools/mypy.sh 1 "3.11"
|
entry: python tools/pre_commit/mypy.py 1 "3.11"
|
||||||
language: python
|
<<: *mypy_common
|
||||||
types: [python]
|
|
||||||
additional_dependencies: *mypy_deps
|
|
||||||
stages: [manual] # Only run in CI
|
stages: [manual] # Only run in CI
|
||||||
- id: mypy-3.12 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
|
- id: mypy-3.12 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
|
||||||
name: Run mypy for Python 3.12
|
name: Run mypy for Python 3.12
|
||||||
entry: tools/mypy.sh 1 "3.12"
|
entry: python tools/pre_commit/mypy.py 1 "3.12"
|
||||||
language: python
|
<<: *mypy_common
|
||||||
types: [python]
|
stages: [manual] # Only run in CI
|
||||||
additional_dependencies: *mypy_deps
|
- id: mypy-3.13 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
|
||||||
|
name: Run mypy for Python 3.13
|
||||||
|
entry: python tools/pre_commit/mypy.py 1 "3.13"
|
||||||
|
<<: *mypy_common
|
||||||
stages: [manual] # Only run in CI
|
stages: [manual] # Only run in CI
|
||||||
- id: shellcheck
|
- id: shellcheck
|
||||||
name: Lint shell scripts
|
name: Lint shell scripts
|
||||||
@@ -155,18 +138,15 @@ repos:
|
|||||||
additional_dependencies: [regex]
|
additional_dependencies: [regex]
|
||||||
- id: check-pickle-imports
|
- id: check-pickle-imports
|
||||||
name: Prevent new pickle/cloudpickle imports
|
name: Prevent new pickle/cloudpickle imports
|
||||||
entry: python tools/check_pickle_imports.py
|
entry: python tools/pre_commit/check_pickle_imports.py
|
||||||
language: python
|
language: python
|
||||||
types: [python]
|
types: [python]
|
||||||
pass_filenames: false
|
additional_dependencies: [regex]
|
||||||
additional_dependencies: [pathspec, regex]
|
|
||||||
- id: validate-config
|
- id: validate-config
|
||||||
name: Validate configuration has default values and that each field has a docstring
|
name: Validate configuration has default values and that each field has a docstring
|
||||||
entry: python tools/validate_config.py
|
entry: python tools/validate_config.py
|
||||||
language: python
|
language: python
|
||||||
types: [python]
|
additional_dependencies: [regex]
|
||||||
pass_filenames: true
|
|
||||||
files: vllm/config.py|tests/test_config.py|vllm/entrypoints/openai/cli_args.py
|
|
||||||
# Keep `suggestion` last
|
# Keep `suggestion` last
|
||||||
- id: suggestion
|
- id: suggestion
|
||||||
name: Suggestion
|
name: Suggestion
|
||||||
|
|||||||
@@ -13,6 +13,7 @@ build:
|
|||||||
|
|
||||||
mkdocs:
|
mkdocs:
|
||||||
configuration: mkdocs.yaml
|
configuration: mkdocs.yaml
|
||||||
|
fail_on_warning: true
|
||||||
|
|
||||||
# Optionally declare the Python requirements required to build your docs
|
# Optionally declare the Python requirements required to build your docs
|
||||||
python:
|
python:
|
||||||
|
|||||||
100
CMakeLists.txt
100
CMakeLists.txt
@@ -13,6 +13,10 @@ cmake_minimum_required(VERSION 3.26)
|
|||||||
# cmake --install . --component _C
|
# cmake --install . --component _C
|
||||||
project(vllm_extensions LANGUAGES CXX)
|
project(vllm_extensions LANGUAGES CXX)
|
||||||
|
|
||||||
|
set(CMAKE_CXX_STANDARD 17)
|
||||||
|
set(CMAKE_CXX_STANDARD_REQUIRED ON)
|
||||||
|
|
||||||
|
|
||||||
# CUDA by default, can be overridden by using -DVLLM_TARGET_DEVICE=... (used by setup.py)
|
# CUDA by default, can be overridden by using -DVLLM_TARGET_DEVICE=... (used by setup.py)
|
||||||
set(VLLM_TARGET_DEVICE "cuda" CACHE STRING "Target device backend for vLLM")
|
set(VLLM_TARGET_DEVICE "cuda" CACHE STRING "Target device backend for vLLM")
|
||||||
message(STATUS "Build type: ${CMAKE_BUILD_TYPE}")
|
message(STATUS "Build type: ${CMAKE_BUILD_TYPE}")
|
||||||
@@ -30,10 +34,10 @@ install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY TRUE)" ALL_COMPONENTS)
|
|||||||
# Supported python versions. These versions will be searched in order, the
|
# Supported python versions. These versions will be searched in order, the
|
||||||
# first match will be selected. These should be kept in sync with setup.py.
|
# first match will be selected. These should be kept in sync with setup.py.
|
||||||
#
|
#
|
||||||
set(PYTHON_SUPPORTED_VERSIONS "3.9" "3.10" "3.11" "3.12" "3.13")
|
set(PYTHON_SUPPORTED_VERSIONS "3.10" "3.11" "3.12" "3.13")
|
||||||
|
|
||||||
# Supported AMD GPU architectures.
|
# Supported AMD GPU architectures.
|
||||||
set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx1200;gfx1201")
|
set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx1200;gfx1201;gfx1150;gfx1151")
|
||||||
|
|
||||||
#
|
#
|
||||||
# Supported/expected torch versions for CUDA/ROCm.
|
# Supported/expected torch versions for CUDA/ROCm.
|
||||||
@@ -45,8 +49,8 @@ set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1
|
|||||||
# requirements.txt files and should be kept consistent. The ROCm torch
|
# requirements.txt files and should be kept consistent. The ROCm torch
|
||||||
# versions are derived from docker/Dockerfile.rocm
|
# versions are derived from docker/Dockerfile.rocm
|
||||||
#
|
#
|
||||||
set(TORCH_SUPPORTED_VERSION_CUDA "2.8.0")
|
set(TORCH_SUPPORTED_VERSION_CUDA "2.9.0")
|
||||||
set(TORCH_SUPPORTED_VERSION_ROCM "2.8.0")
|
set(TORCH_SUPPORTED_VERSION_ROCM "2.9.0")
|
||||||
|
|
||||||
#
|
#
|
||||||
# Try to find python package with an executable that exactly matches
|
# Try to find python package with an executable that exactly matches
|
||||||
@@ -82,6 +86,9 @@ find_package(Torch REQUIRED)
|
|||||||
# Supported NVIDIA architectures.
|
# Supported NVIDIA architectures.
|
||||||
# This check must happen after find_package(Torch) because that's when CMAKE_CUDA_COMPILER_VERSION gets defined
|
# This check must happen after find_package(Torch) because that's when CMAKE_CUDA_COMPILER_VERSION gets defined
|
||||||
if(DEFINED CMAKE_CUDA_COMPILER_VERSION AND
|
if(DEFINED CMAKE_CUDA_COMPILER_VERSION AND
|
||||||
|
CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 13.0)
|
||||||
|
set(CUDA_SUPPORTED_ARCHS "7.5;8.0;8.6;8.7;8.9;9.0;10.0;11.0;12.0")
|
||||||
|
elseif(DEFINED CMAKE_CUDA_COMPILER_VERSION AND
|
||||||
CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.8)
|
CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.8)
|
||||||
set(CUDA_SUPPORTED_ARCHS "7.0;7.2;7.5;8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0")
|
set(CUDA_SUPPORTED_ARCHS "7.0;7.2;7.5;8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0")
|
||||||
else()
|
else()
|
||||||
@@ -171,6 +178,25 @@ if(NVCC_THREADS AND VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
list(APPEND VLLM_GPU_FLAGS "--threads=${NVCC_THREADS}")
|
list(APPEND VLLM_GPU_FLAGS "--threads=${NVCC_THREADS}")
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
#
|
||||||
|
# Set compression mode for CUDA >=13.x.
|
||||||
|
#
|
||||||
|
if(VLLM_GPU_LANG STREQUAL "CUDA" AND
|
||||||
|
DEFINED CMAKE_CUDA_COMPILER_VERSION AND
|
||||||
|
CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 13.0)
|
||||||
|
list(APPEND VLLM_GPU_FLAGS "--compress-mode=size")
|
||||||
|
endif()
|
||||||
|
|
||||||
|
#
|
||||||
|
# Set CUDA include flags for CXX compiler.
|
||||||
|
#
|
||||||
|
if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||||
|
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -I${CUDA_TOOLKIT_ROOT_DIR}/include")
|
||||||
|
if(CUDA_VERSION VERSION_GREATER_EQUAL 13.0)
|
||||||
|
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -I${CUDA_TOOLKIT_ROOT_DIR}/include/cccl")
|
||||||
|
endif()
|
||||||
|
endif()
|
||||||
|
|
||||||
#
|
#
|
||||||
# Use FetchContent for C++ dependencies that are compiled as part of vLLM's build process.
|
# Use FetchContent for C++ dependencies that are compiled as part of vLLM's build process.
|
||||||
# setup.py will override FETCHCONTENT_BASE_DIR to play nicely with sccache.
|
# setup.py will override FETCHCONTENT_BASE_DIR to play nicely with sccache.
|
||||||
@@ -256,7 +282,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
SET(CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL "Enable only the header library")
|
SET(CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL "Enable only the header library")
|
||||||
|
|
||||||
# Set CUTLASS_REVISION. Used for FetchContent. Also fixes some bogus messages when building.
|
# Set CUTLASS_REVISION. Used for FetchContent. Also fixes some bogus messages when building.
|
||||||
set(CUTLASS_REVISION "v4.0.0" CACHE STRING "CUTLASS revision to use")
|
set(CUTLASS_REVISION "v4.2.1" CACHE STRING "CUTLASS revision to use")
|
||||||
|
|
||||||
# Use the specified CUTLASS source directory for compilation if VLLM_CUTLASS_SRC_DIR is provided
|
# Use the specified CUTLASS source directory for compilation if VLLM_CUTLASS_SRC_DIR is provided
|
||||||
if (DEFINED ENV{VLLM_CUTLASS_SRC_DIR})
|
if (DEFINED ENV{VLLM_CUTLASS_SRC_DIR})
|
||||||
@@ -291,10 +317,8 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
"csrc/quantization/w8a8/cutlass/scaled_mm_entry.cu"
|
"csrc/quantization/w8a8/cutlass/scaled_mm_entry.cu"
|
||||||
"csrc/quantization/fp4/nvfp4_quant_entry.cu"
|
"csrc/quantization/fp4/nvfp4_quant_entry.cu"
|
||||||
"csrc/quantization/fp4/nvfp4_scaled_mm_entry.cu"
|
"csrc/quantization/fp4/nvfp4_scaled_mm_entry.cu"
|
||||||
"csrc/quantization/fp4/nvfp4_blockwise_moe_kernel.cu"
|
|
||||||
"csrc/sparse/cutlass/sparse_scaled_mm_entry.cu"
|
"csrc/sparse/cutlass/sparse_scaled_mm_entry.cu"
|
||||||
"csrc/cutlass_extensions/common.cpp"
|
"csrc/cutlass_extensions/common.cpp"
|
||||||
"csrc/attention/mla/cutlass_mla_entry.cu"
|
|
||||||
"csrc/quantization/w8a8/fp8/per_token_group_quant.cu"
|
"csrc/quantization/w8a8/fp8/per_token_group_quant.cu"
|
||||||
"csrc/quantization/w8a8/int8/per_token_group_quant.cu")
|
"csrc/quantization/w8a8/int8/per_token_group_quant.cu")
|
||||||
|
|
||||||
@@ -428,7 +452,11 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
|
|
||||||
# The cutlass_scaled_mm kernels for Geforce Blackwell SM120 (c3x, i.e. CUTLASS 3.x) require
|
# The cutlass_scaled_mm kernels for Geforce Blackwell SM120 (c3x, i.e. CUTLASS 3.x) require
|
||||||
# CUDA 12.8 or later
|
# CUDA 12.8 or later
|
||||||
cuda_archs_loose_intersection(SCALED_MM_ARCHS "12.0;12.0a" "${CUDA_ARCHS}")
|
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
|
||||||
|
cuda_archs_loose_intersection(SCALED_MM_ARCHS "12.0f" "${CUDA_ARCHS}")
|
||||||
|
else()
|
||||||
|
cuda_archs_loose_intersection(SCALED_MM_ARCHS "12.0a" "${CUDA_ARCHS}")
|
||||||
|
endif()
|
||||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
|
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
|
||||||
set(SRCS
|
set(SRCS
|
||||||
"csrc/quantization/w8a8/cutlass/scaled_mm_c3x_sm120.cu"
|
"csrc/quantization/w8a8/cutlass/scaled_mm_c3x_sm120.cu"
|
||||||
@@ -458,7 +486,11 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
|
|
||||||
# The cutlass_scaled_mm kernels for Blackwell SM100 (c3x, i.e. CUTLASS 3.x)
|
# The cutlass_scaled_mm kernels for Blackwell SM100 (c3x, i.e. CUTLASS 3.x)
|
||||||
# require CUDA 12.8 or later
|
# require CUDA 12.8 or later
|
||||||
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a" "${CUDA_ARCHS}")
|
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
|
||||||
|
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f;11.0f;12.0f" "${CUDA_ARCHS}")
|
||||||
|
else()
|
||||||
|
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a;10.3a;12.0a;12.1a" "${CUDA_ARCHS}")
|
||||||
|
endif()
|
||||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
|
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
|
||||||
set(SRCS
|
set(SRCS
|
||||||
"csrc/quantization/w8a8/cutlass/scaled_mm_c3x_sm100.cu"
|
"csrc/quantization/w8a8/cutlass/scaled_mm_c3x_sm100.cu"
|
||||||
@@ -538,7 +570,11 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
|
|
||||||
# The nvfp4_scaled_mm_sm120 kernels for Geforce Blackwell SM120 require
|
# The nvfp4_scaled_mm_sm120 kernels for Geforce Blackwell SM120 require
|
||||||
# CUDA 12.8 or later
|
# CUDA 12.8 or later
|
||||||
cuda_archs_loose_intersection(FP4_ARCHS "12.0;12.0a" "${CUDA_ARCHS}")
|
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
|
||||||
|
cuda_archs_loose_intersection(FP4_ARCHS "12.0f" "${CUDA_ARCHS}")
|
||||||
|
else()
|
||||||
|
cuda_archs_loose_intersection(FP4_ARCHS "12.0a" "${CUDA_ARCHS}")
|
||||||
|
endif()
|
||||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND FP4_ARCHS)
|
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND FP4_ARCHS)
|
||||||
set(SRCS
|
set(SRCS
|
||||||
"csrc/quantization/fp4/nvfp4_quant_kernels.cu"
|
"csrc/quantization/fp4/nvfp4_quant_kernels.cu"
|
||||||
@@ -557,7 +593,11 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
endif()
|
endif()
|
||||||
|
|
||||||
# FP4 Archs and flags
|
# FP4 Archs and flags
|
||||||
cuda_archs_loose_intersection(FP4_ARCHS "10.0a" "${CUDA_ARCHS}")
|
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
|
||||||
|
cuda_archs_loose_intersection(FP4_ARCHS "10.0f;11.0f;12.0f" "${CUDA_ARCHS}")
|
||||||
|
else()
|
||||||
|
cuda_archs_loose_intersection(FP4_ARCHS "10.0a;10.1a;12.0a;12.1a" "${CUDA_ARCHS}")
|
||||||
|
endif()
|
||||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND FP4_ARCHS)
|
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND FP4_ARCHS)
|
||||||
set(SRCS
|
set(SRCS
|
||||||
"csrc/quantization/fp4/nvfp4_quant_kernels.cu"
|
"csrc/quantization/fp4/nvfp4_quant_kernels.cu"
|
||||||
@@ -579,10 +619,13 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
endif()
|
endif()
|
||||||
|
|
||||||
# CUTLASS MLA Archs and flags
|
# CUTLASS MLA Archs and flags
|
||||||
cuda_archs_loose_intersection(MLA_ARCHS "10.0a" "${CUDA_ARCHS}")
|
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
|
||||||
|
cuda_archs_loose_intersection(MLA_ARCHS "10.0f;11.0f;12.0f" "${CUDA_ARCHS}")
|
||||||
|
else()
|
||||||
|
cuda_archs_loose_intersection(MLA_ARCHS "10.0a;10.1a;10.3a;12.0a;12.1a" "${CUDA_ARCHS}")
|
||||||
|
endif()
|
||||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND MLA_ARCHS)
|
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND MLA_ARCHS)
|
||||||
set(SRCS
|
set(SRCS
|
||||||
"csrc/attention/mla/cutlass_mla_kernels.cu"
|
|
||||||
"csrc/attention/mla/sm100_cutlass_mla_kernel.cu")
|
"csrc/attention/mla/sm100_cutlass_mla_kernel.cu")
|
||||||
set_gencode_flags_for_srcs(
|
set_gencode_flags_for_srcs(
|
||||||
SRCS "${SRCS}"
|
SRCS "${SRCS}"
|
||||||
@@ -624,7 +667,11 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
endif()
|
endif()
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a" "${CUDA_ARCHS}")
|
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
|
||||||
|
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f;11.0f" "${CUDA_ARCHS}")
|
||||||
|
else()
|
||||||
|
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a" "${CUDA_ARCHS}")
|
||||||
|
endif()
|
||||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
|
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
|
||||||
set(SRCS "csrc/quantization/w8a8/cutlass/moe/grouped_mm_c3x_sm100.cu")
|
set(SRCS "csrc/quantization/w8a8/cutlass/moe/grouped_mm_c3x_sm100.cu")
|
||||||
set_gencode_flags_for_srcs(
|
set_gencode_flags_for_srcs(
|
||||||
@@ -645,7 +692,11 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
endif()
|
endif()
|
||||||
|
|
||||||
# moe_data.cu is used by all CUTLASS MoE kernels.
|
# moe_data.cu is used by all CUTLASS MoE kernels.
|
||||||
cuda_archs_loose_intersection(CUTLASS_MOE_DATA_ARCHS "9.0a;10.0a" "${CUDA_ARCHS}")
|
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
|
||||||
|
cuda_archs_loose_intersection(CUTLASS_MOE_DATA_ARCHS "9.0a;10.0f;11.0f;12.0f" "${CUDA_ARCHS}")
|
||||||
|
else()
|
||||||
|
cuda_archs_loose_intersection(CUTLASS_MOE_DATA_ARCHS "9.0a;10.0a;10.1a;10.3a;12.0a;12.1a" "${CUDA_ARCHS}")
|
||||||
|
endif()
|
||||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND CUTLASS_MOE_DATA_ARCHS)
|
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND CUTLASS_MOE_DATA_ARCHS)
|
||||||
set(SRCS "csrc/quantization/w8a8/cutlass/moe/moe_data.cu")
|
set(SRCS "csrc/quantization/w8a8/cutlass/moe/moe_data.cu")
|
||||||
set_gencode_flags_for_srcs(
|
set_gencode_flags_for_srcs(
|
||||||
@@ -664,7 +715,11 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
endif()
|
endif()
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a" "${CUDA_ARCHS}")
|
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
|
||||||
|
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f;11.0f;12.0f" "${CUDA_ARCHS}")
|
||||||
|
else()
|
||||||
|
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a;10.3a;12.0a;12.1a" "${CUDA_ARCHS}")
|
||||||
|
endif()
|
||||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
|
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
|
||||||
set(SRCS "csrc/quantization/w8a8/cutlass/moe/blockwise_scaled_group_mm_sm100.cu")
|
set(SRCS "csrc/quantization/w8a8/cutlass/moe/blockwise_scaled_group_mm_sm100.cu")
|
||||||
set_gencode_flags_for_srcs(
|
set_gencode_flags_for_srcs(
|
||||||
@@ -780,6 +835,17 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
endif()
|
endif()
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
# Hadacore kernels
|
||||||
|
cuda_archs_loose_intersection(HADACORE_ARCHS "8.0;8.9;9.0" "${CUDA_ARCHS}")
|
||||||
|
if(HADACORE_ARCHS)
|
||||||
|
set(SRCS "csrc/quantization/hadamard/hadacore/hadamard_transform_cuda.cu")
|
||||||
|
set_gencode_flags_for_srcs(
|
||||||
|
SRCS "${SRCS}"
|
||||||
|
CUDA_ARCHS "${HADACORE_ARCHS}")
|
||||||
|
list(APPEND VLLM_EXT_SRC "${SRCS}")
|
||||||
|
message(STATUS "Building hadacore")
|
||||||
|
endif()
|
||||||
|
|
||||||
# if CUDA endif
|
# if CUDA endif
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
@@ -817,6 +883,7 @@ target_compile_definitions(_C PRIVATE CUTLASS_ENABLE_DIRECT_CUDA_DRIVER_CALL=1)
|
|||||||
set(VLLM_MOE_EXT_SRC
|
set(VLLM_MOE_EXT_SRC
|
||||||
"csrc/moe/torch_bindings.cpp"
|
"csrc/moe/torch_bindings.cpp"
|
||||||
"csrc/moe/moe_align_sum_kernels.cu"
|
"csrc/moe/moe_align_sum_kernels.cu"
|
||||||
|
"csrc/moe/moe_lora_align_sum_kernels.cu"
|
||||||
"csrc/moe/topk_softmax_kernels.cu")
|
"csrc/moe/topk_softmax_kernels.cu")
|
||||||
|
|
||||||
if(VLLM_GPU_LANG STREQUAL "CUDA")
|
if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||||
@@ -941,6 +1008,7 @@ endif()
|
|||||||
# For CUDA we also build and ship some external projects.
|
# For CUDA we also build and ship some external projects.
|
||||||
if (VLLM_GPU_LANG STREQUAL "CUDA")
|
if (VLLM_GPU_LANG STREQUAL "CUDA")
|
||||||
include(cmake/external_projects/flashmla.cmake)
|
include(cmake/external_projects/flashmla.cmake)
|
||||||
|
include(cmake/external_projects/qutlass.cmake)
|
||||||
|
|
||||||
# vllm-flash-attn should be last as it overwrites some CMake functions
|
# vllm-flash-attn should be last as it overwrites some CMake functions
|
||||||
include(cmake/external_projects/vllm_flash_attn.cmake)
|
include(cmake/external_projects/vllm_flash_attn.cmake)
|
||||||
|
|||||||
@@ -21,6 +21,7 @@ Join us at the [PyTorch Conference, October 22-23](https://events.linuxfoundatio
|
|||||||
|
|
||||||
*Latest News* 🔥
|
*Latest News* 🔥
|
||||||
|
|
||||||
|
- [2025/09] We hosted [vLLM Toronto Meetup](https://luma.com/e80e0ymm) focused on tackling inference at scale and speculative decoding with speakers from NVIDIA and Red Hat! Please find the meetup slides [here](https://docs.google.com/presentation/d/1IYJYmJcu9fLpID5N5RbW_vO0XLo0CGOR14IXOjB61V8/edit?usp=sharing).
|
||||||
- [2025/08] We hosted [vLLM Shenzhen Meetup](https://mp.weixin.qq.com/s/k8ZBO1u2_2odgiKWH_GVTQ) focusing on the ecosystem around vLLM! Please find the meetup slides [here](https://drive.google.com/drive/folders/1Ua2SVKVSu-wp5vou_6ElraDt2bnKhiEA).
|
- [2025/08] We hosted [vLLM Shenzhen Meetup](https://mp.weixin.qq.com/s/k8ZBO1u2_2odgiKWH_GVTQ) focusing on the ecosystem around vLLM! Please find the meetup slides [here](https://drive.google.com/drive/folders/1Ua2SVKVSu-wp5vou_6ElraDt2bnKhiEA).
|
||||||
- [2025/08] We hosted [vLLM Singapore Meetup](https://www.sginnovate.com/event/vllm-sg-meet). We shared V1 updates, disaggregated serving and MLLM speedups with speakers from Embedded LLM, AMD, WekaIO, and A*STAR. Please find the meetup slides [here](https://drive.google.com/drive/folders/1ncf3GyqLdqFaB6IeB834E5TZJPLAOiXZ?usp=sharing).
|
- [2025/08] We hosted [vLLM Singapore Meetup](https://www.sginnovate.com/event/vllm-sg-meet). We shared V1 updates, disaggregated serving and MLLM speedups with speakers from Embedded LLM, AMD, WekaIO, and A*STAR. Please find the meetup slides [here](https://drive.google.com/drive/folders/1ncf3GyqLdqFaB6IeB834E5TZJPLAOiXZ?usp=sharing).
|
||||||
- [2025/08] We hosted [vLLM Shanghai Meetup](https://mp.weixin.qq.com/s/pDmAXHcN7Iqc8sUKgJgGtg) focusing on building, developing, and integrating with vLLM! Please find the meetup slides [here](https://drive.google.com/drive/folders/1OvLx39wnCGy_WKq8SiVKf7YcxxYI3WCH).
|
- [2025/08] We hosted [vLLM Shanghai Meetup](https://mp.weixin.qq.com/s/pDmAXHcN7Iqc8sUKgJgGtg) focusing on building, developing, and integrating with vLLM! Please find the meetup slides [here](https://drive.google.com/drive/folders/1OvLx39wnCGy_WKq8SiVKf7YcxxYI3WCH).
|
||||||
@@ -148,6 +149,7 @@ Compute Resources:
|
|||||||
- Trainy
|
- Trainy
|
||||||
- UC Berkeley
|
- UC Berkeley
|
||||||
- UC San Diego
|
- UC San Diego
|
||||||
|
- Volcengine
|
||||||
|
|
||||||
Slack Sponsor: Anyscale
|
Slack Sponsor: Anyscale
|
||||||
|
|
||||||
|
|||||||
@@ -1,874 +1,20 @@
|
|||||||
# Benchmarking vLLM
|
# Benchmarks
|
||||||
|
|
||||||
This README guides you through running benchmark tests with the extensive
|
This directory used to contain vLLM's benchmark scripts and utilities for performance testing and evaluation.
|
||||||
datasets supported on vLLM. It’s a living document, updated as new features and datasets
|
|
||||||
become available.
|
|
||||||
|
|
||||||
## Dataset Overview
|
## Contents
|
||||||
|
|
||||||
<table style="width:100%; border-collapse: collapse;">
|
- **Serving benchmarks**: Scripts for testing online inference performance (latency, throughput)
|
||||||
<thead>
|
- **Throughput benchmarks**: Scripts for testing offline batch inference performance
|
||||||
<tr>
|
- **Specialized benchmarks**: Tools for testing specific features like structured output, prefix caching, long document QA, request prioritization, and multi-modal inference
|
||||||
<th style="width:15%; text-align: left;">Dataset</th>
|
- **Dataset utilities**: Framework for loading and sampling from various benchmark datasets (ShareGPT, HuggingFace datasets, synthetic data, etc.)
|
||||||
<th style="width:10%; text-align: center;">Online</th>
|
|
||||||
<th style="width:10%; text-align: center;">Offline</th>
|
|
||||||
<th style="width:65%; text-align: left;">Data Path</th>
|
|
||||||
</tr>
|
|
||||||
</thead>
|
|
||||||
<tbody>
|
|
||||||
<tr>
|
|
||||||
<td><strong>ShareGPT</strong></td>
|
|
||||||
<td style="text-align: center;">✅</td>
|
|
||||||
<td style="text-align: center;">✅</td>
|
|
||||||
<td><code>wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json</code></td>
|
|
||||||
</tr>
|
|
||||||
<tr>
|
|
||||||
<td><strong>ShareGPT4V (Image)</strong></td>
|
|
||||||
<td style="text-align: center;">✅</td>
|
|
||||||
<td style="text-align: center;">✅</td>
|
|
||||||
<td>
|
|
||||||
<code>wget https://huggingface.co/datasets/Lin-Chen/ShareGPT4V/blob/main/sharegpt4v_instruct_gpt4-vision_cap100k.json</code>
|
|
||||||
<br>
|
|
||||||
<div>Note that the images need to be downloaded separately. For example, to download COCO's 2017 Train images:</div>
|
|
||||||
<code>wget http://images.cocodataset.org/zips/train2017.zip</code>
|
|
||||||
</td>
|
|
||||||
</tr>
|
|
||||||
<tr>
|
|
||||||
<td><strong>ShareGPT4Video (Video)</strong></td>
|
|
||||||
<td style="text-align: center;">✅</td>
|
|
||||||
<td style="text-align: center;">✅</td>
|
|
||||||
<td>
|
|
||||||
<code>git clone https://huggingface.co/datasets/ShareGPT4Video/ShareGPT4Video</code>
|
|
||||||
</td>
|
|
||||||
</tr>
|
|
||||||
<tr>
|
|
||||||
<td><strong>BurstGPT</strong></td>
|
|
||||||
<td style="text-align: center;">✅</td>
|
|
||||||
<td style="text-align: center;">✅</td>
|
|
||||||
<td><code>wget https://github.com/HPMLL/BurstGPT/releases/download/v1.1/BurstGPT_without_fails_2.csv</code></td>
|
|
||||||
</tr>
|
|
||||||
<tr>
|
|
||||||
<td><strong>Sonnet (deprecated)</strong></td>
|
|
||||||
<td style="text-align: center;">✅</td>
|
|
||||||
<td style="text-align: center;">✅</td>
|
|
||||||
<td>Local file: <code>benchmarks/sonnet.txt</code></td>
|
|
||||||
</tr>
|
|
||||||
<tr>
|
|
||||||
<td><strong>Random</strong></td>
|
|
||||||
<td style="text-align: center;">✅</td>
|
|
||||||
<td style="text-align: center;">✅</td>
|
|
||||||
<td><code>synthetic</code></td>
|
|
||||||
</tr>
|
|
||||||
<tr>
|
|
||||||
<td><strong>RandomMultiModal (Image/Video)</strong></td>
|
|
||||||
<td style="text-align: center;">🟡</td>
|
|
||||||
<td style="text-align: center;">🚧</td>
|
|
||||||
<td><code>synthetic</code> </td>
|
|
||||||
</tr>
|
|
||||||
<tr>
|
|
||||||
<td><strong>Prefix Repetition</strong></td>
|
|
||||||
<td style="text-align: center;">✅</td>
|
|
||||||
<td style="text-align: center;">✅</td>
|
|
||||||
<td><code>synthetic</code></td>
|
|
||||||
</tr>
|
|
||||||
<tr>
|
|
||||||
<td><strong>HuggingFace-VisionArena</strong></td>
|
|
||||||
<td style="text-align: center;">✅</td>
|
|
||||||
<td style="text-align: center;">✅</td>
|
|
||||||
<td><code>lmarena-ai/VisionArena-Chat</code></td>
|
|
||||||
</tr>
|
|
||||||
<tr>
|
|
||||||
<td><strong>HuggingFace-InstructCoder</strong></td>
|
|
||||||
<td style="text-align: center;">✅</td>
|
|
||||||
<td style="text-align: center;">✅</td>
|
|
||||||
<td><code>likaixin/InstructCoder</code></td>
|
|
||||||
</tr>
|
|
||||||
<tr>
|
|
||||||
<td><strong>HuggingFace-AIMO</strong></td>
|
|
||||||
<td style="text-align: center;">✅</td>
|
|
||||||
<td style="text-align: center;">✅</td>
|
|
||||||
<td><code>AI-MO/aimo-validation-aime</code> , <code>AI-MO/NuminaMath-1.5</code>, <code>AI-MO/NuminaMath-CoT</code></td>
|
|
||||||
</tr>
|
|
||||||
<tr>
|
|
||||||
<td><strong>HuggingFace-Other</strong></td>
|
|
||||||
<td style="text-align: center;">✅</td>
|
|
||||||
<td style="text-align: center;">✅</td>
|
|
||||||
<td><code>lmms-lab/LLaVA-OneVision-Data</code>, <code>Aeala/ShareGPT_Vicuna_unfiltered</code></td>
|
|
||||||
</tr>
|
|
||||||
<tr>
|
|
||||||
<td><strong>HuggingFace-MTBench</strong></td>
|
|
||||||
<td style="text-align: center;">✅</td>
|
|
||||||
<td style="text-align: center;">✅</td>
|
|
||||||
<td><code>philschmid/mt-bench</code></td>
|
|
||||||
</tr>
|
|
||||||
<tr>
|
|
||||||
<td><strong>HuggingFace-Blazedit</strong></td>
|
|
||||||
<td style="text-align: center;">✅</td>
|
|
||||||
<td style="text-align: center;">✅</td>
|
|
||||||
<td><code>vdaita/edit_5k_char</code>, <code>vdaita/edit_10k_char</code></td>
|
|
||||||
</tr>
|
|
||||||
<tr>
|
|
||||||
<td><strong>Spec Bench</strong></td>
|
|
||||||
<td style="text-align: center;">✅</td>
|
|
||||||
<td style="text-align: center;">✅</td>
|
|
||||||
<td><code>wget https://raw.githubusercontent.com/hemingkx/Spec-Bench/refs/heads/main/data/spec_bench/question.jsonl</code></td>
|
|
||||||
</tr>
|
|
||||||
<tr>
|
|
||||||
<td><strong>Custom</strong></td>
|
|
||||||
<td style="text-align: center;">✅</td>
|
|
||||||
<td style="text-align: center;">✅</td>
|
|
||||||
<td>Local file: <code>data.jsonl</code></td>
|
|
||||||
</tr>
|
|
||||||
</tbody>
|
|
||||||
</table>
|
|
||||||
|
|
||||||
✅: supported
|
## Usage
|
||||||
|
|
||||||
🟡: Partial support
|
For detailed usage instructions, examples, and dataset information, see the [Benchmark CLI documentation](https://docs.vllm.ai/en/latest/contributing/benchmarks.html#benchmark-cli).
|
||||||
|
|
||||||
🚧: to be supported
|
For full CLI reference see:
|
||||||
|
|
||||||
**Note**: HuggingFace dataset's `dataset-name` should be set to `hf`.
|
- <https://docs.vllm.ai/en/latest/cli/bench/latency.html>
|
||||||
For local `dataset-path`, please set `hf-name` to its Hugging Face ID like
|
- <https://docs.vllm.ai/en/latest/cli/bench/serve.html>
|
||||||
|
- <https://docs.vllm.ai/en/latest/cli/bench/throughput.html>
|
||||||
```bash
|
|
||||||
--dataset-path /datasets/VisionArena-Chat/ --hf-name lmarena-ai/VisionArena-Chat
|
|
||||||
```
|
|
||||||
|
|
||||||
## 🚀 Example - Online Benchmark
|
|
||||||
|
|
||||||
<details>
|
|
||||||
<summary>Show more</summary>
|
|
||||||
|
|
||||||
<br/>
|
|
||||||
|
|
||||||
First start serving your model
|
|
||||||
|
|
||||||
```bash
|
|
||||||
vllm serve NousResearch/Hermes-3-Llama-3.1-8B
|
|
||||||
```
|
|
||||||
|
|
||||||
Then run the benchmarking script
|
|
||||||
|
|
||||||
```bash
|
|
||||||
# download dataset
|
|
||||||
# wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
|
|
||||||
vllm bench serve \
|
|
||||||
--backend vllm \
|
|
||||||
--model NousResearch/Hermes-3-Llama-3.1-8B \
|
|
||||||
--endpoint /v1/completions \
|
|
||||||
--dataset-name sharegpt \
|
|
||||||
--dataset-path <your data path>/ShareGPT_V3_unfiltered_cleaned_split.json \
|
|
||||||
--num-prompts 10
|
|
||||||
```
|
|
||||||
|
|
||||||
If successful, you will see the following output
|
|
||||||
|
|
||||||
```text
|
|
||||||
============ Serving Benchmark Result ============
|
|
||||||
Successful requests: 10
|
|
||||||
Benchmark duration (s): 5.78
|
|
||||||
Total input tokens: 1369
|
|
||||||
Total generated tokens: 2212
|
|
||||||
Request throughput (req/s): 1.73
|
|
||||||
Output token throughput (tok/s): 382.89
|
|
||||||
Total Token throughput (tok/s): 619.85
|
|
||||||
---------------Time to First Token----------------
|
|
||||||
Mean TTFT (ms): 71.54
|
|
||||||
Median TTFT (ms): 73.88
|
|
||||||
P99 TTFT (ms): 79.49
|
|
||||||
-----Time per Output Token (excl. 1st token)------
|
|
||||||
Mean TPOT (ms): 7.91
|
|
||||||
Median TPOT (ms): 7.96
|
|
||||||
P99 TPOT (ms): 8.03
|
|
||||||
---------------Inter-token Latency----------------
|
|
||||||
Mean ITL (ms): 7.74
|
|
||||||
Median ITL (ms): 7.70
|
|
||||||
P99 ITL (ms): 8.39
|
|
||||||
==================================================
|
|
||||||
```
|
|
||||||
|
|
||||||
### Custom Dataset
|
|
||||||
|
|
||||||
If the dataset you want to benchmark is not supported yet in vLLM, even then you can benchmark on it using `CustomDataset`. Your data needs to be in `.jsonl` format and needs to have "prompt" field per entry, e.g., data.jsonl
|
|
||||||
|
|
||||||
```json
|
|
||||||
{"prompt": "What is the capital of India?"}
|
|
||||||
{"prompt": "What is the capital of Iran?"}
|
|
||||||
{"prompt": "What is the capital of China?"}
|
|
||||||
```
|
|
||||||
|
|
||||||
```bash
|
|
||||||
# start server
|
|
||||||
VLLM_USE_V1=1 vllm serve meta-llama/Llama-3.1-8B-Instruct
|
|
||||||
```
|
|
||||||
|
|
||||||
```bash
|
|
||||||
# run benchmarking script
|
|
||||||
vllm bench serve --port 9001 --save-result --save-detailed \
|
|
||||||
--backend vllm \
|
|
||||||
--model meta-llama/Llama-3.1-8B-Instruct \
|
|
||||||
--endpoint /v1/completions \
|
|
||||||
--dataset-name custom \
|
|
||||||
--dataset-path <path-to-your-data-jsonl> \
|
|
||||||
--custom-skip-chat-template \
|
|
||||||
--num-prompts 80 \
|
|
||||||
--max-concurrency 1 \
|
|
||||||
--temperature=0.3 \
|
|
||||||
--top-p=0.75 \
|
|
||||||
--result-dir "./log/"
|
|
||||||
```
|
|
||||||
|
|
||||||
You can skip applying chat template if your data already has it by using `--custom-skip-chat-template`.
|
|
||||||
|
|
||||||
### VisionArena Benchmark for Vision Language Models
|
|
||||||
|
|
||||||
```bash
|
|
||||||
# need a model with vision capability here
|
|
||||||
vllm serve Qwen/Qwen2-VL-7B-Instruct
|
|
||||||
```
|
|
||||||
|
|
||||||
```bash
|
|
||||||
vllm bench serve \
|
|
||||||
--backend openai-chat \
|
|
||||||
--endpoint-type openai-chat \
|
|
||||||
--model Qwen/Qwen2-VL-7B-Instruct \
|
|
||||||
--endpoint /v1/chat/completions \
|
|
||||||
--dataset-name hf \
|
|
||||||
--dataset-path lmarena-ai/VisionArena-Chat \
|
|
||||||
--hf-split train \
|
|
||||||
--num-prompts 1000
|
|
||||||
```
|
|
||||||
|
|
||||||
### InstructCoder Benchmark with Speculative Decoding
|
|
||||||
|
|
||||||
``` bash
|
|
||||||
VLLM_USE_V1=1 vllm serve meta-llama/Meta-Llama-3-8B-Instruct \
|
|
||||||
--speculative-config $'{"method": "ngram",
|
|
||||||
"num_speculative_tokens": 5, "prompt_lookup_max": 5,
|
|
||||||
"prompt_lookup_min": 2}'
|
|
||||||
```
|
|
||||||
|
|
||||||
``` bash
|
|
||||||
vllm bench serve \
|
|
||||||
--model meta-llama/Meta-Llama-3-8B-Instruct \
|
|
||||||
--dataset-name hf \
|
|
||||||
--dataset-path likaixin/InstructCoder \
|
|
||||||
--num-prompts 2048
|
|
||||||
```
|
|
||||||
|
|
||||||
### Spec Bench Benchmark with Speculative Decoding
|
|
||||||
|
|
||||||
``` bash
|
|
||||||
VLLM_USE_V1=1 vllm serve meta-llama/Meta-Llama-3-8B-Instruct \
|
|
||||||
--speculative-config $'{"method": "ngram",
|
|
||||||
"num_speculative_tokens": 5, "prompt_lookup_max": 5,
|
|
||||||
"prompt_lookup_min": 2}'
|
|
||||||
```
|
|
||||||
|
|
||||||
[SpecBench dataset](https://github.com/hemingkx/Spec-Bench)
|
|
||||||
|
|
||||||
Run all categories:
|
|
||||||
|
|
||||||
``` bash
|
|
||||||
# Download the dataset using:
|
|
||||||
# wget https://raw.githubusercontent.com/hemingkx/Spec-Bench/refs/heads/main/data/spec_bench/question.jsonl
|
|
||||||
|
|
||||||
vllm bench serve \
|
|
||||||
--model meta-llama/Meta-Llama-3-8B-Instruct \
|
|
||||||
--dataset-name spec_bench \
|
|
||||||
--dataset-path "<YOUR_DOWNLOADED_PATH>/data/spec_bench/question.jsonl" \
|
|
||||||
--num-prompts -1
|
|
||||||
```
|
|
||||||
|
|
||||||
Available categories include `[writing, roleplay, reasoning, math, coding, extraction, stem, humanities, translation, summarization, qa, math_reasoning, rag]`.
|
|
||||||
|
|
||||||
Run only a specific category like "summarization":
|
|
||||||
|
|
||||||
``` bash
|
|
||||||
vllm bench serve \
|
|
||||||
--model meta-llama/Meta-Llama-3-8B-Instruct \
|
|
||||||
--dataset-name spec_bench \
|
|
||||||
--dataset-path "<YOUR_DOWNLOADED_PATH>/data/spec_bench/question.jsonl" \
|
|
||||||
--num-prompts -1
|
|
||||||
--spec-bench-category "summarization"
|
|
||||||
```
|
|
||||||
|
|
||||||
### Other HuggingFaceDataset Examples
|
|
||||||
|
|
||||||
```bash
|
|
||||||
vllm serve Qwen/Qwen2-VL-7B-Instruct
|
|
||||||
```
|
|
||||||
|
|
||||||
`lmms-lab/LLaVA-OneVision-Data`:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
vllm bench serve \
|
|
||||||
--backend openai-chat \
|
|
||||||
--endpoint-type openai-chat \
|
|
||||||
--model Qwen/Qwen2-VL-7B-Instruct \
|
|
||||||
--endpoint /v1/chat/completions \
|
|
||||||
--dataset-name hf \
|
|
||||||
--dataset-path lmms-lab/LLaVA-OneVision-Data \
|
|
||||||
--hf-split train \
|
|
||||||
--hf-subset "chart2text(cauldron)" \
|
|
||||||
--num-prompts 10
|
|
||||||
```
|
|
||||||
|
|
||||||
`Aeala/ShareGPT_Vicuna_unfiltered`:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
vllm bench serve \
|
|
||||||
--backend openai-chat \
|
|
||||||
--endpoint-type openai-chat \
|
|
||||||
--model Qwen/Qwen2-VL-7B-Instruct \
|
|
||||||
--endpoint /v1/chat/completions \
|
|
||||||
--dataset-name hf \
|
|
||||||
--dataset-path Aeala/ShareGPT_Vicuna_unfiltered \
|
|
||||||
--hf-split train \
|
|
||||||
--num-prompts 10
|
|
||||||
```
|
|
||||||
|
|
||||||
`AI-MO/aimo-validation-aime`:
|
|
||||||
|
|
||||||
``` bash
|
|
||||||
vllm bench serve \
|
|
||||||
--model Qwen/QwQ-32B \
|
|
||||||
--dataset-name hf \
|
|
||||||
--dataset-path AI-MO/aimo-validation-aime \
|
|
||||||
--num-prompts 10 \
|
|
||||||
--seed 42
|
|
||||||
```
|
|
||||||
|
|
||||||
`philschmid/mt-bench`:
|
|
||||||
|
|
||||||
``` bash
|
|
||||||
vllm bench serve \
|
|
||||||
--model Qwen/QwQ-32B \
|
|
||||||
--dataset-name hf \
|
|
||||||
--dataset-path philschmid/mt-bench \
|
|
||||||
--num-prompts 80
|
|
||||||
```
|
|
||||||
|
|
||||||
`vdaita/edit_5k_char` or `vdaita/edit_10k_char`:
|
|
||||||
|
|
||||||
``` bash
|
|
||||||
vllm bench serve \
|
|
||||||
--model Qwen/QwQ-32B \
|
|
||||||
--dataset-name hf \
|
|
||||||
--dataset-path vdaita/edit_5k_char \
|
|
||||||
--num-prompts 90 \
|
|
||||||
--blazedit-min-distance 0.01 \
|
|
||||||
--blazedit-max-distance 0.99
|
|
||||||
```
|
|
||||||
|
|
||||||
### Running With Sampling Parameters
|
|
||||||
|
|
||||||
When using OpenAI-compatible backends such as `vllm`, optional sampling
|
|
||||||
parameters can be specified. Example client command:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
vllm bench serve \
|
|
||||||
--backend vllm \
|
|
||||||
--model NousResearch/Hermes-3-Llama-3.1-8B \
|
|
||||||
--endpoint /v1/completions \
|
|
||||||
--dataset-name sharegpt \
|
|
||||||
--dataset-path <your data path>/ShareGPT_V3_unfiltered_cleaned_split.json \
|
|
||||||
--top-k 10 \
|
|
||||||
--top-p 0.9 \
|
|
||||||
--temperature 0.5 \
|
|
||||||
--num-prompts 10
|
|
||||||
```
|
|
||||||
|
|
||||||
### Running With Ramp-Up Request Rate
|
|
||||||
|
|
||||||
The benchmark tool also supports ramping up the request rate over the
|
|
||||||
duration of the benchmark run. This can be useful for stress testing the
|
|
||||||
server or finding the maximum throughput that it can handle, given some latency budget.
|
|
||||||
|
|
||||||
Two ramp-up strategies are supported:
|
|
||||||
|
|
||||||
- `linear`: Increases the request rate linearly from a start value to an end value.
|
|
||||||
- `exponential`: Increases the request rate exponentially.
|
|
||||||
|
|
||||||
The following arguments can be used to control the ramp-up:
|
|
||||||
|
|
||||||
- `--ramp-up-strategy`: The ramp-up strategy to use (`linear` or `exponential`).
|
|
||||||
- `--ramp-up-start-rps`: The request rate at the beginning of the benchmark.
|
|
||||||
- `--ramp-up-end-rps`: The request rate at the end of the benchmark.
|
|
||||||
|
|
||||||
</details>
|
|
||||||
|
|
||||||
## 📈 Example - Offline Throughput Benchmark
|
|
||||||
|
|
||||||
<details>
|
|
||||||
<summary>Show more</summary>
|
|
||||||
|
|
||||||
<br/>
|
|
||||||
|
|
||||||
```bash
|
|
||||||
vllm bench throughput \
|
|
||||||
--model NousResearch/Hermes-3-Llama-3.1-8B \
|
|
||||||
--dataset-name sonnet \
|
|
||||||
--dataset-path vllm/benchmarks/sonnet.txt \
|
|
||||||
--num-prompts 10
|
|
||||||
```
|
|
||||||
|
|
||||||
If successful, you will see the following output
|
|
||||||
|
|
||||||
```text
|
|
||||||
Throughput: 7.15 requests/s, 4656.00 total tokens/s, 1072.15 output tokens/s
|
|
||||||
Total num prompt tokens: 5014
|
|
||||||
Total num output tokens: 1500
|
|
||||||
```
|
|
||||||
|
|
||||||
### VisionArena Benchmark for Vision Language Models
|
|
||||||
|
|
||||||
```bash
|
|
||||||
vllm bench throughput \
|
|
||||||
--model Qwen/Qwen2-VL-7B-Instruct \
|
|
||||||
--backend vllm-chat \
|
|
||||||
--dataset-name hf \
|
|
||||||
--dataset-path lmarena-ai/VisionArena-Chat \
|
|
||||||
--num-prompts 1000 \
|
|
||||||
--hf-split train
|
|
||||||
```
|
|
||||||
|
|
||||||
The `num prompt tokens` now includes image token counts
|
|
||||||
|
|
||||||
```text
|
|
||||||
Throughput: 2.55 requests/s, 4036.92 total tokens/s, 326.90 output tokens/s
|
|
||||||
Total num prompt tokens: 14527
|
|
||||||
Total num output tokens: 1280
|
|
||||||
```
|
|
||||||
|
|
||||||
### InstructCoder Benchmark with Speculative Decoding
|
|
||||||
|
|
||||||
``` bash
|
|
||||||
VLLM_WORKER_MULTIPROC_METHOD=spawn \
|
|
||||||
VLLM_USE_V1=1 \
|
|
||||||
vllm bench throughput \
|
|
||||||
--dataset-name=hf \
|
|
||||||
--dataset-path=likaixin/InstructCoder \
|
|
||||||
--model=meta-llama/Meta-Llama-3-8B-Instruct \
|
|
||||||
--input-len=1000 \
|
|
||||||
--output-len=100 \
|
|
||||||
--num-prompts=2048 \
|
|
||||||
--async-engine \
|
|
||||||
--speculative-config $'{"method": "ngram",
|
|
||||||
"num_speculative_tokens": 5, "prompt_lookup_max": 5,
|
|
||||||
"prompt_lookup_min": 2}'
|
|
||||||
```
|
|
||||||
|
|
||||||
```text
|
|
||||||
Throughput: 104.77 requests/s, 23836.22 total tokens/s, 10477.10 output tokens/s
|
|
||||||
Total num prompt tokens: 261136
|
|
||||||
Total num output tokens: 204800
|
|
||||||
```
|
|
||||||
|
|
||||||
### Other HuggingFaceDataset Examples
|
|
||||||
|
|
||||||
`lmms-lab/LLaVA-OneVision-Data`:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
vllm bench throughput \
|
|
||||||
--model Qwen/Qwen2-VL-7B-Instruct \
|
|
||||||
--backend vllm-chat \
|
|
||||||
--dataset-name hf \
|
|
||||||
--dataset-path lmms-lab/LLaVA-OneVision-Data \
|
|
||||||
--hf-split train \
|
|
||||||
--hf-subset "chart2text(cauldron)" \
|
|
||||||
--num-prompts 10
|
|
||||||
```
|
|
||||||
|
|
||||||
`Aeala/ShareGPT_Vicuna_unfiltered`:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
vllm bench throughput \
|
|
||||||
--model Qwen/Qwen2-VL-7B-Instruct \
|
|
||||||
--backend vllm-chat \
|
|
||||||
--dataset-name hf \
|
|
||||||
--dataset-path Aeala/ShareGPT_Vicuna_unfiltered \
|
|
||||||
--hf-split train \
|
|
||||||
--num-prompts 10
|
|
||||||
```
|
|
||||||
|
|
||||||
`AI-MO/aimo-validation-aime`:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
vllm bench throughput \
|
|
||||||
--model Qwen/QwQ-32B \
|
|
||||||
--backend vllm \
|
|
||||||
--dataset-name hf \
|
|
||||||
--dataset-path AI-MO/aimo-validation-aime \
|
|
||||||
--hf-split train \
|
|
||||||
--num-prompts 10
|
|
||||||
```
|
|
||||||
|
|
||||||
Benchmark with LoRA adapters:
|
|
||||||
|
|
||||||
``` bash
|
|
||||||
# download dataset
|
|
||||||
# wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
|
|
||||||
vllm bench throughput \
|
|
||||||
--model meta-llama/Llama-2-7b-hf \
|
|
||||||
--backend vllm \
|
|
||||||
--dataset_path <your data path>/ShareGPT_V3_unfiltered_cleaned_split.json \
|
|
||||||
--dataset_name sharegpt \
|
|
||||||
--num-prompts 10 \
|
|
||||||
--max-loras 2 \
|
|
||||||
--max-lora-rank 8 \
|
|
||||||
--enable-lora \
|
|
||||||
--lora-path yard1/llama-2-7b-sql-lora-test
|
|
||||||
```
|
|
||||||
|
|
||||||
</details>
|
|
||||||
|
|
||||||
## 🛠️ Example - Structured Output Benchmark
|
|
||||||
|
|
||||||
<details>
|
|
||||||
<summary>Show more</summary>
|
|
||||||
|
|
||||||
<br/>
|
|
||||||
|
|
||||||
Benchmark the performance of structured output generation (JSON, grammar, regex).
|
|
||||||
|
|
||||||
### Server Setup
|
|
||||||
|
|
||||||
```bash
|
|
||||||
vllm serve NousResearch/Hermes-3-Llama-3.1-8B
|
|
||||||
```
|
|
||||||
|
|
||||||
### JSON Schema Benchmark
|
|
||||||
|
|
||||||
```bash
|
|
||||||
python3 benchmarks/benchmark_serving_structured_output.py \
|
|
||||||
--backend vllm \
|
|
||||||
--model NousResearch/Hermes-3-Llama-3.1-8B \
|
|
||||||
--dataset json \
|
|
||||||
--structured-output-ratio 1.0 \
|
|
||||||
--request-rate 10 \
|
|
||||||
--num-prompts 1000
|
|
||||||
```
|
|
||||||
|
|
||||||
### Grammar-based Generation Benchmark
|
|
||||||
|
|
||||||
```bash
|
|
||||||
python3 benchmarks/benchmark_serving_structured_output.py \
|
|
||||||
--backend vllm \
|
|
||||||
--model NousResearch/Hermes-3-Llama-3.1-8B \
|
|
||||||
--dataset grammar \
|
|
||||||
--structure-type grammar \
|
|
||||||
--request-rate 10 \
|
|
||||||
--num-prompts 1000
|
|
||||||
```
|
|
||||||
|
|
||||||
### Regex-based Generation Benchmark
|
|
||||||
|
|
||||||
```bash
|
|
||||||
python3 benchmarks/benchmark_serving_structured_output.py \
|
|
||||||
--backend vllm \
|
|
||||||
--model NousResearch/Hermes-3-Llama-3.1-8B \
|
|
||||||
--dataset regex \
|
|
||||||
--request-rate 10 \
|
|
||||||
--num-prompts 1000
|
|
||||||
```
|
|
||||||
|
|
||||||
### Choice-based Generation Benchmark
|
|
||||||
|
|
||||||
```bash
|
|
||||||
python3 benchmarks/benchmark_serving_structured_output.py \
|
|
||||||
--backend vllm \
|
|
||||||
--model NousResearch/Hermes-3-Llama-3.1-8B \
|
|
||||||
--dataset choice \
|
|
||||||
--request-rate 10 \
|
|
||||||
--num-prompts 1000
|
|
||||||
```
|
|
||||||
|
|
||||||
### XGrammar Benchmark Dataset
|
|
||||||
|
|
||||||
```bash
|
|
||||||
python3 benchmarks/benchmark_serving_structured_output.py \
|
|
||||||
--backend vllm \
|
|
||||||
--model NousResearch/Hermes-3-Llama-3.1-8B \
|
|
||||||
--dataset xgrammar_bench \
|
|
||||||
--request-rate 10 \
|
|
||||||
--num-prompts 1000
|
|
||||||
```
|
|
||||||
|
|
||||||
</details>
|
|
||||||
|
|
||||||
## 📚 Example - Long Document QA Benchmark
|
|
||||||
|
|
||||||
<details>
|
|
||||||
<summary>Show more</summary>
|
|
||||||
|
|
||||||
<br/>
|
|
||||||
|
|
||||||
Benchmark the performance of long document question-answering with prefix caching.
|
|
||||||
|
|
||||||
### Basic Long Document QA Test
|
|
||||||
|
|
||||||
```bash
|
|
||||||
python3 benchmarks/benchmark_long_document_qa_throughput.py \
|
|
||||||
--model meta-llama/Llama-2-7b-chat-hf \
|
|
||||||
--enable-prefix-caching \
|
|
||||||
--num-documents 16 \
|
|
||||||
--document-length 2000 \
|
|
||||||
--output-len 50 \
|
|
||||||
--repeat-count 5
|
|
||||||
```
|
|
||||||
|
|
||||||
### Different Repeat Modes
|
|
||||||
|
|
||||||
```bash
|
|
||||||
# Random mode (default) - shuffle prompts randomly
|
|
||||||
python3 benchmarks/benchmark_long_document_qa_throughput.py \
|
|
||||||
--model meta-llama/Llama-2-7b-chat-hf \
|
|
||||||
--enable-prefix-caching \
|
|
||||||
--num-documents 8 \
|
|
||||||
--document-length 3000 \
|
|
||||||
--repeat-count 3 \
|
|
||||||
--repeat-mode random
|
|
||||||
|
|
||||||
# Tile mode - repeat entire prompt list in sequence
|
|
||||||
python3 benchmarks/benchmark_long_document_qa_throughput.py \
|
|
||||||
--model meta-llama/Llama-2-7b-chat-hf \
|
|
||||||
--enable-prefix-caching \
|
|
||||||
--num-documents 8 \
|
|
||||||
--document-length 3000 \
|
|
||||||
--repeat-count 3 \
|
|
||||||
--repeat-mode tile
|
|
||||||
|
|
||||||
# Interleave mode - repeat each prompt consecutively
|
|
||||||
python3 benchmarks/benchmark_long_document_qa_throughput.py \
|
|
||||||
--model meta-llama/Llama-2-7b-chat-hf \
|
|
||||||
--enable-prefix-caching \
|
|
||||||
--num-documents 8 \
|
|
||||||
--document-length 3000 \
|
|
||||||
--repeat-count 3 \
|
|
||||||
--repeat-mode interleave
|
|
||||||
```
|
|
||||||
|
|
||||||
</details>
|
|
||||||
|
|
||||||
## 🗂️ Example - Prefix Caching Benchmark
|
|
||||||
|
|
||||||
<details>
|
|
||||||
<summary>Show more</summary>
|
|
||||||
|
|
||||||
<br/>
|
|
||||||
|
|
||||||
Benchmark the efficiency of automatic prefix caching.
|
|
||||||
|
|
||||||
### Fixed Prompt with Prefix Caching
|
|
||||||
|
|
||||||
```bash
|
|
||||||
python3 benchmarks/benchmark_prefix_caching.py \
|
|
||||||
--model meta-llama/Llama-2-7b-chat-hf \
|
|
||||||
--enable-prefix-caching \
|
|
||||||
--num-prompts 1 \
|
|
||||||
--repeat-count 100 \
|
|
||||||
--input-length-range 128:256
|
|
||||||
```
|
|
||||||
|
|
||||||
### ShareGPT Dataset with Prefix Caching
|
|
||||||
|
|
||||||
```bash
|
|
||||||
# download dataset
|
|
||||||
# wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
|
|
||||||
|
|
||||||
python3 benchmarks/benchmark_prefix_caching.py \
|
|
||||||
--model meta-llama/Llama-2-7b-chat-hf \
|
|
||||||
--dataset-path /path/ShareGPT_V3_unfiltered_cleaned_split.json \
|
|
||||||
--enable-prefix-caching \
|
|
||||||
--num-prompts 20 \
|
|
||||||
--repeat-count 5 \
|
|
||||||
--input-length-range 128:256
|
|
||||||
```
|
|
||||||
|
|
||||||
### Prefix Repetition Dataset
|
|
||||||
|
|
||||||
```bash
|
|
||||||
vllm bench serve \
|
|
||||||
--backend openai \
|
|
||||||
--model meta-llama/Llama-2-7b-chat-hf \
|
|
||||||
--dataset-name prefix_repetition \
|
|
||||||
--num-prompts 100 \
|
|
||||||
--prefix-repetition-prefix-len 512 \
|
|
||||||
--prefix-repetition-suffix-len 128 \
|
|
||||||
--prefix-repetition-num-prefixes 5 \
|
|
||||||
--prefix-repetition-output-len 128
|
|
||||||
```
|
|
||||||
|
|
||||||
</details>
|
|
||||||
|
|
||||||
## ⚡ Example - Request Prioritization Benchmark
|
|
||||||
|
|
||||||
<details>
|
|
||||||
<summary>Show more</summary>
|
|
||||||
|
|
||||||
<br/>
|
|
||||||
|
|
||||||
Benchmark the performance of request prioritization in vLLM.
|
|
||||||
|
|
||||||
### Basic Prioritization Test
|
|
||||||
|
|
||||||
```bash
|
|
||||||
python3 benchmarks/benchmark_prioritization.py \
|
|
||||||
--model meta-llama/Llama-2-7b-chat-hf \
|
|
||||||
--input-len 128 \
|
|
||||||
--output-len 64 \
|
|
||||||
--num-prompts 100 \
|
|
||||||
--scheduling-policy priority
|
|
||||||
```
|
|
||||||
|
|
||||||
### Multiple Sequences per Prompt
|
|
||||||
|
|
||||||
```bash
|
|
||||||
python3 benchmarks/benchmark_prioritization.py \
|
|
||||||
--model meta-llama/Llama-2-7b-chat-hf \
|
|
||||||
--input-len 128 \
|
|
||||||
--output-len 64 \
|
|
||||||
--num-prompts 100 \
|
|
||||||
--scheduling-policy priority \
|
|
||||||
--n 2
|
|
||||||
```
|
|
||||||
|
|
||||||
</details>
|
|
||||||
|
|
||||||
## 👁️ Example - Multi-Modal Benchmark
|
|
||||||
|
|
||||||
<details>
|
|
||||||
<summary>Show more</summary>
|
|
||||||
|
|
||||||
<br/>
|
|
||||||
|
|
||||||
Benchmark the performance of multi-modal requests in vLLM.
|
|
||||||
|
|
||||||
### Images (ShareGPT4V)
|
|
||||||
|
|
||||||
Start vLLM:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
python -m vllm.entrypoints.openai.api_server \
|
|
||||||
--model Qwen/Qwen2.5-VL-7B-Instruct \
|
|
||||||
--dtype bfloat16 \
|
|
||||||
--limit-mm-per-prompt '{"image": 1}' \
|
|
||||||
--allowed-local-media-path /path/to/sharegpt4v/images
|
|
||||||
```
|
|
||||||
|
|
||||||
Send requests with images:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
vllm bench serve \
|
|
||||||
--backend openai-chat \
|
|
||||||
--model Qwen/Qwen2.5-VL-7B-Instruct \
|
|
||||||
--dataset-name sharegpt \
|
|
||||||
--dataset-path /path/to/ShareGPT4V/sharegpt4v_instruct_gpt4-vision_cap100k.json \
|
|
||||||
--num-prompts 100 \
|
|
||||||
--save-result \
|
|
||||||
--result-dir ~/vllm_benchmark_results \
|
|
||||||
--save-detailed \
|
|
||||||
--endpoint /v1/chat/completion
|
|
||||||
```
|
|
||||||
|
|
||||||
### Videos (ShareGPT4Video)
|
|
||||||
|
|
||||||
Start vLLM:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
python -m vllm.entrypoints.openai.api_server \
|
|
||||||
--model Qwen/Qwen2.5-VL-7B-Instruct \
|
|
||||||
--dtype bfloat16 \
|
|
||||||
--limit-mm-per-prompt '{"video": 1}' \
|
|
||||||
--allowed-local-media-path /path/to/sharegpt4video/videos
|
|
||||||
```
|
|
||||||
|
|
||||||
Send requests with videos:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
vllm bench serve \
|
|
||||||
--backend openai-chat \
|
|
||||||
--model Qwen/Qwen2.5-VL-7B-Instruct \
|
|
||||||
--dataset-name sharegpt \
|
|
||||||
--dataset-path /path/to/ShareGPT4Video/llava_v1_5_mix665k_with_video_chatgpt72k_share4video28k.json \
|
|
||||||
--num-prompts 100 \
|
|
||||||
--save-result \
|
|
||||||
--result-dir ~/vllm_benchmark_results \
|
|
||||||
--save-detailed \
|
|
||||||
--endpoint /v1/chat/completion
|
|
||||||
```
|
|
||||||
|
|
||||||
### Synthetic Random Images (random-mm)
|
|
||||||
|
|
||||||
Generate synthetic image inputs alongside random text prompts to stress-test vision models without external datasets.
|
|
||||||
|
|
||||||
Notes:
|
|
||||||
|
|
||||||
- Works only with online benchmark via the OpenAI backend (`--backend openai-chat`) and endpoint `/v1/chat/completions`.
|
|
||||||
- Video sampling is not yet implemented.
|
|
||||||
|
|
||||||
Start the server (example):
|
|
||||||
|
|
||||||
```bash
|
|
||||||
vllm serve Qwen/Qwen2.5-VL-3B-Instruct \
|
|
||||||
--dtype bfloat16 \
|
|
||||||
--max-model-len 16384 \
|
|
||||||
--limit-mm-per-prompt '{"image": 3, "video": 0}' \
|
|
||||||
--mm-processor-kwargs max_pixels=1003520
|
|
||||||
```
|
|
||||||
|
|
||||||
Benchmark. It is recommended to use the flag `--ignore-eos` to simulate real responses. You can set the size of the output via the arg `random-output-len`.
|
|
||||||
|
|
||||||
Ex.1: Fixed number of items and a single image resolution, enforcing generation of approx 40 tokens:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
vllm bench serve \
|
|
||||||
--backend openai-chat \
|
|
||||||
--model Qwen/Qwen2.5-VL-3B-Instruct \
|
|
||||||
--endpoint /v1/chat/completions \
|
|
||||||
--dataset-name random-mm \
|
|
||||||
--num-prompts 100 \
|
|
||||||
--max-concurrency 10 \
|
|
||||||
--random-prefix-len 25 \
|
|
||||||
--random-input-len 300 \
|
|
||||||
--random-output-len 40 \
|
|
||||||
--random-range-ratio 0.2 \
|
|
||||||
--random-mm-base-items-per-request 2 \
|
|
||||||
--random-mm-limit-mm-per-prompt '{"image": 3, "video": 0}' \
|
|
||||||
--random-mm-bucket-config '{(224, 224, 1): 1.0}' \
|
|
||||||
--request-rate inf \
|
|
||||||
--ignore-eos \
|
|
||||||
--seed 42
|
|
||||||
```
|
|
||||||
|
|
||||||
The number of items per request can be controlled by passing multiple image buckets:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
--random-mm-base-items-per-request 2 \
|
|
||||||
--random-mm-num-mm-items-range-ratio 0.5 \
|
|
||||||
--random-mm-limit-mm-per-prompt '{"image": 4, "video": 0}' \
|
|
||||||
--random-mm-bucket-config '{(256, 256, 1): 0.7, (720, 1280, 1): 0.3}' \
|
|
||||||
```
|
|
||||||
|
|
||||||
Flags specific to `random-mm`:
|
|
||||||
|
|
||||||
- `--random-mm-base-items-per-request`: base number of multimodal items per request.
|
|
||||||
- `--random-mm-num-mm-items-range-ratio`: vary item count uniformly in the closed integer range [floor(n·(1−r)), ceil(n·(1+r))]. Set r=0 to keep it fixed; r=1 allows 0 items.
|
|
||||||
- `--random-mm-limit-mm-per-prompt`: per-modality hard caps, e.g. '{"image": 3, "video": 0}'.
|
|
||||||
- `--random-mm-bucket-config`: dict mapping (H, W, T) → probability. Entries with probability 0 are removed; remaining probabilities are renormalized to sum to 1. Use T=1 for images. Set any T>1 for videos (video sampling not yet supported).
|
|
||||||
|
|
||||||
Behavioral notes:
|
|
||||||
|
|
||||||
- If the requested base item count cannot be satisfied under the provided per-prompt limits, the tool raises an error rather than silently clamping.
|
|
||||||
|
|
||||||
How sampling works:
|
|
||||||
|
|
||||||
- Determine per-request item count k by sampling uniformly from the integer range defined by `--random-mm-base-items-per-request` and `--random-mm-num-mm-items-range-ratio`, then clamp k to at most the sum of per-modality limits.
|
|
||||||
- For each of the k items, sample a bucket (H, W, T) according to the normalized probabilities in `--random-mm-bucket-config`, while tracking how many items of each modality have been added.
|
|
||||||
- If a modality (e.g., image) reaches its limit from `--random-mm-limit-mm-per-prompt`, all buckets of that modality are excluded and the remaining bucket probabilities are renormalized before continuing.
|
|
||||||
This should be seen as an edge case, and if this behavior can be avoided by setting `--random-mm-limit-mm-per-prompt` to a large number. Note that this might result in errors due to engine config `--limit-mm-per-prompt`.
|
|
||||||
- The resulting request contains synthetic image data in `multi_modal_data` (OpenAI Chat format). When `random-mm` is used with the OpenAI Chat backend, prompts remain text and MM content is attached via `multi_modal_data`.
|
|
||||||
|
|
||||||
</details>
|
|
||||||
|
|||||||
@@ -149,3 +149,70 @@ The script follows a systematic process to find the optimal parameters:
|
|||||||
4. **Track Best Result**: Throughout the process, the script tracks the parameter combination that has yielded the highest valid throughput so far.
|
4. **Track Best Result**: Throughout the process, the script tracks the parameter combination that has yielded the highest valid throughput so far.
|
||||||
|
|
||||||
5. **Profile Collection**: For the best-performing run, the script saves the vLLM profiler output, which can be used for deep-dive performance analysis with tools like TensorBoard.
|
5. **Profile Collection**: For the best-performing run, the script saves the vLLM profiler output, which can be used for deep-dive performance analysis with tools like TensorBoard.
|
||||||
|
|
||||||
|
## Batched `auto_tune`
|
||||||
|
|
||||||
|
The `batch_auto_tune.sh` script allows you to run multiple `auto_tune.sh` experiments sequentially from a single configuration file. It iterates through a list of parameter sets, executes `auto_tune.sh` for each, and records the results back into the input file.
|
||||||
|
|
||||||
|
### Prerequisites
|
||||||
|
|
||||||
|
- **jq**: This script requires `jq` to parse the JSON configuration file.
|
||||||
|
- **gcloud**: If you plan to upload results to Google Cloud Storage, the `gcloud` CLI must be installed and authenticated.
|
||||||
|
|
||||||
|
### How to Run
|
||||||
|
|
||||||
|
1. **Create a JSON configuration file**: Create a file (e.g., `runs_config.json`) containing an array of JSON objects. Each object defines the parameters for a single `auto_tune.sh` run.
|
||||||
|
|
||||||
|
2. **Execute the script**:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
bash batch_auto_tune.sh <path_to_json_file> [gcs_upload_path]
|
||||||
|
```
|
||||||
|
|
||||||
|
- `<path_to_json_file>`: **Required.** Path to your JSON configuration file.
|
||||||
|
- `[gcs_upload_path]`: **Optional.** A GCS path (e.g., `gs://my-bucket/benchmark-results`) where the detailed results and profiles for each run will be uploaded. If this is empty, the results will be available on the local filesystem (see the log for `RESULT_FILE=/path/to/results/file.txt`).
|
||||||
|
|
||||||
|
### Configuration File
|
||||||
|
|
||||||
|
The JSON configuration file should contain an array of objects. Each object's keys correspond to the configuration variables for `auto_tune.sh` (see the [Configuration table above](#configuration)). These keys will be converted to uppercase environment variables for each run.
|
||||||
|
|
||||||
|
Here is an example `runs_config.json` with two benchmark configurations:
|
||||||
|
|
||||||
|
```json
|
||||||
|
[
|
||||||
|
{
|
||||||
|
"base": "/home/user",
|
||||||
|
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||||
|
"system": "TPU", # OR GPU
|
||||||
|
"tp": 8,
|
||||||
|
"input_len": 128,
|
||||||
|
"output_len": 2048,
|
||||||
|
"max_model_len": 2300,
|
||||||
|
"num_seqs_list": "128 256",
|
||||||
|
"num_batched_tokens_list": "8192 16384"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"base": "/home/user",
|
||||||
|
"model": "meta-llama/Llama-3.1-70B-Instruct",
|
||||||
|
"system": "TPU", # OR GPU
|
||||||
|
"tp": 8,
|
||||||
|
"input_len": 4000,
|
||||||
|
"output_len": 16,
|
||||||
|
"max_model_len": 4096,
|
||||||
|
"num_seqs_list": "64 128",
|
||||||
|
"num_batched_tokens_list": "4096 8192",
|
||||||
|
"max_latency_allowed_ms": 500
|
||||||
|
}
|
||||||
|
]
|
||||||
|
```
|
||||||
|
|
||||||
|
### Output
|
||||||
|
|
||||||
|
The script modifies the input JSON file in place, adding the results of each run to the corresponding object. The following fields are added:
|
||||||
|
|
||||||
|
- `run_id`: A unique identifier for the run, derived from the timestamp.
|
||||||
|
- `status`: The outcome of the run (`SUCCESS`, `FAILURE`, or `WARNING_NO_RESULT_FILE`).
|
||||||
|
- `results`: The content of the `result.txt` file from the `auto_tune.sh` run.
|
||||||
|
- `gcs_results`: The GCS URL where the run's artifacts are stored (if a GCS path was provided).
|
||||||
|
|
||||||
|
A summary of successful and failed runs is also printed to the console upon completion.
|
||||||
|
|||||||
@@ -74,7 +74,7 @@ start_server() {
|
|||||||
local vllm_log=$4
|
local vllm_log=$4
|
||||||
local profile_dir=$5
|
local profile_dir=$5
|
||||||
|
|
||||||
pkill -if vllm
|
pkill -if "vllm serve" || true
|
||||||
|
|
||||||
# Define the common arguments as a bash array.
|
# Define the common arguments as a bash array.
|
||||||
# Each argument and its value are separate elements.
|
# Each argument and its value are separate elements.
|
||||||
@@ -96,17 +96,22 @@ start_server() {
|
|||||||
# This correctly passes each element as a separate argument.
|
# This correctly passes each element as a separate argument.
|
||||||
if [[ -n "$profile_dir" ]]; then
|
if [[ -n "$profile_dir" ]]; then
|
||||||
# Start server with profiling enabled
|
# Start server with profiling enabled
|
||||||
VLLM_USE_V1=1 VLLM_SERVER_DEV_MODE=1 VLLM_TORCH_PROFILER_DIR=$profile_dir \
|
VLLM_SERVER_DEV_MODE=1 VLLM_TORCH_PROFILER_DIR=$profile_dir \
|
||||||
vllm serve "${common_args_array[@]}" > "$vllm_log" 2>&1 &
|
vllm serve "${common_args_array[@]}" > "$vllm_log" 2>&1 &
|
||||||
else
|
else
|
||||||
# Start server without profiling
|
# Start server without profiling
|
||||||
VLLM_USE_V1=1 VLLM_SERVER_DEV_MODE=1 \
|
VLLM_SERVER_DEV_MODE=1 \
|
||||||
vllm serve "${common_args_array[@]}" > "$vllm_log" 2>&1 &
|
vllm serve "${common_args_array[@]}" > "$vllm_log" 2>&1 &
|
||||||
fi
|
fi
|
||||||
|
local server_pid=$!
|
||||||
|
|
||||||
# wait for 10 minutes...
|
# wait for 10 minutes...
|
||||||
server_started=0
|
server_started=0
|
||||||
for i in {1..60}; do
|
for i in {1..60}; do
|
||||||
|
# This line checks whether the server is still alive or not,
|
||||||
|
# since that we should always have permission to send signal to the server process.
|
||||||
|
kill -0 $server_pid 2> /dev/null || break
|
||||||
|
|
||||||
RESPONSE=$(curl -s -X GET "http://0.0.0.0:8004/health" -w "%{http_code}" -o /dev/stdout)
|
RESPONSE=$(curl -s -X GET "http://0.0.0.0:8004/health" -w "%{http_code}" -o /dev/stdout)
|
||||||
STATUS_CODE=$(echo "$RESPONSE" | tail -n 1)
|
STATUS_CODE=$(echo "$RESPONSE" | tail -n 1)
|
||||||
if [[ "$STATUS_CODE" -eq 200 ]]; then
|
if [[ "$STATUS_CODE" -eq 200 ]]; then
|
||||||
@@ -118,7 +123,7 @@ start_server() {
|
|||||||
done
|
done
|
||||||
|
|
||||||
if (( ! server_started )); then
|
if (( ! server_started )); then
|
||||||
echo "server did not start within 10 minutes. Please check server log at $vllm_log".
|
echo "server did not start within 10 minutes or crashed. Please check server log at $vllm_log".
|
||||||
return 1
|
return 1
|
||||||
else
|
else
|
||||||
return 0
|
return 0
|
||||||
@@ -134,7 +139,7 @@ run_benchmark() {
|
|||||||
echo "vllm_log: $vllm_log"
|
echo "vllm_log: $vllm_log"
|
||||||
echo
|
echo
|
||||||
rm -f $vllm_log
|
rm -f $vllm_log
|
||||||
pkill -if vllm
|
pkill -if "vllm serve" || true
|
||||||
|
|
||||||
echo "starting server..."
|
echo "starting server..."
|
||||||
# Call start_server without a profile_dir to avoid profiling overhead
|
# Call start_server without a profile_dir to avoid profiling overhead
|
||||||
@@ -227,7 +232,7 @@ run_benchmark() {
|
|||||||
|
|
||||||
echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput"
|
echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput"
|
||||||
|
|
||||||
pkill -if vllm
|
pkill -if "vllm serve" || true
|
||||||
sleep 10
|
sleep 10
|
||||||
echo "===================="
|
echo "===================="
|
||||||
return 0
|
return 0
|
||||||
@@ -303,6 +308,6 @@ if (( $(echo "$best_throughput > 0" | bc -l) )); then
|
|||||||
else
|
else
|
||||||
echo "No configuration met the latency requirements. Skipping final profiling run."
|
echo "No configuration met the latency requirements. Skipping final profiling run."
|
||||||
fi
|
fi
|
||||||
pkill -if vllm
|
pkill -if "vllm serve" || true
|
||||||
echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput, profile saved in: $PROFILE_PATH"
|
echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput, profile saved in: $PROFILE_PATH"
|
||||||
echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput, profile saved in: $PROFILE_PATH" >> "$RESULT"
|
echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput, profile saved in: $PROFILE_PATH" >> "$RESULT"
|
||||||
|
|||||||
128
benchmarks/auto_tune/batch_auto_tune.sh
Executable file
128
benchmarks/auto_tune/batch_auto_tune.sh
Executable file
@@ -0,0 +1,128 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
INPUT_JSON="$1"
|
||||||
|
GCS_PATH="$2" # Optional GCS path for uploading results for each run
|
||||||
|
|
||||||
|
SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd)
|
||||||
|
AUTOTUNE_SCRIPT="$SCRIPT_DIR/auto_tune.sh"
|
||||||
|
|
||||||
|
if [[ -z "$INPUT_JSON" ]]; then
|
||||||
|
echo "Error: Input JSON file not provided."
|
||||||
|
echo "Usage: $0 <path_to_json_file> [gcs_upload_path]"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ ! -f "$INPUT_JSON" ]]; then
|
||||||
|
echo "Error: File not found at '$INPUT_JSON'"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
if ! command -v jq &> /dev/null; then
|
||||||
|
echo "Error: 'jq' command not found. Please install jq to process the JSON input."
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ -n "$GCS_PATH" ]] && ! command -v gcloud &> /dev/null; then
|
||||||
|
echo "Error: 'gcloud' command not found, but a GCS_PATH was provided."
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
SUCCESS_COUNT=0
|
||||||
|
FAILURE_COUNT=0
|
||||||
|
FAILED_RUNS=()
|
||||||
|
SCRIPT_START_TIME=$(date +%s)
|
||||||
|
|
||||||
|
json_content=$(cat "$INPUT_JSON")
|
||||||
|
if ! num_runs=$(echo "$json_content" | jq 'length'); then
|
||||||
|
echo "Error: Invalid JSON in $INPUT_JSON. 'jq' failed to get array length." >&2
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "Found $num_runs benchmark configurations in $INPUT_JSON."
|
||||||
|
echo "Starting benchmark runs..."
|
||||||
|
echo "--------------------------------------------------"
|
||||||
|
|
||||||
|
for i in $(seq 0 $(($num_runs - 1))); do
|
||||||
|
run_object=$(echo "$json_content" | jq ".[$i]")
|
||||||
|
|
||||||
|
RUN_START_TIME=$(date +%s)
|
||||||
|
ENV_VARS_ARRAY=()
|
||||||
|
# Dynamically create env vars from the JSON object's keys
|
||||||
|
for key in $(echo "$run_object" | jq -r 'keys_unsorted[]'); do
|
||||||
|
value=$(echo "$run_object" | jq -r ".$key")
|
||||||
|
var_name=$(echo "$key" | tr '[:lower:]' '[:upper:]' | tr -cd 'A-Z0-9_')
|
||||||
|
ENV_VARS_ARRAY+=("${var_name}=${value}")
|
||||||
|
done
|
||||||
|
|
||||||
|
echo "Executing run #$((i+1))/$num_runs with parameters: ${ENV_VARS_ARRAY[*]}"
|
||||||
|
|
||||||
|
# Execute auto_tune.sh and capture output
|
||||||
|
RUN_OUTPUT_FILE=$(mktemp)
|
||||||
|
if env "${ENV_VARS_ARRAY[@]}" bash "$AUTOTUNE_SCRIPT" > >(tee -a "$RUN_OUTPUT_FILE") 2>&1; then
|
||||||
|
STATUS="SUCCESS"
|
||||||
|
((SUCCESS_COUNT++))
|
||||||
|
else
|
||||||
|
STATUS="FAILURE"
|
||||||
|
((FAILURE_COUNT++))
|
||||||
|
FAILED_RUNS+=("Run #$((i+1)): $(echo $run_object | jq -c .)")
|
||||||
|
fi
|
||||||
|
|
||||||
|
RUN_OUTPUT=$(<"$RUN_OUTPUT_FILE")
|
||||||
|
rm "$RUN_OUTPUT_FILE"
|
||||||
|
|
||||||
|
# Parse results and optionally upload them to GCS
|
||||||
|
RUN_ID=""
|
||||||
|
RESULTS=""
|
||||||
|
GCS_RESULTS_URL=""
|
||||||
|
if [[ "$STATUS" == "SUCCESS" ]]; then
|
||||||
|
RESULT_FILE_PATH=$(echo "$RUN_OUTPUT" | grep 'RESULT_FILE=' | tail -n 1 | cut -d'=' -f2 | tr -s '/' || true)
|
||||||
|
|
||||||
|
if [[ -n "$RESULT_FILE_PATH" && -f "$RESULT_FILE_PATH" ]]; then
|
||||||
|
RUN_ID=$(basename "$(dirname "$RESULT_FILE_PATH")")
|
||||||
|
RESULT_DIR=$(dirname "$RESULT_FILE_PATH")
|
||||||
|
RESULTS=$(cat "$RESULT_FILE_PATH")
|
||||||
|
|
||||||
|
if [[ -n "$GCS_PATH" ]]; then
|
||||||
|
GCS_RESULTS_URL="${GCS_PATH}/${RUN_ID}"
|
||||||
|
echo "Uploading results to GCS..."
|
||||||
|
if gcloud storage rsync --recursive "$RESULT_DIR/" "$GCS_RESULTS_URL"; then
|
||||||
|
echo "GCS upload successful."
|
||||||
|
else
|
||||||
|
echo "Warning: GCS upload failed for RUN_ID $RUN_ID."
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
else
|
||||||
|
echo "Warning: Could not find result file for a successful run."
|
||||||
|
STATUS="WARNING_NO_RESULT_FILE"
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Add the results back into the JSON object for this run
|
||||||
|
json_content=$(echo "$json_content" | jq --argjson i "$i" --arg run_id "$RUN_ID" --arg status "$STATUS" --arg results "$RESULTS" --arg gcs_results "$GCS_RESULTS_URL" \
|
||||||
|
'.[$i] += {run_id: $run_id, status: $status, results: $results, gcs_results: $gcs_results}')
|
||||||
|
|
||||||
|
RUN_END_TIME=$(date +%s)
|
||||||
|
echo "Run finished in $((RUN_END_TIME - RUN_START_TIME)) seconds. Status: $STATUS"
|
||||||
|
echo "--------------------------------------------------"
|
||||||
|
|
||||||
|
# Save intermediate progress back to the file
|
||||||
|
echo "$json_content" > "$INPUT_JSON.tmp" && mv "$INPUT_JSON.tmp" "$INPUT_JSON"
|
||||||
|
|
||||||
|
done
|
||||||
|
|
||||||
|
SCRIPT_END_TIME=$(date +%s)
|
||||||
|
echo "All benchmark runs completed in $((SCRIPT_END_TIME - SCRIPT_START_TIME)) seconds."
|
||||||
|
echo
|
||||||
|
echo "====================== SUMMARY ======================"
|
||||||
|
echo "Successful runs: $SUCCESS_COUNT"
|
||||||
|
echo "Failed runs: $FAILURE_COUNT"
|
||||||
|
echo "==================================================="
|
||||||
|
|
||||||
|
if [[ $FAILURE_COUNT -gt 0 ]]; then
|
||||||
|
echo "Details of failed runs (see JSON file for full parameters):"
|
||||||
|
for failed in "${FAILED_RUNS[@]}"; do
|
||||||
|
echo " - $failed"
|
||||||
|
done
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "Updated results have been saved to '$INPUT_JSON'."
|
||||||
@@ -8,7 +8,6 @@ import sys
|
|||||||
import time
|
import time
|
||||||
import traceback
|
import traceback
|
||||||
from dataclasses import dataclass, field
|
from dataclasses import dataclass, field
|
||||||
from typing import Optional, Union
|
|
||||||
|
|
||||||
import aiohttp
|
import aiohttp
|
||||||
import huggingface_hub.constants
|
import huggingface_hub.constants
|
||||||
@@ -28,13 +27,13 @@ class RequestFuncInput:
|
|||||||
prompt_len: int
|
prompt_len: int
|
||||||
output_len: int
|
output_len: int
|
||||||
model: str
|
model: str
|
||||||
model_name: Optional[str] = None
|
model_name: str | None = None
|
||||||
logprobs: Optional[int] = None
|
logprobs: int | None = None
|
||||||
extra_body: Optional[dict] = None
|
extra_body: dict | None = None
|
||||||
multi_modal_content: Optional[dict | list[dict]] = None
|
multi_modal_content: dict | list[dict] | None = None
|
||||||
ignore_eos: bool = False
|
ignore_eos: bool = False
|
||||||
language: Optional[str] = None
|
language: str | None = None
|
||||||
request_id: Optional[str] = None
|
request_id: str | None = None
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
@@ -52,7 +51,7 @@ class RequestFuncOutput:
|
|||||||
|
|
||||||
async def async_request_tgi(
|
async def async_request_tgi(
|
||||||
request_func_input: RequestFuncInput,
|
request_func_input: RequestFuncInput,
|
||||||
pbar: Optional[tqdm] = None,
|
pbar: tqdm | None = None,
|
||||||
) -> RequestFuncOutput:
|
) -> RequestFuncOutput:
|
||||||
api_url = request_func_input.api_url
|
api_url = request_func_input.api_url
|
||||||
assert api_url.endswith("generate_stream")
|
assert api_url.endswith("generate_stream")
|
||||||
@@ -133,7 +132,7 @@ async def async_request_tgi(
|
|||||||
|
|
||||||
async def async_request_trt_llm(
|
async def async_request_trt_llm(
|
||||||
request_func_input: RequestFuncInput,
|
request_func_input: RequestFuncInput,
|
||||||
pbar: Optional[tqdm] = None,
|
pbar: tqdm | None = None,
|
||||||
) -> RequestFuncOutput:
|
) -> RequestFuncOutput:
|
||||||
api_url = request_func_input.api_url
|
api_url = request_func_input.api_url
|
||||||
assert api_url.endswith("generate_stream")
|
assert api_url.endswith("generate_stream")
|
||||||
@@ -204,7 +203,7 @@ async def async_request_trt_llm(
|
|||||||
|
|
||||||
async def async_request_deepspeed_mii(
|
async def async_request_deepspeed_mii(
|
||||||
request_func_input: RequestFuncInput,
|
request_func_input: RequestFuncInput,
|
||||||
pbar: Optional[tqdm] = None,
|
pbar: tqdm | None = None,
|
||||||
) -> RequestFuncOutput:
|
) -> RequestFuncOutput:
|
||||||
api_url = request_func_input.api_url
|
api_url = request_func_input.api_url
|
||||||
assert api_url.endswith(("completions", "profile")), (
|
assert api_url.endswith(("completions", "profile")), (
|
||||||
@@ -267,7 +266,7 @@ async def async_request_deepspeed_mii(
|
|||||||
|
|
||||||
async def async_request_openai_completions(
|
async def async_request_openai_completions(
|
||||||
request_func_input: RequestFuncInput,
|
request_func_input: RequestFuncInput,
|
||||||
pbar: Optional[tqdm] = None,
|
pbar: tqdm | None = None,
|
||||||
) -> RequestFuncOutput:
|
) -> RequestFuncOutput:
|
||||||
api_url = request_func_input.api_url
|
api_url = request_func_input.api_url
|
||||||
assert api_url.endswith(("completions", "profile")), (
|
assert api_url.endswith(("completions", "profile")), (
|
||||||
@@ -367,7 +366,7 @@ async def async_request_openai_completions(
|
|||||||
|
|
||||||
async def async_request_openai_chat_completions(
|
async def async_request_openai_chat_completions(
|
||||||
request_func_input: RequestFuncInput,
|
request_func_input: RequestFuncInput,
|
||||||
pbar: Optional[tqdm] = None,
|
pbar: tqdm | None = None,
|
||||||
) -> RequestFuncOutput:
|
) -> RequestFuncOutput:
|
||||||
api_url = request_func_input.api_url
|
api_url = request_func_input.api_url
|
||||||
assert api_url.endswith(("chat/completions", "profile")), (
|
assert api_url.endswith(("chat/completions", "profile")), (
|
||||||
@@ -476,7 +475,7 @@ async def async_request_openai_chat_completions(
|
|||||||
|
|
||||||
async def async_request_openai_audio(
|
async def async_request_openai_audio(
|
||||||
request_func_input: RequestFuncInput,
|
request_func_input: RequestFuncInput,
|
||||||
pbar: Optional[tqdm] = None,
|
pbar: tqdm | None = None,
|
||||||
) -> RequestFuncOutput:
|
) -> RequestFuncOutput:
|
||||||
# Lazy import without PlaceholderModule to avoid vllm dep.
|
# Lazy import without PlaceholderModule to avoid vllm dep.
|
||||||
import soundfile
|
import soundfile
|
||||||
@@ -610,7 +609,7 @@ def get_tokenizer(
|
|||||||
tokenizer_mode: str = "auto",
|
tokenizer_mode: str = "auto",
|
||||||
trust_remote_code: bool = False,
|
trust_remote_code: bool = False,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]:
|
) -> PreTrainedTokenizer | PreTrainedTokenizerFast:
|
||||||
if pretrained_model_name_or_path is not None and not os.path.exists(
|
if pretrained_model_name_or_path is not None and not os.path.exists(
|
||||||
pretrained_model_name_or_path
|
pretrained_model_name_or_path
|
||||||
):
|
):
|
||||||
|
|||||||
@@ -2,9 +2,9 @@
|
|||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
import gc
|
import gc
|
||||||
|
|
||||||
|
from benchmark_utils import TimeCollector
|
||||||
from tabulate import tabulate
|
from tabulate import tabulate
|
||||||
|
|
||||||
from benchmark_utils import TimeCollector
|
|
||||||
from vllm.utils import FlexibleArgumentParser
|
from vllm.utils import FlexibleArgumentParser
|
||||||
from vllm.v1.core.block_pool import BlockPool
|
from vllm.v1.core.block_pool import BlockPool
|
||||||
|
|
||||||
|
|||||||
File diff suppressed because it is too large
Load Diff
@@ -1,17 +1,31 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
import gc
|
import gc
|
||||||
|
import time
|
||||||
|
from unittest import mock
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
from benchmark_utils import TimeCollector
|
||||||
from tabulate import tabulate
|
from tabulate import tabulate
|
||||||
|
|
||||||
from benchmark_utils import TimeCollector
|
from vllm.config import (
|
||||||
from vllm.config import ModelConfig, SpeculativeConfig, VllmConfig
|
CacheConfig,
|
||||||
|
DeviceConfig,
|
||||||
|
LoadConfig,
|
||||||
|
ModelConfig,
|
||||||
|
ParallelConfig,
|
||||||
|
SchedulerConfig,
|
||||||
|
SpeculativeConfig,
|
||||||
|
VllmConfig,
|
||||||
|
)
|
||||||
|
from vllm.platforms import current_platform
|
||||||
from vllm.utils import FlexibleArgumentParser
|
from vllm.utils import FlexibleArgumentParser
|
||||||
from vllm.v1.spec_decode.ngram_proposer import NgramProposer
|
from vllm.v1.spec_decode.ngram_proposer import NgramProposer
|
||||||
|
from vllm.v1.worker.gpu_input_batch import InputBatch
|
||||||
|
from vllm.v1.worker.gpu_model_runner import GPUModelRunner
|
||||||
|
|
||||||
|
|
||||||
def main(args):
|
def benchmark_propose(args):
|
||||||
rows = []
|
rows = []
|
||||||
for max_ngram in args.max_ngram:
|
for max_ngram in args.max_ngram:
|
||||||
collector = TimeCollector(TimeCollector.US)
|
collector = TimeCollector(TimeCollector.US)
|
||||||
@@ -69,10 +83,88 @@ def main(args):
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def benchmark_batched_propose(args):
|
||||||
|
NUM_SPECULATIVE_TOKENS_NGRAM = 10
|
||||||
|
PROMPT_LOOKUP_MIN = 5
|
||||||
|
PROMPT_LOOKUP_MAX = 15
|
||||||
|
MAX_MODEL_LEN = int(1e7)
|
||||||
|
DEVICE = current_platform.device_type
|
||||||
|
|
||||||
|
model_config = ModelConfig(model="facebook/opt-125m", runner="generate")
|
||||||
|
|
||||||
|
speculative_config = SpeculativeConfig(
|
||||||
|
target_model_config=model_config,
|
||||||
|
target_parallel_config=ParallelConfig(),
|
||||||
|
method="ngram",
|
||||||
|
num_speculative_tokens=NUM_SPECULATIVE_TOKENS_NGRAM,
|
||||||
|
prompt_lookup_max=PROMPT_LOOKUP_MAX,
|
||||||
|
prompt_lookup_min=PROMPT_LOOKUP_MIN,
|
||||||
|
)
|
||||||
|
|
||||||
|
vllm_config = VllmConfig(
|
||||||
|
model_config=model_config,
|
||||||
|
cache_config=CacheConfig(),
|
||||||
|
speculative_config=speculative_config,
|
||||||
|
device_config=DeviceConfig(device=current_platform.device_type),
|
||||||
|
parallel_config=ParallelConfig(),
|
||||||
|
load_config=LoadConfig(),
|
||||||
|
scheduler_config=SchedulerConfig(),
|
||||||
|
)
|
||||||
|
|
||||||
|
# monkey patch vllm.v1.worker.gpu_model_runner.get_pp_group
|
||||||
|
mock_pp_group = mock.MagicMock()
|
||||||
|
mock_pp_group.world_size = 1
|
||||||
|
with mock.patch(
|
||||||
|
"vllm.v1.worker.gpu_model_runner.get_pp_group", return_value=mock_pp_group
|
||||||
|
):
|
||||||
|
runner = GPUModelRunner(vllm_config, DEVICE)
|
||||||
|
|
||||||
|
# hack max model len
|
||||||
|
runner.max_model_len = MAX_MODEL_LEN
|
||||||
|
runner.drafter.max_model_len = MAX_MODEL_LEN
|
||||||
|
|
||||||
|
dummy_input_batch = InputBatch(
|
||||||
|
max_num_reqs=args.num_req,
|
||||||
|
max_model_len=MAX_MODEL_LEN,
|
||||||
|
max_num_batched_tokens=args.num_req * args.num_token,
|
||||||
|
device=DEVICE,
|
||||||
|
pin_memory=False,
|
||||||
|
vocab_size=256000,
|
||||||
|
block_sizes=[16],
|
||||||
|
)
|
||||||
|
dummy_input_batch._req_ids = list(str(id) for id in range(args.num_req))
|
||||||
|
dummy_input_batch.spec_decode_unsupported_reqs = ()
|
||||||
|
dummy_input_batch.num_tokens_no_spec = [args.num_token] * args.num_req
|
||||||
|
dummy_input_batch.token_ids_cpu = np.random.randint(
|
||||||
|
0, 20, (args.num_req, args.num_token)
|
||||||
|
)
|
||||||
|
|
||||||
|
runner.input_batch = dummy_input_batch
|
||||||
|
|
||||||
|
sampled_token_ids = [[0]] * args.num_req
|
||||||
|
|
||||||
|
print("Starting benchmark")
|
||||||
|
# first run is warmup so ignore it
|
||||||
|
for _ in range(args.num_iteration):
|
||||||
|
start = time.time()
|
||||||
|
runner.drafter.propose(
|
||||||
|
sampled_token_ids,
|
||||||
|
dummy_input_batch.req_ids,
|
||||||
|
dummy_input_batch.num_tokens_no_spec,
|
||||||
|
dummy_input_batch.token_ids_cpu,
|
||||||
|
dummy_input_batch.spec_decode_unsupported_reqs,
|
||||||
|
)
|
||||||
|
end = time.time()
|
||||||
|
print(f"Iteration time (s): {end - start}")
|
||||||
|
|
||||||
|
|
||||||
def invoke_main() -> None:
|
def invoke_main() -> None:
|
||||||
parser = FlexibleArgumentParser(
|
parser = FlexibleArgumentParser(
|
||||||
description="Benchmark the performance of N-gram speculative decode drafting"
|
description="Benchmark the performance of N-gram speculative decode drafting"
|
||||||
)
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--batched", action="store_true", help="consider time to prepare batch"
|
||||||
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--num-iteration",
|
"--num-iteration",
|
||||||
type=int,
|
type=int,
|
||||||
@@ -105,8 +197,17 @@ def invoke_main() -> None:
|
|||||||
help="Number of speculative tokens to generate",
|
help="Number of speculative tokens to generate",
|
||||||
)
|
)
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
main(args)
|
|
||||||
|
if not args.batched:
|
||||||
|
benchmark_propose(args)
|
||||||
|
else:
|
||||||
|
benchmark_batched_propose(args)
|
||||||
|
|
||||||
|
|
||||||
|
"""
|
||||||
|
# Example command lines:
|
||||||
|
# time python3 benchmarks/benchmark_ngram_proposer.py
|
||||||
|
# time python3 benchmarks/benchmark_ngram_proposer.py --batched --num-iteration 4 --num-token 1000000 --num-req 128
|
||||||
|
""" # noqa: E501
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
invoke_main() # pragma: no cover
|
invoke_main() # pragma: no cover
|
||||||
|
|||||||
@@ -32,7 +32,6 @@ import dataclasses
|
|||||||
import json
|
import json
|
||||||
import random
|
import random
|
||||||
import time
|
import time
|
||||||
from typing import Optional
|
|
||||||
|
|
||||||
from transformers import PreTrainedTokenizerBase
|
from transformers import PreTrainedTokenizerBase
|
||||||
|
|
||||||
@@ -80,7 +79,7 @@ def sample_requests_from_dataset(
|
|||||||
num_requests: int,
|
num_requests: int,
|
||||||
tokenizer: PreTrainedTokenizerBase,
|
tokenizer: PreTrainedTokenizerBase,
|
||||||
input_length_range: tuple[int, int],
|
input_length_range: tuple[int, int],
|
||||||
fixed_output_len: Optional[int],
|
fixed_output_len: int | None,
|
||||||
) -> list[Request]:
|
) -> list[Request]:
|
||||||
if fixed_output_len is not None and fixed_output_len < 4:
|
if fixed_output_len is not None and fixed_output_len < 4:
|
||||||
raise ValueError("output_len too small")
|
raise ValueError("output_len too small")
|
||||||
@@ -128,7 +127,7 @@ def sample_requests_from_random(
|
|||||||
num_requests: int,
|
num_requests: int,
|
||||||
tokenizer: PreTrainedTokenizerBase,
|
tokenizer: PreTrainedTokenizerBase,
|
||||||
input_length_range: tuple[int, int],
|
input_length_range: tuple[int, int],
|
||||||
fixed_output_len: Optional[int],
|
fixed_output_len: int | None,
|
||||||
prefix_len: int,
|
prefix_len: int,
|
||||||
) -> list[Request]:
|
) -> list[Request]:
|
||||||
requests = []
|
requests = []
|
||||||
|
|||||||
@@ -7,7 +7,6 @@ import dataclasses
|
|||||||
import json
|
import json
|
||||||
import random
|
import random
|
||||||
import time
|
import time
|
||||||
from typing import Optional
|
|
||||||
|
|
||||||
from transformers import AutoTokenizer, PreTrainedTokenizerBase
|
from transformers import AutoTokenizer, PreTrainedTokenizerBase
|
||||||
|
|
||||||
@@ -24,7 +23,7 @@ def sample_requests(
|
|||||||
dataset_path: str,
|
dataset_path: str,
|
||||||
num_requests: int,
|
num_requests: int,
|
||||||
tokenizer: PreTrainedTokenizerBase,
|
tokenizer: PreTrainedTokenizerBase,
|
||||||
fixed_output_len: Optional[int],
|
fixed_output_len: int | None,
|
||||||
) -> list[tuple[str, int, int, int]]:
|
) -> list[tuple[str, int, int, int]]:
|
||||||
if fixed_output_len is not None and fixed_output_len < 4:
|
if fixed_output_len is not None and fixed_output_len < 4:
|
||||||
raise ValueError("output_len too small")
|
raise ValueError("output_len too small")
|
||||||
|
|||||||
@@ -31,20 +31,19 @@ import time
|
|||||||
import uuid
|
import uuid
|
||||||
import warnings
|
import warnings
|
||||||
from collections.abc import AsyncGenerator
|
from collections.abc import AsyncGenerator
|
||||||
|
from contextlib import nullcontext
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from typing import Optional
|
|
||||||
|
|
||||||
import datasets
|
import datasets
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
from tqdm.asyncio import tqdm
|
|
||||||
from transformers import PreTrainedTokenizerBase
|
|
||||||
|
|
||||||
from backend_request_func import (
|
from backend_request_func import (
|
||||||
ASYNC_REQUEST_FUNCS,
|
ASYNC_REQUEST_FUNCS,
|
||||||
RequestFuncInput,
|
RequestFuncInput,
|
||||||
RequestFuncOutput,
|
RequestFuncOutput,
|
||||||
)
|
)
|
||||||
|
from tqdm.asyncio import tqdm
|
||||||
|
from transformers import PreTrainedTokenizerBase
|
||||||
|
|
||||||
try:
|
try:
|
||||||
from vllm.transformers_utils.tokenizer import get_tokenizer
|
from vllm.transformers_utils.tokenizer import get_tokenizer
|
||||||
@@ -317,7 +316,7 @@ def calculate_metrics(
|
|||||||
tokenizer: PreTrainedTokenizerBase,
|
tokenizer: PreTrainedTokenizerBase,
|
||||||
selected_percentile_metrics: list[str],
|
selected_percentile_metrics: list[str],
|
||||||
selected_percentiles: list[float],
|
selected_percentiles: list[float],
|
||||||
goodput_config_dict: Optional[dict[str, float]] = None,
|
goodput_config_dict: dict[str, float] | None = None,
|
||||||
) -> tuple[BenchmarkMetrics, list[int]]:
|
) -> tuple[BenchmarkMetrics, list[int]]:
|
||||||
actual_output_lens: list[int] = []
|
actual_output_lens: list[int] = []
|
||||||
total_input = 0
|
total_input = 0
|
||||||
@@ -437,9 +436,9 @@ async def benchmark(
|
|||||||
selected_percentile_metrics: list[str],
|
selected_percentile_metrics: list[str],
|
||||||
selected_percentiles: list[str],
|
selected_percentiles: list[str],
|
||||||
ignore_eos: bool,
|
ignore_eos: bool,
|
||||||
max_concurrency: Optional[int],
|
max_concurrency: int | None,
|
||||||
structured_output_ratio: float,
|
structured_output_ratio: float,
|
||||||
goodput_config_dict: Optional[dict[str, float]] = None,
|
goodput_config_dict: dict[str, float] | None = None,
|
||||||
):
|
):
|
||||||
if backend in ASYNC_REQUEST_FUNCS:
|
if backend in ASYNC_REQUEST_FUNCS:
|
||||||
request_func = ASYNC_REQUEST_FUNCS[backend]
|
request_func = ASYNC_REQUEST_FUNCS[backend]
|
||||||
@@ -449,7 +448,8 @@ async def benchmark(
|
|||||||
def prepare_extra_body(request) -> dict:
|
def prepare_extra_body(request) -> dict:
|
||||||
extra_body = {}
|
extra_body = {}
|
||||||
# Add the schema to the extra_body
|
# Add the schema to the extra_body
|
||||||
extra_body[request.structure_type] = request.schema
|
extra_body["structured_outputs"] = {}
|
||||||
|
extra_body["structured_outputs"][request.structure_type] = request.schema
|
||||||
return extra_body
|
return extra_body
|
||||||
|
|
||||||
print("Starting initial single prompt test run...")
|
print("Starting initial single prompt test run...")
|
||||||
@@ -502,15 +502,9 @@ async def benchmark(
|
|||||||
|
|
||||||
pbar = None if disable_tqdm else tqdm(total=len(input_requests))
|
pbar = None if disable_tqdm else tqdm(total=len(input_requests))
|
||||||
|
|
||||||
# This can be used once the minimum Python version is 3.10 or higher,
|
semaphore = asyncio.Semaphore(max_concurrency) if max_concurrency else nullcontext()
|
||||||
# and it will simplify the code in limited_request_func.
|
|
||||||
# semaphore = (asyncio.Semaphore(max_concurrency)
|
|
||||||
# if max_concurrency else contextlib.nullcontext())
|
|
||||||
semaphore = asyncio.Semaphore(max_concurrency) if max_concurrency else None
|
|
||||||
|
|
||||||
async def limited_request_func(request_func_input, pbar):
|
async def limited_request_func(request_func_input, pbar):
|
||||||
if semaphore is None:
|
|
||||||
return await request_func(request_func_input=request_func_input, pbar=pbar)
|
|
||||||
async with semaphore:
|
async with semaphore:
|
||||||
return await request_func(request_func_input=request_func_input, pbar=pbar)
|
return await request_func(request_func_input=request_func_input, pbar=pbar)
|
||||||
|
|
||||||
@@ -696,11 +690,11 @@ def evaluate(ret, args):
|
|||||||
return re.match(args.regex, actual) is not None
|
return re.match(args.regex, actual) is not None
|
||||||
|
|
||||||
def _eval_correctness(expected, actual):
|
def _eval_correctness(expected, actual):
|
||||||
if args.structure_type == "guided_json":
|
if args.structure_type == "json":
|
||||||
return _eval_correctness_json(expected, actual)
|
return _eval_correctness_json(expected, actual)
|
||||||
elif args.structure_type == "guided_regex":
|
elif args.structure_type == "regex":
|
||||||
return _eval_correctness_regex(expected, actual)
|
return _eval_correctness_regex(expected, actual)
|
||||||
elif args.structure_type == "guided_choice":
|
elif args.structure_type == "choice":
|
||||||
return _eval_correctness_choice(expected, actual)
|
return _eval_correctness_choice(expected, actual)
|
||||||
else:
|
else:
|
||||||
return None
|
return None
|
||||||
@@ -780,18 +774,18 @@ def main(args: argparse.Namespace):
|
|||||||
)
|
)
|
||||||
|
|
||||||
if args.dataset == "grammar":
|
if args.dataset == "grammar":
|
||||||
args.structure_type = "guided_grammar"
|
args.structure_type = "grammar"
|
||||||
elif args.dataset == "regex":
|
elif args.dataset == "regex":
|
||||||
args.structure_type = "guided_regex"
|
args.structure_type = "regex"
|
||||||
elif args.dataset == "choice":
|
elif args.dataset == "choice":
|
||||||
args.structure_type = "guided_choice"
|
args.structure_type = "choice"
|
||||||
else:
|
else:
|
||||||
args.structure_type = "guided_json"
|
args.structure_type = "json"
|
||||||
|
|
||||||
if args.no_structured_output:
|
if args.no_structured_output:
|
||||||
args.structured_output_ratio = 0
|
args.structured_output_ratio = 0
|
||||||
if args.save_results:
|
if args.save_results:
|
||||||
result_file_name = f"{args.structured_output_ratio}guided"
|
result_file_name = f"{args.structured_output_ratio}so"
|
||||||
result_file_name += f"_{backend}"
|
result_file_name += f"_{backend}"
|
||||||
result_file_name += f"_{args.request_rate}qps"
|
result_file_name += f"_{args.request_rate}qps"
|
||||||
result_file_name += f"_{args.model.split('/')[-1]}"
|
result_file_name += f"_{args.model.split('/')[-1]}"
|
||||||
@@ -909,13 +903,13 @@ def create_argument_parser():
|
|||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--tokenizer",
|
"--tokenizer",
|
||||||
type=str,
|
type=str,
|
||||||
help="Name or path of the tokenizer, if not using the default tokenizer.", # noqa: E501
|
help="Name or path of the tokenizer, if not using the default tokenizer.",
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--tokenizer-mode",
|
"--tokenizer-mode",
|
||||||
type=str,
|
type=str,
|
||||||
default="auto",
|
default="auto",
|
||||||
help="Name or path of the tokenizer, if not using the default tokenizer.", # noqa: E501
|
help="Name or path of the tokenizer, if not using the default tokenizer.",
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--num-prompts",
|
"--num-prompts",
|
||||||
|
|||||||
@@ -6,7 +6,7 @@ import math
|
|||||||
import os
|
import os
|
||||||
import time
|
import time
|
||||||
from types import TracebackType
|
from types import TracebackType
|
||||||
from typing import Any, Optional, Union
|
from typing import Any
|
||||||
|
|
||||||
|
|
||||||
def convert_to_pytorch_benchmark_format(
|
def convert_to_pytorch_benchmark_format(
|
||||||
@@ -92,7 +92,7 @@ class TimeCollector:
|
|||||||
def __init__(self, scale: int) -> None:
|
def __init__(self, scale: int) -> None:
|
||||||
self.cnt: int = 0
|
self.cnt: int = 0
|
||||||
self._sum: int = 0
|
self._sum: int = 0
|
||||||
self._max: Optional[int] = None
|
self._max: int | None = None
|
||||||
self.scale = scale
|
self.scale = scale
|
||||||
self.start_time: int = time.monotonic_ns()
|
self.start_time: int = time.monotonic_ns()
|
||||||
|
|
||||||
@@ -104,13 +104,13 @@ class TimeCollector:
|
|||||||
else:
|
else:
|
||||||
self._max = max(self._max, v)
|
self._max = max(self._max, v)
|
||||||
|
|
||||||
def avg(self) -> Union[float, str]:
|
def avg(self) -> float | str:
|
||||||
return self._sum * 1.0 / self.cnt / self.scale if self.cnt > 0 else "N/A"
|
return self._sum * 1.0 / self.cnt / self.scale if self.cnt > 0 else "N/A"
|
||||||
|
|
||||||
def max(self) -> Union[float, str]:
|
def max(self) -> float | str:
|
||||||
return self._max / self.scale if self._max else "N/A"
|
return self._max / self.scale if self._max else "N/A"
|
||||||
|
|
||||||
def dump_avg_max(self) -> list[Union[float, str]]:
|
def dump_avg_max(self) -> list[float | str]:
|
||||||
return [self.avg(), self.max()]
|
return [self.avg(), self.max()]
|
||||||
|
|
||||||
def __enter__(self) -> None:
|
def __enter__(self) -> None:
|
||||||
@@ -118,8 +118,8 @@ class TimeCollector:
|
|||||||
|
|
||||||
def __exit__(
|
def __exit__(
|
||||||
self,
|
self,
|
||||||
exc_type: Optional[type[BaseException]],
|
exc_type: type[BaseException] | None,
|
||||||
exc_value: Optional[BaseException],
|
exc_value: BaseException | None,
|
||||||
exc_traceback: Optional[TracebackType],
|
exc_traceback: TracebackType | None,
|
||||||
) -> None:
|
) -> None:
|
||||||
self.collect(time.monotonic_ns() - self.start_time)
|
self.collect(time.monotonic_ns() - self.start_time)
|
||||||
|
|||||||
@@ -6,8 +6,7 @@ import copy
|
|||||||
import itertools
|
import itertools
|
||||||
import pickle as pkl
|
import pickle as pkl
|
||||||
import time
|
import time
|
||||||
from collections.abc import Iterable
|
from collections.abc import Callable, Iterable
|
||||||
from typing import Callable
|
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
import torch.utils.benchmark as TBenchmark
|
import torch.utils.benchmark as TBenchmark
|
||||||
|
|||||||
@@ -6,8 +6,7 @@ import copy
|
|||||||
import itertools
|
import itertools
|
||||||
import pickle as pkl
|
import pickle as pkl
|
||||||
import time
|
import time
|
||||||
from collections.abc import Iterable
|
from collections.abc import Callable, Iterable
|
||||||
from typing import Callable, Optional
|
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
import torch.utils.benchmark as TBenchmark
|
import torch.utils.benchmark as TBenchmark
|
||||||
@@ -17,7 +16,7 @@ from weight_shapes import WEIGHT_SHAPES
|
|||||||
|
|
||||||
from vllm import _custom_ops as ops
|
from vllm import _custom_ops as ops
|
||||||
from vllm.model_executor.layers.quantization.utils.fp8_utils import (
|
from vllm.model_executor.layers.quantization.utils.fp8_utils import (
|
||||||
w8a8_block_fp8_matmul,
|
w8a8_triton_block_scaled_mm,
|
||||||
)
|
)
|
||||||
from vllm.utils import FlexibleArgumentParser, cdiv
|
from vllm.utils import FlexibleArgumentParser, cdiv
|
||||||
|
|
||||||
@@ -53,7 +52,7 @@ def bench_int8(
|
|||||||
n: int,
|
n: int,
|
||||||
label: str,
|
label: str,
|
||||||
sub_label: str,
|
sub_label: str,
|
||||||
bench_kernels: Optional[list[str]] = None,
|
bench_kernels: list[str] | None = None,
|
||||||
) -> Iterable[TMeasurement]:
|
) -> Iterable[TMeasurement]:
|
||||||
"""Benchmark INT8-based kernels."""
|
"""Benchmark INT8-based kernels."""
|
||||||
assert dtype == torch.int8
|
assert dtype == torch.int8
|
||||||
@@ -108,7 +107,7 @@ def bench_fp8(
|
|||||||
n: int,
|
n: int,
|
||||||
label: str,
|
label: str,
|
||||||
sub_label: str,
|
sub_label: str,
|
||||||
bench_kernels: Optional[list[str]] = None,
|
bench_kernels: list[str] | None = None,
|
||||||
) -> Iterable[TMeasurement]:
|
) -> Iterable[TMeasurement]:
|
||||||
"""Benchmark FP8-based kernels."""
|
"""Benchmark FP8-based kernels."""
|
||||||
assert dtype == torch.float8_e4m3fn
|
assert dtype == torch.float8_e4m3fn
|
||||||
@@ -158,7 +157,7 @@ def bench_fp8(
|
|||||||
"cutlass_fp8_fp8_fp16_scaled_mm_bias": lambda: ops.cutlass_scaled_mm(
|
"cutlass_fp8_fp8_fp16_scaled_mm_bias": lambda: ops.cutlass_scaled_mm(
|
||||||
a, b, scale_a, scale_b, torch.float16, bias.to(dtype=torch.float16)
|
a, b, scale_a, scale_b, torch.float16, bias.to(dtype=torch.float16)
|
||||||
),
|
),
|
||||||
"triton_fp8_fp8_fp16_scaled_mm_blockwise": lambda: w8a8_block_fp8_matmul(
|
"triton_fp8_fp8_fp16_scaled_mm_blockwise": lambda: w8a8_triton_block_scaled_mm(
|
||||||
a_cont, b.t(), block_scale_a, block_scale_b.t(), (128, 128)
|
a_cont, b.t(), block_scale_a, block_scale_b.t(), (128, 128)
|
||||||
),
|
),
|
||||||
"cutlass_fp8_fp8_fp16_scaled_mm_blockwise": lambda: ops.cutlass_scaled_mm(
|
"cutlass_fp8_fp8_fp16_scaled_mm_blockwise": lambda: ops.cutlass_scaled_mm(
|
||||||
@@ -183,7 +182,7 @@ def bench(
|
|||||||
n: int,
|
n: int,
|
||||||
label: str,
|
label: str,
|
||||||
sub_label: str,
|
sub_label: str,
|
||||||
bench_kernels: Optional[list[str]] = None,
|
bench_kernels: list[str] | None = None,
|
||||||
) -> Iterable[TMeasurement]:
|
) -> Iterable[TMeasurement]:
|
||||||
if dtype == torch.int8:
|
if dtype == torch.int8:
|
||||||
return bench_int8(dtype, m, k, n, label, sub_label, bench_kernels)
|
return bench_int8(dtype, m, k, n, label, sub_label, bench_kernels)
|
||||||
@@ -201,7 +200,7 @@ def print_timers(timers: Iterable[TMeasurement]):
|
|||||||
def run(
|
def run(
|
||||||
dtype: torch.dtype,
|
dtype: torch.dtype,
|
||||||
MKNs: Iterable[tuple[int, int, int]],
|
MKNs: Iterable[tuple[int, int, int]],
|
||||||
bench_kernels: Optional[list[str]] = None,
|
bench_kernels: list[str] | None = None,
|
||||||
) -> Iterable[TMeasurement]:
|
) -> Iterable[TMeasurement]:
|
||||||
results = []
|
results = []
|
||||||
for m, k, n in MKNs:
|
for m, k, n in MKNs:
|
||||||
|
|||||||
@@ -55,9 +55,7 @@ benchmark() {
|
|||||||
output_len=$2
|
output_len=$2
|
||||||
|
|
||||||
|
|
||||||
CUDA_VISIBLE_DEVICES=0 python3 \
|
CUDA_VISIBLE_DEVICES=0 vllm serve $model \
|
||||||
-m vllm.entrypoints.openai.api_server \
|
|
||||||
--model $model \
|
|
||||||
--port 8100 \
|
--port 8100 \
|
||||||
--max-model-len 10000 \
|
--max-model-len 10000 \
|
||||||
--gpu-memory-utilization 0.6 \
|
--gpu-memory-utilization 0.6 \
|
||||||
@@ -65,9 +63,7 @@ benchmark() {
|
|||||||
'{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
|
'{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
|
||||||
|
|
||||||
|
|
||||||
CUDA_VISIBLE_DEVICES=1 python3 \
|
CUDA_VISIBLE_DEVICES=1 vllm serve $model \
|
||||||
-m vllm.entrypoints.openai.api_server \
|
|
||||||
--model $model \
|
|
||||||
--port 8200 \
|
--port 8200 \
|
||||||
--max-model-len 10000 \
|
--max-model-len 10000 \
|
||||||
--gpu-memory-utilization 0.6 \
|
--gpu-memory-utilization 0.6 \
|
||||||
|
|||||||
@@ -38,16 +38,12 @@ wait_for_server() {
|
|||||||
launch_chunked_prefill() {
|
launch_chunked_prefill() {
|
||||||
model="meta-llama/Meta-Llama-3.1-8B-Instruct"
|
model="meta-llama/Meta-Llama-3.1-8B-Instruct"
|
||||||
# disagg prefill
|
# disagg prefill
|
||||||
CUDA_VISIBLE_DEVICES=0 python3 \
|
CUDA_VISIBLE_DEVICES=0 vllm serve $model \
|
||||||
-m vllm.entrypoints.openai.api_server \
|
|
||||||
--model $model \
|
|
||||||
--port 8100 \
|
--port 8100 \
|
||||||
--max-model-len 10000 \
|
--max-model-len 10000 \
|
||||||
--enable-chunked-prefill \
|
--enable-chunked-prefill \
|
||||||
--gpu-memory-utilization 0.6 &
|
--gpu-memory-utilization 0.6 &
|
||||||
CUDA_VISIBLE_DEVICES=1 python3 \
|
CUDA_VISIBLE_DEVICES=1 vllm serve $model \
|
||||||
-m vllm.entrypoints.openai.api_server \
|
|
||||||
--model $model \
|
|
||||||
--port 8200 \
|
--port 8200 \
|
||||||
--max-model-len 10000 \
|
--max-model-len 10000 \
|
||||||
--enable-chunked-prefill \
|
--enable-chunked-prefill \
|
||||||
@@ -62,18 +58,14 @@ launch_chunked_prefill() {
|
|||||||
launch_disagg_prefill() {
|
launch_disagg_prefill() {
|
||||||
model="meta-llama/Meta-Llama-3.1-8B-Instruct"
|
model="meta-llama/Meta-Llama-3.1-8B-Instruct"
|
||||||
# disagg prefill
|
# disagg prefill
|
||||||
CUDA_VISIBLE_DEVICES=0 python3 \
|
CUDA_VISIBLE_DEVICES=0 vllm serve $model \
|
||||||
-m vllm.entrypoints.openai.api_server \
|
|
||||||
--model $model \
|
|
||||||
--port 8100 \
|
--port 8100 \
|
||||||
--max-model-len 10000 \
|
--max-model-len 10000 \
|
||||||
--gpu-memory-utilization 0.6 \
|
--gpu-memory-utilization 0.6 \
|
||||||
--kv-transfer-config \
|
--kv-transfer-config \
|
||||||
'{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
|
'{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
|
||||||
|
|
||||||
CUDA_VISIBLE_DEVICES=1 python3 \
|
CUDA_VISIBLE_DEVICES=1 vllm serve $model \
|
||||||
-m vllm.entrypoints.openai.api_server \
|
|
||||||
--model $model \
|
|
||||||
--port 8200 \
|
--port 8200 \
|
||||||
--max-model-len 10000 \
|
--max-model-len 10000 \
|
||||||
--gpu-memory-utilization 0.6 \
|
--gpu-memory-utilization 0.6 \
|
||||||
|
|||||||
@@ -3,10 +3,9 @@
|
|||||||
|
|
||||||
import pickle as pkl
|
import pickle as pkl
|
||||||
import time
|
import time
|
||||||
from collections.abc import Iterable
|
from collections.abc import Callable, Iterable
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from itertools import product
|
from itertools import product
|
||||||
from typing import Callable, Optional
|
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
import torch.utils.benchmark as TBenchmark
|
import torch.utils.benchmark as TBenchmark
|
||||||
@@ -51,7 +50,7 @@ def get_bench_params() -> list[bench_params_t]:
|
|||||||
def unfused_int8_impl(
|
def unfused_int8_impl(
|
||||||
rms_norm_layer: RMSNorm,
|
rms_norm_layer: RMSNorm,
|
||||||
x: torch.Tensor,
|
x: torch.Tensor,
|
||||||
residual: Optional[torch.Tensor],
|
residual: torch.Tensor | None,
|
||||||
quant_dtype: torch.dtype,
|
quant_dtype: torch.dtype,
|
||||||
):
|
):
|
||||||
# Norm
|
# Norm
|
||||||
@@ -68,7 +67,7 @@ def unfused_int8_impl(
|
|||||||
def unfused_fp8_impl(
|
def unfused_fp8_impl(
|
||||||
rms_norm_layer: RMSNorm,
|
rms_norm_layer: RMSNorm,
|
||||||
x: torch.Tensor,
|
x: torch.Tensor,
|
||||||
residual: Optional[torch.Tensor],
|
residual: torch.Tensor | None,
|
||||||
quant_dtype: torch.dtype,
|
quant_dtype: torch.dtype,
|
||||||
):
|
):
|
||||||
# Norm
|
# Norm
|
||||||
@@ -85,7 +84,7 @@ def unfused_fp8_impl(
|
|||||||
def fused_impl(
|
def fused_impl(
|
||||||
rms_norm_layer: RMSNorm, # this stores the weights
|
rms_norm_layer: RMSNorm, # this stores the weights
|
||||||
x: torch.Tensor,
|
x: torch.Tensor,
|
||||||
residual: Optional[torch.Tensor],
|
residual: torch.Tensor | None,
|
||||||
quant_dtype: torch.dtype,
|
quant_dtype: torch.dtype,
|
||||||
):
|
):
|
||||||
out, _ = ops.rms_norm_dynamic_per_token_quant(
|
out, _ = ops.rms_norm_dynamic_per_token_quant(
|
||||||
|
|||||||
191
benchmarks/kernels/bench_mxfp4_qutlass.py
Normal file
191
benchmarks/kernels/bench_mxfp4_qutlass.py
Normal file
@@ -0,0 +1,191 @@
|
|||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
#
|
||||||
|
# Copyright (C) 2025 Roberto L. Castro (Roberto.LopezCastro@ist.ac.at).
|
||||||
|
# All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
#
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import copy
|
||||||
|
import itertools
|
||||||
|
|
||||||
|
import torch
|
||||||
|
from compressed_tensors.transform.utils.hadamard import deterministic_hadamard_matrix
|
||||||
|
from weight_shapes import WEIGHT_SHAPES
|
||||||
|
|
||||||
|
from vllm._custom_ops import fusedQuantizeMx, matmul_mxf4_bf16_tn
|
||||||
|
from vllm.model_executor.layers.quantization.qutlass_utils import to_blocked
|
||||||
|
from vllm.triton_utils import triton
|
||||||
|
|
||||||
|
PROVIDER_CFGS = {
|
||||||
|
"torch-bf16": dict(enabled=True),
|
||||||
|
"mxfp4": dict(no_a_quant=False, enabled=True),
|
||||||
|
"mxfp4-noquant": dict(no_a_quant=True, enabled=True),
|
||||||
|
}
|
||||||
|
|
||||||
|
_enabled = [k for k, v in PROVIDER_CFGS.items() if v["enabled"]]
|
||||||
|
|
||||||
|
|
||||||
|
def get_hadamard_matrix(group_size: int, dtype: torch.dtype, device: torch.device):
|
||||||
|
return (
|
||||||
|
deterministic_hadamard_matrix(group_size, dtype=dtype, device=device)
|
||||||
|
* group_size**-0.5
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _quant_weight_mxfp4(
|
||||||
|
b: torch.Tensor, forward_hadamard_matrix: torch.Tensor, device: str
|
||||||
|
):
|
||||||
|
weight_hf_e2m1, weight_hf_e8m0 = fusedQuantizeMx(
|
||||||
|
b, forward_hadamard_matrix, method="abs_max"
|
||||||
|
)
|
||||||
|
weight_hf_scale_block = to_blocked(weight_hf_e8m0, backend="triton")
|
||||||
|
return weight_hf_e2m1, weight_hf_scale_block
|
||||||
|
|
||||||
|
|
||||||
|
def build_mxfp4_runner(cfg, a, b, forward_hadamard_matrix, dtype, device):
|
||||||
|
weight_hf_e2m1, weight_hf_scale_block = _quant_weight_mxfp4(
|
||||||
|
b, forward_hadamard_matrix, device
|
||||||
|
)
|
||||||
|
alpha = torch.tensor([1.0], device="cuda")
|
||||||
|
|
||||||
|
if cfg["no_a_quant"]:
|
||||||
|
# Pre-quantize activation
|
||||||
|
input_hf_e2m1, input_hf_e8m0 = fusedQuantizeMx(
|
||||||
|
a, forward_hadamard_matrix, method="abs_max"
|
||||||
|
)
|
||||||
|
input_hf_scale_block = to_blocked(input_hf_e8m0, backend="triton")
|
||||||
|
|
||||||
|
def run():
|
||||||
|
return matmul_mxf4_bf16_tn(
|
||||||
|
input_hf_e2m1,
|
||||||
|
weight_hf_e2m1,
|
||||||
|
input_hf_scale_block,
|
||||||
|
weight_hf_scale_block,
|
||||||
|
alpha,
|
||||||
|
)
|
||||||
|
|
||||||
|
return run
|
||||||
|
|
||||||
|
# Quantize activation on-the-fly
|
||||||
|
def run():
|
||||||
|
input_hf_e2m1, input_hf_e8m0 = fusedQuantizeMx(
|
||||||
|
a, forward_hadamard_matrix, method="abs_max"
|
||||||
|
)
|
||||||
|
input_hf_scale_block = to_blocked(input_hf_e8m0, backend="triton")
|
||||||
|
return matmul_mxf4_bf16_tn(
|
||||||
|
input_hf_e2m1,
|
||||||
|
weight_hf_e2m1,
|
||||||
|
input_hf_scale_block,
|
||||||
|
weight_hf_scale_block,
|
||||||
|
alpha,
|
||||||
|
)
|
||||||
|
|
||||||
|
return run
|
||||||
|
|
||||||
|
|
||||||
|
@triton.testing.perf_report(
|
||||||
|
triton.testing.Benchmark(
|
||||||
|
x_names=["batch_size"],
|
||||||
|
x_vals=[
|
||||||
|
1,
|
||||||
|
4,
|
||||||
|
8,
|
||||||
|
16,
|
||||||
|
32,
|
||||||
|
64,
|
||||||
|
128,
|
||||||
|
256,
|
||||||
|
512,
|
||||||
|
1024,
|
||||||
|
2048,
|
||||||
|
4096,
|
||||||
|
8192,
|
||||||
|
16384,
|
||||||
|
24576,
|
||||||
|
32768,
|
||||||
|
],
|
||||||
|
x_log=False,
|
||||||
|
line_arg="provider",
|
||||||
|
line_vals=_enabled,
|
||||||
|
line_names=_enabled,
|
||||||
|
ylabel="TFLOP/s (larger is better)",
|
||||||
|
plot_name="BF16 vs MXFP4 GEMMs",
|
||||||
|
args={},
|
||||||
|
)
|
||||||
|
)
|
||||||
|
def benchmark(batch_size, provider, N, K, had_size):
|
||||||
|
M = batch_size
|
||||||
|
device = "cuda"
|
||||||
|
dtype = torch.bfloat16
|
||||||
|
|
||||||
|
a = torch.randn((M, K), device=device, dtype=dtype)
|
||||||
|
b = torch.randn((N, K), device=device, dtype=dtype)
|
||||||
|
forward_hadamard_matrix = get_hadamard_matrix(had_size, dtype, device)
|
||||||
|
|
||||||
|
quantiles = [0.5, 0.2, 0.8]
|
||||||
|
|
||||||
|
if provider == "torch-bf16":
|
||||||
|
ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
|
||||||
|
lambda: torch.nn.functional.linear(a, b), rep=200, quantiles=quantiles
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
cfg = PROVIDER_CFGS[provider]
|
||||||
|
run_quant = build_mxfp4_runner(
|
||||||
|
cfg, a, b, forward_hadamard_matrix, dtype, device
|
||||||
|
)
|
||||||
|
ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
|
||||||
|
lambda: run_quant(), rep=200, quantiles=quantiles
|
||||||
|
)
|
||||||
|
|
||||||
|
to_tflops = lambda t_ms: (2 * M * N * K) * 1e-12 / (t_ms * 1e-3)
|
||||||
|
return to_tflops(ms), to_tflops(max_ms), to_tflops(min_ms)
|
||||||
|
|
||||||
|
|
||||||
|
def prepare_shapes(args):
|
||||||
|
out = []
|
||||||
|
for model, tp_size in itertools.product(args.models, args.tp_sizes):
|
||||||
|
for KN, tp_dim in copy.deepcopy(WEIGHT_SHAPES[model]):
|
||||||
|
KN[tp_dim] //= tp_size
|
||||||
|
KN.append(model)
|
||||||
|
out.append(KN)
|
||||||
|
return out
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
parser.add_argument(
|
||||||
|
"--models",
|
||||||
|
nargs="+",
|
||||||
|
type=str,
|
||||||
|
default=["meta-llama/Llama-3.3-70B-Instruct"],
|
||||||
|
choices=list(WEIGHT_SHAPES.keys()),
|
||||||
|
)
|
||||||
|
parser.add_argument("--tp-sizes", nargs="+", type=int, default=[1])
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
for K, N, model in prepare_shapes(args):
|
||||||
|
for had_size in [32, 64, 128]:
|
||||||
|
print(f"{model}, N={N} K={K}, HAD={had_size}, BF16 vs MXFP4 GEMMs TFLOP/s:")
|
||||||
|
benchmark.run(
|
||||||
|
print_data=True,
|
||||||
|
show_plots=True,
|
||||||
|
save_path=f"bench_mxfp4_res_n{N}_k{K}",
|
||||||
|
N=N,
|
||||||
|
K=K,
|
||||||
|
had_size=had_size,
|
||||||
|
)
|
||||||
|
|
||||||
|
print("Benchmark finished!")
|
||||||
@@ -3,6 +3,7 @@
|
|||||||
import argparse
|
import argparse
|
||||||
import copy
|
import copy
|
||||||
import itertools
|
import itertools
|
||||||
|
import os
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
from weight_shapes import WEIGHT_SHAPES
|
from weight_shapes import WEIGHT_SHAPES
|
||||||
@@ -23,21 +24,45 @@ PROVIDER_CFGS = {
|
|||||||
"torch-bf16": dict(enabled=True),
|
"torch-bf16": dict(enabled=True),
|
||||||
"nvfp4": dict(no_a_quant=False, enabled=True),
|
"nvfp4": dict(no_a_quant=False, enabled=True),
|
||||||
"nvfp4-noquant": dict(no_a_quant=True, enabled=True),
|
"nvfp4-noquant": dict(no_a_quant=True, enabled=True),
|
||||||
|
"fbgemm-nvfp4": dict(fbgemm=True, no_a_quant=False, enabled=True),
|
||||||
|
"fbgemm-nvfp4-noquant": dict(fbgemm=True, no_a_quant=True, enabled=True),
|
||||||
}
|
}
|
||||||
|
|
||||||
|
_needs_fbgemm = any(
|
||||||
|
v.get("fbgemm", False) for v in PROVIDER_CFGS.values() if v.get("enabled", False)
|
||||||
|
)
|
||||||
|
if _needs_fbgemm:
|
||||||
|
try:
|
||||||
|
from fbgemm_gpu.experimental.gemm.triton_gemm.fp4_quantize import (
|
||||||
|
triton_scale_nvfp4_quant,
|
||||||
|
)
|
||||||
|
except ImportError:
|
||||||
|
print(
|
||||||
|
"WARNING: FBGEMM providers are enabled but fbgemm_gpu is not installed. "
|
||||||
|
"These providers will be skipped. Please install fbgemm_gpu with: "
|
||||||
|
"'pip install fbgemm-gpu-genai' to run them."
|
||||||
|
)
|
||||||
|
# Disable FBGEMM providers so the benchmark can run.
|
||||||
|
for cfg in PROVIDER_CFGS.values():
|
||||||
|
if cfg.get("fbgemm"):
|
||||||
|
cfg["enabled"] = False
|
||||||
|
|
||||||
_enabled = [k for k, v in PROVIDER_CFGS.items() if v["enabled"]]
|
_enabled = [k for k, v in PROVIDER_CFGS.items() if v["enabled"]]
|
||||||
|
|
||||||
|
|
||||||
def _quant_weight_nvfp4(b: torch.Tensor, device: str):
|
def _quant_weight_nvfp4(b: torch.Tensor, device: str, cfg):
|
||||||
# Compute global scale for weight
|
# Compute global scale for weight
|
||||||
b_amax = torch.abs(b).max().to(torch.float32)
|
b_amax = torch.abs(b).max().to(torch.float32)
|
||||||
b_global_scale = FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX / b_amax
|
b_global_scale = FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX / b_amax
|
||||||
b_fp4, scale_b_fp4 = ops.scaled_fp4_quant(b, b_global_scale)
|
if "fbgemm" in cfg and cfg["fbgemm"]:
|
||||||
|
b_fp4, scale_b_fp4 = triton_scale_nvfp4_quant(b, b_global_scale)
|
||||||
|
else:
|
||||||
|
b_fp4, scale_b_fp4 = ops.scaled_fp4_quant(b, b_global_scale)
|
||||||
return b_fp4, scale_b_fp4, b_global_scale
|
return b_fp4, scale_b_fp4, b_global_scale
|
||||||
|
|
||||||
|
|
||||||
def build_nvfp4_runner(cfg, a, b, dtype, device):
|
def build_nvfp4_runner(cfg, a, b, dtype, device):
|
||||||
b_fp4, scale_b_fp4, b_global_scale = _quant_weight_nvfp4(b, device)
|
b_fp4, scale_b_fp4, b_global_scale = _quant_weight_nvfp4(b, device, cfg)
|
||||||
|
|
||||||
# Compute global scale for activation
|
# Compute global scale for activation
|
||||||
# NOTE: This is generally provided ahead-of-time by the model checkpoint.
|
# NOTE: This is generally provided ahead-of-time by the model checkpoint.
|
||||||
@@ -46,6 +71,35 @@ def build_nvfp4_runner(cfg, a, b, dtype, device):
|
|||||||
|
|
||||||
# Alpha for the GEMM operation
|
# Alpha for the GEMM operation
|
||||||
alpha = 1.0 / (a_global_scale * b_global_scale)
|
alpha = 1.0 / (a_global_scale * b_global_scale)
|
||||||
|
if "fbgemm" in cfg and cfg["fbgemm"]:
|
||||||
|
if cfg["no_a_quant"]:
|
||||||
|
a_fp4, scale_a_fp4 = triton_scale_nvfp4_quant(a, a_global_scale)
|
||||||
|
|
||||||
|
def run():
|
||||||
|
return torch.ops.fbgemm.f4f4bf16(
|
||||||
|
a_fp4,
|
||||||
|
b_fp4,
|
||||||
|
scale_a_fp4,
|
||||||
|
scale_b_fp4,
|
||||||
|
global_scale=alpha,
|
||||||
|
use_mx=False,
|
||||||
|
)
|
||||||
|
|
||||||
|
return run
|
||||||
|
else:
|
||||||
|
|
||||||
|
def run():
|
||||||
|
a_fp4, scale_a_fp4 = triton_scale_nvfp4_quant(a, a_global_scale)
|
||||||
|
return torch.ops.fbgemm.f4f4bf16(
|
||||||
|
a_fp4,
|
||||||
|
b_fp4,
|
||||||
|
scale_a_fp4,
|
||||||
|
scale_b_fp4,
|
||||||
|
global_scale=alpha,
|
||||||
|
use_mx=False,
|
||||||
|
)
|
||||||
|
|
||||||
|
return run
|
||||||
|
|
||||||
if cfg["no_a_quant"]:
|
if cfg["no_a_quant"]:
|
||||||
# Pre-quantize activation
|
# Pre-quantize activation
|
||||||
@@ -130,10 +184,13 @@ if __name__ == "__main__":
|
|||||||
|
|
||||||
for K, N, model in prepare_shapes(args):
|
for K, N, model in prepare_shapes(args):
|
||||||
print(f"{model}, N={N} K={K}, BF16 vs NVFP4 GEMMs TFLOP/s:")
|
print(f"{model}, N={N} K={K}, BF16 vs NVFP4 GEMMs TFLOP/s:")
|
||||||
|
save_dir = f"bench_nvfp4_res_n{N}_k{K}"
|
||||||
|
os.makedirs(save_dir, exist_ok=True)
|
||||||
|
|
||||||
benchmark.run(
|
benchmark.run(
|
||||||
print_data=True,
|
print_data=True,
|
||||||
show_plots=True,
|
show_plots=True,
|
||||||
save_path=f"bench_nvfp4_res_n{N}_k{K}",
|
save_path=save_dir,
|
||||||
N=N,
|
N=N,
|
||||||
K=K,
|
K=K,
|
||||||
)
|
)
|
||||||
|
|||||||
207
benchmarks/kernels/bench_nvfp4_qutlass.py
Normal file
207
benchmarks/kernels/bench_nvfp4_qutlass.py
Normal file
@@ -0,0 +1,207 @@
|
|||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
#
|
||||||
|
# Copyright (C) 2025 Roberto L. Castro (Roberto.LopezCastro@ist.ac.at).
|
||||||
|
# All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
#
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import copy
|
||||||
|
import itertools
|
||||||
|
|
||||||
|
import torch
|
||||||
|
from compressed_tensors.transform.utils.hadamard import deterministic_hadamard_matrix
|
||||||
|
from weight_shapes import WEIGHT_SHAPES
|
||||||
|
|
||||||
|
from vllm import _custom_ops as ops # use existing nvfp4 gemm in vllm
|
||||||
|
from vllm._custom_ops import fusedQuantizeNv
|
||||||
|
from vllm.model_executor.layers.quantization.qutlass_utils import to_blocked
|
||||||
|
from vllm.triton_utils import triton
|
||||||
|
|
||||||
|
PROVIDER_CFGS = {
|
||||||
|
"torch-bf16": dict(enabled=True),
|
||||||
|
"nvfp4": dict(no_a_quant=False, enabled=True),
|
||||||
|
"nvfp4-noquant": dict(no_a_quant=True, enabled=True),
|
||||||
|
}
|
||||||
|
|
||||||
|
_enabled = [k for k, v in PROVIDER_CFGS.items() if v["enabled"]]
|
||||||
|
|
||||||
|
|
||||||
|
def get_hadamard_matrix(group_size: int, dtype: torch.dtype, device: torch.device):
|
||||||
|
return (
|
||||||
|
deterministic_hadamard_matrix(group_size, dtype=dtype, device=device)
|
||||||
|
* group_size**-0.5
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _quant_weight_nvfp4(
|
||||||
|
b: torch.Tensor,
|
||||||
|
forward_hadamard_matrix: torch.Tensor,
|
||||||
|
global_scale: torch.Tensor,
|
||||||
|
device: str,
|
||||||
|
M: int,
|
||||||
|
N: int,
|
||||||
|
K: int,
|
||||||
|
):
|
||||||
|
weight_hf_e2m1, weight_hf_e8m0 = fusedQuantizeNv(
|
||||||
|
b, forward_hadamard_matrix, global_scale
|
||||||
|
)
|
||||||
|
weight_hf_scale_block = to_blocked(weight_hf_e8m0, backend="triton").view(
|
||||||
|
-1, K // 16
|
||||||
|
)
|
||||||
|
return weight_hf_e2m1, weight_hf_scale_block
|
||||||
|
|
||||||
|
|
||||||
|
def build_nvfp4_runner(cfg, a, b, forward_hadamard_matrix, dtype, device, M, N, K):
|
||||||
|
alpha = torch.tensor([1.0], device="cuda")
|
||||||
|
global_scale = torch.tensor([1.0], device="cuda")
|
||||||
|
weight_hf_e2m1, weight_hf_scale_block = _quant_weight_nvfp4(
|
||||||
|
b, forward_hadamard_matrix, global_scale, device, M, N, K
|
||||||
|
)
|
||||||
|
|
||||||
|
if cfg["no_a_quant"]:
|
||||||
|
# Pre-quantize activation
|
||||||
|
input_hf_e2m1, input_hf_e8m0 = fusedQuantizeNv(
|
||||||
|
a, forward_hadamard_matrix, global_scale
|
||||||
|
)
|
||||||
|
input_hf_scale_block = to_blocked(input_hf_e8m0, backend="triton").view(
|
||||||
|
-1, K // 16
|
||||||
|
)
|
||||||
|
|
||||||
|
def run():
|
||||||
|
return ops.cutlass_scaled_fp4_mm(
|
||||||
|
input_hf_e2m1,
|
||||||
|
weight_hf_e2m1,
|
||||||
|
input_hf_scale_block,
|
||||||
|
weight_hf_scale_block,
|
||||||
|
alpha,
|
||||||
|
torch.bfloat16,
|
||||||
|
)
|
||||||
|
|
||||||
|
return run
|
||||||
|
|
||||||
|
# Quantize activation on-the-fly
|
||||||
|
def run():
|
||||||
|
input_hf_e2m1, input_hf_e8m0 = fusedQuantizeNv(
|
||||||
|
a, forward_hadamard_matrix, global_scale
|
||||||
|
)
|
||||||
|
input_hf_scale_block = to_blocked(input_hf_e8m0, backend="triton").view(
|
||||||
|
-1, K // 16
|
||||||
|
)
|
||||||
|
return ops.cutlass_scaled_fp4_mm(
|
||||||
|
input_hf_e2m1,
|
||||||
|
weight_hf_e2m1,
|
||||||
|
input_hf_scale_block,
|
||||||
|
weight_hf_scale_block,
|
||||||
|
alpha,
|
||||||
|
torch.bfloat16,
|
||||||
|
)
|
||||||
|
|
||||||
|
return run
|
||||||
|
|
||||||
|
|
||||||
|
@triton.testing.perf_report(
|
||||||
|
triton.testing.Benchmark(
|
||||||
|
x_names=["batch_size"],
|
||||||
|
x_vals=[
|
||||||
|
1,
|
||||||
|
4,
|
||||||
|
8,
|
||||||
|
16,
|
||||||
|
32,
|
||||||
|
64,
|
||||||
|
128,
|
||||||
|
256,
|
||||||
|
512,
|
||||||
|
1024,
|
||||||
|
2048,
|
||||||
|
4096,
|
||||||
|
8192,
|
||||||
|
16384,
|
||||||
|
24576,
|
||||||
|
32768,
|
||||||
|
],
|
||||||
|
x_log=False,
|
||||||
|
line_arg="provider",
|
||||||
|
line_vals=_enabled,
|
||||||
|
line_names=_enabled,
|
||||||
|
ylabel="TFLOP/s (larger is better)",
|
||||||
|
plot_name="BF16 vs NVFP4 GEMMs",
|
||||||
|
args={},
|
||||||
|
)
|
||||||
|
)
|
||||||
|
def benchmark(batch_size, provider, N, K, had_size):
|
||||||
|
M = batch_size
|
||||||
|
device = "cuda"
|
||||||
|
dtype = torch.bfloat16
|
||||||
|
|
||||||
|
a = torch.randn((M, K), device=device, dtype=dtype)
|
||||||
|
b = torch.randn((N, K), device=device, dtype=dtype)
|
||||||
|
forward_hadamard_matrix = get_hadamard_matrix(had_size, dtype, device)
|
||||||
|
|
||||||
|
quantiles = [0.5, 0.2, 0.8]
|
||||||
|
|
||||||
|
if provider == "torch-bf16":
|
||||||
|
ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
|
||||||
|
lambda: torch.nn.functional.linear(a, b), rep=200, quantiles=quantiles
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
cfg = PROVIDER_CFGS[provider]
|
||||||
|
run_quant = build_nvfp4_runner(
|
||||||
|
cfg, a, b, forward_hadamard_matrix, dtype, device, M, N, K
|
||||||
|
)
|
||||||
|
ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
|
||||||
|
lambda: run_quant(), rep=200, quantiles=quantiles
|
||||||
|
)
|
||||||
|
|
||||||
|
to_tflops = lambda t_ms: (2 * M * N * K) * 1e-12 / (t_ms * 1e-3)
|
||||||
|
return to_tflops(ms), to_tflops(max_ms), to_tflops(min_ms)
|
||||||
|
|
||||||
|
|
||||||
|
def prepare_shapes(args):
|
||||||
|
out = []
|
||||||
|
for model, tp_size in itertools.product(args.models, args.tp_sizes):
|
||||||
|
for KN, tp_dim in copy.deepcopy(WEIGHT_SHAPES[model]):
|
||||||
|
KN[tp_dim] //= tp_size
|
||||||
|
KN.append(model)
|
||||||
|
out.append(KN)
|
||||||
|
return out
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
parser.add_argument(
|
||||||
|
"--models",
|
||||||
|
nargs="+",
|
||||||
|
type=str,
|
||||||
|
default=["meta-llama/Llama-3.3-70B-Instruct"],
|
||||||
|
choices=list(WEIGHT_SHAPES.keys()),
|
||||||
|
)
|
||||||
|
parser.add_argument("--tp-sizes", nargs="+", type=int, default=[1])
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
for K, N, model in prepare_shapes(args):
|
||||||
|
for had_size in [16, 32, 64, 128]:
|
||||||
|
print(f"{model}, N={N} K={K}, HAD={had_size}, BF16 vs NVFP4 GEMMs TFLOP/s:")
|
||||||
|
benchmark.run(
|
||||||
|
print_data=True,
|
||||||
|
show_plots=True,
|
||||||
|
save_path=f"bench_nvfp4_res_n{N}_k{K}",
|
||||||
|
N=N,
|
||||||
|
K=K,
|
||||||
|
had_size=had_size,
|
||||||
|
)
|
||||||
|
|
||||||
|
print("Benchmark finished!")
|
||||||
@@ -1,15 +1,27 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
import itertools
|
import itertools
|
||||||
from typing import Callable
|
from collections.abc import Callable
|
||||||
|
from unittest.mock import patch
|
||||||
|
|
||||||
|
import pandas as pd
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
from vllm import _custom_ops as ops
|
|
||||||
from vllm.config import CompilationConfig, VllmConfig, set_current_vllm_config
|
|
||||||
from vllm.model_executor.layers.quantization.input_quant_fp8 import QuantFP8
|
from vllm.model_executor.layers.quantization.input_quant_fp8 import QuantFP8
|
||||||
from vllm.model_executor.layers.quantization.utils.quant_utils import GroupShape
|
from vllm.model_executor.layers.quantization.utils.quant_utils import GroupShape
|
||||||
from vllm.triton_utils import triton
|
from vllm.triton_utils import triton
|
||||||
|
from vllm.utils import FlexibleArgumentParser
|
||||||
|
from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE
|
||||||
|
|
||||||
|
|
||||||
|
def with_triton_mode(fn):
|
||||||
|
"""Temporarily force the Triton fallback path"""
|
||||||
|
|
||||||
|
def wrapped(*args, **kwargs):
|
||||||
|
with patch("vllm.platforms.current_platform.is_cuda", return_value=False):
|
||||||
|
return fn(*args, **kwargs)
|
||||||
|
|
||||||
|
return wrapped
|
||||||
|
|
||||||
|
|
||||||
# TODO(luka): use standalone_compile utility
|
# TODO(luka): use standalone_compile utility
|
||||||
@@ -21,78 +33,238 @@ def with_dyn_arg(fn: Callable, arg_index: int, dim_index: int):
|
|||||||
return inner
|
return inner
|
||||||
|
|
||||||
|
|
||||||
torch._dynamo.config.recompile_limit = 8888
|
def bench_compile(fn: Callable):
|
||||||
compilation_config = CompilationConfig(custom_ops=["none"])
|
# recompile for different shapes
|
||||||
with set_current_vllm_config(VllmConfig(compilation_config=compilation_config)):
|
fwd = torch.compile(fn, fullgraph=True, dynamic=False)
|
||||||
torch_per_token_quant_fp8 = torch.compile(
|
|
||||||
QuantFP8(False, GroupShape.PER_TOKEN),
|
|
||||||
fullgraph=True,
|
|
||||||
dynamic=False, # recompile for different shapes
|
|
||||||
)
|
|
||||||
|
|
||||||
# First dim is explicitly dynamic to simulate vLLM usage
|
# First dim is explicitly dynamic to simulate vLLM usage
|
||||||
torch_per_token_quant_fp8 = with_dyn_arg(torch_per_token_quant_fp8, 0, 0)
|
return with_dyn_arg(fwd, 0, 0)
|
||||||
|
|
||||||
|
|
||||||
def cuda_per_token_quant_fp8(
|
torch._dynamo.config.recompile_limit = 8888
|
||||||
input: torch.Tensor,
|
|
||||||
) -> tuple[torch.Tensor, torch.Tensor]:
|
|
||||||
return ops.scaled_fp8_quant(input)
|
|
||||||
|
|
||||||
|
|
||||||
def calculate_diff(batch_size: int, seq_len: int):
|
def calculate_diff(
|
||||||
"""Calculate difference between Triton and CUDA implementations."""
|
batch_size: int,
|
||||||
|
hidden_size: int,
|
||||||
|
group_shape: GroupShape,
|
||||||
|
dtype: torch.dtype,
|
||||||
|
):
|
||||||
|
"""Calculate the difference between Inductor and CUDA implementations."""
|
||||||
device = torch.device("cuda")
|
device = torch.device("cuda")
|
||||||
x = torch.rand((batch_size * seq_len, 4096), dtype=torch.float16, device=device)
|
x = torch.randn((batch_size, hidden_size), dtype=dtype, device=device)
|
||||||
|
|
||||||
torch_out, torch_scale = torch_per_token_quant_fp8(x)
|
quant_fp8 = QuantFP8(False, group_shape, column_major_scales=False)
|
||||||
cuda_out, cuda_scale = cuda_per_token_quant_fp8(x)
|
|
||||||
|
|
||||||
if torch.allclose(
|
torch_out, torch_scale = bench_compile(quant_fp8.forward_native)(x)
|
||||||
cuda_out.to(torch.float32), torch_out.to(torch.float32), rtol=1e-3, atol=1e-5
|
torch_eager_out, torch_eager_scale = quant_fp8.forward_native(x)
|
||||||
) and torch.allclose(cuda_scale, torch_scale, rtol=1e-3, atol=1e-5):
|
cuda_out, cuda_scale = quant_fp8.forward_cuda(x)
|
||||||
|
|
||||||
|
try:
|
||||||
|
torch.testing.assert_close(
|
||||||
|
cuda_out.to(torch.float32),
|
||||||
|
torch_out.to(torch.float32),
|
||||||
|
rtol=1e-3,
|
||||||
|
atol=1e-5,
|
||||||
|
)
|
||||||
|
torch.testing.assert_close(cuda_scale, torch_scale, rtol=1e-3, atol=1e-5)
|
||||||
|
torch.testing.assert_close(
|
||||||
|
cuda_out.to(torch.float32),
|
||||||
|
torch_eager_out.to(torch.float32),
|
||||||
|
rtol=1e-3,
|
||||||
|
atol=1e-5,
|
||||||
|
)
|
||||||
|
torch.testing.assert_close(cuda_scale, torch_eager_scale, rtol=1e-3, atol=1e-5)
|
||||||
print("✅ All implementations match")
|
print("✅ All implementations match")
|
||||||
else:
|
except AssertionError as e:
|
||||||
print("❌ Implementations differ")
|
print("❌ Implementations differ")
|
||||||
|
print(e)
|
||||||
|
|
||||||
|
|
||||||
batch_size_range = [1, 16, 32, 64, 128]
|
configs = []
|
||||||
seq_len_range = [1, 16, 64, 128, 256, 512, 1024, 2048, 4096]
|
|
||||||
|
|
||||||
configs = list(itertools.product(batch_size_range, seq_len_range))
|
|
||||||
|
|
||||||
|
|
||||||
@triton.testing.perf_report(
|
def benchmark_quantization(
|
||||||
triton.testing.Benchmark(
|
batch_size,
|
||||||
x_names=["batch_size", "seq_len"],
|
hidden_size,
|
||||||
x_vals=configs,
|
provider,
|
||||||
line_arg="provider",
|
group_shape: GroupShape,
|
||||||
line_vals=["torch", "cuda"],
|
col_major: bool,
|
||||||
line_names=["Torch", "CUDA"],
|
dtype: torch.dtype,
|
||||||
styles=[("blue", "-"), ("green", "-")],
|
):
|
||||||
ylabel="us",
|
|
||||||
plot_name="per-token-dynamic-quant-fp8-performance",
|
|
||||||
args={},
|
|
||||||
)
|
|
||||||
)
|
|
||||||
def benchmark_quantization(batch_size, seq_len, provider):
|
|
||||||
dtype = torch.float16
|
|
||||||
device = torch.device("cuda")
|
device = torch.device("cuda")
|
||||||
|
|
||||||
x = torch.randn(batch_size * seq_len, 4096, device=device, dtype=dtype)
|
x = torch.randn(batch_size, hidden_size, device=device, dtype=dtype)
|
||||||
|
|
||||||
quantiles = [0.5, 0.2, 0.8]
|
quantiles = [0.5, 0.2, 0.8]
|
||||||
|
quant_fp8 = QuantFP8(False, group_shape, column_major_scales=col_major)
|
||||||
|
|
||||||
if provider == "torch":
|
if provider == "torch":
|
||||||
fn = lambda: torch_per_token_quant_fp8(x.clone())
|
fn = lambda: bench_compile(quant_fp8.forward_native)(x.clone())
|
||||||
elif provider == "cuda":
|
elif provider == "cuda":
|
||||||
fn = lambda: cuda_per_token_quant_fp8(x.clone())
|
fn = lambda: quant_fp8.forward_cuda(x.clone())
|
||||||
|
elif provider == "triton":
|
||||||
|
if not group_shape.is_per_group():
|
||||||
|
# Triton only supported for per-group
|
||||||
|
return 0, 0, 0
|
||||||
|
|
||||||
|
fn = lambda: with_triton_mode(quant_fp8.forward_cuda)(x.clone())
|
||||||
|
|
||||||
ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(fn, quantiles=quantiles)
|
ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(fn, quantiles=quantiles)
|
||||||
|
|
||||||
return 1000 * ms, 1000 * max_ms, 1000 * min_ms
|
return 1000 * ms, 1000 * max_ms, 1000 * min_ms
|
||||||
|
|
||||||
|
|
||||||
|
# TODO(luka) extract to utils
|
||||||
|
def compute_geomean_speedups(
|
||||||
|
df: pd.DataFrame,
|
||||||
|
baseline_col: str,
|
||||||
|
speedup_cols: list[str],
|
||||||
|
groupby_cols: list[str] | None = None,
|
||||||
|
) -> pd.DataFrame:
|
||||||
|
"""
|
||||||
|
Compute geometric mean speedups over a baseline column.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
df: Input dataframe
|
||||||
|
baseline_col: Column to use as baseline
|
||||||
|
speedup_cols: Columns to compute speedups for
|
||||||
|
groupby_cols: Columns to group by. If None, compute over entire df.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
pd.DataFrame with geometric mean speedups
|
||||||
|
"""
|
||||||
|
from scipy.stats import gmean
|
||||||
|
|
||||||
|
def geo_speedup(group: pd.DataFrame) -> pd.Series:
|
||||||
|
ratios = {
|
||||||
|
col: (group[baseline_col] / group[col]).values for col in speedup_cols
|
||||||
|
}
|
||||||
|
return pd.Series({col: gmean(vals) for col, vals in ratios.items()})
|
||||||
|
|
||||||
|
if groupby_cols is None:
|
||||||
|
result = geo_speedup(df).to_frame().T
|
||||||
|
else:
|
||||||
|
result = (
|
||||||
|
df.groupby(groupby_cols)
|
||||||
|
.apply(geo_speedup, include_groups=False)
|
||||||
|
.reset_index()
|
||||||
|
)
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
calculate_diff(batch_size=4, seq_len=4096)
|
parser = FlexibleArgumentParser(
|
||||||
benchmark_quantization.run(print_data=True)
|
description="Benchmark the various implementations of QuantFP8 (dynamic-only)"
|
||||||
|
)
|
||||||
|
parser.add_argument("-c", "--check", action="store_true")
|
||||||
|
parser.add_argument(
|
||||||
|
"--dtype", type=str, choices=["half", "bfloat16", "float"], default="bfloat16"
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--hidden-sizes",
|
||||||
|
type=int,
|
||||||
|
nargs="+",
|
||||||
|
default=[896, 1024, 2048, 4096, 7168],
|
||||||
|
help="Hidden sizes to benchmark",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--batch-sizes",
|
||||||
|
type=int,
|
||||||
|
nargs="+",
|
||||||
|
default=[1, 16, 128, 512, 1024],
|
||||||
|
help="Batch sizes to benchmark",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--group-sizes",
|
||||||
|
type=int,
|
||||||
|
nargs="+",
|
||||||
|
default=None,
|
||||||
|
help="Group sizes for GroupShape(1,N) to benchmark. "
|
||||||
|
"Use 0 for PER_TENSOR, -1 for PER_TOKEN (default: 0,-1,64,128)",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--no-column-major",
|
||||||
|
action="store_true",
|
||||||
|
help="Disable column-major scales testing",
|
||||||
|
)
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
assert args
|
||||||
|
|
||||||
|
dtype = STR_DTYPE_TO_TORCH_DTYPE[args.dtype]
|
||||||
|
|
||||||
|
hidden_sizes = args.hidden_sizes
|
||||||
|
batch_sizes = args.batch_sizes
|
||||||
|
|
||||||
|
if args.group_sizes is not None:
|
||||||
|
group_shapes = []
|
||||||
|
for size in args.group_sizes:
|
||||||
|
if size == 0:
|
||||||
|
group_shapes.append(GroupShape.PER_TENSOR)
|
||||||
|
elif size == -1:
|
||||||
|
group_shapes.append(GroupShape.PER_TOKEN)
|
||||||
|
else:
|
||||||
|
group_shapes.append(GroupShape(1, size))
|
||||||
|
else:
|
||||||
|
group_shapes = [
|
||||||
|
GroupShape.PER_TENSOR,
|
||||||
|
GroupShape.PER_TOKEN,
|
||||||
|
GroupShape(1, 64),
|
||||||
|
GroupShape(1, 128),
|
||||||
|
]
|
||||||
|
|
||||||
|
column_major_scales = [False] if args.no_column_major else [True, False]
|
||||||
|
|
||||||
|
config_gen = itertools.product(
|
||||||
|
group_shapes,
|
||||||
|
column_major_scales,
|
||||||
|
batch_sizes,
|
||||||
|
hidden_sizes,
|
||||||
|
)
|
||||||
|
|
||||||
|
# filter out column-major scales for non-group, reverse order
|
||||||
|
configs.extend(c[::-1] for c in config_gen if (c[0].is_per_group() or not c[1]))
|
||||||
|
|
||||||
|
print(f"Running {len(configs)} configurations:")
|
||||||
|
print(f" Hidden sizes: {hidden_sizes}")
|
||||||
|
print(f" Batch sizes: {batch_sizes}")
|
||||||
|
print(f" Group shapes: {[str(g) for g in group_shapes]}")
|
||||||
|
print(f" Column major scales: {column_major_scales}")
|
||||||
|
print()
|
||||||
|
|
||||||
|
if args.check:
|
||||||
|
for group_shape in group_shapes:
|
||||||
|
group_size = group_shape[1]
|
||||||
|
print(f"{group_size=}")
|
||||||
|
calculate_diff(
|
||||||
|
batch_size=4, hidden_size=4096, group_shape=group_shape, dtype=dtype
|
||||||
|
)
|
||||||
|
|
||||||
|
benchmark = triton.testing.perf_report(
|
||||||
|
triton.testing.Benchmark(
|
||||||
|
x_names=["hidden_size", "batch_size", "col_major", "group_shape"],
|
||||||
|
x_vals=configs,
|
||||||
|
line_arg="provider",
|
||||||
|
line_vals=["torch", "cuda", "triton"],
|
||||||
|
line_names=["Torch (Compiled)", "CUDA", "Triton"],
|
||||||
|
styles=[("blue", "-"), ("green", "-"), ("black", "-")],
|
||||||
|
ylabel="us",
|
||||||
|
plot_name="QuantFP8 performance",
|
||||||
|
args={},
|
||||||
|
)
|
||||||
|
)(benchmark_quantization)
|
||||||
|
|
||||||
|
df = benchmark.run(print_data=True, dtype=dtype, return_df=True)
|
||||||
|
|
||||||
|
# Print geomean speedups
|
||||||
|
geo_table_grouped = compute_geomean_speedups(
|
||||||
|
df,
|
||||||
|
baseline_col="Torch (Compiled)",
|
||||||
|
speedup_cols=["CUDA", "Triton"],
|
||||||
|
groupby_cols=["col_major", "group_shape"],
|
||||||
|
)
|
||||||
|
|
||||||
|
print("Speedup over Torch (Compiled)")
|
||||||
|
print(geo_table_grouped.to_string(index=False))
|
||||||
|
|||||||
@@ -10,7 +10,8 @@ import vllm.model_executor.layers.activation # noqa F401
|
|||||||
from vllm.model_executor.custom_op import CustomOp
|
from vllm.model_executor.custom_op import CustomOp
|
||||||
from vllm.platforms import current_platform
|
from vllm.platforms import current_platform
|
||||||
from vllm.triton_utils import triton
|
from vllm.triton_utils import triton
|
||||||
from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser
|
from vllm.utils import FlexibleArgumentParser
|
||||||
|
from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE
|
||||||
|
|
||||||
batch_size_range = [1, 16, 32, 64, 128]
|
batch_size_range = [1, 16, 32, 64, 128]
|
||||||
seq_len_range = [1, 16, 64, 128, 256, 512, 1024, 2048, 4096]
|
seq_len_range = [1, 16, 64, 128, 256, 512, 1024, 2048, 4096]
|
||||||
|
|||||||
@@ -13,6 +13,10 @@ import torch.utils.benchmark as benchmark
|
|||||||
|
|
||||||
from vllm import _custom_ops as ops
|
from vllm import _custom_ops as ops
|
||||||
from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config
|
from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config
|
||||||
|
from vllm.model_executor.layers.fused_moe.config import (
|
||||||
|
fp8_w8a8_moe_quant_config,
|
||||||
|
nvfp4_moe_quant_config,
|
||||||
|
)
|
||||||
from vllm.model_executor.layers.fused_moe.cutlass_moe import cutlass_moe_fp4
|
from vllm.model_executor.layers.fused_moe.cutlass_moe import cutlass_moe_fp4
|
||||||
from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts, fused_topk
|
from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts, fused_topk
|
||||||
from vllm.scalar_type import scalar_types
|
from vllm.scalar_type import scalar_types
|
||||||
@@ -140,6 +144,12 @@ def bench_run(
|
|||||||
a_fp8_scale: torch.Tensor,
|
a_fp8_scale: torch.Tensor,
|
||||||
num_repeats: int,
|
num_repeats: int,
|
||||||
):
|
):
|
||||||
|
quant_config = fp8_w8a8_moe_quant_config(
|
||||||
|
w1_scale=w1_scale,
|
||||||
|
w2_scale=w2_scale,
|
||||||
|
a1_scale=a_fp8_scale,
|
||||||
|
)
|
||||||
|
|
||||||
for _ in range(num_repeats):
|
for _ in range(num_repeats):
|
||||||
fused_experts(
|
fused_experts(
|
||||||
a,
|
a,
|
||||||
@@ -147,10 +157,7 @@ def bench_run(
|
|||||||
w2,
|
w2,
|
||||||
topk_weights,
|
topk_weights,
|
||||||
topk_ids,
|
topk_ids,
|
||||||
use_fp8_w8a8=True,
|
quant_config=quant_config,
|
||||||
w1_scale=w1_scale,
|
|
||||||
w2_scale=w2_scale,
|
|
||||||
a1_scale=a_fp8_scale,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
def run_cutlass_moe_fp4(
|
def run_cutlass_moe_fp4(
|
||||||
@@ -172,25 +179,27 @@ def bench_run(
|
|||||||
device: torch.device,
|
device: torch.device,
|
||||||
num_repeats: int,
|
num_repeats: int,
|
||||||
):
|
):
|
||||||
|
quant_config = nvfp4_moe_quant_config(
|
||||||
|
a1_gscale=a1_gs,
|
||||||
|
a2_gscale=a2_gs,
|
||||||
|
w1_scale=w1_blockscale,
|
||||||
|
w2_scale=w2_blockscale,
|
||||||
|
g1_alphas=w1_gs,
|
||||||
|
g2_alphas=w2_gs,
|
||||||
|
)
|
||||||
for _ in range(num_repeats):
|
for _ in range(num_repeats):
|
||||||
with nvtx.annotate("cutlass_moe_fp4", color="green"):
|
with nvtx.annotate("cutlass_moe_fp4", color="green"):
|
||||||
cutlass_moe_fp4(
|
cutlass_moe_fp4(
|
||||||
a=a,
|
a=a,
|
||||||
a1_gscale=a1_gs,
|
|
||||||
a2_gscale=a2_gs,
|
|
||||||
w1_fp4=w1_fp4,
|
w1_fp4=w1_fp4,
|
||||||
w1_blockscale=w1_blockscale,
|
|
||||||
w1_alphas=w1_gs,
|
|
||||||
w2_fp4=w2_fp4,
|
w2_fp4=w2_fp4,
|
||||||
w2_blockscale=w2_blockscale,
|
|
||||||
w2_alphas=w2_gs,
|
|
||||||
topk_weights=topk_weights,
|
topk_weights=topk_weights,
|
||||||
topk_ids=topk_ids,
|
topk_ids=topk_ids,
|
||||||
m=m,
|
m=m,
|
||||||
n=n,
|
n=n,
|
||||||
k=k,
|
k=k,
|
||||||
e=num_experts,
|
e=num_experts,
|
||||||
device=device,
|
quant_config=quant_config,
|
||||||
)
|
)
|
||||||
|
|
||||||
def run_cutlass_from_graph(
|
def run_cutlass_from_graph(
|
||||||
@@ -211,26 +220,29 @@ def bench_run(
|
|||||||
e: int,
|
e: int,
|
||||||
device: torch.device,
|
device: torch.device,
|
||||||
):
|
):
|
||||||
|
quant_config = nvfp4_moe_quant_config(
|
||||||
|
a1_gscale=a1_gs,
|
||||||
|
a2_gscale=a2_gs,
|
||||||
|
w1_scale=w1_blockscale,
|
||||||
|
w2_scale=w2_blockscale,
|
||||||
|
g1_alphas=w1_gs,
|
||||||
|
g2_alphas=w2_gs,
|
||||||
|
)
|
||||||
|
|
||||||
with set_current_vllm_config(
|
with set_current_vllm_config(
|
||||||
VllmConfig(parallel_config=ParallelConfig(pipeline_parallel_size=1))
|
VllmConfig(parallel_config=ParallelConfig(pipeline_parallel_size=1))
|
||||||
):
|
):
|
||||||
return cutlass_moe_fp4(
|
return cutlass_moe_fp4(
|
||||||
a=a,
|
a=a,
|
||||||
a1_gscale=a1_gs,
|
|
||||||
w1_fp4=w1_fp4,
|
w1_fp4=w1_fp4,
|
||||||
w1_blockscale=w1_blockscale,
|
|
||||||
w1_alphas=w1_alphas,
|
|
||||||
a2_gscale=a2_gs,
|
|
||||||
w2_fp4=w2_fp4,
|
w2_fp4=w2_fp4,
|
||||||
w2_blockscale=w2_blockscale,
|
|
||||||
w2_alphas=w2_alphas,
|
|
||||||
topk_weights=topk_weights,
|
topk_weights=topk_weights,
|
||||||
topk_ids=topk_ids,
|
topk_ids=topk_ids,
|
||||||
m=m,
|
m=m,
|
||||||
n=n,
|
n=n,
|
||||||
k=k,
|
k=k,
|
||||||
e=num_experts,
|
e=num_experts,
|
||||||
device=device,
|
quant_config=quant_config,
|
||||||
)
|
)
|
||||||
|
|
||||||
def run_triton_from_graph(
|
def run_triton_from_graph(
|
||||||
@@ -246,16 +258,18 @@ def bench_run(
|
|||||||
with set_current_vllm_config(
|
with set_current_vllm_config(
|
||||||
VllmConfig(parallel_config=ParallelConfig(pipeline_parallel_size=1))
|
VllmConfig(parallel_config=ParallelConfig(pipeline_parallel_size=1))
|
||||||
):
|
):
|
||||||
|
quant_config = fp8_w8a8_moe_quant_config(
|
||||||
|
w1_scale=w1_scale,
|
||||||
|
w2_scale=w2_scale,
|
||||||
|
a1_scale=a_fp8_scale,
|
||||||
|
)
|
||||||
return fused_experts(
|
return fused_experts(
|
||||||
a,
|
a,
|
||||||
w1,
|
w1,
|
||||||
w2,
|
w2,
|
||||||
topk_weights,
|
topk_weights,
|
||||||
topk_ids,
|
topk_ids,
|
||||||
use_fp8_w8a8=True,
|
quant_config=quant_config,
|
||||||
w1_scale=w1_scale,
|
|
||||||
w2_scale=w2_scale,
|
|
||||||
a1_scale=a_fp8_scale,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
def replay_graph(graph, num_repeats):
|
def replay_graph(graph, num_repeats):
|
||||||
|
|||||||
406
benchmarks/kernels/benchmark_cutlass_moe_fp8.py
Normal file
406
benchmarks/kernels/benchmark_cutlass_moe_fp8.py
Normal file
@@ -0,0 +1,406 @@
|
|||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
"""
|
||||||
|
Benchmark the performance of the cutlass_moe_fp8 kernel vs the triton_moe
|
||||||
|
kernel. Both kernels take in fp8 quantized weights and 16-bit activations,
|
||||||
|
but use different quantization strategies and backends.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import nvtx
|
||||||
|
import torch
|
||||||
|
|
||||||
|
from vllm import _custom_ops as ops
|
||||||
|
from vllm.model_executor.layers.fused_moe.config import fp8_w8a8_moe_quant_config
|
||||||
|
from vllm.model_executor.layers.fused_moe.cutlass_moe import cutlass_moe_fp8
|
||||||
|
from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts, fused_topk
|
||||||
|
from vllm.platforms import current_platform
|
||||||
|
from vllm.utils import FlexibleArgumentParser
|
||||||
|
|
||||||
|
# Weight shapes for different models: [num_experts, topk, hidden_size,
|
||||||
|
# intermediate_size]
|
||||||
|
WEIGHT_SHAPES_MOE = {
|
||||||
|
"mixtral-8x7b": [
|
||||||
|
[8, 2, 4096, 14336],
|
||||||
|
],
|
||||||
|
"deepseek-v2": [
|
||||||
|
[160, 6, 5120, 12288],
|
||||||
|
],
|
||||||
|
"custom-small": [
|
||||||
|
[8, 2, 2048, 7168],
|
||||||
|
],
|
||||||
|
"glm45-fp8": [
|
||||||
|
[128, 8, 4096, 1408],
|
||||||
|
],
|
||||||
|
"Llama-4-Maverick-17B-128E-Instruct-FP8": [
|
||||||
|
[128, 1, 5120, 8192],
|
||||||
|
],
|
||||||
|
}
|
||||||
|
|
||||||
|
DEFAULT_MODELS = [
|
||||||
|
"mixtral-8x7b",
|
||||||
|
]
|
||||||
|
|
||||||
|
DEFAULT_BATCH_SIZES = [4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048]
|
||||||
|
DEFAULT_TP_SIZES = [1]
|
||||||
|
|
||||||
|
PER_ACT_TOKEN_OPTS = [False, True]
|
||||||
|
PER_OUT_CH_OPTS = [False, True]
|
||||||
|
|
||||||
|
FP8_DTYPE = current_platform.fp8_dtype()
|
||||||
|
|
||||||
|
|
||||||
|
def bench_run(
|
||||||
|
results: list,
|
||||||
|
model: str,
|
||||||
|
num_experts: int,
|
||||||
|
topk: int,
|
||||||
|
per_act_token: bool,
|
||||||
|
per_out_ch: bool,
|
||||||
|
mkn: tuple[int, int, int],
|
||||||
|
):
|
||||||
|
(m, k, n) = mkn
|
||||||
|
|
||||||
|
dtype = torch.half
|
||||||
|
device = "cuda"
|
||||||
|
|
||||||
|
# Create input activations
|
||||||
|
a = torch.randn((m, k), device=device, dtype=dtype) / 10
|
||||||
|
|
||||||
|
# Create weights
|
||||||
|
w1 = torch.randn((num_experts, 2 * n, k), device=device, dtype=dtype) / 10
|
||||||
|
w2 = torch.randn((num_experts, k, n), device=device, dtype=dtype) / 10
|
||||||
|
|
||||||
|
# Create FP8 quantized weights and scales for both kernels
|
||||||
|
w1_fp8q = torch.empty((num_experts, 2 * n, k), device=device, dtype=FP8_DTYPE)
|
||||||
|
w2_fp8q = torch.empty((num_experts, k, n), device=device, dtype=FP8_DTYPE)
|
||||||
|
|
||||||
|
# Create scales based on quantization strategy
|
||||||
|
if per_out_ch:
|
||||||
|
# Per-channel quantization
|
||||||
|
w1_scale = torch.empty(
|
||||||
|
(num_experts, 2 * n, 1), device=device, dtype=torch.float32
|
||||||
|
)
|
||||||
|
w2_scale = torch.empty((num_experts, k, 1), device=device, dtype=torch.float32)
|
||||||
|
else:
|
||||||
|
# Per-tensor quantization
|
||||||
|
w1_scale = torch.empty((num_experts, 1, 1), device=device, dtype=torch.float32)
|
||||||
|
w2_scale = torch.empty((num_experts, 1, 1), device=device, dtype=torch.float32)
|
||||||
|
|
||||||
|
# Quantize weights
|
||||||
|
for expert in range(num_experts):
|
||||||
|
if per_out_ch:
|
||||||
|
# Per-channel quantization - not yet implemented properly
|
||||||
|
# For now, fall back to per-tensor quantization
|
||||||
|
w1_fp8q[expert], w1_scale_temp = ops.scaled_fp8_quant(w1[expert])
|
||||||
|
w2_fp8q[expert], w2_scale_temp = ops.scaled_fp8_quant(w2[expert])
|
||||||
|
# Expand scalar scales to the expected per-channel shape
|
||||||
|
w1_scale[expert] = w1_scale_temp.expand(2 * n, 1)
|
||||||
|
w2_scale[expert] = w2_scale_temp.expand(k, 1)
|
||||||
|
else:
|
||||||
|
# Per-tensor quantization
|
||||||
|
w1_fp8q[expert], w1_scale_temp = ops.scaled_fp8_quant(w1[expert])
|
||||||
|
w2_fp8q[expert], w2_scale_temp = ops.scaled_fp8_quant(w2[expert])
|
||||||
|
# Store scalar scales in [1, 1] tensors
|
||||||
|
w1_scale[expert, 0, 0] = w1_scale_temp
|
||||||
|
w2_scale[expert, 0, 0] = w2_scale_temp
|
||||||
|
|
||||||
|
# Prepare weights for CUTLASS (no transpose needed)
|
||||||
|
w1_fp8q_cutlass = w1_fp8q # Keep original [E, 2N, K]
|
||||||
|
w2_fp8q_cutlass = w2_fp8q # Keep original [E, K, N]
|
||||||
|
|
||||||
|
# Create router scores and get topk
|
||||||
|
score = torch.randn((m, num_experts), device=device, dtype=dtype)
|
||||||
|
topk_weights, topk_ids, _ = fused_topk(a, score, topk, renormalize=False)
|
||||||
|
|
||||||
|
# WORKAROUND: CUTLASS MoE FP8 has issues with per-token quantization
|
||||||
|
# Force per-tensor quantization for all cases to match working e2e setup
|
||||||
|
a1_scale = torch.full((), 1e-2, device=device, dtype=torch.float32)
|
||||||
|
a2_scale = torch.full((), 1e-2, device=device, dtype=torch.float32)
|
||||||
|
|
||||||
|
# Force per-tensor quantization for all cases
|
||||||
|
per_act_token = False
|
||||||
|
|
||||||
|
# Create stride tensors for CUTLASS
|
||||||
|
ab_strides1 = torch.full((num_experts,), k, dtype=torch.int64, device=device)
|
||||||
|
ab_strides2 = torch.full((num_experts,), n, dtype=torch.int64, device=device)
|
||||||
|
c_strides1 = torch.full((num_experts,), 2 * n, dtype=torch.int64, device=device)
|
||||||
|
c_strides2 = torch.full((num_experts,), k, dtype=torch.int64, device=device)
|
||||||
|
|
||||||
|
def run_triton_moe(
|
||||||
|
a: torch.Tensor,
|
||||||
|
w1: torch.Tensor,
|
||||||
|
w2: torch.Tensor,
|
||||||
|
topk_weights: torch.Tensor,
|
||||||
|
topk_ids: torch.Tensor,
|
||||||
|
w1_scale: torch.Tensor,
|
||||||
|
w2_scale: torch.Tensor,
|
||||||
|
a1_scale: torch.Tensor,
|
||||||
|
a2_scale: torch.Tensor,
|
||||||
|
num_repeats: int,
|
||||||
|
):
|
||||||
|
quant_config = fp8_w8a8_moe_quant_config(
|
||||||
|
w1_scale=w1_scale,
|
||||||
|
w2_scale=w2_scale,
|
||||||
|
a1_scale=a1_scale,
|
||||||
|
a2_scale=a2_scale,
|
||||||
|
per_act_token_quant=per_act_token,
|
||||||
|
per_out_ch_quant=per_out_ch,
|
||||||
|
)
|
||||||
|
|
||||||
|
for _ in range(num_repeats):
|
||||||
|
fused_experts(
|
||||||
|
a,
|
||||||
|
w1,
|
||||||
|
w2,
|
||||||
|
topk_weights,
|
||||||
|
topk_ids,
|
||||||
|
quant_config=quant_config,
|
||||||
|
)
|
||||||
|
|
||||||
|
def run_cutlass_moe_fp8(
|
||||||
|
a: torch.Tensor,
|
||||||
|
w1: torch.Tensor,
|
||||||
|
w2: torch.Tensor,
|
||||||
|
topk_weights: torch.Tensor,
|
||||||
|
topk_ids: torch.Tensor,
|
||||||
|
ab_strides1: torch.Tensor,
|
||||||
|
ab_strides2: torch.Tensor,
|
||||||
|
c_strides1: torch.Tensor,
|
||||||
|
c_strides2: torch.Tensor,
|
||||||
|
w1_scale: torch.Tensor,
|
||||||
|
w2_scale: torch.Tensor,
|
||||||
|
a1_scale: torch.Tensor,
|
||||||
|
a2_scale: torch.Tensor,
|
||||||
|
num_repeats: int,
|
||||||
|
):
|
||||||
|
quant_config = fp8_w8a8_moe_quant_config(
|
||||||
|
w1_scale=w1_scale,
|
||||||
|
w2_scale=w2_scale,
|
||||||
|
a1_scale=a1_scale,
|
||||||
|
a2_scale=a2_scale,
|
||||||
|
per_act_token_quant=per_act_token,
|
||||||
|
per_out_ch_quant=per_out_ch,
|
||||||
|
)
|
||||||
|
|
||||||
|
for _ in range(num_repeats):
|
||||||
|
with nvtx.annotate("cutlass_moe_fp8", color="blue"):
|
||||||
|
cutlass_moe_fp8(
|
||||||
|
a=a,
|
||||||
|
w1_q=w1,
|
||||||
|
w2_q=w2,
|
||||||
|
topk_weights=topk_weights,
|
||||||
|
topk_ids=topk_ids,
|
||||||
|
ab_strides1=ab_strides1,
|
||||||
|
ab_strides2=ab_strides2,
|
||||||
|
c_strides1=c_strides1,
|
||||||
|
c_strides2=c_strides2,
|
||||||
|
quant_config=quant_config,
|
||||||
|
activation="silu",
|
||||||
|
global_num_experts=num_experts,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Pre-create quantization config to avoid creating it inside CUDA graph
|
||||||
|
quant_config = fp8_w8a8_moe_quant_config(
|
||||||
|
w1_scale=w1_scale,
|
||||||
|
w2_scale=w2_scale,
|
||||||
|
a1_scale=a1_scale,
|
||||||
|
a2_scale=a2_scale,
|
||||||
|
per_act_token_quant=per_act_token,
|
||||||
|
per_out_ch_quant=per_out_ch,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Create CUDA graphs for CUTLASS (match benchmark_moe.py pattern exactly)
|
||||||
|
cutlass_stream = torch.cuda.Stream()
|
||||||
|
cutlass_graph = torch.cuda.CUDAGraph()
|
||||||
|
with torch.cuda.graph(cutlass_graph, stream=cutlass_stream):
|
||||||
|
# Capture 10 invocations like benchmark_moe.py
|
||||||
|
for _ in range(10):
|
||||||
|
cutlass_moe_fp8(
|
||||||
|
a=a,
|
||||||
|
w1_q=w1_fp8q_cutlass,
|
||||||
|
w2_q=w2_fp8q_cutlass,
|
||||||
|
topk_weights=topk_weights,
|
||||||
|
topk_ids=topk_ids,
|
||||||
|
ab_strides1=ab_strides1,
|
||||||
|
ab_strides2=ab_strides2,
|
||||||
|
c_strides1=c_strides1,
|
||||||
|
c_strides2=c_strides2,
|
||||||
|
quant_config=quant_config,
|
||||||
|
activation="silu",
|
||||||
|
global_num_experts=num_experts,
|
||||||
|
)
|
||||||
|
torch.cuda.synchronize()
|
||||||
|
|
||||||
|
# Create CUDA graphs for Triton (match benchmark_moe.py pattern exactly)
|
||||||
|
triton_stream = torch.cuda.Stream()
|
||||||
|
triton_graph = torch.cuda.CUDAGraph()
|
||||||
|
with torch.cuda.graph(triton_graph, stream=triton_stream):
|
||||||
|
# Capture 10 invocations like benchmark_moe.py
|
||||||
|
for _ in range(10):
|
||||||
|
fused_experts(
|
||||||
|
a,
|
||||||
|
w1_fp8q,
|
||||||
|
w2_fp8q,
|
||||||
|
topk_weights,
|
||||||
|
topk_ids,
|
||||||
|
quant_config=quant_config,
|
||||||
|
)
|
||||||
|
torch.cuda.synchronize()
|
||||||
|
|
||||||
|
def bench_cuda_graph(graph, num_warmup=5, num_iters=100):
|
||||||
|
"""Benchmark CUDA graph using events like benchmark_moe.py"""
|
||||||
|
# Warmup
|
||||||
|
for _ in range(num_warmup):
|
||||||
|
graph.replay()
|
||||||
|
torch.cuda.synchronize()
|
||||||
|
|
||||||
|
# Timing
|
||||||
|
start_event = torch.cuda.Event(enable_timing=True)
|
||||||
|
end_event = torch.cuda.Event(enable_timing=True)
|
||||||
|
|
||||||
|
latencies = []
|
||||||
|
for _ in range(num_iters):
|
||||||
|
torch.cuda.synchronize()
|
||||||
|
start_event.record()
|
||||||
|
graph.replay()
|
||||||
|
end_event.record()
|
||||||
|
end_event.synchronize()
|
||||||
|
latencies.append(start_event.elapsed_time(end_event))
|
||||||
|
|
||||||
|
# Divide by 10 since graph contains 10 calls
|
||||||
|
return sum(latencies) / (num_iters * 10)
|
||||||
|
|
||||||
|
# Benchmark parameters
|
||||||
|
num_warmup = 5
|
||||||
|
num_iters = 100
|
||||||
|
|
||||||
|
# Benchmark only CUDA graphs (more reliable and faster)
|
||||||
|
# Benchmark Triton MoE with CUDA graphs
|
||||||
|
triton_graph_time = bench_cuda_graph(
|
||||||
|
triton_graph, num_warmup=num_warmup, num_iters=num_iters
|
||||||
|
)
|
||||||
|
|
||||||
|
# Benchmark CUTLASS MoE with CUDA graphs
|
||||||
|
cutlass_graph_time = bench_cuda_graph(
|
||||||
|
cutlass_graph, num_warmup=num_warmup, num_iters=num_iters
|
||||||
|
)
|
||||||
|
|
||||||
|
# Convert ms to us and return results
|
||||||
|
triton_time_us = triton_graph_time * 1000
|
||||||
|
cutlass_time_us = cutlass_graph_time * 1000
|
||||||
|
|
||||||
|
return {
|
||||||
|
"batch_size": m,
|
||||||
|
"triton_time_us": triton_time_us,
|
||||||
|
"cutlass_time_us": cutlass_time_us,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def main(args):
|
||||||
|
print("Benchmarking models:")
|
||||||
|
for i, model in enumerate(args.models):
|
||||||
|
print(f"[{i}] {model}")
|
||||||
|
|
||||||
|
all_results = []
|
||||||
|
|
||||||
|
for model in args.models:
|
||||||
|
for tp in args.tp_sizes:
|
||||||
|
for layer in WEIGHT_SHAPES_MOE[model]:
|
||||||
|
num_experts = layer[0]
|
||||||
|
topk = layer[1]
|
||||||
|
size_k = layer[2]
|
||||||
|
size_n = layer[3] // tp
|
||||||
|
|
||||||
|
if len(args.limit_k) > 0 and size_k not in args.limit_k:
|
||||||
|
continue
|
||||||
|
|
||||||
|
if len(args.limit_n) > 0 and size_n not in args.limit_n:
|
||||||
|
continue
|
||||||
|
|
||||||
|
for per_act_token in args.per_act_token_opts:
|
||||||
|
for per_out_ch in args.per_out_ch_opts:
|
||||||
|
print(
|
||||||
|
f"\n=== {model}, experts={num_experts}, topk={topk},"
|
||||||
|
f"per_act={per_act_token}, per_out_ch={per_out_ch} ==="
|
||||||
|
)
|
||||||
|
|
||||||
|
config_results = []
|
||||||
|
for size_m in args.batch_sizes:
|
||||||
|
mkn = (size_m, size_k, size_n)
|
||||||
|
result = bench_run(
|
||||||
|
[], # Not used anymore
|
||||||
|
model,
|
||||||
|
num_experts,
|
||||||
|
topk,
|
||||||
|
per_act_token,
|
||||||
|
per_out_ch,
|
||||||
|
mkn,
|
||||||
|
)
|
||||||
|
if result:
|
||||||
|
config_results.append(result)
|
||||||
|
|
||||||
|
# Print results table for this configuration
|
||||||
|
if config_results:
|
||||||
|
print(
|
||||||
|
f"\n{'Batch Size':<12}"
|
||||||
|
f"{'Triton (us)':<15}"
|
||||||
|
f"{'CUTLASS (us)':<15}"
|
||||||
|
)
|
||||||
|
print("-" * 45)
|
||||||
|
for result in config_results:
|
||||||
|
print(
|
||||||
|
f"{result['batch_size']:<12}"
|
||||||
|
f"{result['triton_time_us']:<15.2f}"
|
||||||
|
f"{result['cutlass_time_us']:<15.2f}"
|
||||||
|
)
|
||||||
|
|
||||||
|
all_results.extend(config_results)
|
||||||
|
|
||||||
|
print(f"\nTotal benchmarks completed: {len(all_results)}")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
parser = FlexibleArgumentParser(
|
||||||
|
description="""Benchmark CUTLASS FP8 MOE vs Triton FP8 FUSED MOE
|
||||||
|
across specified models/shapes/batches
|
||||||
|
|
||||||
|
Example usage:
|
||||||
|
python benchmark_cutlass_moe_fp8.py \
|
||||||
|
--model "Llama-4-Maverick-17B-128E-Instruct-FP8" \
|
||||||
|
--tp-sizes 8 \
|
||||||
|
--batch-size 2 4 8 \
|
||||||
|
--per-act-token-opts false \
|
||||||
|
--per-out-ch-opts false
|
||||||
|
|
||||||
|
"""
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--models",
|
||||||
|
nargs="+",
|
||||||
|
type=str,
|
||||||
|
default=DEFAULT_MODELS,
|
||||||
|
choices=WEIGHT_SHAPES_MOE.keys(),
|
||||||
|
)
|
||||||
|
parser.add_argument("--tp-sizes", nargs="+", type=int, default=DEFAULT_TP_SIZES)
|
||||||
|
parser.add_argument(
|
||||||
|
"--batch-sizes", nargs="+", type=int, default=DEFAULT_BATCH_SIZES
|
||||||
|
)
|
||||||
|
parser.add_argument("--limit-k", nargs="+", type=int, default=[])
|
||||||
|
parser.add_argument("--limit-n", nargs="+", type=int, default=[])
|
||||||
|
parser.add_argument(
|
||||||
|
"--per-act-token-opts",
|
||||||
|
nargs="+",
|
||||||
|
type=lambda x: x.lower() == "true",
|
||||||
|
default=[False, True],
|
||||||
|
help="Per-activation token quantization options (true/false)",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--per-out-ch-opts",
|
||||||
|
nargs="+",
|
||||||
|
type=lambda x: x.lower() == "true",
|
||||||
|
default=[False, True],
|
||||||
|
help="Per-output channel quantization options (true/false)",
|
||||||
|
)
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
main(args)
|
||||||
@@ -7,6 +7,10 @@ Benchmark script for device communicators:
|
|||||||
CustomAllreduce (oneshot, twoshot), PyNcclCommunicator,
|
CustomAllreduce (oneshot, twoshot), PyNcclCommunicator,
|
||||||
and SymmMemCommunicator (multimem, two-shot).
|
and SymmMemCommunicator (multimem, two-shot).
|
||||||
|
|
||||||
|
for NCCL symmetric memory you need to set the environment variables
|
||||||
|
NCCL_NVLS_ENABLE=1 NCCL_CUMEM_ENABLE=1 VLLM_USE_NCCL_SYMM_MEM=1, otherwise NCCL does
|
||||||
|
not use fast NVLS implementation for all reduce.
|
||||||
|
|
||||||
Usage:
|
Usage:
|
||||||
torchrun --nproc_per_node=<N> benchmark_device_communicators.py [options]
|
torchrun --nproc_per_node=<N> benchmark_device_communicators.py [options]
|
||||||
|
|
||||||
@@ -18,15 +22,21 @@ Example:
|
|||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
import time
|
import time
|
||||||
|
from collections.abc import Callable
|
||||||
from contextlib import nullcontext
|
from contextlib import nullcontext
|
||||||
from typing import Callable, Optional
|
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
import torch.distributed as dist
|
import torch.distributed as dist
|
||||||
from torch.distributed import ProcessGroup
|
from torch.distributed import ProcessGroup
|
||||||
|
|
||||||
from vllm.distributed.device_communicators.custom_all_reduce import CustomAllreduce
|
from vllm.distributed.device_communicators.custom_all_reduce import CustomAllreduce
|
||||||
from vllm.distributed.device_communicators.pynccl import PyNcclCommunicator
|
from vllm.distributed.device_communicators.pynccl import (
|
||||||
|
PyNcclCommunicator,
|
||||||
|
register_nccl_symmetric_ops,
|
||||||
|
)
|
||||||
|
from vllm.distributed.device_communicators.pynccl_allocator import (
|
||||||
|
set_graph_pool_id,
|
||||||
|
)
|
||||||
from vllm.distributed.device_communicators.symm_mem import SymmMemCommunicator
|
from vllm.distributed.device_communicators.symm_mem import SymmMemCommunicator
|
||||||
from vllm.logger import init_logger
|
from vllm.logger import init_logger
|
||||||
from vllm.utils import FlexibleArgumentParser
|
from vllm.utils import FlexibleArgumentParser
|
||||||
@@ -98,6 +108,7 @@ class CommunicatorBenchmark:
|
|||||||
)
|
)
|
||||||
if not self.pynccl_comm.disabled:
|
if not self.pynccl_comm.disabled:
|
||||||
logger.info("Rank %s: PyNcclCommunicator initialized", self.rank)
|
logger.info("Rank %s: PyNcclCommunicator initialized", self.rank)
|
||||||
|
register_nccl_symmetric_ops(self.pynccl_comm)
|
||||||
else:
|
else:
|
||||||
logger.info("Rank %s: PyNcclCommunicator disabled", self.rank)
|
logger.info("Rank %s: PyNcclCommunicator disabled", self.rank)
|
||||||
self.pynccl_comm = None
|
self.pynccl_comm = None
|
||||||
@@ -194,6 +205,15 @@ class CommunicatorBenchmark:
|
|||||||
None, # no env variable needed
|
None, # no env variable needed
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
communicators.append(
|
||||||
|
(
|
||||||
|
"pynccl-symm",
|
||||||
|
lambda t: torch.ops.vllm.all_reduce_symmetric_with_copy(t),
|
||||||
|
lambda t: True, # Always available if initialized
|
||||||
|
nullcontext(),
|
||||||
|
None, # no env variable needed
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
if self.symm_mem_comm_multimem is not None:
|
if self.symm_mem_comm_multimem is not None:
|
||||||
comm = self.symm_mem_comm_multimem
|
comm = self.symm_mem_comm_multimem
|
||||||
@@ -244,12 +264,12 @@ class CommunicatorBenchmark:
|
|||||||
def benchmark_allreduce_single(
|
def benchmark_allreduce_single(
|
||||||
self,
|
self,
|
||||||
sequence_length: int,
|
sequence_length: int,
|
||||||
allreduce_fn: Callable[[torch.Tensor], Optional[torch.Tensor]],
|
allreduce_fn: Callable[[torch.Tensor], torch.Tensor | None],
|
||||||
should_use_fn: Callable[[torch.Tensor], bool],
|
should_use_fn: Callable[[torch.Tensor], bool],
|
||||||
context,
|
context,
|
||||||
num_warmup: int,
|
num_warmup: int,
|
||||||
num_trials: int,
|
num_trials: int,
|
||||||
) -> Optional[float]:
|
) -> float | None:
|
||||||
"""Benchmark method with CUDA graph optimization."""
|
"""Benchmark method with CUDA graph optimization."""
|
||||||
try:
|
try:
|
||||||
# Create test tensor (2D: sequence_length x hidden_size)
|
# Create test tensor (2D: sequence_length x hidden_size)
|
||||||
@@ -271,7 +291,9 @@ class CommunicatorBenchmark:
|
|||||||
# Capture the graph using context manager
|
# Capture the graph using context manager
|
||||||
with context:
|
with context:
|
||||||
graph = torch.cuda.CUDAGraph()
|
graph = torch.cuda.CUDAGraph()
|
||||||
with torch.cuda.graph(graph):
|
graph_pool = torch.cuda.graph_pool_handle()
|
||||||
|
set_graph_pool_id(graph_pool)
|
||||||
|
with torch.cuda.graph(graph, pool=graph_pool):
|
||||||
for _ in range(CUDA_GRAPH_CAPTURE_CYCLES):
|
for _ in range(CUDA_GRAPH_CAPTURE_CYCLES):
|
||||||
allreduce_fn(graph_input)
|
allreduce_fn(graph_input)
|
||||||
|
|
||||||
|
|||||||
@@ -7,6 +7,7 @@ from benchmark_shapes import WEIGHT_SHAPES_MOE
|
|||||||
|
|
||||||
from vllm import _custom_ops as ops
|
from vllm import _custom_ops as ops
|
||||||
from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config
|
from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config
|
||||||
|
from vllm.model_executor.layers.fused_moe.config import fp8_w8a8_moe_quant_config
|
||||||
from vllm.model_executor.layers.fused_moe.cutlass_moe import cutlass_moe_fp8
|
from vllm.model_executor.layers.fused_moe.cutlass_moe import cutlass_moe_fp8
|
||||||
from vllm.model_executor.layers.fused_moe.fused_moe import (
|
from vllm.model_executor.layers.fused_moe.fused_moe import (
|
||||||
fused_experts,
|
fused_experts,
|
||||||
@@ -96,6 +97,11 @@ def bench_run(
|
|||||||
a_scale: torch.Tensor,
|
a_scale: torch.Tensor,
|
||||||
num_repeats: int,
|
num_repeats: int,
|
||||||
):
|
):
|
||||||
|
quant_config = fp8_w8a8_moe_quant_config(
|
||||||
|
w1_scale=w1_scale,
|
||||||
|
w2_scale=w2_scale,
|
||||||
|
a1_scale=a_scale,
|
||||||
|
)
|
||||||
for _ in range(num_repeats):
|
for _ in range(num_repeats):
|
||||||
fused_experts(
|
fused_experts(
|
||||||
a,
|
a,
|
||||||
@@ -103,10 +109,7 @@ def bench_run(
|
|||||||
w2,
|
w2,
|
||||||
topk_weights,
|
topk_weights,
|
||||||
topk_ids,
|
topk_ids,
|
||||||
use_fp8_w8a8=True,
|
quant_config=quant_config,
|
||||||
w1_scale=w1_scale,
|
|
||||||
w2_scale=w2_scale,
|
|
||||||
a1_scale=a_scale,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
def run_cutlass_moe(
|
def run_cutlass_moe(
|
||||||
@@ -125,6 +128,12 @@ def bench_run(
|
|||||||
per_act_token: bool,
|
per_act_token: bool,
|
||||||
num_repeats: int,
|
num_repeats: int,
|
||||||
):
|
):
|
||||||
|
quant_config = fp8_w8a8_moe_quant_config(
|
||||||
|
w1_scale=w1_scale,
|
||||||
|
w2_scale=w2_scale,
|
||||||
|
per_act_token_quant=per_act_token,
|
||||||
|
)
|
||||||
|
|
||||||
for _ in range(num_repeats):
|
for _ in range(num_repeats):
|
||||||
cutlass_moe_fp8(
|
cutlass_moe_fp8(
|
||||||
a,
|
a,
|
||||||
@@ -132,14 +141,11 @@ def bench_run(
|
|||||||
w2,
|
w2,
|
||||||
topk_weights,
|
topk_weights,
|
||||||
topk_ids,
|
topk_ids,
|
||||||
w1_scale,
|
|
||||||
w2_scale,
|
|
||||||
ab_strides1,
|
ab_strides1,
|
||||||
ab_strides2,
|
ab_strides2,
|
||||||
c_strides1,
|
c_strides1,
|
||||||
c_strides2,
|
c_strides2,
|
||||||
per_act_token,
|
quant_config=quant_config,
|
||||||
a1_scale=None,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
def run_cutlass_from_graph(
|
def run_cutlass_from_graph(
|
||||||
@@ -156,6 +162,12 @@ def bench_run(
|
|||||||
topk_weights: torch.Tensor,
|
topk_weights: torch.Tensor,
|
||||||
topk_ids: torch.Tensor,
|
topk_ids: torch.Tensor,
|
||||||
):
|
):
|
||||||
|
quant_config = fp8_w8a8_moe_quant_config(
|
||||||
|
w1_scale=w1_scale,
|
||||||
|
w2_scale=w2_scale,
|
||||||
|
per_act_token_quant=per_act_token,
|
||||||
|
)
|
||||||
|
|
||||||
with set_current_vllm_config(
|
with set_current_vllm_config(
|
||||||
VllmConfig(parallel_config=ParallelConfig(pipeline_parallel_size=1))
|
VllmConfig(parallel_config=ParallelConfig(pipeline_parallel_size=1))
|
||||||
):
|
):
|
||||||
@@ -165,14 +177,11 @@ def bench_run(
|
|||||||
w2_q,
|
w2_q,
|
||||||
topk_weights,
|
topk_weights,
|
||||||
topk_ids,
|
topk_ids,
|
||||||
w1_scale,
|
|
||||||
w2_scale,
|
|
||||||
ab_strides1,
|
ab_strides1,
|
||||||
ab_strides2,
|
ab_strides2,
|
||||||
c_strides1,
|
c_strides1,
|
||||||
c_strides2,
|
c_strides2,
|
||||||
per_act_token,
|
quant_config=quant_config,
|
||||||
a1_scale=None,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
def run_triton_from_graph(
|
def run_triton_from_graph(
|
||||||
@@ -185,6 +194,11 @@ def bench_run(
|
|||||||
w2_scale: torch.Tensor,
|
w2_scale: torch.Tensor,
|
||||||
a_scale: torch.Tensor,
|
a_scale: torch.Tensor,
|
||||||
):
|
):
|
||||||
|
quant_config = fp8_w8a8_moe_quant_config(
|
||||||
|
w1_scale=w1_scale,
|
||||||
|
w2_scale=w2_scale,
|
||||||
|
a1_scale=a_scale,
|
||||||
|
)
|
||||||
with set_current_vllm_config(
|
with set_current_vllm_config(
|
||||||
VllmConfig(parallel_config=ParallelConfig(pipeline_parallel_size=1))
|
VllmConfig(parallel_config=ParallelConfig(pipeline_parallel_size=1))
|
||||||
):
|
):
|
||||||
@@ -194,10 +208,7 @@ def bench_run(
|
|||||||
w2,
|
w2,
|
||||||
topk_weights,
|
topk_weights,
|
||||||
topk_ids,
|
topk_ids,
|
||||||
use_fp8_w8a8=True,
|
quant_config=quant_config,
|
||||||
w1_scale=w1_scale,
|
|
||||||
w2_scale=w2_scale,
|
|
||||||
a1_scale=a_scale,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
def replay_graph(graph, num_repeats):
|
def replay_graph(graph, num_repeats):
|
||||||
|
|||||||
@@ -7,7 +7,8 @@ import torch
|
|||||||
|
|
||||||
from vllm.model_executor.layers.layernorm import RMSNorm
|
from vllm.model_executor.layers.layernorm import RMSNorm
|
||||||
from vllm.platforms import current_platform
|
from vllm.platforms import current_platform
|
||||||
from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser
|
from vllm.utils import FlexibleArgumentParser
|
||||||
|
from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE
|
||||||
|
|
||||||
|
|
||||||
@torch.inference_mode()
|
@torch.inference_mode()
|
||||||
|
|||||||
@@ -6,11 +6,12 @@ import copy
|
|||||||
import json
|
import json
|
||||||
import pickle
|
import pickle
|
||||||
import time
|
import time
|
||||||
|
from collections.abc import Callable
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from enum import Enum, auto
|
from enum import Enum, auto
|
||||||
from itertools import product
|
from itertools import product
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Any, Callable, Optional
|
from typing import Any
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
import torch.utils.benchmark as TBenchmark
|
import torch.utils.benchmark as TBenchmark
|
||||||
@@ -79,9 +80,9 @@ def make_rand_lora_weight_tensor(
|
|||||||
|
|
||||||
|
|
||||||
def make_rand_tensors(
|
def make_rand_tensors(
|
||||||
a_shape: tuple[int],
|
a_shape: tuple[int, ...],
|
||||||
b_shape: tuple[int],
|
b_shape: tuple[int, ...],
|
||||||
c_shape: tuple[int],
|
c_shape: tuple[int, ...],
|
||||||
a_dtype: torch.dtype,
|
a_dtype: torch.dtype,
|
||||||
b_dtype: torch.dtype,
|
b_dtype: torch.dtype,
|
||||||
c_dtype: torch.dtype,
|
c_dtype: torch.dtype,
|
||||||
@@ -158,7 +159,7 @@ def ref_group_gemm(
|
|||||||
seq_lens_cpu: torch.Tensor,
|
seq_lens_cpu: torch.Tensor,
|
||||||
prompt_lora_mapping_cpu: torch.Tensor,
|
prompt_lora_mapping_cpu: torch.Tensor,
|
||||||
scaling: float,
|
scaling: float,
|
||||||
add_inputs: Optional[bool],
|
add_inputs: bool | None,
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Torch group gemm reference implementation to test correctness of
|
Torch group gemm reference implementation to test correctness of
|
||||||
@@ -243,7 +244,7 @@ class OpType(Enum):
|
|||||||
lora_rank: int,
|
lora_rank: int,
|
||||||
num_loras: int,
|
num_loras: int,
|
||||||
num_slices: int,
|
num_slices: int,
|
||||||
) -> tuple[tuple[int], tuple[int], tuple[int]]:
|
) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...]]:
|
||||||
"""
|
"""
|
||||||
Given num_slices, return the shapes of the A, B, and C matrices
|
Given num_slices, return the shapes of the A, B, and C matrices
|
||||||
in A x B = C, for the op_type
|
in A x B = C, for the op_type
|
||||||
@@ -316,8 +317,8 @@ class BenchmarkContext:
|
|||||||
lora_rank: int
|
lora_rank: int
|
||||||
sort_by_lora_id: bool
|
sort_by_lora_id: bool
|
||||||
dtype: torch.dtype
|
dtype: torch.dtype
|
||||||
seq_length: Optional[int] = None
|
seq_length: int | None = None
|
||||||
num_slices: Optional[int] = None # num_slices for slice based ops
|
num_slices: int | None = None # num_slices for slice based ops
|
||||||
|
|
||||||
def with_seq_length(self, seq_length: int) -> "BenchmarkContext":
|
def with_seq_length(self, seq_length: int) -> "BenchmarkContext":
|
||||||
ctx = copy.copy(self)
|
ctx = copy.copy(self)
|
||||||
@@ -464,7 +465,11 @@ class BenchmarkTensors:
|
|||||||
for field_name in LoRAKernelMeta.__dataclass_fields__:
|
for field_name in LoRAKernelMeta.__dataclass_fields__:
|
||||||
field = getattr(self.lora_kernel_meta, field_name)
|
field = getattr(self.lora_kernel_meta, field_name)
|
||||||
assert isinstance(field, torch.Tensor)
|
assert isinstance(field, torch.Tensor)
|
||||||
setattr(self.lora_kernel_meta, field_name, to_device(field))
|
setattr(
|
||||||
|
self.lora_kernel_meta,
|
||||||
|
field_name,
|
||||||
|
to_device(field) if field_name != "no_lora_flag_cpu" else field,
|
||||||
|
)
|
||||||
|
|
||||||
def metadata(self) -> tuple[int, int, int]:
|
def metadata(self) -> tuple[int, int, int]:
|
||||||
"""
|
"""
|
||||||
@@ -512,6 +517,7 @@ class BenchmarkTensors:
|
|||||||
"lora_token_start_loc": self.lora_kernel_meta.lora_token_start_loc,
|
"lora_token_start_loc": self.lora_kernel_meta.lora_token_start_loc,
|
||||||
"lora_ids": self.lora_kernel_meta.active_lora_ids,
|
"lora_ids": self.lora_kernel_meta.active_lora_ids,
|
||||||
"scaling": 1.0,
|
"scaling": 1.0,
|
||||||
|
"no_lora_flag_cpu": self.lora_kernel_meta.no_lora_flag_cpu,
|
||||||
}
|
}
|
||||||
|
|
||||||
def as_lora_expand_kwargs(self, add_inputs: bool) -> dict[str, Any]:
|
def as_lora_expand_kwargs(self, add_inputs: bool) -> dict[str, Any]:
|
||||||
@@ -552,10 +558,11 @@ class BenchmarkTensors:
|
|||||||
"lora_ids": self.lora_kernel_meta.active_lora_ids,
|
"lora_ids": self.lora_kernel_meta.active_lora_ids,
|
||||||
"offset_start": 0,
|
"offset_start": 0,
|
||||||
"add_inputs": add_inputs,
|
"add_inputs": add_inputs,
|
||||||
|
"no_lora_flag_cpu": self.lora_kernel_meta.no_lora_flag_cpu,
|
||||||
}
|
}
|
||||||
|
|
||||||
def bench_fn_kwargs(
|
def bench_fn_kwargs(
|
||||||
self, op_type: OpType, add_inputs: Optional[bool] = None
|
self, op_type: OpType, add_inputs: bool | None = None
|
||||||
) -> dict[str, Any]:
|
) -> dict[str, Any]:
|
||||||
if op_type.is_shrink_fn():
|
if op_type.is_shrink_fn():
|
||||||
assert add_inputs is None
|
assert add_inputs is None
|
||||||
@@ -569,7 +576,7 @@ class BenchmarkTensors:
|
|||||||
raise ValueError(f"Unrecognized optype {self}")
|
raise ValueError(f"Unrecognized optype {self}")
|
||||||
|
|
||||||
def test_correctness(
|
def test_correctness(
|
||||||
self, op_type: OpType, expand_fn_add_inputs: Optional[bool]
|
self, op_type: OpType, expand_fn_add_inputs: bool | None
|
||||||
) -> bool:
|
) -> bool:
|
||||||
"""
|
"""
|
||||||
Test correctness of op_type implementation against a grouped gemm
|
Test correctness of op_type implementation against a grouped gemm
|
||||||
@@ -605,8 +612,8 @@ def bench_optype(
|
|||||||
ctx: BenchmarkContext,
|
ctx: BenchmarkContext,
|
||||||
arg_pool_size: int,
|
arg_pool_size: int,
|
||||||
op_type: OpType,
|
op_type: OpType,
|
||||||
cuda_graph_nops: Optional[int] = None,
|
cuda_graph_nops: int | None = None,
|
||||||
expand_fn_add_inputs: Optional[bool] = None,
|
expand_fn_add_inputs: bool | None = None,
|
||||||
test_correctness: bool = False,
|
test_correctness: bool = False,
|
||||||
) -> TMeasurement:
|
) -> TMeasurement:
|
||||||
assert arg_pool_size >= 1
|
assert arg_pool_size >= 1
|
||||||
@@ -673,7 +680,7 @@ def bench_torch_mm(
|
|||||||
ctx: BenchmarkContext,
|
ctx: BenchmarkContext,
|
||||||
arg_pool_size: int,
|
arg_pool_size: int,
|
||||||
op_type: OpType,
|
op_type: OpType,
|
||||||
cuda_graph_nops: Optional[int] = None,
|
cuda_graph_nops: int | None = None,
|
||||||
) -> TMeasurement:
|
) -> TMeasurement:
|
||||||
"""
|
"""
|
||||||
Benchmark basic torch.mm as a roofline.
|
Benchmark basic torch.mm as a roofline.
|
||||||
@@ -738,7 +745,7 @@ def use_cuda_graph_recommendation() -> str:
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
def print_timers(timers: list[TMeasurement], args: Optional[argparse.Namespace] = None):
|
def print_timers(timers: list[TMeasurement], args: argparse.Namespace | None = None):
|
||||||
compare = TBenchmark.Compare(timers)
|
compare = TBenchmark.Compare(timers)
|
||||||
compare.print()
|
compare.print()
|
||||||
|
|
||||||
|
|||||||
@@ -8,10 +8,9 @@ import math
|
|||||||
import os
|
import os
|
||||||
import pickle as pkl
|
import pickle as pkl
|
||||||
import time
|
import time
|
||||||
from collections.abc import Iterable
|
from collections.abc import Callable, Iterable
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from itertools import product
|
from itertools import product
|
||||||
from typing import Callable, Optional
|
|
||||||
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
import torch
|
import torch
|
||||||
@@ -63,23 +62,23 @@ class BenchmarkTensors:
|
|||||||
a: torch.Tensor
|
a: torch.Tensor
|
||||||
|
|
||||||
w_q: torch.Tensor
|
w_q: torch.Tensor
|
||||||
group_size: Optional[int]
|
group_size: int | None
|
||||||
wtype: ScalarType
|
wtype: ScalarType
|
||||||
w_g_s: torch.Tensor
|
w_g_s: torch.Tensor
|
||||||
w_g_zp: Optional[torch.Tensor]
|
w_g_zp: torch.Tensor | None
|
||||||
w_ch_s: Optional[torch.Tensor]
|
w_ch_s: torch.Tensor | None
|
||||||
w_tok_s: Optional[torch.Tensor]
|
w_tok_s: torch.Tensor | None
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class TypeConfig:
|
class TypeConfig:
|
||||||
act_type: torch.dtype
|
act_type: torch.dtype
|
||||||
weight_type: ScalarType
|
weight_type: ScalarType
|
||||||
output_type: Optional[torch.dtype]
|
output_type: torch.dtype | None
|
||||||
group_scale_type: Optional[torch.dtype]
|
group_scale_type: torch.dtype | None
|
||||||
group_zero_type: Optional[torch.dtype]
|
group_zero_type: torch.dtype | None
|
||||||
channel_scale_type: Optional[torch.dtype]
|
channel_scale_type: torch.dtype | None
|
||||||
token_scale_type: Optional[torch.dtype]
|
token_scale_type: torch.dtype | None
|
||||||
|
|
||||||
|
|
||||||
def rand_data(shape, dtype=torch.float16, scale=1):
|
def rand_data(shape, dtype=torch.float16, scale=1):
|
||||||
@@ -93,8 +92,8 @@ def quantize_and_pack(
|
|||||||
atype: torch.dtype,
|
atype: torch.dtype,
|
||||||
w: torch.Tensor,
|
w: torch.Tensor,
|
||||||
wtype: ScalarType,
|
wtype: ScalarType,
|
||||||
stype: Optional[torch.dtype],
|
stype: torch.dtype | None,
|
||||||
group_size: Optional[int],
|
group_size: int | None,
|
||||||
zero_points: bool = False,
|
zero_points: bool = False,
|
||||||
):
|
):
|
||||||
assert wtype.is_integer(), "TODO: support floating point weights"
|
assert wtype.is_integer(), "TODO: support floating point weights"
|
||||||
@@ -113,7 +112,7 @@ def quantize_and_pack(
|
|||||||
|
|
||||||
|
|
||||||
def create_bench_tensors(
|
def create_bench_tensors(
|
||||||
shape: tuple[int, int, int], types: TypeConfig, group_size: Optional[int]
|
shape: tuple[int, int, int], types: TypeConfig, group_size: int | None
|
||||||
) -> list[BenchmarkTensors]:
|
) -> list[BenchmarkTensors]:
|
||||||
m, n, k = shape
|
m, n, k = shape
|
||||||
|
|
||||||
@@ -331,8 +330,8 @@ def bench_fns(label: str, sub_label: str, description: str, fns: list[Callable])
|
|||||||
return res
|
return res
|
||||||
|
|
||||||
|
|
||||||
_SWEEP_SCHEDULES_RESULTS: Optional[pd.DataFrame] = None
|
_SWEEP_SCHEDULES_RESULTS: pd.DataFrame | None = None
|
||||||
_SWEEP_SCHEDULES_RESULTS_CSV: Optional[str] = None
|
_SWEEP_SCHEDULES_RESULTS_CSV: str | None = None
|
||||||
|
|
||||||
|
|
||||||
def bench(
|
def bench(
|
||||||
|
|||||||
@@ -14,6 +14,10 @@ import ray
|
|||||||
import torch
|
import torch
|
||||||
from ray.experimental.tqdm_ray import tqdm
|
from ray.experimental.tqdm_ray import tqdm
|
||||||
|
|
||||||
|
from vllm.model_executor.layers.fused_moe.config import (
|
||||||
|
FusedMoEQuantConfig,
|
||||||
|
_get_config_dtype_str,
|
||||||
|
)
|
||||||
from vllm.model_executor.layers.fused_moe.fused_moe import *
|
from vllm.model_executor.layers.fused_moe.fused_moe import *
|
||||||
from vllm.platforms import current_platform
|
from vllm.platforms import current_platform
|
||||||
from vllm.transformers_utils.config import get_config
|
from vllm.transformers_utils.config import get_config
|
||||||
@@ -134,43 +138,36 @@ def benchmark_config(
|
|||||||
def run():
|
def run():
|
||||||
from vllm.model_executor.layers.fused_moe import override_config
|
from vllm.model_executor.layers.fused_moe import override_config
|
||||||
|
|
||||||
|
if use_fp8_w8a8:
|
||||||
|
quant_dtype = torch.float8_e4m3fn
|
||||||
|
elif use_int8_w8a16:
|
||||||
|
quant_dtype = torch.int8
|
||||||
|
else:
|
||||||
|
quant_dtype = None
|
||||||
|
|
||||||
|
quant_config = FusedMoEQuantConfig.make(
|
||||||
|
quant_dtype=quant_dtype,
|
||||||
|
w1_scale=w1_scale,
|
||||||
|
w2_scale=w2_scale,
|
||||||
|
a1_scale=a1_scale,
|
||||||
|
a2_scale=a2_scale,
|
||||||
|
block_shape=block_quant_shape,
|
||||||
|
)
|
||||||
|
|
||||||
with override_config(config):
|
with override_config(config):
|
||||||
if use_deep_gemm:
|
topk_weights, topk_ids, token_expert_indices = fused_topk(
|
||||||
topk_weights, topk_ids, token_expert_indices = fused_topk(
|
x, input_gating, topk, renormalize=not use_deep_gemm
|
||||||
x, input_gating, topk, False
|
)
|
||||||
)
|
return fused_experts(
|
||||||
return fused_experts(
|
x,
|
||||||
x,
|
w1,
|
||||||
w1,
|
w2,
|
||||||
w2,
|
topk_weights,
|
||||||
topk_weights,
|
topk_ids,
|
||||||
topk_ids,
|
inplace=True,
|
||||||
inplace=True,
|
quant_config=quant_config,
|
||||||
use_fp8_w8a8=use_fp8_w8a8,
|
allow_deep_gemm=use_deep_gemm,
|
||||||
w1_scale=w1_scale,
|
)
|
||||||
w2_scale=w2_scale,
|
|
||||||
a1_scale=a1_scale,
|
|
||||||
a2_scale=a2_scale,
|
|
||||||
block_shape=block_quant_shape,
|
|
||||||
allow_deep_gemm=True,
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
fused_moe(
|
|
||||||
x,
|
|
||||||
w1,
|
|
||||||
w2,
|
|
||||||
input_gating,
|
|
||||||
topk,
|
|
||||||
renormalize=True,
|
|
||||||
inplace=True,
|
|
||||||
use_fp8_w8a8=use_fp8_w8a8,
|
|
||||||
use_int8_w8a16=use_int8_w8a16,
|
|
||||||
w1_scale=w1_scale,
|
|
||||||
w2_scale=w2_scale,
|
|
||||||
a1_scale=a1_scale,
|
|
||||||
a2_scale=a2_scale,
|
|
||||||
block_shape=block_quant_shape,
|
|
||||||
)
|
|
||||||
|
|
||||||
# JIT compilation & warmup
|
# JIT compilation & warmup
|
||||||
run()
|
run()
|
||||||
@@ -414,7 +411,7 @@ class BenchmarkWorker:
|
|||||||
use_deep_gemm: bool = False,
|
use_deep_gemm: bool = False,
|
||||||
) -> tuple[dict[str, int], float]:
|
) -> tuple[dict[str, int], float]:
|
||||||
current_platform.seed_everything(self.seed)
|
current_platform.seed_everything(self.seed)
|
||||||
dtype_str = get_config_dtype_str(
|
dtype_str = _get_config_dtype_str(
|
||||||
dtype, use_int8_w8a16=use_int8_w8a16, use_fp8_w8a8=use_fp8_w8a8
|
dtype, use_int8_w8a16=use_int8_w8a16, use_fp8_w8a8=use_fp8_w8a8
|
||||||
)
|
)
|
||||||
# NOTE(woosuk): The current naming convention uses w2.shape[2], which
|
# NOTE(woosuk): The current naming convention uses w2.shape[2], which
|
||||||
@@ -547,7 +544,7 @@ def save_configs(
|
|||||||
block_quant_shape: list[int],
|
block_quant_shape: list[int],
|
||||||
save_dir: str,
|
save_dir: str,
|
||||||
) -> None:
|
) -> None:
|
||||||
dtype_str = get_config_dtype_str(
|
dtype_str = _get_config_dtype_str(
|
||||||
dtype, use_int8_w8a16=use_int8_w8a16, use_fp8_w8a8=use_fp8_w8a8
|
dtype, use_int8_w8a16=use_int8_w8a16, use_fp8_w8a8=use_fp8_w8a8
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -560,7 +557,7 @@ def save_configs(
|
|||||||
filename = os.path.join(save_dir, filename)
|
filename = os.path.join(save_dir, filename)
|
||||||
print(f"Writing best config to {filename}...")
|
print(f"Writing best config to {filename}...")
|
||||||
with open(filename, "w") as f:
|
with open(filename, "w") as f:
|
||||||
json.dump(configs, f, indent=4)
|
json.dump({"triton_version": triton.__version__, **configs}, f, indent=4)
|
||||||
f.write("\n")
|
f.write("\n")
|
||||||
|
|
||||||
|
|
||||||
@@ -582,18 +579,22 @@ def main(args: argparse.Namespace):
|
|||||||
E = config.ffn_config.moe_num_experts
|
E = config.ffn_config.moe_num_experts
|
||||||
topk = config.ffn_config.moe_top_k
|
topk = config.ffn_config.moe_top_k
|
||||||
intermediate_size = config.ffn_config.ffn_hidden_size
|
intermediate_size = config.ffn_config.ffn_hidden_size
|
||||||
|
hidden_size = config.hidden_size
|
||||||
elif config.architectures[0] == "JambaForCausalLM":
|
elif config.architectures[0] == "JambaForCausalLM":
|
||||||
E = config.num_experts
|
E = config.num_experts
|
||||||
topk = config.num_experts_per_tok
|
topk = config.num_experts_per_tok
|
||||||
intermediate_size = config.intermediate_size
|
intermediate_size = config.intermediate_size
|
||||||
|
hidden_size = config.hidden_size
|
||||||
elif config.architectures[0] in (
|
elif config.architectures[0] in (
|
||||||
"DeepseekV3ForCausalLM",
|
|
||||||
"DeepseekV2ForCausalLM",
|
"DeepseekV2ForCausalLM",
|
||||||
|
"DeepseekV3ForCausalLM",
|
||||||
|
"DeepseekV32ForCausalLM",
|
||||||
"Glm4MoeForCausalLM",
|
"Glm4MoeForCausalLM",
|
||||||
):
|
):
|
||||||
E = config.n_routed_experts
|
E = config.n_routed_experts
|
||||||
topk = config.num_experts_per_tok
|
topk = config.num_experts_per_tok
|
||||||
intermediate_size = config.moe_intermediate_size
|
intermediate_size = config.moe_intermediate_size
|
||||||
|
hidden_size = config.hidden_size
|
||||||
elif config.architectures[0] in (
|
elif config.architectures[0] in (
|
||||||
"Qwen2MoeForCausalLM",
|
"Qwen2MoeForCausalLM",
|
||||||
"Qwen3MoeForCausalLM",
|
"Qwen3MoeForCausalLM",
|
||||||
@@ -602,10 +603,18 @@ def main(args: argparse.Namespace):
|
|||||||
E = config.num_experts
|
E = config.num_experts
|
||||||
topk = config.num_experts_per_tok
|
topk = config.num_experts_per_tok
|
||||||
intermediate_size = config.moe_intermediate_size
|
intermediate_size = config.moe_intermediate_size
|
||||||
|
hidden_size = config.hidden_size
|
||||||
|
elif config.architectures[0] == "Qwen3VLMoeForConditionalGeneration":
|
||||||
|
text_config = config.get_text_config()
|
||||||
|
E = text_config.num_experts
|
||||||
|
topk = text_config.num_experts_per_tok
|
||||||
|
intermediate_size = text_config.moe_intermediate_size
|
||||||
|
hidden_size = text_config.hidden_size
|
||||||
elif config.architectures[0] in ("HunYuanMoEV1ForCausalLM"):
|
elif config.architectures[0] in ("HunYuanMoEV1ForCausalLM"):
|
||||||
E = config.num_experts
|
E = config.num_experts
|
||||||
topk = config.moe_topk[0]
|
topk = config.moe_topk[0]
|
||||||
intermediate_size = config.moe_intermediate_size[0]
|
intermediate_size = config.moe_intermediate_size[0]
|
||||||
|
hidden_size = config.hidden_size
|
||||||
else:
|
else:
|
||||||
# Support for llama4
|
# Support for llama4
|
||||||
config = config.get_text_config()
|
config = config.get_text_config()
|
||||||
@@ -613,6 +622,7 @@ def main(args: argparse.Namespace):
|
|||||||
E = config.num_local_experts
|
E = config.num_local_experts
|
||||||
topk = config.num_experts_per_tok
|
topk = config.num_experts_per_tok
|
||||||
intermediate_size = config.intermediate_size
|
intermediate_size = config.intermediate_size
|
||||||
|
hidden_size = config.hidden_size
|
||||||
enable_ep = bool(args.enable_expert_parallel)
|
enable_ep = bool(args.enable_expert_parallel)
|
||||||
if enable_ep:
|
if enable_ep:
|
||||||
ensure_divisibility(E, args.tp_size, "Number of experts")
|
ensure_divisibility(E, args.tp_size, "Number of experts")
|
||||||
@@ -621,8 +631,7 @@ def main(args: argparse.Namespace):
|
|||||||
else:
|
else:
|
||||||
ensure_divisibility(intermediate_size, args.tp_size, "intermediate_size")
|
ensure_divisibility(intermediate_size, args.tp_size, "intermediate_size")
|
||||||
shard_intermediate_size = 2 * intermediate_size // args.tp_size
|
shard_intermediate_size = 2 * intermediate_size // args.tp_size
|
||||||
hidden_size = config.hidden_size
|
dtype = torch.float16 if current_platform.is_rocm() else config.dtype
|
||||||
dtype = torch.float16 if current_platform.is_rocm() else config.torch_dtype
|
|
||||||
use_fp8_w8a8 = args.dtype == "fp8_w8a8"
|
use_fp8_w8a8 = args.dtype == "fp8_w8a8"
|
||||||
use_int8_w8a16 = args.dtype == "int8_w8a16"
|
use_int8_w8a16 = args.dtype == "int8_w8a16"
|
||||||
block_quant_shape = get_weight_block_size_safety(config)
|
block_quant_shape = get_weight_block_size_safety(config)
|
||||||
|
|||||||
@@ -344,7 +344,7 @@ def main(args: argparse.Namespace):
|
|||||||
topk = config.num_experts_per_tok
|
topk = config.num_experts_per_tok
|
||||||
|
|
||||||
hidden_size = config.hidden_size
|
hidden_size = config.hidden_size
|
||||||
dtype = torch.float16 if current_platform.is_rocm() else config.torch_dtype
|
dtype = torch.float16 if current_platform.is_rocm() else config.dtype
|
||||||
use_fp8_w8a8 = args.dtype == "fp8_w8a8"
|
use_fp8_w8a8 = args.dtype == "fp8_w8a8"
|
||||||
use_int8_w8a16 = args.dtype == "int8_w8a16"
|
use_int8_w8a16 = args.dtype == "int8_w8a16"
|
||||||
use_customized_permute = args.use_customized_permute
|
use_customized_permute = args.use_customized_permute
|
||||||
|
|||||||
@@ -3,16 +3,15 @@
|
|||||||
|
|
||||||
import random
|
import random
|
||||||
import time
|
import time
|
||||||
from typing import Optional
|
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
from vllm import _custom_ops as ops
|
from vllm import _custom_ops as ops
|
||||||
from vllm.logger import init_logger
|
from vllm.logger import init_logger
|
||||||
from vllm.platforms import current_platform
|
from vllm.platforms import current_platform
|
||||||
from vllm.utils import (
|
from vllm.utils import FlexibleArgumentParser
|
||||||
|
from vllm.utils.torch_utils import (
|
||||||
STR_DTYPE_TO_TORCH_DTYPE,
|
STR_DTYPE_TO_TORCH_DTYPE,
|
||||||
FlexibleArgumentParser,
|
|
||||||
create_kv_caches_with_random,
|
create_kv_caches_with_random,
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -37,7 +36,7 @@ def main(
|
|||||||
seed: int,
|
seed: int,
|
||||||
do_profile: bool,
|
do_profile: bool,
|
||||||
device: str = "cuda",
|
device: str = "cuda",
|
||||||
kv_cache_dtype: Optional[str] = None,
|
kv_cache_dtype: str | None = None,
|
||||||
) -> None:
|
) -> None:
|
||||||
current_platform.seed_everything(seed)
|
current_platform.seed_everything(seed)
|
||||||
|
|
||||||
|
|||||||
@@ -3,8 +3,8 @@
|
|||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
import math
|
import math
|
||||||
|
from collections.abc import Callable
|
||||||
from contextlib import contextmanager
|
from contextlib import contextmanager
|
||||||
from typing import Callable
|
|
||||||
from unittest.mock import patch
|
from unittest.mock import patch
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
|
|||||||
@@ -1,155 +0,0 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
|
||||||
|
|
||||||
import itertools
|
|
||||||
|
|
||||||
import torch
|
|
||||||
|
|
||||||
from vllm import _custom_ops as vllm_ops
|
|
||||||
from vllm.triton_utils import triton
|
|
||||||
|
|
||||||
|
|
||||||
def polynorm_naive(
|
|
||||||
x: torch.Tensor,
|
|
||||||
weight: torch.Tensor,
|
|
||||||
bias: torch.Tensor,
|
|
||||||
eps: float = 1e-6,
|
|
||||||
):
|
|
||||||
orig_shape = x.shape
|
|
||||||
x = x.view(-1, x.shape[-1])
|
|
||||||
|
|
||||||
def norm(x, eps: float):
|
|
||||||
return x / torch.sqrt(x.pow(2).mean(-1, keepdim=True) + eps)
|
|
||||||
|
|
||||||
x = x.float()
|
|
||||||
return (
|
|
||||||
(
|
|
||||||
weight[0] * norm(x**3, eps)
|
|
||||||
+ weight[1] * norm(x**2, eps)
|
|
||||||
+ weight[2] * norm(x, eps)
|
|
||||||
+ bias
|
|
||||||
)
|
|
||||||
.to(weight.dtype)
|
|
||||||
.view(orig_shape)
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def polynorm_vllm(
|
|
||||||
x: torch.Tensor,
|
|
||||||
weight: torch.Tensor,
|
|
||||||
bias: torch.Tensor,
|
|
||||||
eps: float = 1e-6,
|
|
||||||
):
|
|
||||||
orig_shape = x.shape
|
|
||||||
x = x.view(-1, x.shape[-1])
|
|
||||||
|
|
||||||
out = torch.empty_like(x)
|
|
||||||
vllm_ops.poly_norm(out, x, weight, bias, eps)
|
|
||||||
output = out
|
|
||||||
|
|
||||||
output = output.view(orig_shape)
|
|
||||||
return output
|
|
||||||
|
|
||||||
|
|
||||||
def calculate_diff(batch_size, seq_len, hidden_dim):
|
|
||||||
dtype = torch.bfloat16
|
|
||||||
x = torch.randn(batch_size, seq_len, hidden_dim, dtype=dtype, device="cuda")
|
|
||||||
weight = torch.ones(3, dtype=dtype, device="cuda")
|
|
||||||
bias = torch.ones(1, dtype=dtype, device="cuda")
|
|
||||||
|
|
||||||
output_naive = polynorm_naive(x, weight, bias)
|
|
||||||
output_vllm = polynorm_vllm(x, weight, bias)
|
|
||||||
|
|
||||||
if torch.allclose(output_naive, output_vllm, atol=1e-2, rtol=1e-2):
|
|
||||||
print("✅ All implementations match")
|
|
||||||
else:
|
|
||||||
print("❌ Implementations differ")
|
|
||||||
|
|
||||||
|
|
||||||
batch_size_range = [2**i for i in range(0, 7, 2)]
|
|
||||||
seq_length_range = [2**i for i in range(6, 11, 1)]
|
|
||||||
dim_range = [2048, 4096]
|
|
||||||
configs = list(itertools.product(dim_range, batch_size_range, seq_length_range))
|
|
||||||
|
|
||||||
|
|
||||||
def get_benchmark():
|
|
||||||
@triton.testing.perf_report(
|
|
||||||
triton.testing.Benchmark(
|
|
||||||
x_names=["dim", "batch_size", "seq_len"],
|
|
||||||
x_vals=[list(_) for _ in configs],
|
|
||||||
line_arg="provider",
|
|
||||||
line_vals=["naive", "vllm"],
|
|
||||||
line_names=["Naive", "vLLM"],
|
|
||||||
styles=[("blue", "-"), ("red", "-")],
|
|
||||||
ylabel="us",
|
|
||||||
plot_name="polynorm-perf",
|
|
||||||
args={},
|
|
||||||
)
|
|
||||||
)
|
|
||||||
def benchmark(dim, batch_size, seq_len, provider):
|
|
||||||
dtype = torch.bfloat16
|
|
||||||
hidden_dim = dim * 4
|
|
||||||
|
|
||||||
x = torch.randn(batch_size, seq_len, hidden_dim, dtype=dtype, device="cuda")
|
|
||||||
weight = torch.ones(3, dtype=dtype, device="cuda")
|
|
||||||
bias = torch.ones(1, dtype=dtype, device="cuda")
|
|
||||||
|
|
||||||
quantiles = [0.5, 0.2, 0.8]
|
|
||||||
|
|
||||||
if provider == "naive":
|
|
||||||
ms, min_ms, max_ms = triton.testing.do_bench(
|
|
||||||
lambda: polynorm_naive(x, weight, bias),
|
|
||||||
quantiles=quantiles,
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
ms, min_ms, max_ms = triton.testing.do_bench(
|
|
||||||
lambda: polynorm_vllm(x, weight, bias),
|
|
||||||
quantiles=quantiles,
|
|
||||||
)
|
|
||||||
|
|
||||||
return 1000 * ms, 1000 * max_ms, 1000 * min_ms
|
|
||||||
|
|
||||||
return benchmark
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
import argparse
|
|
||||||
|
|
||||||
parser = argparse.ArgumentParser()
|
|
||||||
parser.add_argument(
|
|
||||||
"--batch-size",
|
|
||||||
type=int,
|
|
||||||
default=4,
|
|
||||||
help="Batch size",
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--seq-len",
|
|
||||||
type=int,
|
|
||||||
default=128,
|
|
||||||
help="Sequence length",
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--hidden-dim",
|
|
||||||
type=int,
|
|
||||||
default=8192,
|
|
||||||
help="Intermediate size of MLP",
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--save-path",
|
|
||||||
type=str,
|
|
||||||
default="./configs/polnorm/",
|
|
||||||
help="Path to save polnorm benchmark results",
|
|
||||||
)
|
|
||||||
|
|
||||||
args = parser.parse_args()
|
|
||||||
|
|
||||||
# Run correctness test
|
|
||||||
calculate_diff(
|
|
||||||
batch_size=args.batch_size,
|
|
||||||
seq_len=args.seq_len,
|
|
||||||
hidden_dim=args.hidden_dim,
|
|
||||||
)
|
|
||||||
|
|
||||||
benchmark = get_benchmark()
|
|
||||||
# Run performance benchmark
|
|
||||||
benchmark.run(print_data=True, save_path=args.save_path)
|
|
||||||
@@ -7,7 +7,8 @@ import torch
|
|||||||
|
|
||||||
from vllm import _custom_ops as ops
|
from vllm import _custom_ops as ops
|
||||||
from vllm.platforms import current_platform
|
from vllm.platforms import current_platform
|
||||||
from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser
|
from vllm.utils import FlexibleArgumentParser
|
||||||
|
from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE
|
||||||
|
|
||||||
|
|
||||||
@torch.inference_mode()
|
@torch.inference_mode()
|
||||||
|
|||||||
172
benchmarks/kernels/benchmark_reshape_and_cache.py
Normal file
172
benchmarks/kernels/benchmark_reshape_and_cache.py
Normal file
@@ -0,0 +1,172 @@
|
|||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
import random
|
||||||
|
import time
|
||||||
|
|
||||||
|
import torch
|
||||||
|
from tabulate import tabulate
|
||||||
|
|
||||||
|
from vllm import _custom_ops as ops
|
||||||
|
from vllm.logger import init_logger
|
||||||
|
from vllm.platforms import current_platform
|
||||||
|
from vllm.utils import FlexibleArgumentParser
|
||||||
|
from vllm.utils.torch_utils import (
|
||||||
|
STR_DTYPE_TO_TORCH_DTYPE,
|
||||||
|
create_kv_caches_with_random,
|
||||||
|
)
|
||||||
|
|
||||||
|
logger = init_logger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
@torch.inference_mode()
|
||||||
|
def run_benchmark(
|
||||||
|
num_tokens: int,
|
||||||
|
num_heads: int,
|
||||||
|
head_size: int,
|
||||||
|
block_size: int,
|
||||||
|
num_blocks: int,
|
||||||
|
dtype: torch.dtype,
|
||||||
|
kv_cache_dtype: str,
|
||||||
|
num_iters: int,
|
||||||
|
benchmark_mode: str,
|
||||||
|
device: str = "cuda",
|
||||||
|
) -> float:
|
||||||
|
"""Return latency (seconds) for given num_tokens."""
|
||||||
|
|
||||||
|
if kv_cache_dtype == "fp8" and head_size % 16:
|
||||||
|
raise ValueError("fp8 kv-cache requires head_size to be a multiple of 16.")
|
||||||
|
|
||||||
|
current_platform.seed_everything(42)
|
||||||
|
torch.set_default_device(device)
|
||||||
|
|
||||||
|
# create random key / value tensors [T, H, D].
|
||||||
|
key = torch.randn(num_tokens, num_heads, head_size, dtype=dtype, device=device)
|
||||||
|
value = torch.randn_like(key)
|
||||||
|
|
||||||
|
# prepare the slot mapping.
|
||||||
|
# each token is assigned a unique slot in the KV-cache.
|
||||||
|
num_slots = block_size * num_blocks
|
||||||
|
if num_tokens > num_slots:
|
||||||
|
raise ValueError("num_tokens cannot exceed the total number of cache slots")
|
||||||
|
slot_mapping_lst = random.sample(range(num_slots), num_tokens)
|
||||||
|
slot_mapping = torch.tensor(slot_mapping_lst, dtype=torch.long, device=device)
|
||||||
|
|
||||||
|
key_caches, value_caches = create_kv_caches_with_random(
|
||||||
|
num_blocks,
|
||||||
|
block_size,
|
||||||
|
1, # num_layers
|
||||||
|
num_heads,
|
||||||
|
head_size,
|
||||||
|
kv_cache_dtype,
|
||||||
|
dtype,
|
||||||
|
device=device,
|
||||||
|
)
|
||||||
|
key_cache, value_cache = key_caches[0], value_caches[0]
|
||||||
|
# to free unused memory
|
||||||
|
del key_caches, value_caches
|
||||||
|
|
||||||
|
# compute per-kernel scaling factors for fp8 conversion (if used).
|
||||||
|
k_scale = (key.amax() / 64.0).to(torch.float32)
|
||||||
|
v_scale = (value.amax() / 64.0).to(torch.float32)
|
||||||
|
|
||||||
|
function_under_test = lambda: ops.reshape_and_cache(
|
||||||
|
key, # noqa: F821
|
||||||
|
value, # noqa: F821
|
||||||
|
key_cache, # noqa: F821
|
||||||
|
value_cache, # noqa: F821
|
||||||
|
slot_mapping, # noqa: F821
|
||||||
|
kv_cache_dtype,
|
||||||
|
k_scale,
|
||||||
|
v_scale,
|
||||||
|
)
|
||||||
|
|
||||||
|
if benchmark_mode == "cudagraph":
|
||||||
|
g = torch.cuda.CUDAGraph()
|
||||||
|
with torch.cuda.graph(g):
|
||||||
|
function_under_test()
|
||||||
|
torch.cuda.synchronize()
|
||||||
|
function_under_test = lambda: g.replay()
|
||||||
|
|
||||||
|
def run_cuda_benchmark(n_iters: int) -> float:
|
||||||
|
nonlocal key, value, key_cache, value_cache, slot_mapping
|
||||||
|
torch.cuda.synchronize()
|
||||||
|
start = time.perf_counter()
|
||||||
|
for _ in range(n_iters):
|
||||||
|
function_under_test()
|
||||||
|
torch.cuda.synchronize()
|
||||||
|
end = time.perf_counter()
|
||||||
|
return (end - start) / n_iters
|
||||||
|
|
||||||
|
# warm-up
|
||||||
|
run_cuda_benchmark(3)
|
||||||
|
|
||||||
|
lat = run_cuda_benchmark(num_iters)
|
||||||
|
|
||||||
|
# free tensors to mitigate OOM when sweeping
|
||||||
|
del key, value, key_cache, value_cache, slot_mapping
|
||||||
|
torch.cuda.empty_cache()
|
||||||
|
|
||||||
|
return lat
|
||||||
|
|
||||||
|
|
||||||
|
def main(args):
|
||||||
|
rows = []
|
||||||
|
for exp in range(1, 17):
|
||||||
|
n_tok = 2**exp
|
||||||
|
lat = run_benchmark(
|
||||||
|
num_tokens=n_tok,
|
||||||
|
num_heads=args.num_heads,
|
||||||
|
head_size=args.head_size,
|
||||||
|
block_size=args.block_size,
|
||||||
|
num_blocks=args.num_blocks,
|
||||||
|
dtype=STR_DTYPE_TO_TORCH_DTYPE[args.dtype],
|
||||||
|
kv_cache_dtype=args.kv_cache_dtype,
|
||||||
|
num_iters=args.iters,
|
||||||
|
benchmark_mode=args.mode,
|
||||||
|
device="cuda",
|
||||||
|
)
|
||||||
|
rows.append([n_tok, lat * 1e6]) # convert to microseconds
|
||||||
|
|
||||||
|
print(f"Benchmark results for implementation cuda (measuring with {args.mode}):")
|
||||||
|
print(tabulate(rows, headers=["num_tokens", "latency (µs)"], floatfmt=".3f"))
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
parser = FlexibleArgumentParser()
|
||||||
|
|
||||||
|
parser.add_argument("--num-heads", type=int, default=128)
|
||||||
|
parser.add_argument(
|
||||||
|
"--head-size",
|
||||||
|
type=int,
|
||||||
|
choices=[64, 80, 96, 112, 120, 128, 192, 256],
|
||||||
|
default=128,
|
||||||
|
)
|
||||||
|
parser.add_argument("--block-size", type=int, choices=[16, 32], default=16)
|
||||||
|
parser.add_argument("--num-blocks", type=int, default=128 * 128)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"--dtype",
|
||||||
|
type=str,
|
||||||
|
choices=["half", "bfloat16", "float"],
|
||||||
|
default="bfloat16",
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"--kv-cache-dtype",
|
||||||
|
type=str,
|
||||||
|
choices=["auto", "fp8"],
|
||||||
|
default="auto",
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument("--iters", type=int, default=200)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"--mode",
|
||||||
|
type=str,
|
||||||
|
choices=["cudagraph", "no_graph"],
|
||||||
|
default="cudagraph",
|
||||||
|
)
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
main(args)
|
||||||
@@ -1,7 +1,5 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
import random
|
import random
|
||||||
import time
|
import time
|
||||||
|
|
||||||
@@ -9,11 +7,14 @@ import torch
|
|||||||
from tabulate import tabulate
|
from tabulate import tabulate
|
||||||
|
|
||||||
from vllm import _custom_ops as ops
|
from vllm import _custom_ops as ops
|
||||||
|
from vllm.attention.ops.triton_reshape_and_cache_flash import (
|
||||||
|
triton_reshape_and_cache_flash,
|
||||||
|
)
|
||||||
from vllm.logger import init_logger
|
from vllm.logger import init_logger
|
||||||
from vllm.platforms import current_platform
|
from vllm.platforms import current_platform
|
||||||
from vllm.utils import (
|
from vllm.utils import FlexibleArgumentParser
|
||||||
|
from vllm.utils.torch_utils import (
|
||||||
STR_DTYPE_TO_TORCH_DTYPE,
|
STR_DTYPE_TO_TORCH_DTYPE,
|
||||||
FlexibleArgumentParser,
|
|
||||||
create_kv_caches_with_random_flash,
|
create_kv_caches_with_random_flash,
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -31,6 +32,8 @@ def run_benchmark(
|
|||||||
kv_cache_dtype: str,
|
kv_cache_dtype: str,
|
||||||
kv_cache_layout: str,
|
kv_cache_layout: str,
|
||||||
num_iters: int,
|
num_iters: int,
|
||||||
|
implementation: str,
|
||||||
|
benchmark_mode: str,
|
||||||
device: str = "cuda",
|
device: str = "cuda",
|
||||||
) -> float:
|
) -> float:
|
||||||
"""Return latency (seconds) for given num_tokens."""
|
"""Return latency (seconds) for given num_tokens."""
|
||||||
@@ -38,6 +41,14 @@ def run_benchmark(
|
|||||||
if kv_cache_dtype == "fp8" and head_size % 16:
|
if kv_cache_dtype == "fp8" and head_size % 16:
|
||||||
raise ValueError("fp8 kv-cache requires head_size to be a multiple of 16.")
|
raise ValueError("fp8 kv-cache requires head_size to be a multiple of 16.")
|
||||||
|
|
||||||
|
if implementation not in ("cuda", "triton"):
|
||||||
|
raise ValueError(
|
||||||
|
f"Unsupported implementation: {implementation}. "
|
||||||
|
"Only 'cuda' and 'triton' are supported."
|
||||||
|
)
|
||||||
|
if implementation == "triton" and kv_cache_layout == "HND":
|
||||||
|
return float("nan") # Triton does not support HND layout yet.
|
||||||
|
|
||||||
current_platform.seed_everything(42)
|
current_platform.seed_everything(42)
|
||||||
torch.set_default_device(device)
|
torch.set_default_device(device)
|
||||||
|
|
||||||
@@ -65,27 +76,49 @@ def run_benchmark(
|
|||||||
cache_layout=kv_cache_layout,
|
cache_layout=kv_cache_layout,
|
||||||
)
|
)
|
||||||
key_cache, value_cache = key_caches[0], value_caches[0]
|
key_cache, value_cache = key_caches[0], value_caches[0]
|
||||||
|
# to free unused memory
|
||||||
|
del key_caches, value_caches
|
||||||
|
|
||||||
# compute per-kernel scaling factors for fp8 conversion (if used).
|
# compute per-kernel scaling factors for fp8 conversion (if used).
|
||||||
k_scale = (key.amax() / 64.0).to(torch.float32)
|
k_scale = (key.amax() / 64.0).to(torch.float32)
|
||||||
v_scale = (value.amax() / 64.0).to(torch.float32)
|
v_scale = (value.amax() / 64.0).to(torch.float32)
|
||||||
|
|
||||||
|
if implementation == "cuda":
|
||||||
|
function_under_test = lambda: ops.reshape_and_cache_flash(
|
||||||
|
key, # noqa: F821
|
||||||
|
value, # noqa: F821
|
||||||
|
key_cache, # noqa: F821
|
||||||
|
value_cache, # noqa: F821
|
||||||
|
slot_mapping, # noqa: F821
|
||||||
|
kv_cache_dtype,
|
||||||
|
k_scale,
|
||||||
|
v_scale,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
function_under_test = lambda: triton_reshape_and_cache_flash(
|
||||||
|
key, # noqa: F821
|
||||||
|
value, # noqa: F821
|
||||||
|
key_cache, # noqa: F821
|
||||||
|
value_cache, # noqa: F821
|
||||||
|
slot_mapping, # noqa: F821
|
||||||
|
kv_cache_dtype,
|
||||||
|
k_scale,
|
||||||
|
v_scale,
|
||||||
|
)
|
||||||
|
if benchmark_mode == "cudagraph":
|
||||||
|
g = torch.cuda.CUDAGraph()
|
||||||
|
with torch.cuda.graph(g):
|
||||||
|
function_under_test()
|
||||||
|
torch.cuda.synchronize()
|
||||||
|
function_under_test = lambda: g.replay()
|
||||||
|
|
||||||
def run_cuda_benchmark(n_iters: int) -> float:
|
def run_cuda_benchmark(n_iters: int) -> float:
|
||||||
nonlocal key, value, key_cache, value_cache, slot_mapping
|
nonlocal key, value, key_cache, value_cache, slot_mapping
|
||||||
torch.cuda.synchronize()
|
torch.cuda.synchronize()
|
||||||
start = time.perf_counter()
|
start = time.perf_counter()
|
||||||
for _ in range(n_iters):
|
for _ in range(n_iters):
|
||||||
ops.reshape_and_cache_flash(
|
function_under_test()
|
||||||
key,
|
torch.cuda.synchronize()
|
||||||
value,
|
|
||||||
key_cache,
|
|
||||||
value_cache,
|
|
||||||
slot_mapping,
|
|
||||||
kv_cache_dtype,
|
|
||||||
k_scale,
|
|
||||||
v_scale,
|
|
||||||
)
|
|
||||||
torch.cuda.synchronize()
|
|
||||||
end = time.perf_counter()
|
end = time.perf_counter()
|
||||||
return (end - start) / n_iters
|
return (end - start) / n_iters
|
||||||
|
|
||||||
@@ -116,10 +149,16 @@ def main(args):
|
|||||||
kv_cache_dtype=args.kv_cache_dtype,
|
kv_cache_dtype=args.kv_cache_dtype,
|
||||||
kv_cache_layout=layout,
|
kv_cache_layout=layout,
|
||||||
num_iters=args.iters,
|
num_iters=args.iters,
|
||||||
|
implementation=args.implementation,
|
||||||
|
benchmark_mode=args.mode,
|
||||||
device="cuda",
|
device="cuda",
|
||||||
)
|
)
|
||||||
rows.append([n_tok, layout, f"{lat * 1e6:.3f}"])
|
rows.append([n_tok, layout, f"{lat * 1e6:.3f}"])
|
||||||
|
|
||||||
|
print(
|
||||||
|
f"Benchmark results for implementation {args.implementation}"
|
||||||
|
f" (measuring with {args.mode}):"
|
||||||
|
)
|
||||||
print(tabulate(rows, headers=["num_tokens", "layout", "latency (µs)"]))
|
print(tabulate(rows, headers=["num_tokens", "layout", "latency (µs)"]))
|
||||||
|
|
||||||
|
|
||||||
@@ -151,6 +190,21 @@ if __name__ == "__main__":
|
|||||||
)
|
)
|
||||||
|
|
||||||
parser.add_argument("--iters", type=int, default=100)
|
parser.add_argument("--iters", type=int, default=100)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"--implementation",
|
||||||
|
type=str,
|
||||||
|
choices=["cuda", "triton"],
|
||||||
|
default="cuda",
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"--mode",
|
||||||
|
type=str,
|
||||||
|
choices=["cudagraph", "no_graph"],
|
||||||
|
default="cudagraph",
|
||||||
|
)
|
||||||
|
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
main(args)
|
main(args)
|
||||||
|
|||||||
@@ -2,7 +2,6 @@
|
|||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
|
||||||
import itertools
|
import itertools
|
||||||
from typing import Optional, Union
|
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
from flashinfer.norm import fused_add_rmsnorm, rmsnorm
|
from flashinfer.norm import fused_add_rmsnorm, rmsnorm
|
||||||
@@ -21,8 +20,8 @@ class HuggingFaceRMSNorm(nn.Module):
|
|||||||
def forward(
|
def forward(
|
||||||
self,
|
self,
|
||||||
x: torch.Tensor,
|
x: torch.Tensor,
|
||||||
residual: Optional[torch.Tensor] = None,
|
residual: torch.Tensor | None = None,
|
||||||
) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
|
) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
|
||||||
orig_dtype = x.dtype
|
orig_dtype = x.dtype
|
||||||
x = x.to(torch.float32)
|
x = x.to(torch.float32)
|
||||||
if residual is not None:
|
if residual is not None:
|
||||||
@@ -41,7 +40,7 @@ class HuggingFaceRMSNorm(nn.Module):
|
|||||||
def rmsnorm_naive(
|
def rmsnorm_naive(
|
||||||
x: torch.Tensor,
|
x: torch.Tensor,
|
||||||
weight: torch.Tensor,
|
weight: torch.Tensor,
|
||||||
residual: Optional[torch.Tensor] = None,
|
residual: torch.Tensor | None = None,
|
||||||
eps: float = 1e-6,
|
eps: float = 1e-6,
|
||||||
):
|
):
|
||||||
naive_norm = HuggingFaceRMSNorm(x.shape[-1], eps=eps)
|
naive_norm = HuggingFaceRMSNorm(x.shape[-1], eps=eps)
|
||||||
@@ -65,7 +64,7 @@ def rmsnorm_naive(
|
|||||||
def rmsnorm_flashinfer(
|
def rmsnorm_flashinfer(
|
||||||
x: torch.Tensor,
|
x: torch.Tensor,
|
||||||
weight: torch.Tensor,
|
weight: torch.Tensor,
|
||||||
residual: Optional[torch.Tensor] = None,
|
residual: torch.Tensor | None = None,
|
||||||
eps: float = 1e-6,
|
eps: float = 1e-6,
|
||||||
):
|
):
|
||||||
orig_shape = x.shape
|
orig_shape = x.shape
|
||||||
@@ -89,7 +88,7 @@ def rmsnorm_flashinfer(
|
|||||||
def rmsnorm_vllm(
|
def rmsnorm_vllm(
|
||||||
x: torch.Tensor,
|
x: torch.Tensor,
|
||||||
weight: torch.Tensor,
|
weight: torch.Tensor,
|
||||||
residual: Optional[torch.Tensor] = None,
|
residual: torch.Tensor | None = None,
|
||||||
eps: float = 1e-6,
|
eps: float = 1e-6,
|
||||||
):
|
):
|
||||||
orig_shape = x.shape
|
orig_shape = x.shape
|
||||||
|
|||||||
@@ -2,7 +2,6 @@
|
|||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
|
||||||
from itertools import accumulate
|
from itertools import accumulate
|
||||||
from typing import Optional
|
|
||||||
|
|
||||||
import nvtx
|
import nvtx
|
||||||
import torch
|
import torch
|
||||||
@@ -18,7 +17,7 @@ def benchmark_rope_kernels_multi_lora(
|
|||||||
seq_len: int,
|
seq_len: int,
|
||||||
num_heads: int,
|
num_heads: int,
|
||||||
head_size: int,
|
head_size: int,
|
||||||
rotary_dim: Optional[int],
|
rotary_dim: int | None,
|
||||||
dtype: torch.dtype,
|
dtype: torch.dtype,
|
||||||
seed: int,
|
seed: int,
|
||||||
device: str,
|
device: str,
|
||||||
|
|||||||
@@ -1,5 +1,19 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
|
||||||
|
"""
|
||||||
|
Comprehensive 3-way SiLU Benchmark Suite
|
||||||
|
|
||||||
|
This benchmark compares three SiLU implementations:
|
||||||
|
1. SiLU V2 (CUDA) - Optimized CUDA kernel implementation
|
||||||
|
2. Triton Kernel - Triton-based implementation
|
||||||
|
|
||||||
|
The suite generates detailed performance comparisons including:
|
||||||
|
- Memory bandwidth utilization
|
||||||
|
- Speedup ratios (baseline vs optimized implementations)
|
||||||
|
- Performance across different expert configurations and token distributions
|
||||||
|
"""
|
||||||
|
|
||||||
from collections.abc import Callable
|
from collections.abc import Callable
|
||||||
|
|
||||||
import matplotlib.pyplot as plt
|
import matplotlib.pyplot as plt
|
||||||
@@ -7,7 +21,7 @@ import numpy as np
|
|||||||
import torch
|
import torch
|
||||||
|
|
||||||
from vllm.model_executor.layers.fused_moe.batched_deep_gemm_moe import (
|
from vllm.model_executor.layers.fused_moe.batched_deep_gemm_moe import (
|
||||||
silu_mul_fp8_quant_deep_gemm_cuda,
|
persistent_masked_m_silu_mul_quant,
|
||||||
)
|
)
|
||||||
from vllm.platforms import current_platform
|
from vllm.platforms import current_platform
|
||||||
from vllm.triton_utils import tl, triton
|
from vllm.triton_utils import tl, triton
|
||||||
@@ -94,6 +108,7 @@ def silu_mul_fp8_quant_deep_gemm_triton(
|
|||||||
num_parallel_tokens,
|
num_parallel_tokens,
|
||||||
group_size: int = 128,
|
group_size: int = 128,
|
||||||
eps: float = 1e-10,
|
eps: float = 1e-10,
|
||||||
|
expert_offsets: torch.Tensor = None,
|
||||||
) -> tuple[torch.Tensor, torch.Tensor]:
|
) -> tuple[torch.Tensor, torch.Tensor]:
|
||||||
"""Quantize silu(y[..., :H]) * y[..., H:] to FP8 with group per-token scales
|
"""Quantize silu(y[..., :H]) * y[..., H:] to FP8 with group per-token scales
|
||||||
|
|
||||||
@@ -174,7 +189,7 @@ def silu_mul_fp8_quant_deep_gemm_triton(
|
|||||||
|
|
||||||
|
|
||||||
# Parse generation strategies
|
# Parse generation strategies
|
||||||
strategies = ["uniform", "max_t", "first_t"]
|
strategies = ["random_imbalanced", "uniform", "max_t"]
|
||||||
|
|
||||||
|
|
||||||
def benchmark(
|
def benchmark(
|
||||||
@@ -195,15 +210,27 @@ def benchmark(
|
|||||||
current_platform.seed_everything(42 + seed_offset)
|
current_platform.seed_everything(42 + seed_offset)
|
||||||
y = torch.rand((E, T, 2 * H), dtype=torch.bfloat16, device="cuda").contiguous()
|
y = torch.rand((E, T, 2 * H), dtype=torch.bfloat16, device="cuda").contiguous()
|
||||||
|
|
||||||
if gen_strategy == "uniform":
|
if gen_strategy == "random_imbalanced":
|
||||||
r = torch.rand(size=(E,), device="cuda")
|
|
||||||
|
def generate_expert_loads(n_e, total_tokens, ratio, device="cuda"):
|
||||||
|
mean = total_tokens // n_e
|
||||||
|
min_max = mean // ratio
|
||||||
|
e = torch.ones(size=(E,), dtype=torch.int64, device=device) * mean
|
||||||
|
e[0] = min_max
|
||||||
|
r = torch.rand(size=(E - 1,))
|
||||||
|
r /= r.sum()
|
||||||
|
r *= total_tokens - min_max
|
||||||
|
r = r.round().long()
|
||||||
|
e[1:] = r.to(device=device)
|
||||||
|
return e
|
||||||
|
|
||||||
|
tokens_per_expert = generate_expert_loads(E, total_tokens, 0.7, "cuda")
|
||||||
|
elif gen_strategy == "uniform":
|
||||||
|
r = torch.rand(size=(E,))
|
||||||
r /= r.sum()
|
r /= r.sum()
|
||||||
r *= total_tokens
|
r *= total_tokens
|
||||||
tokens_per_expert = r.int()
|
r = r.round().long()
|
||||||
tokens_per_expert = torch.minimum(
|
tokens_per_expert = r
|
||||||
tokens_per_expert,
|
|
||||||
torch.ones((E,), device=r.device, dtype=torch.int) * T,
|
|
||||||
)
|
|
||||||
elif gen_strategy == "max_t":
|
elif gen_strategy == "max_t":
|
||||||
tokens_per_expert = torch.empty(size=(E,), dtype=torch.int32, device="cuda")
|
tokens_per_expert = torch.empty(size=(E,), dtype=torch.int32, device="cuda")
|
||||||
tokens_per_expert.fill_(total_tokens / E)
|
tokens_per_expert.fill_(total_tokens / E)
|
||||||
@@ -281,40 +308,34 @@ def benchmark(
|
|||||||
|
|
||||||
|
|
||||||
def create_comparison_plot(
|
def create_comparison_plot(
|
||||||
ratio, cuda_times, baseline_times, config_labels, strategy_name, id
|
ratios, silu_v2_times, triton_times, config_labels, strategy_name, id
|
||||||
):
|
):
|
||||||
"""Create a comparison plot for a specific generation strategy"""
|
fig, ax = plt.subplots(1, 1, figsize=(18, 6))
|
||||||
fig, ax = plt.subplots(1, 1, figsize=(16, 6))
|
|
||||||
|
|
||||||
# Configure x-axis positions
|
# Configure x-axis positions
|
||||||
x = np.arange(len(config_labels))
|
x = np.arange(len(config_labels))
|
||||||
width = 0.35
|
width = 0.25
|
||||||
|
|
||||||
# Execution Time plot (lower is better)
|
# Execution Time plot (lower is better)
|
||||||
|
ax.bar(x, silu_v2_times, width, label="SiLU V2 (CUDA)", alpha=0.8, color="blue")
|
||||||
ax.bar(
|
ax.bar(
|
||||||
x - width / 2, cuda_times, width, label="CUDA Kernel", alpha=0.8, color="blue"
|
x + width, triton_times, width, label="Triton Kernel", alpha=0.8, color="green"
|
||||||
)
|
|
||||||
ax.bar(
|
|
||||||
x + width / 2,
|
|
||||||
baseline_times,
|
|
||||||
width,
|
|
||||||
label="Baseline",
|
|
||||||
alpha=0.8,
|
|
||||||
color="orange",
|
|
||||||
)
|
)
|
||||||
|
|
||||||
# Add speedup labels over each bar pair
|
# Add speedup labels over each bar trio
|
||||||
for i in range(len(x)):
|
for i in range(len(x)):
|
||||||
speedup = ratio[i]
|
triton_v2_speedup = ratios[i][1] # triton/v2
|
||||||
max_height = max(cuda_times[i], baseline_times[i])
|
max_height = max(silu_v2_times[i], triton_times[i])
|
||||||
|
|
||||||
|
# Triton/V2 speedup
|
||||||
ax.text(
|
ax.text(
|
||||||
x[i],
|
x[i] + width / 2,
|
||||||
max_height + max_height * 0.02,
|
max_height + max_height * 0.02,
|
||||||
f"{speedup:.2f}x",
|
f"{triton_v2_speedup:.2f}x",
|
||||||
ha="center",
|
ha="center",
|
||||||
va="bottom",
|
va="bottom",
|
||||||
fontweight="bold",
|
fontweight="bold",
|
||||||
fontsize=9,
|
fontsize=8,
|
||||||
)
|
)
|
||||||
|
|
||||||
ax.set_xlabel("Configuration")
|
ax.set_xlabel("Configuration")
|
||||||
@@ -332,56 +353,75 @@ def create_comparison_plot(
|
|||||||
|
|
||||||
|
|
||||||
def create_combined_plot(all_results):
|
def create_combined_plot(all_results):
|
||||||
"""Create a combined plot with all strategies in one PNG"""
|
|
||||||
num_strategies = len(all_results)
|
num_strategies = len(all_results)
|
||||||
fig, axes = plt.subplots(num_strategies, 1, figsize=(20, 6 * num_strategies))
|
fig, axes = plt.subplots(num_strategies, 1, figsize=(22, 7 * num_strategies))
|
||||||
|
|
||||||
if num_strategies == 1:
|
if num_strategies == 1:
|
||||||
axes = [axes]
|
axes = [axes]
|
||||||
|
|
||||||
for idx, (
|
for idx, (
|
||||||
strategy_name,
|
strategy_name,
|
||||||
ratio,
|
all_ratios,
|
||||||
cuda_times,
|
all_silu_v2_results,
|
||||||
baseline_times,
|
all_triton_results,
|
||||||
config_labels,
|
config_labels,
|
||||||
|
config_x_axis,
|
||||||
) in enumerate(all_results):
|
) in enumerate(all_results):
|
||||||
ax = axes[idx]
|
ax = axes[idx]
|
||||||
|
|
||||||
|
# Flatten the nested results to get bandwidth percentages for plotting
|
||||||
|
silu_v2_bandwidths = []
|
||||||
|
triton_bandwidths = []
|
||||||
|
flat_ratios = []
|
||||||
|
|
||||||
|
for config_results in all_silu_v2_results:
|
||||||
|
for result in config_results:
|
||||||
|
silu_v2_bandwidths.append(result[3]) # bandwidth percentage
|
||||||
|
|
||||||
|
for config_results in all_triton_results:
|
||||||
|
for result in config_results:
|
||||||
|
triton_bandwidths.append(result[3]) # bandwidth percentage
|
||||||
|
|
||||||
|
for config_ratios in all_ratios:
|
||||||
|
for ratio in config_ratios:
|
||||||
|
flat_ratios.append(ratio)
|
||||||
|
|
||||||
# Configure x-axis positions
|
# Configure x-axis positions
|
||||||
x = np.arange(len(config_labels))
|
x = np.arange(len(config_labels))
|
||||||
width = 0.35
|
width = 0.25
|
||||||
|
|
||||||
# Execution Time plot (lower is better)
|
# Bandwidth utilization plot (higher is better)
|
||||||
ax.bar(
|
ax.bar(
|
||||||
x - width / 2,
|
x,
|
||||||
cuda_times,
|
silu_v2_bandwidths,
|
||||||
width,
|
width,
|
||||||
label="CUDA Kernel",
|
label="SiLU V2 (CUDA)",
|
||||||
alpha=0.8,
|
alpha=0.8,
|
||||||
color="blue",
|
color="blue",
|
||||||
)
|
)
|
||||||
ax.bar(
|
ax.bar(
|
||||||
x + width / 2,
|
x + width,
|
||||||
baseline_times,
|
triton_bandwidths,
|
||||||
width,
|
width,
|
||||||
label="Baseline",
|
label="Triton Kernel",
|
||||||
alpha=0.8,
|
alpha=0.8,
|
||||||
color="orange",
|
color="green",
|
||||||
)
|
)
|
||||||
|
|
||||||
# Add speedup labels over each bar pair
|
# Add speedup labels over each bar trio
|
||||||
for i in range(len(x)):
|
for i in range(len(x)):
|
||||||
speedup = ratio[i]
|
triton_v2_speedup = flat_ratios[i] # triton/v2
|
||||||
max_height = max(cuda_times[i], baseline_times[i])
|
max_height = max(silu_v2_bandwidths[i], triton_bandwidths[i])
|
||||||
|
|
||||||
|
# Triton/V2 speedup
|
||||||
ax.text(
|
ax.text(
|
||||||
x[i],
|
x[i] + width / 2,
|
||||||
max_height + max_height * 0.02,
|
max_height + max_height * 0.02,
|
||||||
f"{speedup:.2f}x",
|
f"{triton_v2_speedup:.2f}x",
|
||||||
ha="center",
|
ha="center",
|
||||||
va="bottom",
|
va="bottom",
|
||||||
fontweight="bold",
|
fontweight="bold",
|
||||||
fontsize=9,
|
fontsize=8,
|
||||||
)
|
)
|
||||||
|
|
||||||
ax.set_xlabel("Configuration")
|
ax.set_xlabel("Configuration")
|
||||||
@@ -395,7 +435,7 @@ def create_combined_plot(all_results):
|
|||||||
ax.grid(True, alpha=0.3)
|
ax.grid(True, alpha=0.3)
|
||||||
|
|
||||||
plt.tight_layout()
|
plt.tight_layout()
|
||||||
filename = "../../silu_bench/silu_benchmark_combined.png"
|
filename = "silu_benchmark_combined_3way.png"
|
||||||
plt.savefig(filename, dpi=300, bbox_inches="tight")
|
plt.savefig(filename, dpi=300, bbox_inches="tight")
|
||||||
plt.show()
|
plt.show()
|
||||||
|
|
||||||
@@ -405,7 +445,9 @@ def create_combined_plot(all_results):
|
|||||||
outer_dim = 7168
|
outer_dim = 7168
|
||||||
configs = [
|
configs = [
|
||||||
# DeepSeekV3 Configs
|
# DeepSeekV3 Configs
|
||||||
|
# (1, 56, 7168),
|
||||||
(8, 1024, 7168),
|
(8, 1024, 7168),
|
||||||
|
# (32, 56, 7168),
|
||||||
# DeepSeekV3 Configs
|
# DeepSeekV3 Configs
|
||||||
(32, 1024, 7168),
|
(32, 1024, 7168),
|
||||||
# DeepSeekV3 Configs
|
# DeepSeekV3 Configs
|
||||||
@@ -417,6 +459,7 @@ num_warmups = 20
|
|||||||
|
|
||||||
strategy_descriptions = {
|
strategy_descriptions = {
|
||||||
"uniform": "Uniform Random",
|
"uniform": "Uniform Random",
|
||||||
|
"random_imbalanced": "Imbalanced Random",
|
||||||
"max_t": "Even Assignment",
|
"max_t": "Even Assignment",
|
||||||
"first_t": "experts[0] = T, experts[1:] = 0",
|
"first_t": "experts[0] = T, experts[1:] = 0",
|
||||||
}
|
}
|
||||||
@@ -433,28 +476,31 @@ for id, strategy in enumerate(strategies):
|
|||||||
print(f"Testing strategy: {strategy_descriptions[strategy]}")
|
print(f"Testing strategy: {strategy_descriptions[strategy]}")
|
||||||
print(f"{'=' * 60}")
|
print(f"{'=' * 60}")
|
||||||
|
|
||||||
# Collect benchmark data for both algorithms
|
# Collect benchmark data for all three algorithms
|
||||||
config_labels = []
|
config_labels = []
|
||||||
config_x_axis = []
|
config_x_axis = []
|
||||||
all_cuda_results = []
|
all_silu_v2_results = []
|
||||||
all_baseline_results = []
|
all_triton_results = []
|
||||||
all_ratios = []
|
all_ratios = []
|
||||||
|
|
||||||
for E, T, H in configs:
|
for E, T, H in configs:
|
||||||
total_tokens_config = [8 * E, 16 * E, 32 * E, 64 * E, 128 * E, 256 * E]
|
total_tokens_config = []
|
||||||
|
for i in [8, 16, 32, 64, 128, 256, 512]:
|
||||||
|
if i <= T:
|
||||||
|
total_tokens_config.append(i * E)
|
||||||
config_x_axis.append(total_tokens_config)
|
config_x_axis.append(total_tokens_config)
|
||||||
|
|
||||||
cuda_results = []
|
silu_v2_results = []
|
||||||
baseline_results = []
|
triton_results = []
|
||||||
ratios = []
|
ratios = []
|
||||||
|
|
||||||
for total_tokens in total_tokens_config:
|
for total_tokens in total_tokens_config:
|
||||||
config_label = f"E={E},T={T},H={H},TT={total_tokens}"
|
config_label = f"E={E},T={T},H={H},TT={total_tokens}"
|
||||||
config_labels.append(config_label)
|
config_labels.append(config_label)
|
||||||
|
|
||||||
# CUDA kernel results
|
# SiLU V2 (CUDA kernel) results
|
||||||
time_ms_cuda, gflops, gbps, perc = benchmark(
|
time_ms_silu_v2, gflops, gbps, perc = benchmark(
|
||||||
silu_mul_fp8_quant_deep_gemm_cuda,
|
persistent_masked_m_silu_mul_quant,
|
||||||
E,
|
E,
|
||||||
T,
|
T,
|
||||||
H,
|
H,
|
||||||
@@ -463,9 +509,9 @@ for id, strategy in enumerate(strategies):
|
|||||||
num_warmups=num_warmups,
|
num_warmups=num_warmups,
|
||||||
gen_strategy=strategy,
|
gen_strategy=strategy,
|
||||||
)
|
)
|
||||||
cuda_results.append((time_ms_cuda, gflops, gbps, perc))
|
silu_v2_results.append((time_ms_silu_v2, gflops, gbps, perc))
|
||||||
|
|
||||||
# Baseline results
|
# Triton kernel results
|
||||||
time_ms_triton, gflops, gbps, perc = benchmark(
|
time_ms_triton, gflops, gbps, perc = benchmark(
|
||||||
silu_mul_fp8_quant_deep_gemm_triton,
|
silu_mul_fp8_quant_deep_gemm_triton,
|
||||||
E,
|
E,
|
||||||
@@ -476,12 +522,20 @@ for id, strategy in enumerate(strategies):
|
|||||||
num_warmups=num_warmups,
|
num_warmups=num_warmups,
|
||||||
gen_strategy=strategy,
|
gen_strategy=strategy,
|
||||||
)
|
)
|
||||||
baseline_results.append((time_ms_triton, gflops, gbps, perc))
|
triton_results.append((time_ms_triton, gflops, gbps, perc))
|
||||||
ratios.append(time_ms_triton / time_ms_cuda)
|
|
||||||
|
|
||||||
print(f"Completed: {config_label}")
|
# Calculate speedup ratios (triton baseline / implementation)
|
||||||
all_cuda_results.append(cuda_results)
|
triton_v2_ratio = time_ms_triton / time_ms_silu_v2
|
||||||
all_baseline_results.append(baseline_results)
|
ratios.append(triton_v2_ratio)
|
||||||
|
|
||||||
|
print(
|
||||||
|
f"Completed: {config_label}:"
|
||||||
|
f" V2: {time_ms_silu_v2:.3f}ms,"
|
||||||
|
f" Triton: {time_ms_triton:.3f}ms"
|
||||||
|
)
|
||||||
|
|
||||||
|
all_silu_v2_results.append(silu_v2_results)
|
||||||
|
all_triton_results.append(triton_results)
|
||||||
all_ratios.append(ratios)
|
all_ratios.append(ratios)
|
||||||
|
|
||||||
# Store results for combined plotting
|
# Store results for combined plotting
|
||||||
@@ -489,8 +543,8 @@ for id, strategy in enumerate(strategies):
|
|||||||
(
|
(
|
||||||
strategy_descriptions[strategy],
|
strategy_descriptions[strategy],
|
||||||
all_ratios,
|
all_ratios,
|
||||||
all_cuda_results,
|
all_silu_v2_results,
|
||||||
all_baseline_results,
|
all_triton_results,
|
||||||
config_labels,
|
config_labels,
|
||||||
config_x_axis,
|
config_x_axis,
|
||||||
)
|
)
|
||||||
@@ -498,15 +552,18 @@ for id, strategy in enumerate(strategies):
|
|||||||
|
|
||||||
# Print summary table for this strategy
|
# Print summary table for this strategy
|
||||||
print(f"\nSummary Table - {strategy_descriptions[strategy]}:")
|
print(f"\nSummary Table - {strategy_descriptions[strategy]}:")
|
||||||
print(f"{'Config':<20} {'CUDA Time(ms)':<12} {'Base Time(ms)':<12} {'Speedup':<8}")
|
print(f" {'V2 Time(ms)':<12} {'Triton Time(ms)':<14} {'Triton/V2':<10}")
|
||||||
print("-" * 60)
|
print("-" * 90)
|
||||||
|
|
||||||
for i, (E, T, H) in enumerate(configs):
|
for i, (E, T, H) in enumerate(configs):
|
||||||
speedup = baseline_results[i][0] / cuda_results[i][0]
|
# Get the first result for each config (simplifying for summary)
|
||||||
|
v2_time = silu_v2_results[i][0]
|
||||||
|
triton_time = triton_results[i][0]
|
||||||
|
triton_v2_speedup = triton_time / v2_time
|
||||||
config_label = f"E={E:3d},T={T:4d},H={H:4d}"
|
config_label = f"E={E:3d},T={T:4d},H={H:4d}"
|
||||||
print(
|
print(
|
||||||
f"{config_label:<20} {cuda_results[i][0]:8.5f} "
|
f"{config_label:<20} {v2_time:8.5f} {triton_time:10.5f} "
|
||||||
f"{baseline_results[i][0]:8.5f} {speedup:6.2f}x"
|
f"{triton_v2_speedup:8.2f}x"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@@ -514,15 +571,14 @@ def create_total_tokens_plot(all_results):
|
|||||||
num_strategies = len(all_results)
|
num_strategies = len(all_results)
|
||||||
num_configs = len(configs)
|
num_configs = len(configs)
|
||||||
|
|
||||||
# Create side-by-side subplots: 2 columns for speedup and bandwidth percentage
|
|
||||||
fig, axs = plt.subplots(
|
fig, axs = plt.subplots(
|
||||||
num_strategies, num_configs * 2, figsize=(28, 6 * num_strategies)
|
num_strategies, num_configs * 2, figsize=(32, 8 * num_strategies)
|
||||||
)
|
)
|
||||||
|
|
||||||
# Add main title to the entire figure
|
# Add main title to the entire figure
|
||||||
fig.suptitle(
|
fig.suptitle(
|
||||||
"Performance Analysis: Speedup vs Bandwidth Utilization (Triton & CUDA)",
|
"Performance Analysis: Speedup vs Bandwidth Utilization (SiLU V2, and Triton)",
|
||||||
fontsize=16,
|
fontsize=18,
|
||||||
fontweight="bold",
|
fontweight="bold",
|
||||||
y=0.98,
|
y=0.98,
|
||||||
)
|
)
|
||||||
@@ -539,8 +595,8 @@ def create_total_tokens_plot(all_results):
|
|||||||
(
|
(
|
||||||
strategy_name,
|
strategy_name,
|
||||||
all_ratios,
|
all_ratios,
|
||||||
all_cuda_results,
|
all_silu_v2_results,
|
||||||
all_baseline_results,
|
all_triton_results,
|
||||||
config_labels,
|
config_labels,
|
||||||
config_x_axis,
|
config_x_axis,
|
||||||
) = result
|
) = result
|
||||||
@@ -555,42 +611,54 @@ def create_total_tokens_plot(all_results):
|
|||||||
ratios = all_ratios[config_idx]
|
ratios = all_ratios[config_idx]
|
||||||
total_tokens_values = config_x_axis[config_idx]
|
total_tokens_values = config_x_axis[config_idx]
|
||||||
|
|
||||||
# Extract CUDA and Triton bandwidth percentages
|
# Extract speedup ratios
|
||||||
cuda_bandwidth_percentages = [
|
triton_v2_ratios = [ratio for ratio in ratios]
|
||||||
result[3] for result in all_cuda_results[config_idx]
|
|
||||||
|
# Extract bandwidth percentages for all implementations
|
||||||
|
v2_bandwidth_percentages = [
|
||||||
|
result[3] for result in all_silu_v2_results[config_idx]
|
||||||
]
|
]
|
||||||
triton_bandwidth_percentages = [
|
triton_bandwidth_percentages = [
|
||||||
result[3] for result in all_baseline_results[config_idx]
|
result[3] for result in all_triton_results[config_idx]
|
||||||
]
|
]
|
||||||
|
|
||||||
# Plot speedup ratios vs total tokens (left plot)
|
# Plot speedup ratios vs total tokens (left plot)
|
||||||
ax_speedup.plot(
|
ax_speedup.plot(
|
||||||
total_tokens_values, ratios, "bo-", linewidth=3, markersize=8
|
total_tokens_values,
|
||||||
|
triton_v2_ratios,
|
||||||
|
"go-",
|
||||||
|
linewidth=3,
|
||||||
|
markersize=8,
|
||||||
|
label="Triton/V2 Speedup",
|
||||||
)
|
)
|
||||||
ax_speedup.set_title(
|
ax_speedup.set_title(
|
||||||
f"{strategy_name}\nSpeedup (CUDA/Triton)\nE={E}, T={T}, H={H}",
|
f"{strategy_name}\nSpeedup vs Baseline (Triton)\nE={E}, T={T}, H={H}",
|
||||||
fontsize=12,
|
fontsize=12,
|
||||||
fontweight="bold",
|
fontweight="bold",
|
||||||
)
|
)
|
||||||
ax_speedup.set_xlabel("Total Tokens", fontweight="bold", fontsize=11)
|
ax_speedup.set_xlabel("Total Tokens", fontweight="bold", fontsize=11)
|
||||||
ax_speedup.set_ylabel("Speedup Ratio", fontweight="bold", fontsize=11)
|
ax_speedup.set_ylabel("Speedup Ratio", fontweight="bold", fontsize=11)
|
||||||
|
ax_speedup.legend(prop={"weight": "bold"})
|
||||||
ax_speedup.grid(True, alpha=0.3)
|
ax_speedup.grid(True, alpha=0.3)
|
||||||
|
|
||||||
|
# Plot bandwidth utilization (right plot)
|
||||||
ax_bandwidth.plot(
|
ax_bandwidth.plot(
|
||||||
total_tokens_values,
|
total_tokens_values,
|
||||||
cuda_bandwidth_percentages,
|
v2_bandwidth_percentages,
|
||||||
"ro-",
|
"o-",
|
||||||
linewidth=3,
|
linewidth=3,
|
||||||
markersize=8,
|
markersize=8,
|
||||||
label="CUDA",
|
label="SiLU V2",
|
||||||
|
color="blue",
|
||||||
)
|
)
|
||||||
ax_bandwidth.plot(
|
ax_bandwidth.plot(
|
||||||
total_tokens_values,
|
total_tokens_values,
|
||||||
triton_bandwidth_percentages,
|
triton_bandwidth_percentages,
|
||||||
"go-",
|
"o-",
|
||||||
linewidth=3,
|
linewidth=3,
|
||||||
markersize=8,
|
markersize=8,
|
||||||
label="Triton",
|
label="Triton",
|
||||||
|
color="green",
|
||||||
)
|
)
|
||||||
ax_bandwidth.set_title(
|
ax_bandwidth.set_title(
|
||||||
f"{strategy_name}\nBandwidth Utilization (Hopper)\nE={E}, T={T}, H={H}",
|
f"{strategy_name}\nBandwidth Utilization (Hopper)\nE={E}, T={T}, H={H}",
|
||||||
@@ -618,38 +686,12 @@ def create_total_tokens_plot(all_results):
|
|||||||
for label in ax.get_xticklabels() + ax.get_yticklabels():
|
for label in ax.get_xticklabels() + ax.get_yticklabels():
|
||||||
label.set_fontweight("bold")
|
label.set_fontweight("bold")
|
||||||
|
|
||||||
# Add value labels on speedup points
|
# Add value labels on Triton/V2 speedup points
|
||||||
for x, y in zip(total_tokens_values, ratios):
|
for x, y in zip(total_tokens_values, triton_v2_ratios):
|
||||||
ax_speedup.annotate(
|
ax_speedup.annotate(
|
||||||
f"{y:.2f}x",
|
f"{y:.2f}x",
|
||||||
(x, y),
|
(x, y),
|
||||||
textcoords="offset points",
|
textcoords="offset points",
|
||||||
xytext=(0, 12),
|
|
||||||
ha="center",
|
|
||||||
fontsize=10,
|
|
||||||
fontweight="bold",
|
|
||||||
bbox=dict(boxstyle="round,pad=0.3", facecolor="white", alpha=0.7),
|
|
||||||
)
|
|
||||||
|
|
||||||
# Add value labels on CUDA bandwidth points
|
|
||||||
for x, y in zip(total_tokens_values, cuda_bandwidth_percentages):
|
|
||||||
ax_bandwidth.annotate(
|
|
||||||
f"{y:.1f}%",
|
|
||||||
(x, y),
|
|
||||||
textcoords="offset points",
|
|
||||||
xytext=(0, 12),
|
|
||||||
ha="center",
|
|
||||||
fontsize=9,
|
|
||||||
fontweight="bold",
|
|
||||||
bbox=dict(boxstyle="round,pad=0.2", facecolor="red", alpha=0.3),
|
|
||||||
)
|
|
||||||
|
|
||||||
# Add value labels on Triton bandwidth points
|
|
||||||
for x, y in zip(total_tokens_values, triton_bandwidth_percentages):
|
|
||||||
ax_bandwidth.annotate(
|
|
||||||
f"{y:.1f}%",
|
|
||||||
(x, y),
|
|
||||||
textcoords="offset points",
|
|
||||||
xytext=(0, -15),
|
xytext=(0, -15),
|
||||||
ha="center",
|
ha="center",
|
||||||
fontsize=9,
|
fontsize=9,
|
||||||
@@ -659,17 +701,20 @@ def create_total_tokens_plot(all_results):
|
|||||||
|
|
||||||
plt.tight_layout()
|
plt.tight_layout()
|
||||||
plt.subplots_adjust(top=0.93) # Make room for main title
|
plt.subplots_adjust(top=0.93) # Make room for main title
|
||||||
filename = "silu_benchmark_total_tokens.png"
|
filename = "silu_benchmark_total_tokens_3way.png"
|
||||||
plt.savefig(filename, dpi=300, bbox_inches="tight")
|
plt.savefig(filename, dpi=300, bbox_inches="tight")
|
||||||
plt.show()
|
plt.show()
|
||||||
|
|
||||||
return filename
|
return filename
|
||||||
|
|
||||||
|
|
||||||
# Create combined plot with all strategies
|
# Create comprehensive 3-way comparison plots
|
||||||
combined_plot_filename = create_total_tokens_plot(all_results)
|
combined_plot_filename = create_combined_plot(all_results)
|
||||||
|
total_tokens_plot_filename = create_total_tokens_plot(all_results)
|
||||||
|
|
||||||
print(f"\n{'=' * 60}")
|
print(f"\n{'=' * 80}")
|
||||||
print("Benchmark Complete!")
|
print("3-Way Benchmark Suite Complete!")
|
||||||
print(f"Generated combined plot: {combined_plot_filename}")
|
print(f"Generated combined comparison plot: {combined_plot_filename}")
|
||||||
print(f"{'=' * 60}")
|
print(f"Generated total tokens analysis plot: {total_tokens_plot_filename}")
|
||||||
|
print("Compared: SiLU V2 (CUDA), and Triton implementations")
|
||||||
|
print(f"{'=' * 80}")
|
||||||
|
|||||||
@@ -4,7 +4,6 @@
|
|||||||
import csv
|
import csv
|
||||||
import os
|
import os
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from typing import Optional
|
|
||||||
|
|
||||||
import flashinfer
|
import flashinfer
|
||||||
import torch
|
import torch
|
||||||
@@ -28,9 +27,7 @@ def to_float8(x, dtype=torch.float8_e4m3fn):
|
|||||||
@torch.no_grad()
|
@torch.no_grad()
|
||||||
def benchmark_decode(
|
def benchmark_decode(
|
||||||
dtype: torch.dtype,
|
dtype: torch.dtype,
|
||||||
quant_dtypes: tuple[
|
quant_dtypes: tuple[torch.dtype | None, torch.dtype | None, torch.dtype | None],
|
||||||
Optional[torch.dtype], Optional[torch.dtype], Optional[torch.dtype]
|
|
||||||
],
|
|
||||||
batch_size: int,
|
batch_size: int,
|
||||||
max_seq_len: int,
|
max_seq_len: int,
|
||||||
num_heads: tuple[int, int] = (64, 8),
|
num_heads: tuple[int, int] = (64, 8),
|
||||||
|
|||||||
@@ -4,7 +4,6 @@
|
|||||||
import csv
|
import csv
|
||||||
import os
|
import os
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from typing import Optional
|
|
||||||
|
|
||||||
import flashinfer
|
import flashinfer
|
||||||
import torch
|
import torch
|
||||||
@@ -28,9 +27,7 @@ def to_float8(x, dtype=torch.float8_e4m3fn):
|
|||||||
@torch.no_grad()
|
@torch.no_grad()
|
||||||
def benchmark_prefill(
|
def benchmark_prefill(
|
||||||
dtype: torch.dtype,
|
dtype: torch.dtype,
|
||||||
quant_dtypes: tuple[
|
quant_dtypes: tuple[torch.dtype | None, torch.dtype | None, torch.dtype | None],
|
||||||
Optional[torch.dtype], Optional[torch.dtype], Optional[torch.dtype]
|
|
||||||
],
|
|
||||||
batch_size: int,
|
batch_size: int,
|
||||||
max_seq_len: int,
|
max_seq_len: int,
|
||||||
num_heads: tuple[int, int] = (64, 8),
|
num_heads: tuple[int, int] = (64, 8),
|
||||||
|
|||||||
@@ -11,13 +11,13 @@ from datetime import datetime
|
|||||||
from typing import Any
|
from typing import Any
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
import triton
|
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
|
|
||||||
from vllm.model_executor.layers.quantization.utils.fp8_utils import (
|
from vllm.model_executor.layers.quantization.utils.fp8_utils import (
|
||||||
_w8a8_block_fp8_matmul,
|
_w8a8_triton_block_scaled_mm,
|
||||||
)
|
)
|
||||||
from vllm.platforms import current_platform
|
from vllm.platforms import current_platform
|
||||||
|
from vllm.triton_utils import triton
|
||||||
from vllm.utils import FlexibleArgumentParser
|
from vllm.utils import FlexibleArgumentParser
|
||||||
|
|
||||||
mp.set_start_method("spawn", force=True)
|
mp.set_start_method("spawn", force=True)
|
||||||
@@ -83,7 +83,7 @@ def w8a8_block_matmul(
|
|||||||
)
|
)
|
||||||
|
|
||||||
if A.dtype == torch.float8_e4m3fn:
|
if A.dtype == torch.float8_e4m3fn:
|
||||||
kernel = _w8a8_block_fp8_matmul
|
kernel = _w8a8_triton_block_scaled_mm
|
||||||
else:
|
else:
|
||||||
raise RuntimeError("Currently, only support tune w8a8 block fp8 kernel.")
|
raise RuntimeError("Currently, only support tune w8a8 block fp8 kernel.")
|
||||||
|
|
||||||
|
|||||||
@@ -1,6 +1,5 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
# fmt: off
|
|
||||||
# ruff: noqa: E501
|
# ruff: noqa: E501
|
||||||
import time
|
import time
|
||||||
|
|
||||||
@@ -8,27 +7,33 @@ import torch
|
|||||||
|
|
||||||
from vllm import _custom_ops as ops
|
from vllm import _custom_ops as ops
|
||||||
from vllm.model_executor.layers.quantization.utils.fp8_utils import (
|
from vllm.model_executor.layers.quantization.utils.fp8_utils import (
|
||||||
get_col_major_tma_aligned_tensor,
|
|
||||||
per_token_group_quant_fp8,
|
per_token_group_quant_fp8,
|
||||||
w8a8_block_fp8_matmul,
|
w8a8_triton_block_scaled_mm,
|
||||||
)
|
)
|
||||||
from vllm.triton_utils import triton
|
from vllm.triton_utils import triton
|
||||||
from vllm.utils.deep_gemm import calc_diff, fp8_gemm_nt, per_block_cast_to_fp8
|
from vllm.utils.deep_gemm import (
|
||||||
|
calc_diff,
|
||||||
|
fp8_gemm_nt,
|
||||||
|
get_col_major_tma_aligned_tensor,
|
||||||
|
per_block_cast_to_fp8,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def benchmark_shape(m: int,
|
def benchmark_shape(
|
||||||
n: int,
|
m: int,
|
||||||
k: int,
|
n: int,
|
||||||
warmup: int = 100,
|
k: int,
|
||||||
repeat: int = 10000,
|
warmup: int = 100,
|
||||||
verbose: bool = False) -> dict:
|
repeat: int = 10000,
|
||||||
|
verbose: bool = False,
|
||||||
|
) -> dict:
|
||||||
"""Benchmark all implementations for a specific (m, n, k) shape."""
|
"""Benchmark all implementations for a specific (m, n, k) shape."""
|
||||||
if verbose:
|
if verbose:
|
||||||
print(f"\n=== Benchmarking shape: m={m}, n={n}, k={k} ===")
|
print(f"\n=== Benchmarking shape: m={m}, n={n}, k={k} ===")
|
||||||
|
|
||||||
# Create test tensors
|
# Create test tensors
|
||||||
A = torch.randn((m, k), device='cuda', dtype=torch.bfloat16)
|
A = torch.randn((m, k), device="cuda", dtype=torch.bfloat16)
|
||||||
B = torch.randn((n, k), device='cuda', dtype=torch.bfloat16)
|
B = torch.randn((n, k), device="cuda", dtype=torch.bfloat16)
|
||||||
|
|
||||||
# Reference result in BF16
|
# Reference result in BF16
|
||||||
torch.cuda.synchronize()
|
torch.cuda.synchronize()
|
||||||
@@ -45,34 +50,39 @@ def benchmark_shape(m: int,
|
|||||||
# Pre-quantize A for all implementations
|
# Pre-quantize A for all implementations
|
||||||
A_deepgemm, A_scale_deepgemm = per_token_group_quant_fp8(A, block_size[1])
|
A_deepgemm, A_scale_deepgemm = per_token_group_quant_fp8(A, block_size[1])
|
||||||
A_scale_deepgemm = get_col_major_tma_aligned_tensor(A_scale_deepgemm)
|
A_scale_deepgemm = get_col_major_tma_aligned_tensor(A_scale_deepgemm)
|
||||||
C_deepgemm = torch.empty((m, n), device='cuda', dtype=torch.bfloat16)
|
C_deepgemm = torch.empty((m, n), device="cuda", dtype=torch.bfloat16)
|
||||||
A_vllm, A_scale_vllm = per_token_group_quant_fp8(A, block_size[1])
|
A_vllm, A_scale_vllm = per_token_group_quant_fp8(A, block_size[1])
|
||||||
A_vllm_cutlass, A_scale_vllm_cutlass = per_token_group_quant_fp8(
|
A_vllm_cutlass, A_scale_vllm_cutlass = per_token_group_quant_fp8(
|
||||||
A, block_size[1], column_major_scales=True)
|
A, block_size[1], column_major_scales=True
|
||||||
|
)
|
||||||
|
|
||||||
# === DeepGEMM Implementation ===
|
# === DeepGEMM Implementation ===
|
||||||
def deepgemm_gemm():
|
def deepgemm_gemm():
|
||||||
fp8_gemm_nt((A_deepgemm, A_scale_deepgemm),
|
fp8_gemm_nt(
|
||||||
(B_deepgemm, B_scale_deepgemm),
|
(A_deepgemm, A_scale_deepgemm), (B_deepgemm, B_scale_deepgemm), C_deepgemm
|
||||||
C_deepgemm)
|
)
|
||||||
return C_deepgemm
|
return C_deepgemm
|
||||||
|
|
||||||
# === vLLM Triton Implementation ===
|
# === vLLM Triton Implementation ===
|
||||||
def vllm_triton_gemm():
|
def vllm_triton_gemm():
|
||||||
return w8a8_block_fp8_matmul(A_vllm,
|
return w8a8_triton_block_scaled_mm(
|
||||||
B_vllm,
|
A_vllm,
|
||||||
A_scale_vllm,
|
B_vllm,
|
||||||
B_scale_vllm,
|
A_scale_vllm,
|
||||||
block_size,
|
B_scale_vllm,
|
||||||
output_dtype=torch.bfloat16)
|
block_size,
|
||||||
|
output_dtype=torch.bfloat16,
|
||||||
|
)
|
||||||
|
|
||||||
# === vLLM CUTLASS Implementation ===
|
# === vLLM CUTLASS Implementation ===
|
||||||
def vllm_cutlass_gemm():
|
def vllm_cutlass_gemm():
|
||||||
return ops.cutlass_scaled_mm(A_vllm_cutlass,
|
return ops.cutlass_scaled_mm(
|
||||||
B_vllm.T,
|
A_vllm_cutlass,
|
||||||
scale_a=A_scale_vllm_cutlass,
|
B_vllm.T,
|
||||||
scale_b=B_scale_vllm.T,
|
scale_a=A_scale_vllm_cutlass,
|
||||||
out_dtype=torch.bfloat16)
|
scale_b=B_scale_vllm.T,
|
||||||
|
out_dtype=torch.bfloat16,
|
||||||
|
)
|
||||||
|
|
||||||
# Run correctness check first
|
# Run correctness check first
|
||||||
if verbose:
|
if verbose:
|
||||||
@@ -89,26 +99,23 @@ def benchmark_shape(m: int,
|
|||||||
print(f"DeepGEMM vs Reference difference: {deepgemm_diff:.6f}")
|
print(f"DeepGEMM vs Reference difference: {deepgemm_diff:.6f}")
|
||||||
print(f"vLLM Triton vs Reference difference: {vllm_triton_diff:.6f}")
|
print(f"vLLM Triton vs Reference difference: {vllm_triton_diff:.6f}")
|
||||||
print(f"vLLM CUTLASS vs Reference difference: {vllm_cutlass_diff:.6f}")
|
print(f"vLLM CUTLASS vs Reference difference: {vllm_cutlass_diff:.6f}")
|
||||||
print("vLLM Triton vs DeepGEMM difference: "
|
print(
|
||||||
f"{calc_diff(C_vllm_triton, C_deepgemm):.6f}")
|
"vLLM Triton vs DeepGEMM difference: "
|
||||||
print("vLLM CUTLASS vs DeepGEMM difference: "
|
f"{calc_diff(C_vllm_triton, C_deepgemm):.6f}"
|
||||||
f"{calc_diff(C_vllm_cutlass, C_deepgemm):.6f}")
|
)
|
||||||
|
print(
|
||||||
|
"vLLM CUTLASS vs DeepGEMM difference: "
|
||||||
|
f"{calc_diff(C_vllm_cutlass, C_deepgemm):.6f}"
|
||||||
|
)
|
||||||
|
|
||||||
# Benchmark implementations
|
# Benchmark implementations
|
||||||
implementations = {
|
implementations = {
|
||||||
"DeepGEMM": deepgemm_gemm,
|
"DeepGEMM": deepgemm_gemm,
|
||||||
"vLLM Triton": vllm_triton_gemm,
|
"vLLM Triton": vllm_triton_gemm,
|
||||||
"vLLM CUTLASS": vllm_cutlass_gemm
|
"vLLM CUTLASS": vllm_cutlass_gemm,
|
||||||
}
|
}
|
||||||
|
|
||||||
benchmark_results = {
|
benchmark_results = {"shape": {"m": m, "n": n, "k": k}, "implementations": {}}
|
||||||
"shape": {
|
|
||||||
"m": m,
|
|
||||||
"n": n,
|
|
||||||
"k": k
|
|
||||||
},
|
|
||||||
"implementations": {}
|
|
||||||
}
|
|
||||||
|
|
||||||
for name, func in implementations.items():
|
for name, func in implementations.items():
|
||||||
# Warmup
|
# Warmup
|
||||||
@@ -136,38 +143,36 @@ def benchmark_shape(m: int,
|
|||||||
"tflops": tflops,
|
"tflops": tflops,
|
||||||
"gb_s": gb_s,
|
"gb_s": gb_s,
|
||||||
"diff": {
|
"diff": {
|
||||||
"DeepGEMM":
|
"DeepGEMM": 0.0
|
||||||
0.0 if name == "DeepGEMM" else calc_diff(func(), C_deepgemm),
|
if name == "DeepGEMM"
|
||||||
"Reference":
|
else calc_diff(func(), C_deepgemm),
|
||||||
deepgemm_diff if name == "DeepGEMM" else
|
"Reference": deepgemm_diff
|
||||||
(vllm_triton_diff
|
if name == "DeepGEMM"
|
||||||
if name == "vLLM Triton" else vllm_cutlass_diff)
|
else (vllm_triton_diff if name == "vLLM Triton" else vllm_cutlass_diff),
|
||||||
}
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
if verbose:
|
if verbose:
|
||||||
print(
|
print(f"{name}: {avg_time_ms:.3f} ms, {tflops:.2f} TFLOPS, {gb_s:.2f} GB/s")
|
||||||
f"{name}: {avg_time_ms:.3f} ms, {tflops:.2f} TFLOPS, {gb_s:.2f} GB/s"
|
|
||||||
)
|
|
||||||
|
|
||||||
# Calculate speedups
|
# Calculate speedups
|
||||||
baseline = benchmark_results["implementations"]["DeepGEMM"]["time_ms"]
|
baseline = benchmark_results["implementations"]["DeepGEMM"]["time_ms"]
|
||||||
for name, data in benchmark_results["implementations"].items():
|
for name, data in benchmark_results["implementations"].items():
|
||||||
if name != "DeepGEMM":
|
if name != "DeepGEMM":
|
||||||
speedup = baseline / data["time_ms"]
|
speedup = baseline / data["time_ms"]
|
||||||
benchmark_results["implementations"][name][
|
benchmark_results["implementations"][name]["speedup_vs_deepgemm"] = speedup
|
||||||
"speedup_vs_deepgemm"] = speedup
|
|
||||||
if verbose:
|
if verbose:
|
||||||
print(f"DeepGEMM is {1/speedup:.2f}x "
|
print(
|
||||||
f"{'faster' if 1/speedup > 1 else 'slower'} than {name}")
|
f"DeepGEMM is {1 / speedup:.2f}x "
|
||||||
|
f"{'faster' if 1 / speedup > 1 else 'slower'} than {name}"
|
||||||
|
)
|
||||||
|
|
||||||
vllm_triton_time = benchmark_results["implementations"]["vLLM Triton"][
|
vllm_triton_time = benchmark_results["implementations"]["vLLM Triton"]["time_ms"]
|
||||||
"time_ms"]
|
vllm_cutlass_time = benchmark_results["implementations"]["vLLM CUTLASS"]["time_ms"]
|
||||||
vllm_cutlass_time = benchmark_results["implementations"]["vLLM CUTLASS"][
|
|
||||||
"time_ms"]
|
|
||||||
cutlass_vs_triton = vllm_triton_time / vllm_cutlass_time
|
cutlass_vs_triton = vllm_triton_time / vllm_cutlass_time
|
||||||
benchmark_results["implementations"]["vLLM CUTLASS"][
|
benchmark_results["implementations"]["vLLM CUTLASS"]["speedup_vs_triton"] = (
|
||||||
"speedup_vs_triton"] = cutlass_vs_triton
|
cutlass_vs_triton
|
||||||
|
)
|
||||||
if verbose:
|
if verbose:
|
||||||
print(
|
print(
|
||||||
f"vLLM CUTLASS is {cutlass_vs_triton:.2f}x "
|
f"vLLM CUTLASS is {cutlass_vs_triton:.2f}x "
|
||||||
@@ -179,8 +184,7 @@ def benchmark_shape(m: int,
|
|||||||
|
|
||||||
def format_table_row(values, widths):
|
def format_table_row(values, widths):
|
||||||
"""Format a row with specified column widths."""
|
"""Format a row with specified column widths."""
|
||||||
return "| " + " | ".join(f"{val:{w}}"
|
return "| " + " | ".join(f"{val:{w}}" for val, w in zip(values, widths)) + " |"
|
||||||
for val, w in zip(values, widths)) + " |"
|
|
||||||
|
|
||||||
|
|
||||||
def print_table(headers, rows, title=None):
|
def print_table(headers, rows, title=None):
|
||||||
@@ -288,38 +292,50 @@ def run_benchmarks(verbose: bool = False):
|
|||||||
for result in all_results:
|
for result in all_results:
|
||||||
shape = result["shape"]
|
shape = result["shape"]
|
||||||
impl_data = result["implementations"]["DeepGEMM"]
|
impl_data = result["implementations"]["DeepGEMM"]
|
||||||
deepgemm_rows.append([
|
deepgemm_rows.append(
|
||||||
shape["m"], shape["n"], shape["k"], f"{impl_data['time_us']:.1f}",
|
[
|
||||||
f"{impl_data['tflops']:.1f}", f"{impl_data['gb_s']:.1f}"
|
shape["m"],
|
||||||
])
|
shape["n"],
|
||||||
|
shape["k"],
|
||||||
|
f"{impl_data['time_us']:.1f}",
|
||||||
|
f"{impl_data['tflops']:.1f}",
|
||||||
|
f"{impl_data['gb_s']:.1f}",
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
print_table(deepgemm_headers,
|
print_table(deepgemm_headers, deepgemm_rows, title="DeepGEMM Implementation:")
|
||||||
deepgemm_rows,
|
|
||||||
title="DeepGEMM Implementation:")
|
|
||||||
|
|
||||||
# Print vLLM Triton table
|
# Print vLLM Triton table
|
||||||
triton_headers = [
|
triton_headers = ["m", "n", "k", "Time (μs)", "TFLOPS", "GB/s", "vs DeepGEMM"]
|
||||||
"m", "n", "k", "Time (μs)", "TFLOPS", "GB/s", "vs DeepGEMM"
|
|
||||||
]
|
|
||||||
triton_rows = []
|
triton_rows = []
|
||||||
for result in all_results:
|
for result in all_results:
|
||||||
shape = result["shape"]
|
shape = result["shape"]
|
||||||
impl_data = result["implementations"]["vLLM Triton"]
|
impl_data = result["implementations"]["vLLM Triton"]
|
||||||
speedup = impl_data.get("speedup_vs_deepgemm", 1.0)
|
speedup = impl_data.get("speedup_vs_deepgemm", 1.0)
|
||||||
triton_rows.append([
|
triton_rows.append(
|
||||||
shape["m"], shape["n"], shape["k"], f"{impl_data['time_us']:.1f}",
|
[
|
||||||
f"{impl_data['tflops']:.1f}", f"{impl_data['gb_s']:.1f}",
|
shape["m"],
|
||||||
format_speedup(speedup)
|
shape["n"],
|
||||||
])
|
shape["k"],
|
||||||
|
f"{impl_data['time_us']:.1f}",
|
||||||
|
f"{impl_data['tflops']:.1f}",
|
||||||
|
f"{impl_data['gb_s']:.1f}",
|
||||||
|
format_speedup(speedup),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
print_table(triton_headers,
|
print_table(triton_headers, triton_rows, title="vLLM Triton Implementation:")
|
||||||
triton_rows,
|
|
||||||
title="vLLM Triton Implementation:")
|
|
||||||
|
|
||||||
# Print vLLM CUTLASS table
|
# Print vLLM CUTLASS table
|
||||||
cutlass_headers = [
|
cutlass_headers = [
|
||||||
"m", "n", "k", "Time (μs)", "TFLOPS", "GB/s", "vs DeepGEMM",
|
"m",
|
||||||
"vs Triton"
|
"n",
|
||||||
|
"k",
|
||||||
|
"Time (μs)",
|
||||||
|
"TFLOPS",
|
||||||
|
"GB/s",
|
||||||
|
"vs DeepGEMM",
|
||||||
|
"vs Triton",
|
||||||
]
|
]
|
||||||
cutlass_rows = []
|
cutlass_rows = []
|
||||||
for result in all_results:
|
for result in all_results:
|
||||||
@@ -327,28 +343,27 @@ def run_benchmarks(verbose: bool = False):
|
|||||||
impl_data = result["implementations"]["vLLM CUTLASS"]
|
impl_data = result["implementations"]["vLLM CUTLASS"]
|
||||||
vs_deepgemm = impl_data.get("speedup_vs_deepgemm", 1.0)
|
vs_deepgemm = impl_data.get("speedup_vs_deepgemm", 1.0)
|
||||||
vs_triton = impl_data.get("speedup_vs_triton", 1.0)
|
vs_triton = impl_data.get("speedup_vs_triton", 1.0)
|
||||||
cutlass_rows.append([
|
cutlass_rows.append(
|
||||||
shape["m"], shape["n"], shape["k"], f"{impl_data['time_us']:.1f}",
|
[
|
||||||
f"{impl_data['tflops']:.1f}", f"{impl_data['gb_s']:.1f}",
|
shape["m"],
|
||||||
format_speedup(vs_deepgemm),
|
shape["n"],
|
||||||
format_speedup(vs_triton)
|
shape["k"],
|
||||||
])
|
f"{impl_data['time_us']:.1f}",
|
||||||
|
f"{impl_data['tflops']:.1f}",
|
||||||
|
f"{impl_data['gb_s']:.1f}",
|
||||||
|
format_speedup(vs_deepgemm),
|
||||||
|
format_speedup(vs_triton),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
print_table(cutlass_headers,
|
print_table(cutlass_headers, cutlass_rows, title="vLLM CUTLASS Implementation:")
|
||||||
cutlass_rows,
|
|
||||||
title="vLLM CUTLASS Implementation:")
|
|
||||||
|
|
||||||
# Calculate and print averages
|
# Calculate and print averages
|
||||||
print("\n===== AVERAGE PERFORMANCE =====")
|
print("\n===== AVERAGE PERFORMANCE =====")
|
||||||
|
|
||||||
implementations = ["DeepGEMM", "vLLM Triton", "vLLM CUTLASS"]
|
implementations = ["DeepGEMM", "vLLM Triton", "vLLM CUTLASS"]
|
||||||
avg_metrics = {
|
avg_metrics = {
|
||||||
impl: {
|
impl: {"tflops": 0, "gb_s": 0, "time_ms": 0} for impl in implementations
|
||||||
"tflops": 0,
|
|
||||||
"gb_s": 0,
|
|
||||||
"time_ms": 0
|
|
||||||
}
|
|
||||||
for impl in implementations
|
|
||||||
}
|
}
|
||||||
|
|
||||||
for result in all_results:
|
for result in all_results:
|
||||||
@@ -366,9 +381,9 @@ def run_benchmarks(verbose: bool = False):
|
|||||||
avg_tflops = avg_metrics[impl]["tflops"] / num_shapes
|
avg_tflops = avg_metrics[impl]["tflops"] / num_shapes
|
||||||
avg_mem_bw = avg_metrics[impl]["gb_s"] / num_shapes
|
avg_mem_bw = avg_metrics[impl]["gb_s"] / num_shapes
|
||||||
avg_time = avg_metrics[impl]["time_ms"] / num_shapes
|
avg_time = avg_metrics[impl]["time_ms"] / num_shapes
|
||||||
avg_rows.append([
|
avg_rows.append(
|
||||||
impl, f"{avg_tflops:.2f}", f"{avg_mem_bw:.2f}", f"{avg_time:.2f}"
|
[impl, f"{avg_tflops:.2f}", f"{avg_mem_bw:.2f}", f"{avg_time:.2f}"]
|
||||||
])
|
)
|
||||||
|
|
||||||
print_table(avg_headers, avg_rows)
|
print_table(avg_headers, avg_rows)
|
||||||
|
|
||||||
@@ -376,21 +391,19 @@ def run_benchmarks(verbose: bool = False):
|
|||||||
avg_speedups = {
|
avg_speedups = {
|
||||||
"DeepGEMM vs vLLM Triton": 0,
|
"DeepGEMM vs vLLM Triton": 0,
|
||||||
"DeepGEMM vs vLLM CUTLASS": 0,
|
"DeepGEMM vs vLLM CUTLASS": 0,
|
||||||
"vLLM CUTLASS vs vLLM Triton": 0
|
"vLLM CUTLASS vs vLLM Triton": 0,
|
||||||
}
|
}
|
||||||
|
|
||||||
for result in all_results:
|
for result in all_results:
|
||||||
deepgemm_time = result["implementations"]["DeepGEMM"]["time_ms"]
|
deepgemm_time = result["implementations"]["DeepGEMM"]["time_ms"]
|
||||||
vllm_triton_time = result["implementations"]["vLLM Triton"]["time_ms"]
|
vllm_triton_time = result["implementations"]["vLLM Triton"]["time_ms"]
|
||||||
vllm_cutlass_time = result["implementations"]["vLLM CUTLASS"][
|
vllm_cutlass_time = result["implementations"]["vLLM CUTLASS"]["time_ms"]
|
||||||
"time_ms"]
|
|
||||||
|
|
||||||
avg_speedups[
|
avg_speedups["DeepGEMM vs vLLM Triton"] += vllm_triton_time / deepgemm_time
|
||||||
"DeepGEMM vs vLLM Triton"] += vllm_triton_time / deepgemm_time
|
avg_speedups["DeepGEMM vs vLLM CUTLASS"] += vllm_cutlass_time / deepgemm_time
|
||||||
avg_speedups[
|
avg_speedups["vLLM CUTLASS vs vLLM Triton"] += (
|
||||||
"DeepGEMM vs vLLM CUTLASS"] += vllm_cutlass_time / deepgemm_time
|
vllm_triton_time / vllm_cutlass_time
|
||||||
avg_speedups[
|
)
|
||||||
"vLLM CUTLASS vs vLLM Triton"] += vllm_triton_time / vllm_cutlass_time
|
|
||||||
|
|
||||||
print("\n===== AVERAGE SPEEDUPS =====")
|
print("\n===== AVERAGE SPEEDUPS =====")
|
||||||
speedup_headers = ["Comparison", "Speedup"]
|
speedup_headers = ["Comparison", "Speedup"]
|
||||||
@@ -408,8 +421,7 @@ def run_benchmarks(verbose: bool = False):
|
|||||||
|
|
||||||
for result in all_results:
|
for result in all_results:
|
||||||
for impl in implementations:
|
for impl in implementations:
|
||||||
avg_diff[impl] += result["implementations"][impl]["diff"][
|
avg_diff[impl] += result["implementations"][impl]["diff"]["Reference"]
|
||||||
"Reference"]
|
|
||||||
|
|
||||||
diff_headers = ["Implementation", "Avg Diff vs Reference"]
|
diff_headers = ["Implementation", "Avg Diff vs Reference"]
|
||||||
diff_rows = []
|
diff_rows = []
|
||||||
|
|||||||
@@ -2,8 +2,8 @@
|
|||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
|
||||||
import dataclasses
|
import dataclasses
|
||||||
from collections.abc import Iterable
|
from collections.abc import Callable, Iterable
|
||||||
from typing import Any, Callable, Optional
|
from typing import Any
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
import torch.utils.benchmark as TBenchmark
|
import torch.utils.benchmark as TBenchmark
|
||||||
@@ -55,7 +55,7 @@ class Bench:
|
|||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
cuda_graph_params: Optional[CudaGraphBenchParams],
|
cuda_graph_params: CudaGraphBenchParams | None,
|
||||||
label: str,
|
label: str,
|
||||||
sub_label: str,
|
sub_label: str,
|
||||||
description: str,
|
description: str,
|
||||||
|
|||||||
@@ -55,6 +55,107 @@ output_num_chunks 166.0 99.01 11.80 79.00 90.00 98.00 108.75
|
|||||||
----------------------------------------------------------------------------------------------------
|
----------------------------------------------------------------------------------------------------
|
||||||
```
|
```
|
||||||
|
|
||||||
|
### JSON configuration file for synthetic conversations generation
|
||||||
|
|
||||||
|
The input flag `--input-file` is used to determine the input conversations for the benchmark.<br/>
|
||||||
|
When the input is a JSON file with the field `"filetype": "generate_conversations"` the tool will generate synthetic multi-turn (questions and answers) conversations.
|
||||||
|
|
||||||
|
The file `generate_multi_turn.json` is an example file.
|
||||||
|
|
||||||
|
The file must contain the sections `prompt_input` and `prompt_output`.
|
||||||
|
|
||||||
|
The `prompt_input` section must contain `num_turns`, `prefix_num_tokens` and `num_tokens`:
|
||||||
|
|
||||||
|
* `num_turns` - Number of total turns in the conversation (both user & assistant).<br/>
|
||||||
|
The final value will always be rounded to an even number so each user turn has a reply.
|
||||||
|
* `prefix_num_tokens` - Tokens added at the start of only the **first user turn** in a conversation (unique per conversation).
|
||||||
|
* `num_tokens` - Total token length of each **user** message (one turn).
|
||||||
|
|
||||||
|
The `prompt_output` section must contain `num_tokens`:
|
||||||
|
|
||||||
|
* `num_tokens` - Total token length of each **assistant** message (one turn).
|
||||||
|
|
||||||
|
### Random distributions for synthetic conversations generation
|
||||||
|
|
||||||
|
When creating an input JSON file (such as `generate_multi_turn.json`),<br/>
|
||||||
|
every numeric field (such as `num_turns` or `num_tokens`) requires a distribution.<br/>
|
||||||
|
The distribution determines how to randomly sample values for the field.
|
||||||
|
|
||||||
|
The available distributions are listed below.
|
||||||
|
|
||||||
|
**Note:** The optional `max` field (for lognormal, zipf, and poisson) can be used to cap sampled values at an upper bound.</br>
|
||||||
|
Can be used to make sure that the total number of tokens in every request does not exceed `--max-model-len`.
|
||||||
|
|
||||||
|
#### constant
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"distribution": "constant",
|
||||||
|
"value": 500
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
* `value` - the fixed integer value (always returns the same number).
|
||||||
|
|
||||||
|
#### uniform
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"distribution": "uniform",
|
||||||
|
"min": 12,
|
||||||
|
"max": 18
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
* `min` - minimum value (inclusive).
|
||||||
|
* `max` - maximum value (inclusive), should be equal or larger than min.
|
||||||
|
|
||||||
|
#### lognormal
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"distribution": "lognormal",
|
||||||
|
"average": 1000,
|
||||||
|
"max": 5000
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
You can parameterize the lognormal distribution in one of two ways:
|
||||||
|
|
||||||
|
Using the average and optional median ratio:
|
||||||
|
|
||||||
|
* `average` - target average value of the distribution.
|
||||||
|
* `median_ratio` - the ratio of the median to the average; controls the skewness. Must be in the range (0, 1).
|
||||||
|
|
||||||
|
Using the parameters of the underlying normal distribution:
|
||||||
|
|
||||||
|
* `mean` - mean of the underlying normal distribution.
|
||||||
|
* `sigma` - standard deviation of the underlying normal distribution.
|
||||||
|
|
||||||
|
#### zipf
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"distribution": "zipf",
|
||||||
|
"alpha": 1.2,
|
||||||
|
"max": 100
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
* `alpha` - skew parameter (> 1). Larger values produce stronger skew toward smaller integers.
|
||||||
|
|
||||||
|
#### poisson
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"distribution": "poisson",
|
||||||
|
"alpha": 10,
|
||||||
|
"max": 50
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
* `alpha` - expected value (λ). Also the variance of the distribution.
|
||||||
|
|
||||||
## ShareGPT Conversations
|
## ShareGPT Conversations
|
||||||
|
|
||||||
To run with the ShareGPT data, download the following ShareGPT dataset:
|
To run with the ShareGPT data, download the following ShareGPT dataset:
|
||||||
|
|||||||
@@ -2,7 +2,7 @@
|
|||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
from abc import ABC, abstractmethod
|
from abc import ABC, abstractmethod
|
||||||
from statistics import mean
|
from statistics import mean
|
||||||
from typing import Any, NamedTuple, Optional, Union
|
from typing import Any, NamedTuple
|
||||||
|
|
||||||
import numpy as np # type: ignore
|
import numpy as np # type: ignore
|
||||||
import pandas as pd # type: ignore
|
import pandas as pd # type: ignore
|
||||||
@@ -35,8 +35,8 @@ class Distribution(ABC):
|
|||||||
class UniformDistribution(Distribution):
|
class UniformDistribution(Distribution):
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
min_val: Union[int, float],
|
min_val: int | float,
|
||||||
max_val: Union[int, float],
|
max_val: int | float,
|
||||||
is_integer: bool = True,
|
is_integer: bool = True,
|
||||||
) -> None:
|
) -> None:
|
||||||
self.min_val = min_val
|
self.min_val = min_val
|
||||||
@@ -56,7 +56,7 @@ class UniformDistribution(Distribution):
|
|||||||
|
|
||||||
|
|
||||||
class ConstantDistribution(Distribution):
|
class ConstantDistribution(Distribution):
|
||||||
def __init__(self, value: Union[int, float]) -> None:
|
def __init__(self, value: int | float) -> None:
|
||||||
self.value = value
|
self.value = value
|
||||||
self.max_val = value
|
self.max_val = value
|
||||||
|
|
||||||
@@ -68,7 +68,7 @@ class ConstantDistribution(Distribution):
|
|||||||
|
|
||||||
|
|
||||||
class ZipfDistribution(Distribution):
|
class ZipfDistribution(Distribution):
|
||||||
def __init__(self, alpha: float, max_val: Optional[int] = None) -> None:
|
def __init__(self, alpha: float, max_val: int | None = None) -> None:
|
||||||
self.alpha = alpha
|
self.alpha = alpha
|
||||||
self.max_val = max_val
|
self.max_val = max_val
|
||||||
|
|
||||||
@@ -83,7 +83,7 @@ class ZipfDistribution(Distribution):
|
|||||||
|
|
||||||
|
|
||||||
class PoissonDistribution(Distribution):
|
class PoissonDistribution(Distribution):
|
||||||
def __init__(self, alpha: float, max_val: Optional[int] = None) -> None:
|
def __init__(self, alpha: float, max_val: int | None = None) -> None:
|
||||||
self.alpha = alpha
|
self.alpha = alpha
|
||||||
self.max_val = max_val
|
self.max_val = max_val
|
||||||
|
|
||||||
@@ -99,21 +99,105 @@ class PoissonDistribution(Distribution):
|
|||||||
|
|
||||||
class LognormalDistribution(Distribution):
|
class LognormalDistribution(Distribution):
|
||||||
def __init__(
|
def __init__(
|
||||||
self, mean: float, sigma: float, max_val: Optional[int] = None
|
self,
|
||||||
|
mean: float | None = None,
|
||||||
|
sigma: float | None = None,
|
||||||
|
average: int | None = None,
|
||||||
|
median_ratio: float | None = None,
|
||||||
|
max_val: int | None = None,
|
||||||
) -> None:
|
) -> None:
|
||||||
|
self.average = average
|
||||||
|
self.median_ratio = median_ratio
|
||||||
|
self.max_val = max_val
|
||||||
|
|
||||||
|
if average is not None:
|
||||||
|
if average < 1:
|
||||||
|
raise ValueError("Lognormal average must be positive")
|
||||||
|
|
||||||
|
if mean or sigma:
|
||||||
|
raise ValueError(
|
||||||
|
"When using lognormal average, you can't provide mean/sigma"
|
||||||
|
)
|
||||||
|
|
||||||
|
if self.median_ratio is None:
|
||||||
|
# Default value that provides relatively wide range of values
|
||||||
|
self.median_ratio = 0.85
|
||||||
|
|
||||||
|
# Calculate mean/sigma of np.random.lognormal based on the average
|
||||||
|
mean, sigma = self._generate_lognormal_by_median(
|
||||||
|
target_average=self.average, median_ratio=self.median_ratio
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
if mean is None or sigma is None:
|
||||||
|
raise ValueError(
|
||||||
|
"Must provide both mean and sigma if average is not used"
|
||||||
|
)
|
||||||
|
|
||||||
|
if mean <= 0 or sigma < 0:
|
||||||
|
raise ValueError(
|
||||||
|
"Lognormal mean must be positive and sigma must be non-negative"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Mean and standard deviation of the underlying normal distribution
|
||||||
|
# Based on numpy.random.lognormal
|
||||||
self.mean = mean
|
self.mean = mean
|
||||||
self.sigma = sigma
|
self.sigma = sigma
|
||||||
self.max_val = max_val
|
|
||||||
|
@staticmethod
|
||||||
|
def _generate_lognormal_by_median(
|
||||||
|
target_average: int, median_ratio: float
|
||||||
|
) -> tuple[float, float]:
|
||||||
|
"""
|
||||||
|
Compute (mu, sigma) for a lognormal distribution given:
|
||||||
|
- a target average (mean of the distribution)
|
||||||
|
- a ratio of median / mean (controls skewness), assume mean > median
|
||||||
|
|
||||||
|
Background:
|
||||||
|
If Z ~ Normal(mu, sigma^2), then X = exp(Z) ~ LogNormal(mu, sigma).
|
||||||
|
* mean(X) = exp(mu + sigma^2 / 2)
|
||||||
|
* median(X) = exp(mu)
|
||||||
|
|
||||||
|
So:
|
||||||
|
median / mean = exp(mu) / exp(mu + sigma^2 / 2)
|
||||||
|
= exp(-sigma^2 / 2)
|
||||||
|
|
||||||
|
Rearranging:
|
||||||
|
sigma^2 = 2 * ln(mean / median)
|
||||||
|
mu = ln(median)
|
||||||
|
|
||||||
|
This gives a unique (mu, sigma) for any valid mean and median.
|
||||||
|
"""
|
||||||
|
# Check input validity: median must be smaller than mean
|
||||||
|
if median_ratio <= 0 or median_ratio >= 1:
|
||||||
|
raise ValueError("median_ratio must be in range (0, 1)")
|
||||||
|
|
||||||
|
target_median = target_average * median_ratio
|
||||||
|
|
||||||
|
# Solve sigma^2 = 2 * ln(mean / median)
|
||||||
|
sigma = np.sqrt(2 * np.log(target_average / target_median))
|
||||||
|
mu = np.log(target_median)
|
||||||
|
|
||||||
|
return mu, sigma
|
||||||
|
|
||||||
def sample(self, size: int = 1) -> np.ndarray:
|
def sample(self, size: int = 1) -> np.ndarray:
|
||||||
samples = np.random.lognormal(mean=self.mean, sigma=self.sigma, size=size)
|
samples = np.random.lognormal(mean=self.mean, sigma=self.sigma, size=size)
|
||||||
|
|
||||||
|
if self.average is not None:
|
||||||
|
# Scale to average
|
||||||
|
samples *= self.average / samples.mean()
|
||||||
|
|
||||||
if self.max_val:
|
if self.max_val:
|
||||||
samples = np.minimum(samples, self.max_val)
|
samples = np.minimum(samples, self.max_val)
|
||||||
|
|
||||||
return np.round(samples).astype(int)
|
return np.round(samples).astype(int)
|
||||||
|
|
||||||
def __repr__(self) -> str:
|
def __repr__(self) -> str:
|
||||||
return f"LognormalDistribution[{self.mean}, {self.sigma}]"
|
if self.average:
|
||||||
|
return (
|
||||||
|
f"LognormalDistribution[{self.average}, "
|
||||||
|
f"{self.median_ratio}, {self.max_val}]"
|
||||||
|
)
|
||||||
|
return f"LognormalDistribution[{self.mean}, {self.sigma}, {self.max_val}]"
|
||||||
|
|
||||||
|
|
||||||
class GenConvArgs(NamedTuple):
|
class GenConvArgs(NamedTuple):
|
||||||
@@ -173,10 +257,21 @@ def get_random_distribution(
|
|||||||
return PoissonDistribution(conf["alpha"], max_val=max_val)
|
return PoissonDistribution(conf["alpha"], max_val=max_val)
|
||||||
|
|
||||||
elif distribution == "lognormal":
|
elif distribution == "lognormal":
|
||||||
|
max_val = conf.get("max", None)
|
||||||
|
|
||||||
|
if "average" in conf:
|
||||||
|
# Infer lognormal mean/sigma (numpy) from input average
|
||||||
|
median_ratio = conf.get("median_ratio", None)
|
||||||
|
return LognormalDistribution(
|
||||||
|
average=conf["average"], median_ratio=median_ratio, max_val=max_val
|
||||||
|
)
|
||||||
|
|
||||||
|
# Use mean/sigma directly (for full control over the distribution)
|
||||||
verify_field_exists(conf, "mean", section, subsection)
|
verify_field_exists(conf, "mean", section, subsection)
|
||||||
verify_field_exists(conf, "sigma", section, subsection)
|
verify_field_exists(conf, "sigma", section, subsection)
|
||||||
max_val = conf.get("max", None)
|
return LognormalDistribution(
|
||||||
return LognormalDistribution(conf["mean"], conf["sigma"], max_val=max_val)
|
mean=conf["mean"], sigma=conf["sigma"], max_val=max_val
|
||||||
|
)
|
||||||
|
|
||||||
elif distribution == "uniform":
|
elif distribution == "uniform":
|
||||||
verify_field_exists(conf, "min", section, subsection)
|
verify_field_exists(conf, "min", section, subsection)
|
||||||
|
|||||||
@@ -13,7 +13,7 @@ from datetime import datetime
|
|||||||
from enum import Enum
|
from enum import Enum
|
||||||
from http import HTTPStatus
|
from http import HTTPStatus
|
||||||
from statistics import mean
|
from statistics import mean
|
||||||
from typing import NamedTuple, Optional, Union
|
from typing import NamedTuple
|
||||||
|
|
||||||
import aiohttp # type: ignore
|
import aiohttp # type: ignore
|
||||||
import numpy as np # type: ignore
|
import numpy as np # type: ignore
|
||||||
@@ -46,9 +46,9 @@ class ConversationSampling(str, Enum):
|
|||||||
|
|
||||||
class ClientArgs(NamedTuple):
|
class ClientArgs(NamedTuple):
|
||||||
seed: int
|
seed: int
|
||||||
max_num_requests: Optional[int]
|
max_num_requests: int | None
|
||||||
skip_first_turn: bool
|
skip_first_turn: bool
|
||||||
max_turns: Optional[int]
|
max_turns: int | None
|
||||||
max_active_conversations: int
|
max_active_conversations: int
|
||||||
verbose: bool
|
verbose: bool
|
||||||
print_content: bool
|
print_content: bool
|
||||||
@@ -109,9 +109,9 @@ class RequestStats(NamedTuple):
|
|||||||
|
|
||||||
class MetricStats:
|
class MetricStats:
|
||||||
def __init__(self) -> None:
|
def __init__(self) -> None:
|
||||||
self.min: Optional[float] = None
|
self.min: float | None = None
|
||||||
self.max: Optional[float] = None
|
self.max: float | None = None
|
||||||
self.avg: Optional[float] = None
|
self.avg: float | None = None
|
||||||
self.sum = 0.0
|
self.sum = 0.0
|
||||||
self.count = 0
|
self.count = 0
|
||||||
|
|
||||||
@@ -143,7 +143,7 @@ class MovingAverage:
|
|||||||
self.index = 0
|
self.index = 0
|
||||||
self.sum = 0.0
|
self.sum = 0.0
|
||||||
self.count = 0
|
self.count = 0
|
||||||
self.avg: Optional[float] = None
|
self.avg: float | None = None
|
||||||
|
|
||||||
def update(self, new_value: float) -> None:
|
def update(self, new_value: float) -> None:
|
||||||
if self.count < self.window_size:
|
if self.count < self.window_size:
|
||||||
@@ -169,7 +169,7 @@ class MovingAverage:
|
|||||||
class DebugStats:
|
class DebugStats:
|
||||||
def __init__(self, logger: logging.Logger, window_size: int) -> None:
|
def __init__(self, logger: logging.Logger, window_size: int) -> None:
|
||||||
self.logger = logger
|
self.logger = logger
|
||||||
self.metrics: dict[str, Union[MovingAverage, MetricStats]] = {
|
self.metrics: dict[str, MovingAverage | MetricStats] = {
|
||||||
"moving_avg_ttft_ms": MovingAverage(window_size),
|
"moving_avg_ttft_ms": MovingAverage(window_size),
|
||||||
"moving_avg_tpot_ms": MovingAverage(window_size),
|
"moving_avg_tpot_ms": MovingAverage(window_size),
|
||||||
"ttft_ms": MetricStats(),
|
"ttft_ms": MetricStats(),
|
||||||
@@ -198,14 +198,6 @@ class DebugStats:
|
|||||||
self.logger.info("-" * 50)
|
self.logger.info("-" * 50)
|
||||||
|
|
||||||
|
|
||||||
# Must support Python 3.8, we can't use str.removeprefix(prefix)
|
|
||||||
# introduced in Python 3.9
|
|
||||||
def remove_prefix(text: str, prefix: str) -> str:
|
|
||||||
if text.startswith(prefix):
|
|
||||||
return text[len(prefix) :]
|
|
||||||
return text
|
|
||||||
|
|
||||||
|
|
||||||
def nanosec_to_millisec(value: float) -> float:
|
def nanosec_to_millisec(value: float) -> float:
|
||||||
return value / 1000000.0
|
return value / 1000000.0
|
||||||
|
|
||||||
@@ -220,8 +212,8 @@ async def send_request(
|
|||||||
chat_url: str,
|
chat_url: str,
|
||||||
model: str,
|
model: str,
|
||||||
stream: bool = True,
|
stream: bool = True,
|
||||||
min_tokens: Optional[int] = None,
|
min_tokens: int | None = None,
|
||||||
max_tokens: Optional[int] = None,
|
max_tokens: int | None = None,
|
||||||
) -> ServerResponse:
|
) -> ServerResponse:
|
||||||
payload = {
|
payload = {
|
||||||
"model": model,
|
"model": model,
|
||||||
@@ -250,9 +242,9 @@ async def send_request(
|
|||||||
timeout = aiohttp.ClientTimeout(total=timeout_sec)
|
timeout = aiohttp.ClientTimeout(total=timeout_sec)
|
||||||
|
|
||||||
valid_response = True
|
valid_response = True
|
||||||
ttft: Optional[float] = None
|
ttft: float | None = None
|
||||||
chunk_delay: list[int] = []
|
chunk_delay: list[int] = []
|
||||||
latency: Optional[float] = None
|
latency: float | None = None
|
||||||
first_chunk = ""
|
first_chunk = ""
|
||||||
generated_text = ""
|
generated_text = ""
|
||||||
|
|
||||||
@@ -269,7 +261,7 @@ async def send_request(
|
|||||||
if not chunk_bytes:
|
if not chunk_bytes:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
chunk = remove_prefix(chunk_bytes.decode("utf-8"), "data: ")
|
chunk = chunk_bytes.decode("utf-8").removeprefix("data: ")
|
||||||
if chunk == "[DONE]":
|
if chunk == "[DONE]":
|
||||||
# End of stream
|
# End of stream
|
||||||
latency = time.perf_counter_ns() - start_time
|
latency = time.perf_counter_ns() - start_time
|
||||||
@@ -364,7 +356,7 @@ async def send_turn(
|
|||||||
req_args: RequestArgs,
|
req_args: RequestArgs,
|
||||||
verbose: bool,
|
verbose: bool,
|
||||||
verify_output: bool,
|
verify_output: bool,
|
||||||
) -> Optional[RequestStats]:
|
) -> RequestStats | None:
|
||||||
assert messages_to_use > 0
|
assert messages_to_use > 0
|
||||||
assert messages_to_use <= len(conversation_messages)
|
assert messages_to_use <= len(conversation_messages)
|
||||||
|
|
||||||
@@ -644,7 +636,7 @@ async def client_main(
|
|||||||
|
|
||||||
if args.verbose:
|
if args.verbose:
|
||||||
curr_time_sec: float = time.perf_counter()
|
curr_time_sec: float = time.perf_counter()
|
||||||
time_since_last_turn: Union[str, float] = "N/A"
|
time_since_last_turn: str | float = "N/A"
|
||||||
if conv_id in time_of_last_turn:
|
if conv_id in time_of_last_turn:
|
||||||
time_since_last_turn = round(
|
time_since_last_turn = round(
|
||||||
curr_time_sec - time_of_last_turn[conv_id], 3
|
curr_time_sec - time_of_last_turn[conv_id], 3
|
||||||
@@ -769,7 +761,7 @@ def get_client_config(
|
|||||||
"Number of conversations must be equal or larger than the number of clients"
|
"Number of conversations must be equal or larger than the number of clients"
|
||||||
)
|
)
|
||||||
|
|
||||||
max_req_per_client: Optional[int] = None
|
max_req_per_client: int | None = None
|
||||||
if args.max_num_requests is not None:
|
if args.max_num_requests is not None:
|
||||||
# Max number of requests per client
|
# Max number of requests per client
|
||||||
req_per_client = args.max_num_requests // args.num_clients
|
req_per_client = args.max_num_requests // args.num_clients
|
||||||
@@ -936,13 +928,13 @@ async def main_mp(
|
|||||||
f"{num_clients_finished} out of {bench_args.num_clients} clients finished, collected {len(client_metrics)} measurements, runtime {runtime_sec:.3f} sec{Color.RESET}" # noqa: E501
|
f"{num_clients_finished} out of {bench_args.num_clients} clients finished, collected {len(client_metrics)} measurements, runtime {runtime_sec:.3f} sec{Color.RESET}" # noqa: E501
|
||||||
)
|
)
|
||||||
|
|
||||||
rps: Union[str, float] = round(len(client_metrics) / runtime_sec, 3)
|
rps: str | float = round(len(client_metrics) / runtime_sec, 3)
|
||||||
if len(client_metrics) < (5 * bench_args.num_clients):
|
if len(client_metrics) < (5 * bench_args.num_clients):
|
||||||
# Do not estimate the RPS if the number of samples is very low
|
# Do not estimate the RPS if the number of samples is very low
|
||||||
# (threshold can be tuned if needed)
|
# (threshold can be tuned if needed)
|
||||||
rps = "N/A"
|
rps = "N/A"
|
||||||
|
|
||||||
runtime_left_sec: Union[str, float] = round(
|
runtime_left_sec: str | float = round(
|
||||||
(runtime_sec / finished_convs) * (total_convs - finished_convs), 3
|
(runtime_sec / finished_convs) * (total_convs - finished_convs), 3
|
||||||
)
|
)
|
||||||
if percent < 0.05:
|
if percent < 0.05:
|
||||||
@@ -1032,7 +1024,7 @@ def process_statistics(
|
|||||||
warmup_percentages: list[float],
|
warmup_percentages: list[float],
|
||||||
test_params: dict,
|
test_params: dict,
|
||||||
verbose: bool,
|
verbose: bool,
|
||||||
gen_conv_args: Optional[GenConvArgs] = None,
|
gen_conv_args: GenConvArgs | None = None,
|
||||||
excel_output: bool = False,
|
excel_output: bool = False,
|
||||||
) -> None:
|
) -> None:
|
||||||
if len(client_metrics) == 0:
|
if len(client_metrics) == 0:
|
||||||
@@ -1259,7 +1251,7 @@ async def main() -> None:
|
|||||||
default=None,
|
default=None,
|
||||||
help="The model name used in the API. "
|
help="The model name used in the API. "
|
||||||
"If not specified, the model name will be the "
|
"If not specified, the model name will be the "
|
||||||
"same as the ``--model`` argument. ",
|
"same as the `--model` argument. ",
|
||||||
)
|
)
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
|
|||||||
@@ -13,7 +13,7 @@ import argparse
|
|||||||
import json
|
import json
|
||||||
import random
|
import random
|
||||||
from statistics import mean
|
from statistics import mean
|
||||||
from typing import Any, Optional
|
from typing import Any
|
||||||
|
|
||||||
import pandas as pd # type: ignore
|
import pandas as pd # type: ignore
|
||||||
import tqdm # type: ignore
|
import tqdm # type: ignore
|
||||||
@@ -25,7 +25,7 @@ def has_non_english_chars(text: str) -> bool:
|
|||||||
|
|
||||||
|
|
||||||
def content_is_valid(
|
def content_is_valid(
|
||||||
content: str, min_content_len: Optional[int], max_content_len: Optional[int]
|
content: str, min_content_len: int | None, max_content_len: int | None
|
||||||
) -> bool:
|
) -> bool:
|
||||||
if min_content_len and len(content) < min_content_len:
|
if min_content_len and len(content) < min_content_len:
|
||||||
return False
|
return False
|
||||||
@@ -37,7 +37,7 @@ def content_is_valid(
|
|||||||
|
|
||||||
|
|
||||||
def print_stats(
|
def print_stats(
|
||||||
conversations: "list[dict[Any, Any]]", tokenizer: Optional[AutoTokenizer] = None
|
conversations: "list[dict[Any, Any]]", tokenizer: AutoTokenizer | None = None
|
||||||
) -> None:
|
) -> None:
|
||||||
# Collect statistics
|
# Collect statistics
|
||||||
stats = []
|
stats = []
|
||||||
@@ -109,12 +109,12 @@ def convert_sharegpt_to_openai(
|
|||||||
seed: int,
|
seed: int,
|
||||||
input_file: str,
|
input_file: str,
|
||||||
output_file: str,
|
output_file: str,
|
||||||
max_items: Optional[int],
|
max_items: int | None,
|
||||||
min_content_len: Optional[int] = None,
|
min_content_len: int | None = None,
|
||||||
max_content_len: Optional[int] = None,
|
max_content_len: int | None = None,
|
||||||
min_turns: Optional[int] = None,
|
min_turns: int | None = None,
|
||||||
max_turns: Optional[int] = None,
|
max_turns: int | None = None,
|
||||||
model: Optional[str] = None,
|
model: str | None = None,
|
||||||
) -> None:
|
) -> None:
|
||||||
if min_turns and max_turns:
|
if min_turns and max_turns:
|
||||||
assert min_turns <= max_turns
|
assert min_turns <= max_turns
|
||||||
|
|||||||
@@ -15,9 +15,8 @@
|
|||||||
},
|
},
|
||||||
"prefix_num_tokens": {
|
"prefix_num_tokens": {
|
||||||
"distribution": "lognormal",
|
"distribution": "lognormal",
|
||||||
"mean": 6,
|
"average": 1000,
|
||||||
"sigma": 4,
|
"max": 5000
|
||||||
"max": 1500
|
|
||||||
},
|
},
|
||||||
"num_tokens": {
|
"num_tokens": {
|
||||||
"distribution": "uniform",
|
"distribution": "uniform",
|
||||||
|
|||||||
@@ -1,49 +0,0 @@
|
|||||||
# This local pyproject file is part of the migration from yapf to ruff format.
|
|
||||||
# It uses the same core rules as the main pyproject.toml file, but with the
|
|
||||||
# following differences:
|
|
||||||
# - ruff line length is overridden to 88
|
|
||||||
# - deprecated typing ignores (UP006, UP035) have been removed
|
|
||||||
|
|
||||||
[tool.ruff]
|
|
||||||
line-length = 88
|
|
||||||
|
|
||||||
[tool.ruff.lint.per-file-ignores]
|
|
||||||
"vllm/third_party/**" = ["ALL"]
|
|
||||||
"vllm/version.py" = ["F401"]
|
|
||||||
"vllm/_version.py" = ["ALL"]
|
|
||||||
|
|
||||||
[tool.ruff.lint]
|
|
||||||
select = [
|
|
||||||
# pycodestyle
|
|
||||||
"E",
|
|
||||||
# Pyflakes
|
|
||||||
"F",
|
|
||||||
# pyupgrade
|
|
||||||
"UP",
|
|
||||||
# flake8-bugbear
|
|
||||||
"B",
|
|
||||||
# flake8-simplify
|
|
||||||
"SIM",
|
|
||||||
# isort
|
|
||||||
"I",
|
|
||||||
# flake8-logging-format
|
|
||||||
"G",
|
|
||||||
]
|
|
||||||
ignore = [
|
|
||||||
# star imports
|
|
||||||
"F405", "F403",
|
|
||||||
# lambda expression assignment
|
|
||||||
"E731",
|
|
||||||
# Loop control variable not used within loop body
|
|
||||||
"B007",
|
|
||||||
# f-string format
|
|
||||||
"UP032",
|
|
||||||
# Can remove once 3.10+ is the minimum Python version
|
|
||||||
"UP007",
|
|
||||||
]
|
|
||||||
|
|
||||||
[tool.ruff.lint.isort]
|
|
||||||
known-first-party = ["vllm"]
|
|
||||||
|
|
||||||
[tool.ruff.format]
|
|
||||||
docstring-code-format = true
|
|
||||||
@@ -101,6 +101,7 @@ else()
|
|||||||
find_isa(${CPUINFO} "asimd" ASIMD_FOUND) # Check for ARM NEON support
|
find_isa(${CPUINFO} "asimd" ASIMD_FOUND) # Check for ARM NEON support
|
||||||
find_isa(${CPUINFO} "bf16" ARM_BF16_FOUND) # Check for ARM BF16 support
|
find_isa(${CPUINFO} "bf16" ARM_BF16_FOUND) # Check for ARM BF16 support
|
||||||
find_isa(${CPUINFO} "S390" S390_FOUND)
|
find_isa(${CPUINFO} "S390" S390_FOUND)
|
||||||
|
find_isa(${CPUINFO} "v" RVV_FOUND) # Check for RISC-V RVV support
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
if (AVX512_FOUND AND NOT AVX512_DISABLED)
|
if (AVX512_FOUND AND NOT AVX512_DISABLED)
|
||||||
@@ -177,35 +178,74 @@ elseif (S390_FOUND)
|
|||||||
"-mzvector"
|
"-mzvector"
|
||||||
"-march=native"
|
"-march=native"
|
||||||
"-mtune=native")
|
"-mtune=native")
|
||||||
|
elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "riscv64")
|
||||||
|
if(RVV_FOUND)
|
||||||
|
message(FAIL_ERROR "Can't support rvv now.")
|
||||||
|
else()
|
||||||
|
list(APPEND CXX_COMPILE_FLAGS "-march=rv64gc")
|
||||||
|
endif()
|
||||||
else()
|
else()
|
||||||
message(FATAL_ERROR "vLLM CPU backend requires AVX512, AVX2, Power9+ ISA, S390X ISA or ARMv8 support.")
|
message(FATAL_ERROR "vLLM CPU backend requires AVX512, AVX2, Power9+ ISA, S390X ISA, ARMv8 or RISC-V support.")
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
#
|
|
||||||
# Build oneDNN for W8A8 GEMM kernels (only for x86-AVX512 /ARM platforms)
|
|
||||||
# Flag to enable ACL kernels for AARCH64 platforms
|
|
||||||
if (VLLM_BUILD_ACL STREQUAL "ON")
|
|
||||||
set(USE_ACL ON)
|
|
||||||
else()
|
|
||||||
set(USE_ACL OFF)
|
|
||||||
endif()
|
|
||||||
|
|
||||||
|
# Build oneDNN for GEMM kernels (only for x86-AVX512 /ARM platforms)
|
||||||
if ((AVX512_FOUND AND NOT AVX512_DISABLED) OR (ASIMD_FOUND AND NOT APPLE_SILICON_FOUND) OR POWER9_FOUND OR POWER10_FOUND OR POWER11_FOUND)
|
if ((AVX512_FOUND AND NOT AVX512_DISABLED) OR (ASIMD_FOUND AND NOT APPLE_SILICON_FOUND) OR POWER9_FOUND OR POWER10_FOUND OR POWER11_FOUND)
|
||||||
FetchContent_Declare(
|
# Fetch and build Arm Compute Library (ACL) as oneDNN's backend for AArch64
|
||||||
oneDNN
|
# TODO [fadara01]: remove this once ACL can be fetched and built automatically as a dependency of oneDNN
|
||||||
GIT_REPOSITORY https://github.com/oneapi-src/oneDNN.git
|
if(ASIMD_FOUND)
|
||||||
GIT_TAG v3.9
|
if(DEFINED ENV{ACL_ROOT_DIR} AND IS_DIRECTORY "$ENV{ACL_ROOT_DIR}")
|
||||||
GIT_PROGRESS TRUE
|
message(STATUS "Using ACL from specified source directory: $ENV{ACL_ROOT_DIR}")
|
||||||
GIT_SHALLOW TRUE
|
else()
|
||||||
)
|
message(STATUS "Downloading Arm Compute Library (ACL) from GitHub")
|
||||||
|
FetchContent_Populate(arm_compute
|
||||||
if(USE_ACL)
|
SUBBUILD_DIR "${FETCHCONTENT_BASE_DIR}/arm_compute-subbuild"
|
||||||
find_library(ARM_COMPUTE_LIBRARY NAMES arm_compute PATHS $ENV{ACL_ROOT_DIR}/build/)
|
SOURCE_DIR "${FETCHCONTENT_BASE_DIR}/arm_compute-src"
|
||||||
if(NOT ARM_COMPUTE_LIBRARY)
|
GIT_REPOSITORY https://github.com/ARM-software/ComputeLibrary.git
|
||||||
message(FATAL_ERROR "Could not find ARM Compute Library: please set ACL_ROOT_DIR")
|
GIT_TAG v52.2.0
|
||||||
|
GIT_SHALLOW TRUE
|
||||||
|
GIT_PROGRESS TRUE
|
||||||
|
)
|
||||||
|
set(ENV{ACL_ROOT_DIR} "${arm_compute_SOURCE_DIR}")
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
# Build ACL with scons
|
||||||
|
include(ProcessorCount)
|
||||||
|
ProcessorCount(_NPROC)
|
||||||
|
execute_process(
|
||||||
|
COMMAND scons -j${_NPROC}
|
||||||
|
Werror=0 debug=0 neon=1 examples=0 embed_kernels=0 os=linux
|
||||||
|
arch=armv8.2-a build=native benchmark_examples=0 fixed_format_kernels=1
|
||||||
|
multi_isa=1 openmp=1 cppthreads=0
|
||||||
|
WORKING_DIRECTORY "$ENV{ACL_ROOT_DIR}"
|
||||||
|
RESULT_VARIABLE _acl_rc
|
||||||
|
)
|
||||||
|
if(NOT _acl_rc EQUAL 0)
|
||||||
|
message(FATAL_ERROR "ACL SCons build failed (exit ${_acl_rc}).")
|
||||||
|
endif()
|
||||||
|
|
||||||
set(ONEDNN_AARCH64_USE_ACL "ON")
|
set(ONEDNN_AARCH64_USE_ACL "ON")
|
||||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wl,-rpath,$ENV{ACL_ROOT_DIR}/build/")
|
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wl,-rpath,$ENV{ACL_ROOT_DIR}/build/")
|
||||||
|
add_compile_definitions(VLLM_USE_ACL)
|
||||||
|
endif()
|
||||||
|
|
||||||
|
set(FETCHCONTENT_SOURCE_DIR_ONEDNN "$ENV{FETCHCONTENT_SOURCE_DIR_ONEDNN}" CACHE PATH "Path to a local oneDNN source directory.")
|
||||||
|
|
||||||
|
if(FETCHCONTENT_SOURCE_DIR_ONEDNN)
|
||||||
|
message(STATUS "Using oneDNN from specified source directory: ${FETCHCONTENT_SOURCE_DIR_ONEDNN}")
|
||||||
|
FetchContent_Declare(
|
||||||
|
oneDNN
|
||||||
|
SOURCE_DIR ${FETCHCONTENT_SOURCE_DIR_ONEDNN}
|
||||||
|
)
|
||||||
|
else()
|
||||||
|
message(STATUS "Downloading oneDNN from GitHub")
|
||||||
|
FetchContent_Declare(
|
||||||
|
oneDNN
|
||||||
|
GIT_REPOSITORY https://github.com/oneapi-src/oneDNN.git
|
||||||
|
GIT_TAG v3.9
|
||||||
|
GIT_PROGRESS TRUE
|
||||||
|
GIT_SHALLOW TRUE
|
||||||
|
)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
set(ONEDNN_LIBRARY_TYPE "STATIC")
|
set(ONEDNN_LIBRARY_TYPE "STATIC")
|
||||||
@@ -258,7 +298,8 @@ set(VLLM_EXT_SRC
|
|||||||
"csrc/cpu/layernorm.cpp"
|
"csrc/cpu/layernorm.cpp"
|
||||||
"csrc/cpu/mla_decode.cpp"
|
"csrc/cpu/mla_decode.cpp"
|
||||||
"csrc/cpu/pos_encoding.cpp"
|
"csrc/cpu/pos_encoding.cpp"
|
||||||
"csrc/cpu/torch_bindings.cpp")
|
"csrc/cpu/torch_bindings.cpp"
|
||||||
|
"csrc/moe/dynamic_4bit_int_moe_cpu.cpp")
|
||||||
|
|
||||||
if (AVX512_FOUND AND NOT AVX512_DISABLED)
|
if (AVX512_FOUND AND NOT AVX512_DISABLED)
|
||||||
set(VLLM_EXT_SRC
|
set(VLLM_EXT_SRC
|
||||||
|
|||||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user