Compare commits
1202 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
4db5176d97 | ||
|
|
4cf1dc39be | ||
|
|
6e4852ce28 | ||
|
|
8571ac4672 | ||
|
|
997cf78308 | ||
|
|
57f560aa23 | ||
|
|
003f8ee128 | ||
|
|
e9630458c7 | ||
|
|
82a1b1a82b | ||
|
|
c0d8f1636c | ||
|
|
cc08fc7225 | ||
|
|
7b86e7c9cd | ||
|
|
f80ab3521c | ||
|
|
16a1cc9bb2 | ||
|
|
b1c9aa3daa | ||
|
|
179a6a36f2 | ||
|
|
83c644fe7e | ||
|
|
9fadc7b7a0 | ||
|
|
654bc5ca49 | ||
|
|
825b044863 | ||
|
|
44dcb52e39 | ||
|
|
67d745cc68 | ||
|
|
99d7cabd7b | ||
|
|
fb2c1c86c1 | ||
|
|
0c25435daa | ||
|
|
a0d164567c | ||
|
|
04e5583425 | ||
|
|
8c025fa703 | ||
|
|
69ea15e5cc | ||
|
|
ed812a73fa | ||
|
|
708989341e | ||
|
|
22e718ff1a | ||
|
|
05308891e2 | ||
|
|
a8d604ca2a | ||
|
|
b482b9a5b1 | ||
|
|
806949514a | ||
|
|
c16eaac500 | ||
|
|
db35186391 | ||
|
|
660dea1235 | ||
|
|
cf2a1a4d9d | ||
|
|
252357793d | ||
|
|
3bb4b1e4cd | ||
|
|
954f7305a1 | ||
|
|
6ce01f3066 | ||
|
|
6a11fdfbb8 | ||
|
|
805a8a75f2 | ||
|
|
562e580abc | ||
|
|
fc912e0886 | ||
|
|
f4fd390f5d | ||
|
|
fb3db61688 | ||
|
|
2dd34371a6 | ||
|
|
7e0861bd0b | ||
|
|
a72a424b3e | ||
|
|
c8a7e93273 | ||
|
|
3c10591ef2 | ||
|
|
0437492ea9 | ||
|
|
630dd9e0ae | ||
|
|
23993a7997 | ||
|
|
1d2e7fb73f | ||
|
|
7ecee34321 | ||
|
|
7eb0cb4a14 | ||
|
|
a0dce9383a | ||
|
|
35e9c12bfa | ||
|
|
93548eb37e | ||
|
|
460c1884e3 | ||
|
|
bd70013407 | ||
|
|
2ee8d3ba55 | ||
|
|
daed30c4a9 | ||
|
|
2f4e108f75 | ||
|
|
6512937de1 | ||
|
|
c0644cf9ce | ||
|
|
533d1932d2 | ||
|
|
9f0e69b653 | ||
|
|
f230cc2ca6 | ||
|
|
da1f7cc12a | ||
|
|
c32ab8be1a | ||
|
|
fb4f530bf5 | ||
|
|
79319cedfa | ||
|
|
40c27a7cbb | ||
|
|
6ca8031e71 | ||
|
|
d7a299edaa | ||
|
|
052b6f8ca4 | ||
|
|
5895b24677 | ||
|
|
cbbc904470 | ||
|
|
5cf9254a9c | ||
|
|
f058403683 | ||
|
|
c66c7f86ac | ||
|
|
6e063ea35b | ||
|
|
af647fb8b3 | ||
|
|
61a97c32f6 | ||
|
|
4fbf4aa128 | ||
|
|
aae6d36f7e | ||
|
|
9f69d8245a | ||
|
|
9a7e2d0534 | ||
|
|
7f8d612d24 | ||
|
|
60d1c6e584 | ||
|
|
db9e5708a9 | ||
|
|
766435e660 | ||
|
|
7cbd9ec7a9 | ||
|
|
3eeb148f46 | ||
|
|
b1366a9534 | ||
|
|
75acdaa4b6 | ||
|
|
fad5576c58 | ||
|
|
f954d0715c | ||
|
|
1ad86acf17 | ||
|
|
ecb33a28cb | ||
|
|
a57d75821c | ||
|
|
925de97e05 | ||
|
|
aa46953a20 | ||
|
|
593e79e733 | ||
|
|
c53041ae3b | ||
|
|
52f07e3dec | ||
|
|
14dbd5a767 | ||
|
|
ed94e4f427 | ||
|
|
3c3012398e | ||
|
|
ced36cd89b | ||
|
|
969d032265 | ||
|
|
55712941e5 | ||
|
|
981b0d5673 | ||
|
|
d09b94ca58 | ||
|
|
bb5494676f | ||
|
|
b5f49ee55b | ||
|
|
150a1ffbfd | ||
|
|
281977bd6e | ||
|
|
3bbb4936dc | ||
|
|
aa4867791e | ||
|
|
71734f1bf2 | ||
|
|
50704f52c4 | ||
|
|
07278c37dd | ||
|
|
85ad7e2d01 | ||
|
|
89a84b0bb7 | ||
|
|
084a01fd35 | ||
|
|
062a1d0fab | ||
|
|
2eb9f4ff26 | ||
|
|
443c7cf4cf | ||
|
|
1adddb14bf | ||
|
|
b7215de2c5 | ||
|
|
f3ff63c3f4 | ||
|
|
cd7edc4e87 | ||
|
|
6a1e25b151 | ||
|
|
95db75de64 | ||
|
|
65b1f121c8 | ||
|
|
889da130e7 | ||
|
|
b75e314fff | ||
|
|
316a41ac1d | ||
|
|
0310029a2f | ||
|
|
309aaef825 | ||
|
|
9e169a4c61 | ||
|
|
5689e256ba | ||
|
|
740374d456 | ||
|
|
d88c458f44 | ||
|
|
421e218b37 | ||
|
|
5448f67635 | ||
|
|
0e63494cf3 | ||
|
|
ee812580f7 | ||
|
|
40468b13fa | ||
|
|
2cf0df3381 | ||
|
|
545146349c | ||
|
|
f4f8a9d892 | ||
|
|
b570811706 | ||
|
|
ccc4a73257 | ||
|
|
0a740a11ba | ||
|
|
c882a7f5b3 | ||
|
|
5e8ca973eb | ||
|
|
87525fab92 | ||
|
|
2f808e69ab | ||
|
|
01c16ede6b | ||
|
|
72fc704803 | ||
|
|
1bedf210e3 | ||
|
|
507ef787d8 | ||
|
|
58f53034ad | ||
|
|
0eb0757bef | ||
|
|
38c4b7e863 | ||
|
|
a112a84aad | ||
|
|
461089a21a | ||
|
|
71950af726 | ||
|
|
cb1362a889 | ||
|
|
bb2fc08072 | ||
|
|
3eda4ec780 | ||
|
|
22fa2e35cb | ||
|
|
c5201240a4 | ||
|
|
97234be0ec | ||
|
|
c051bfe4eb | ||
|
|
9e0b558a09 | ||
|
|
e519ae097a | ||
|
|
7c2749a4fd | ||
|
|
729171ae58 | ||
|
|
c5e8330997 | ||
|
|
e0c15758b8 | ||
|
|
bdf5fd1386 | ||
|
|
5a96ee52a3 | ||
|
|
42c7f66a38 | ||
|
|
69d5ae38dc | ||
|
|
fea59c7712 | ||
|
|
739b61a348 | ||
|
|
89c1c6a196 | ||
|
|
42de2cefcb | ||
|
|
c9eef37f32 | ||
|
|
396d92d5e0 | ||
|
|
25e778aa16 | ||
|
|
b6df37f943 | ||
|
|
14f91fe67c | ||
|
|
d7f4178dd9 | ||
|
|
082ecd80d5 | ||
|
|
f952bbc8ff | ||
|
|
9364f74eee | ||
|
|
06d6c5fe9f | ||
|
|
683e3cb9c4 | ||
|
|
9042d68362 | ||
|
|
3f8d42c81f | ||
|
|
7bd82002ae | ||
|
|
2e26564259 | ||
|
|
e81522e879 | ||
|
|
45ceb85a0c | ||
|
|
4cc24f01b1 | ||
|
|
07eb6f19f3 | ||
|
|
f0bbfaf917 | ||
|
|
30efe41532 | ||
|
|
9ed82e7074 | ||
|
|
51f8aa90ad | ||
|
|
a5314e8698 | ||
|
|
a921e86392 | ||
|
|
6366efc67b | ||
|
|
dbe5588554 | ||
|
|
d4201e06d5 | ||
|
|
b5672a112c | ||
|
|
c5df56f88b | ||
|
|
1689219ebf | ||
|
|
4ffffccb7e | ||
|
|
f53b8f0d05 | ||
|
|
2d4733ba2d | ||
|
|
15c6a079b1 | ||
|
|
ecdb462c24 | ||
|
|
58ca663224 | ||
|
|
4634c8728b | ||
|
|
c8a7d51c49 | ||
|
|
e2fbaee725 | ||
|
|
8a74c68bd1 | ||
|
|
61e592747c | ||
|
|
d25877dd9b | ||
|
|
1c27d25fb5 | ||
|
|
18fecc3559 | ||
|
|
b5af8c223c | ||
|
|
b5241e41d9 | ||
|
|
e76466dde2 | ||
|
|
5f0b9933e6 | ||
|
|
a38524f338 | ||
|
|
2fa4623d9e | ||
|
|
a9a2e74d21 | ||
|
|
e09ce759aa | ||
|
|
5fa6e9876e | ||
|
|
5bf35a91e4 | ||
|
|
a19e8d3726 | ||
|
|
10383887e0 | ||
|
|
1d094fd7c0 | ||
|
|
ce37be7ba0 | ||
|
|
7f62077af5 | ||
|
|
09c2eb85dd | ||
|
|
978aed5300 | ||
|
|
160e1d8c99 | ||
|
|
94162beb9f | ||
|
|
c467dff24f | ||
|
|
9f4ccec761 | ||
|
|
38ef94888a | ||
|
|
2bb0489cb3 | ||
|
|
7508a3dc34 | ||
|
|
7a3d2a5b95 | ||
|
|
d97011512e | ||
|
|
37d776606f | ||
|
|
d92b3c5cde | ||
|
|
9ad32dacd9 | ||
|
|
d6f3b3d5c4 | ||
|
|
4552e37b55 | ||
|
|
ec9933f4a5 | ||
|
|
3dee97b05f | ||
|
|
4cf256ae7f | ||
|
|
64fdc08c72 | ||
|
|
4ef95b0f06 | ||
|
|
eaec4b9153 | ||
|
|
a63a4c6341 | ||
|
|
c8fd97f26d | ||
|
|
94b82e8c18 | ||
|
|
6ae1597ddf | ||
|
|
22e79ee8f3 | ||
|
|
de19916314 | ||
|
|
69672f116c | ||
|
|
44874a0bf9 | ||
|
|
b47008b4d2 | ||
|
|
9bfece89fd | ||
|
|
32c9d7f765 | ||
|
|
ccb20db8bd | ||
|
|
a754dc2cb9 | ||
|
|
61e85dbad8 | ||
|
|
dbfe254eda | ||
|
|
73030b7dae | ||
|
|
ccd3c04571 | ||
|
|
9dad5cc859 | ||
|
|
6ef3bf912c | ||
|
|
540c0368b1 | ||
|
|
fb6af8bc08 | ||
|
|
eeceadaecc | ||
|
|
babf52dade | ||
|
|
9da4aad44b | ||
|
|
41708e5034 | ||
|
|
d80aef3776 | ||
|
|
e1684a766a | ||
|
|
a27f87da34 | ||
|
|
16ff6bd58c | ||
|
|
f8f9ff57ee | ||
|
|
6bc9710f6e | ||
|
|
111fc6e7ec | ||
|
|
75f64d8b94 | ||
|
|
21b2dcedab | ||
|
|
07b35af86d | ||
|
|
bb1a784b05 | ||
|
|
d719ba24c5 | ||
|
|
aa48e502fb | ||
|
|
4dbebd03cc | ||
|
|
b75bce1008 | ||
|
|
b039cbbce3 | ||
|
|
f9d25c2519 | ||
|
|
024ad87cdc | ||
|
|
aea19f0989 | ||
|
|
f7160d946a | ||
|
|
6047187cd8 | ||
|
|
b6c16cf8ff | ||
|
|
d26a8b3f1f | ||
|
|
d59eb98489 | ||
|
|
adf32e0a0f | ||
|
|
2b0fb53481 | ||
|
|
d6ab528997 | ||
|
|
7ed6a4f0e1 | ||
|
|
a4feba929b | ||
|
|
2d23b42d92 | ||
|
|
1df43de9bb | ||
|
|
52b7fcb35a | ||
|
|
b675069d74 | ||
|
|
55f692b46e | ||
|
|
8a1415cf77 | ||
|
|
546b101fa0 | ||
|
|
3963a5335b | ||
|
|
c4774eb841 | ||
|
|
fc17110bbe | ||
|
|
439c84581a | ||
|
|
99ded1e1c4 | ||
|
|
997df46a32 | ||
|
|
ae151d73be | ||
|
|
44cc76610d | ||
|
|
b422d4961a | ||
|
|
c38eba3046 | ||
|
|
e72ae80b06 | ||
|
|
8a924d2248 | ||
|
|
5ed3505d82 | ||
|
|
da78caecfa | ||
|
|
2416b26e11 | ||
|
|
d3a245138a | ||
|
|
673dd4cae9 | ||
|
|
4d6ada947c | ||
|
|
a0550cbc80 | ||
|
|
08c5bdecae | ||
|
|
5d5b4c5fe5 | ||
|
|
70c232f85a | ||
|
|
a3c9435d93 | ||
|
|
4f0e0ea131 | ||
|
|
ddc369fba1 | ||
|
|
185ad31f37 | ||
|
|
543aa48573 | ||
|
|
f7a8fa39d8 | ||
|
|
717f4bcea0 | ||
|
|
16620f439d | ||
|
|
3b08fe2b13 | ||
|
|
abfe705a02 | ||
|
|
333306a252 | ||
|
|
6206dcb29e | ||
|
|
9389380015 | ||
|
|
175c43eca4 | ||
|
|
bc96d5c330 | ||
|
|
f0250620dd | ||
|
|
2de490d60f | ||
|
|
79d406e918 | ||
|
|
abad5746a7 | ||
|
|
e58294ddf2 | ||
|
|
f1e15da6fe | ||
|
|
0097bb1829 | ||
|
|
ea4b570483 | ||
|
|
a41357e941 | ||
|
|
ae96ef8fbd | ||
|
|
69ec3ca14c | ||
|
|
81d7a50f24 | ||
|
|
27902d42be | ||
|
|
56b325e977 | ||
|
|
3dd507083f | ||
|
|
0ed646b7aa | ||
|
|
1dab9bc8a9 | ||
|
|
3de6e6a30e | ||
|
|
966fe72141 | ||
|
|
62963d129e | ||
|
|
d9e98f42e4 | ||
|
|
3c6325f0fc | ||
|
|
47f0954af0 | ||
|
|
7cd2ebb025 | ||
|
|
f1c78138aa | ||
|
|
3a86b54fb0 | ||
|
|
f666207161 | ||
|
|
d830656a97 | ||
|
|
d18bab3587 | ||
|
|
9831aec49f | ||
|
|
482045ee77 | ||
|
|
9d6a8daa87 | ||
|
|
ee93f4f92a | ||
|
|
7c008c51a9 | ||
|
|
4d26d806e1 | ||
|
|
c5832d2ae9 | ||
|
|
15aba081f3 | ||
|
|
31354e563f | ||
|
|
98d6682cd1 | ||
|
|
2c37540aa6 | ||
|
|
3476ed0809 | ||
|
|
54600709b6 | ||
|
|
e373853e12 | ||
|
|
c87ebc3ef9 | ||
|
|
c4059ea54f | ||
|
|
8e0817c262 | ||
|
|
83bdcb6ac3 | ||
|
|
12a59959ed | ||
|
|
dec6fc6f3b | ||
|
|
8893130b63 | ||
|
|
bb60326836 | ||
|
|
4050d646e5 | ||
|
|
d76084c12f | ||
|
|
80ca1e6a3a | ||
|
|
614aa51203 | ||
|
|
af9ad46fca | ||
|
|
7836fdcc11 | ||
|
|
deacb7ec44 | ||
|
|
f5e73c9f1b | ||
|
|
c6c240aa0a | ||
|
|
2be6955a3f | ||
|
|
9d47f64eb6 | ||
|
|
cff6a1fec1 | ||
|
|
bcc6a09b63 | ||
|
|
9def10664e | ||
|
|
75aa1442db | ||
|
|
99397da534 | ||
|
|
8dbfcd35bf | ||
|
|
f7dac83d95 | ||
|
|
7c01f70641 | ||
|
|
51e971d39e | ||
|
|
329df38f1a | ||
|
|
580353da93 | ||
|
|
ba4994443a | ||
|
|
906a19cdb0 | ||
|
|
c4bca740e8 | ||
|
|
7f83f40dee | ||
|
|
54814fd85b | ||
|
|
7041de4384 | ||
|
|
6a62cb82cc | ||
|
|
5d2a1a9cf0 | ||
|
|
4bf35ed9ae | ||
|
|
be0b3af9e0 | ||
|
|
2cd402e169 | ||
|
|
b185230744 | ||
|
|
6a2d659d28 | ||
|
|
b2c620230a | ||
|
|
b90d8cd832 | ||
|
|
3b752a6555 | ||
|
|
ec1ad0046c | ||
|
|
57f09a419c | ||
|
|
5932634409 | ||
|
|
5cbe8d155c | ||
|
|
0d0e3a42ac | ||
|
|
74d55c065b | ||
|
|
f136da15e1 | ||
|
|
c3dde367f1 | ||
|
|
64e8d2a783 | ||
|
|
79c92c7c8a | ||
|
|
736ed38849 | ||
|
|
365791ff81 | ||
|
|
691e29ecf3 | ||
|
|
3fd02bda51 | ||
|
|
98cf2ed678 | ||
|
|
e9d32d077d | ||
|
|
2061f0b8a7 | ||
|
|
96354d6a29 | ||
|
|
d12af207d2 | ||
|
|
6eabc6cb0e | ||
|
|
2110557dab | ||
|
|
b9e84259e9 | ||
|
|
294104c3f9 | ||
|
|
38a1674abb | ||
|
|
f5c8628fdc | ||
|
|
cbc53b6b8d | ||
|
|
c54269d967 | ||
|
|
5bfd1bbc98 | ||
|
|
6984c02a27 | ||
|
|
3439c5a8e3 | ||
|
|
6806998bf9 | ||
|
|
515080ad2f | ||
|
|
3aa7b6cf66 | ||
|
|
dda4811591 | ||
|
|
82079729cc | ||
|
|
c2a8ac75e0 | ||
|
|
f178e56c68 | ||
|
|
dd793d1de5 | ||
|
|
bc34937d68 | ||
|
|
dd248f7675 | ||
|
|
d9b34baedd | ||
|
|
c18ebfdd71 | ||
|
|
67882dbb44 | ||
|
|
7b99314301 | ||
|
|
2ce5d6688b | ||
|
|
f23871e9ee | ||
|
|
e9de9dd551 | ||
|
|
ba991d5c84 | ||
|
|
1744cc99ba | ||
|
|
e72dc6cb35 | ||
|
|
c246212952 | ||
|
|
edd5fe5fa2 | ||
|
|
5d4d90536f | ||
|
|
6c916ac8a8 | ||
|
|
832ea88fcb | ||
|
|
8c00f9c15d | ||
|
|
0cbc1d2b4f | ||
|
|
ff9ddbceee | ||
|
|
9c62db07ed | ||
|
|
cf90ae0123 | ||
|
|
f5dda63eb5 | ||
|
|
7187507301 | ||
|
|
f1e72cc19a | ||
|
|
5b15bde539 | ||
|
|
bd620b01fb | ||
|
|
d9a252bc8e | ||
|
|
67005a07bc | ||
|
|
c35e4a3dd7 | ||
|
|
1f5674218f | ||
|
|
b12518d3cf | ||
|
|
6c5b7af152 | ||
|
|
8065a7e220 | ||
|
|
3f3b6b2150 | ||
|
|
a7dcc62086 | ||
|
|
ad137cd111 | ||
|
|
111af1fa2c | ||
|
|
1b2eaac316 | ||
|
|
3730a1c832 | ||
|
|
949e49a685 | ||
|
|
4a30d7e3cc | ||
|
|
e83db9e7e3 | ||
|
|
78687504f7 | ||
|
|
d571ca0108 | ||
|
|
afed90a034 | ||
|
|
3ee5c4bca5 | ||
|
|
e9c2732b97 | ||
|
|
d8714530d1 | ||
|
|
7d46c8d378 | ||
|
|
da971ec7a5 | ||
|
|
3eea74889f | ||
|
|
f758aed0e8 | ||
|
|
e5150f2c28 | ||
|
|
59a1eb59c9 | ||
|
|
6820724e51 | ||
|
|
b23ce92032 | ||
|
|
2bd231a7b7 | ||
|
|
8a173382c8 | ||
|
|
07feecde1a | ||
|
|
19091efc44 | ||
|
|
95db455e7f | ||
|
|
7879f24dcc | ||
|
|
13db4369d9 | ||
|
|
4ad7b53e59 | ||
|
|
f0cc0e68e3 | ||
|
|
db5ec52ad7 | ||
|
|
114d7270ff | ||
|
|
32c86e494a | ||
|
|
8eadcf0b90 | ||
|
|
5002175e80 | ||
|
|
daef218b55 | ||
|
|
fa9e385229 | ||
|
|
26e1188e51 | ||
|
|
a3e8a05d4c | ||
|
|
e441bad674 | ||
|
|
1b44aaf4e3 | ||
|
|
9e4e6fe207 | ||
|
|
ab66536dbf | ||
|
|
728c4c8a06 | ||
|
|
1f12122b17 | ||
|
|
890d8d960b | ||
|
|
9e74d9d003 | ||
|
|
9333fb8eb9 | ||
|
|
e2b85cf86a | ||
|
|
845a3f26f9 | ||
|
|
f07d513320 | ||
|
|
4a6769053a | ||
|
|
f31c1f90e3 | ||
|
|
3ce2c050dd | ||
|
|
1c0afa13c5 | ||
|
|
d919ecc771 | ||
|
|
e691918e3b | ||
|
|
81fbb3655f | ||
|
|
0e9164b40a | ||
|
|
1b8a0d71cf | ||
|
|
bd7efe95d0 | ||
|
|
f5bb85b435 | ||
|
|
28c145eb57 | ||
|
|
e2afb03c92 | ||
|
|
6e2527a7cb | ||
|
|
cdab68dcdb | ||
|
|
d1c3d7d139 | ||
|
|
77490c6f2f | ||
|
|
48f589e18b | ||
|
|
348616ac4b | ||
|
|
15985680e2 | ||
|
|
d74674bbd9 | ||
|
|
703475f6c2 | ||
|
|
d47af2bc02 | ||
|
|
319ad7f1d3 | ||
|
|
0f0d8bc065 | ||
|
|
55d6361b13 | ||
|
|
cd9c0d65d9 | ||
|
|
50eed24d25 | ||
|
|
e38042d4af | ||
|
|
33e3b37242 | ||
|
|
1696efe6c9 | ||
|
|
6b0511a57b | ||
|
|
a8fda4f661 | ||
|
|
30299a41fa | ||
|
|
85657b5607 | ||
|
|
0ce7b952f8 | ||
|
|
39873476f8 | ||
|
|
03dccc886e | ||
|
|
a65634d3ae | ||
|
|
80aa7e91fc | ||
|
|
bd43973522 | ||
|
|
23ec72fa03 | ||
|
|
c2637a613b | ||
|
|
88407532e7 | ||
|
|
916d219d62 | ||
|
|
ea3890a5f0 | ||
|
|
2135cacb45 | ||
|
|
7d19de2e9c | ||
|
|
94a07bbdd8 | ||
|
|
b8d4dfff9c | ||
|
|
622d45128c | ||
|
|
51602eefd3 | ||
|
|
5cc50a531f | ||
|
|
5985e3427d | ||
|
|
8b82a89997 | ||
|
|
c3c2903e72 | ||
|
|
1a8bfd92d5 | ||
|
|
847cdcca1c | ||
|
|
e3c12bf6d2 | ||
|
|
3dd6853bc8 | ||
|
|
8f89d72090 | ||
|
|
99dac099ab | ||
|
|
c4bd03c7c5 | ||
|
|
dcbf4286af | ||
|
|
00e6a2dc53 | ||
|
|
2e02311a1b | ||
|
|
89ec06c33b | ||
|
|
9fde251bf0 | ||
|
|
4c2ffb28ff | ||
|
|
246598a6b1 | ||
|
|
8bab4959be | ||
|
|
3c4cebf751 | ||
|
|
d8f31f2f8b | ||
|
|
640052b069 | ||
|
|
351d5e7b82 | ||
|
|
a008629807 | ||
|
|
76477a93b7 | ||
|
|
77c87beb06 | ||
|
|
114332b88e | ||
|
|
cb77ad836f | ||
|
|
856c990041 | ||
|
|
c5602f0baa | ||
|
|
f7f9c5f97b | ||
|
|
2c0d933594 | ||
|
|
774d1035e4 | ||
|
|
6b29d6fe70 | ||
|
|
0bfa1c4f13 | ||
|
|
c81da5f56d | ||
|
|
68bc81703e | ||
|
|
5884c2b454 | ||
|
|
45f92c00cf | ||
|
|
5467ac3196 | ||
|
|
5d7e3d0176 | ||
|
|
0373e1837e | ||
|
|
c09dade2a2 | ||
|
|
8ea5e44a43 | ||
|
|
9fb900f90c | ||
|
|
c96fc06747 | ||
|
|
b3376e5c76 | ||
|
|
e69ded7d1c | ||
|
|
767c727a81 | ||
|
|
6840a71610 | ||
|
|
7a9cb294ae | ||
|
|
ca3ea51bde | ||
|
|
dc49fb892c | ||
|
|
18a277b52d | ||
|
|
8d75fe48ca | ||
|
|
388596c914 | ||
|
|
baa15a9ec3 | ||
|
|
15063741e3 | ||
|
|
ccdc490dda | ||
|
|
a31cab7556 | ||
|
|
828da0d44e | ||
|
|
abe855d637 | ||
|
|
4efff036f0 | ||
|
|
89c920785f | ||
|
|
7b0a0dfb22 | ||
|
|
3a6ae1d33c | ||
|
|
8f1729b829 | ||
|
|
6a7c7711a2 | ||
|
|
0f83ddd4d7 | ||
|
|
065aff6c16 | ||
|
|
3d33e372a1 | ||
|
|
faf71bcd4b | ||
|
|
f270a39537 | ||
|
|
51a08e7d8f | ||
|
|
eb8fcd2666 | ||
|
|
5563a4dea8 | ||
|
|
ccd4f129e8 | ||
|
|
02cc3b51a7 | ||
|
|
d5b1eb081e | ||
|
|
f0a500545f | ||
|
|
c65146e75e | ||
|
|
41ca62cf03 | ||
|
|
974fc9b845 | ||
|
|
fee4dcc33a | ||
|
|
650a4cc55e | ||
|
|
9ca62d8668 | ||
|
|
45c35f0d58 | ||
|
|
9ba093b4f4 | ||
|
|
27208be66e | ||
|
|
87d5abef75 | ||
|
|
ec784b2526 | ||
|
|
a58f24e590 | ||
|
|
f42a006b15 | ||
|
|
3a434b07ed | ||
|
|
bd0e7802e0 | ||
|
|
06b2550cbb | ||
|
|
f775a07e30 | ||
|
|
4f0d17c05c | ||
|
|
10c38e3e46 | ||
|
|
cafb8e06c5 | ||
|
|
cbb2f59cc8 | ||
|
|
0ab278ca31 | ||
|
|
7a64d24aad | ||
|
|
dfbe60dc62 | ||
|
|
a66cf40b20 | ||
|
|
f790ad3c50 | ||
|
|
ed59a7ed23 | ||
|
|
044793d8df | ||
|
|
c2d6d2f960 | ||
|
|
8279078e21 | ||
|
|
b9c0605a8e | ||
|
|
37464a0f74 | ||
|
|
c354072828 | ||
|
|
f081c3ce4b | ||
|
|
260d119e86 | ||
|
|
a360ff80bb | ||
|
|
1197e02141 | ||
|
|
657579113f | ||
|
|
e9899fb7a4 | ||
|
|
a377f0bd5e | ||
|
|
e9d3aa04f6 | ||
|
|
a22dea54d3 | ||
|
|
533c217792 | ||
|
|
6d21fa1cad | ||
|
|
b35be5403f | ||
|
|
45a1a69b98 | ||
|
|
87a658c812 | ||
|
|
429d89720e | ||
|
|
a9bcc7afb2 | ||
|
|
d79d9eaaff | ||
|
|
f758505c73 | ||
|
|
d910816c73 | ||
|
|
87d41c849d | ||
|
|
e07aff9e52 | ||
|
|
5bf185a1c4 | ||
|
|
4fbcb0f27e | ||
|
|
7c3604fb68 | ||
|
|
b1c255630d | ||
|
|
eb6c50cdc2 | ||
|
|
eecd864388 | ||
|
|
ae495c74ea | ||
|
|
4238bc82f2 | ||
|
|
594392d27a | ||
|
|
18c1f16d86 | ||
|
|
5bd3c65072 | ||
|
|
616e600e0b | ||
|
|
dfba529b40 | ||
|
|
5ae5ed1e60 | ||
|
|
290f4ada2b | ||
|
|
dd8de11f0a | ||
|
|
9ba415588a | ||
|
|
d4f3985907 | ||
|
|
890aa93d27 | ||
|
|
fbdb7b3ee2 | ||
|
|
1102bef219 | ||
|
|
f17a1a8f96 | ||
|
|
d5a1697772 | ||
|
|
325c119961 | ||
|
|
8e192ff967 | ||
|
|
e64fde4b01 | ||
|
|
919770957f | ||
|
|
6a50f4cafa | ||
|
|
e3470f8753 | ||
|
|
a1242324c9 | ||
|
|
5eda2ea02a | ||
|
|
2ba80bed27 | ||
|
|
6066253296 | ||
|
|
ee3eea0a1b | ||
|
|
a36de682d4 | ||
|
|
eb6d3c264d | ||
|
|
97b030005c | ||
|
|
a3a73ab069 | ||
|
|
8674f9880e | ||
|
|
c74c913bfb | ||
|
|
5f6d10c14c | ||
|
|
9b9a10d6cb | ||
|
|
99eff67ba9 | ||
|
|
14772eeb8e | ||
|
|
757b62c495 | ||
|
|
e941f88584 | ||
|
|
f12c3b5b3d | ||
|
|
d130b573a0 | ||
|
|
65ae8c2c8f | ||
|
|
c3af44722c | ||
|
|
1937e29848 | ||
|
|
f0eecee610 | ||
|
|
943e72ca56 | ||
|
|
546a97ef69 | ||
|
|
da5a0b539d | ||
|
|
6287537a0c | ||
|
|
b57e6c5949 | ||
|
|
27ce85476e | ||
|
|
f68470e803 | ||
|
|
2e9a2227ec | ||
|
|
c0724fc915 | ||
|
|
86b45ae065 | ||
|
|
c5711ef985 | ||
|
|
48d5985a08 | ||
|
|
33e0823de5 | ||
|
|
26148120b3 | ||
|
|
0150a10630 | ||
|
|
8e7fb5d43a | ||
|
|
9a31a817a8 | ||
|
|
2060e93659 | ||
|
|
8435b207af | ||
|
|
10fa9eea21 | ||
|
|
e08188081b | ||
|
|
b5853f9963 | ||
|
|
f09edd8a25 | ||
|
|
6979ade384 | ||
|
|
9216b9cc38 | ||
|
|
5e0391c040 | ||
|
|
dbc0754ddf | ||
|
|
99caa49106 | ||
|
|
5c342570d7 | ||
|
|
973617ae02 | ||
|
|
30e754390c | ||
|
|
52f8107cf2 | ||
|
|
fc0d9dfc3a | ||
|
|
361c461a12 | ||
|
|
a5675d348b | ||
|
|
e9cdd2b1e2 | ||
|
|
65bf2ac165 | ||
|
|
8a7cc254a0 | ||
|
|
29bc01bf3b | ||
|
|
676a99982f | ||
|
|
dc72402b57 | ||
|
|
ccb63a8245 | ||
|
|
c579b750a0 | ||
|
|
4bfa7e7f75 | ||
|
|
ac1fbf7fd2 | ||
|
|
33d3914b1e | ||
|
|
1356df53bd | ||
|
|
ce532ff45c | ||
|
|
8bc68e198c | ||
|
|
0fca3cdcf2 | ||
|
|
e7c46b9527 | ||
|
|
350f9e107f | ||
|
|
702bee461f | ||
|
|
a7be4d0072 | ||
|
|
a709e87a4f | ||
|
|
6eaccb7353 | ||
|
|
e254497b66 | ||
|
|
4e12131089 | ||
|
|
fcc2994be6 | ||
|
|
2e7796f2cf | ||
|
|
706588a77d | ||
|
|
6a0f617210 | ||
|
|
dac6a3f6ed | ||
|
|
64b77dfd7e | ||
|
|
51d4094fda | ||
|
|
e965d46184 | ||
|
|
208b71bcc1 | ||
|
|
c833101740 | ||
|
|
379da6dcb5 | ||
|
|
ebce310b74 | ||
|
|
be0c5180ac | ||
|
|
cea64430f6 | ||
|
|
a3c124570a | ||
|
|
ff5abcd746 | ||
|
|
0ee535b294 | ||
|
|
190bc838e1 | ||
|
|
f12b20decc | ||
|
|
16bc0a098f | ||
|
|
e288df0632 | ||
|
|
8b9241be3a | ||
|
|
f942efb5a3 | ||
|
|
89579a201f | ||
|
|
230c4b38c1 | ||
|
|
20cfcdec99 | ||
|
|
ad932a221d | ||
|
|
5510cf0e8a | ||
|
|
0f9a6e3d22 | ||
|
|
f6a593093a | ||
|
|
d7740ea4dc | ||
|
|
cc466a3290 | ||
|
|
8344f7742b | ||
|
|
469f85c782 | ||
|
|
10760da800 | ||
|
|
478aed5827 | ||
|
|
63575bc2e1 | ||
|
|
a98187cf72 | ||
|
|
bd99d22629 | ||
|
|
19cb4716ee | ||
|
|
e186d37cb1 | ||
|
|
323f27b904 | ||
|
|
0650e5935b | ||
|
|
c7f2cf2b7f | ||
|
|
8d8357c8ed | ||
|
|
4302987069 | ||
|
|
021b1a2ab7 | ||
|
|
2a052011ca | ||
|
|
36fb68f947 | ||
|
|
bc8ad68455 | ||
|
|
344bf7cd2d | ||
|
|
ab50275111 | ||
|
|
43c413ec57 | ||
|
|
f8e7adda21 | ||
|
|
7e65477e5e | ||
|
|
3521ba4f25 | ||
|
|
2d7bce9cd5 | ||
|
|
ce3f1eedf8 | ||
|
|
808632d3b4 | ||
|
|
344a5d0c33 | ||
|
|
0f8a91401c | ||
|
|
9b5c9f9484 | ||
|
|
32881f3f31 | ||
|
|
5b8a7c1cb0 | ||
|
|
1ff0c73a79 | ||
|
|
5ad60b0cbd | ||
|
|
fb087af52e | ||
|
|
7038e8b803 | ||
|
|
2a85f93007 | ||
|
|
cf8cac8c70 | ||
|
|
5e401bce17 | ||
|
|
0d62fe58db | ||
|
|
b8afa8b95a | ||
|
|
826b82a260 | ||
|
|
c9d852d601 | ||
|
|
6ef09b08f8 | ||
|
|
3a922c1e7e | ||
|
|
c47ba4aaa9 | ||
|
|
24bb4fe432 | ||
|
|
a657bfc48a | ||
|
|
24750f4cad | ||
|
|
b38e42fbca | ||
|
|
8b798eec75 | ||
|
|
69909126a7 | ||
|
|
e491c7e053 | ||
|
|
4dc8026d86 | ||
|
|
a88bb9b032 | ||
|
|
6f1df80436 | ||
|
|
d6f4bd7cdd | ||
|
|
c3845d82dc | ||
|
|
a822eb3413 | ||
|
|
f458112e8a | ||
|
|
2e240c69a9 | ||
|
|
ee37328da0 | ||
|
|
6ad58f42c5 | ||
|
|
dd1a50a8bc | ||
|
|
715c2d854d | ||
|
|
a494140433 | ||
|
|
111815d482 | ||
|
|
b31a1fb63c | ||
|
|
4bb53e2dde | ||
|
|
26f2fb5113 | ||
|
|
fa32207842 | ||
|
|
d627a3d837 | ||
|
|
f4f921b7f1 | ||
|
|
ac5ccf0156 | ||
|
|
73c8d677e5 | ||
|
|
df29793dc7 | ||
|
|
03dd7d52bf | ||
|
|
bf480c5302 | ||
|
|
9c7306ac11 | ||
|
|
4ea1f9678d | ||
|
|
ba4be44c32 | ||
|
|
d6e520e170 | ||
|
|
81661da7b2 | ||
|
|
dfea173148 | ||
|
|
7134303cbb | ||
|
|
3da24c2df7 | ||
|
|
eefeb16464 | ||
|
|
18d23f642a | ||
|
|
87f545ba6f | ||
|
|
8947bc3c15 | ||
|
|
12628d3c78 | ||
|
|
258a2c58d0 | ||
|
|
aba47be3fe | ||
|
|
a62aaf1df5 | ||
|
|
603ad84815 | ||
|
|
a88081bf76 | ||
|
|
2f30e7c72f | ||
|
|
a74dee9b62 | ||
|
|
cf29b7eda4 | ||
|
|
efffb63f58 | ||
|
|
15e7c675b0 | ||
|
|
b6dcb4d442 | ||
|
|
b5b4a398a7 | ||
|
|
f4bc4de1b1 | ||
|
|
bd7a8eef25 | ||
|
|
7ee82bef1e | ||
|
|
fbf152d976 | ||
|
|
479d69fad0 | ||
|
|
96e90fdeb3 | ||
|
|
a395a638c2 | ||
|
|
2768884ac4 | ||
|
|
aae08249ac | ||
|
|
7923dcad12 | ||
|
|
3cd9b5bb2d | ||
|
|
468d761b32 | ||
|
|
e4bf860a54 | ||
|
|
91f50a6fe2 | ||
|
|
79a268c4ab | ||
|
|
eace8bf0b9 | ||
|
|
1e8f4252aa | ||
|
|
2b7949c1c2 | ||
|
|
62b5166bd4 | ||
|
|
d86285a4a4 | ||
|
|
d87f39e9a9 | ||
|
|
d3c8180ac4 | ||
|
|
62b8aebc6f | ||
|
|
050f285ff6 | ||
|
|
8f2ea22bde | ||
|
|
0ae11f78ab | ||
|
|
34128a697e | ||
|
|
c1b4e4157c | ||
|
|
ceaf4ed003 | ||
|
|
ad8d696a99 | ||
|
|
3d925165f2 | ||
|
|
1543680691 | ||
|
|
077f0a2e8a | ||
|
|
e73ed0f1c6 | ||
|
|
296cdf8ac7 | ||
|
|
747b1a7147 | ||
|
|
95e5b087cf | ||
|
|
a37d815b83 | ||
|
|
7f2593b164 | ||
|
|
fe7d648fe5 | ||
|
|
cc74b2b232 | ||
|
|
91528575ec | ||
|
|
a22cdea371 | ||
|
|
682789d402 | ||
|
|
138485a82d | ||
|
|
bc9df1571b | ||
|
|
15b86408a8 | ||
|
|
7be4f5628f | ||
|
|
8f20fc04bf | ||
|
|
221d93ecbf | ||
|
|
d17c8477f1 | ||
|
|
a134ef6f5e | ||
|
|
8a7a3e4436 | ||
|
|
8f9c28fd40 | ||
|
|
cd2f63fb36 | ||
|
|
87fa80c91f | ||
|
|
e1bb2fd52d | ||
|
|
705578ae14 | ||
|
|
e8cc7967ff | ||
|
|
53b018edcb | ||
|
|
66ded03067 | ||
|
|
6dc1fc9cfe | ||
|
|
533d2a1f39 | ||
|
|
a53222544c | ||
|
|
fe3b5bbc23 | ||
|
|
8438e0569e | ||
|
|
11d652bd4f | ||
|
|
d150e4f89f | ||
|
|
e95cd87959 | ||
|
|
69e1d2fb69 | ||
|
|
05434764cd | ||
|
|
4e7ee664e2 | ||
|
|
37e84a403d | ||
|
|
4695397dcf | ||
|
|
d619ae2d19 | ||
|
|
eb46fbfda2 | ||
|
|
0003e9154b | ||
|
|
e11e200736 | ||
|
|
8db1bf32f8 | ||
|
|
aceb17cf2d | ||
|
|
563c54f760 | ||
|
|
2cd6b4f362 | ||
|
|
711a000255 | ||
|
|
989ae2538d | ||
|
|
0a430b4ae2 | ||
|
|
ec8e3c695f | ||
|
|
98afde19fc | ||
|
|
5c2e66e487 | ||
|
|
546e721168 | ||
|
|
b8aacac31a | ||
|
|
d04973ad54 | ||
|
|
fbb9d9eef4 | ||
|
|
09473ee41c | ||
|
|
d4ec9ffb95 | ||
|
|
96b6a6d790 | ||
|
|
36729bac13 | ||
|
|
7fd3949a0b | ||
|
|
1096717ae9 | ||
|
|
c2b4a1bce9 | ||
|
|
e46a60aa4c | ||
|
|
1e96c3341a | ||
|
|
95e7d4a97c | ||
|
|
559eb852f8 | ||
|
|
a10d3056da | ||
|
|
8afca50889 | ||
|
|
08ccee1e83 | ||
|
|
c1dc547129 | ||
|
|
f3d0bf7589 | ||
|
|
e9da5a40c6 | ||
|
|
e42df7227d | ||
|
|
caada5e50a | ||
|
|
67b4221a61 | ||
|
|
63e7176f26 | ||
|
|
934d3662f7 | ||
|
|
92cd2e2f21 | ||
|
|
e4c4072c94 | ||
|
|
e35397468f | ||
|
|
8b317c6dd0 | ||
|
|
bd3c144e0b | ||
|
|
0258b7a94b | ||
|
|
b3104b2a10 | ||
|
|
c2e00af523 | ||
|
|
c013d32c75 | ||
|
|
11dd6ebb89 | ||
|
|
6c0b04515f | ||
|
|
e23a43aef8 | ||
|
|
e7c7067b45 | ||
|
|
6d592eb430 | ||
|
|
d036198e23 | ||
|
|
59a6abf3c9 | ||
|
|
bc0c0192d1 | ||
|
|
f46864d68d | ||
|
|
b4543c8f6b | ||
|
|
0ce0539d47 | ||
|
|
2f19283549 | ||
|
|
95baec828f | ||
|
|
e4be7d70bb | ||
|
|
54951ac4bf | ||
|
|
18de883489 | ||
|
|
1d7c940d74 | ||
|
|
cfaf49a167 | ||
|
|
9edec652e2 | ||
|
|
e0dd4d3589 | ||
|
|
e5043a3e75 | ||
|
|
d03d64fd2e | ||
|
|
78107fa091 | ||
|
|
c391e4b68e | ||
|
|
9117f892f0 | ||
|
|
db2a6a41e2 | ||
|
|
ca81ff5196 | ||
|
|
b7782002e1 | ||
|
|
819a309c0f | ||
|
|
aabe8f40f2 | ||
|
|
498eb5cfa3 | ||
|
|
537ee25f43 | ||
|
|
294f8f6665 | ||
|
|
b95047f2da | ||
|
|
2ff767b513 | ||
|
|
3dcb3e8b98 | ||
|
|
c64cf38673 | ||
|
|
76b889bf1d | ||
|
|
c9b506dad4 | ||
|
|
5757d90e26 | ||
|
|
a3c226e7eb | ||
|
|
b321d4881b | ||
|
|
ad6eca408b | ||
|
|
205b94942e | ||
|
|
3bec41f41a | ||
|
|
0739b1947f | ||
|
|
77a6572aa5 | ||
|
|
0e3f06fe9c | ||
|
|
eb69d68804 | ||
|
|
7d4e1b85e7 | ||
|
|
93deb0b38f | ||
|
|
ccb58b23e6 | ||
|
|
49782fcb76 | ||
|
|
f03cc667a0 | ||
|
|
563c1d7ec5 | ||
|
|
9c82a1bec3 | ||
|
|
b6d103542c |
36
.buildkite/check-wheel-size.py
Normal file
36
.buildkite/check-wheel-size.py
Normal file
@@ -0,0 +1,36 @@
|
|||||||
|
import os
|
||||||
|
import zipfile
|
||||||
|
|
||||||
|
MAX_SIZE_MB = 250
|
||||||
|
|
||||||
|
|
||||||
|
def print_top_10_largest_files(zip_file):
|
||||||
|
with zipfile.ZipFile(zip_file, 'r') as z:
|
||||||
|
file_sizes = [(f, z.getinfo(f).file_size) for f in z.namelist()]
|
||||||
|
file_sizes.sort(key=lambda x: x[1], reverse=True)
|
||||||
|
for f, size in file_sizes[:10]:
|
||||||
|
print(f"{f}: {size/(1024*1024)} MBs uncompressed.")
|
||||||
|
|
||||||
|
|
||||||
|
def check_wheel_size(directory):
|
||||||
|
for root, _, files in os.walk(directory):
|
||||||
|
for f in files:
|
||||||
|
if f.endswith(".whl"):
|
||||||
|
wheel_path = os.path.join(root, f)
|
||||||
|
wheel_size = os.path.getsize(wheel_path)
|
||||||
|
wheel_size_mb = wheel_size / (1024 * 1024)
|
||||||
|
if wheel_size_mb > MAX_SIZE_MB:
|
||||||
|
print(
|
||||||
|
f"Wheel {wheel_path} is too large ({wheel_size_mb} MB) "
|
||||||
|
f"compare to the allowed size ({MAX_SIZE_MB} MB).")
|
||||||
|
print_top_10_largest_files(wheel_path)
|
||||||
|
return 1
|
||||||
|
else:
|
||||||
|
print(f"Wheel {wheel_path} is within the allowed size "
|
||||||
|
f"({wheel_size_mb} MB).")
|
||||||
|
return 0
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
import sys
|
||||||
|
sys.exit(check_wheel_size(sys.argv[1]))
|
||||||
@@ -1,18 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
|
|
||||||
set -ex
|
|
||||||
set -o pipefail
|
|
||||||
|
|
||||||
(which wget && which curl) || (apt-get update && apt-get install -y wget curl)
|
|
||||||
|
|
||||||
# aws s3 sync s3://air-example-data-2/vllm_opensource_llava/ images/
|
|
||||||
mkdir -p images
|
|
||||||
cd images
|
|
||||||
wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/stop_sign_pixel_values.pt
|
|
||||||
wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/stop_sign_image_features.pt
|
|
||||||
wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/cherry_blossom_pixel_values.pt
|
|
||||||
wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/cherry_blossom_image_features.pt
|
|
||||||
wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/stop_sign.jpg
|
|
||||||
wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/cherry_blossom.jpg
|
|
||||||
|
|
||||||
cd -
|
|
||||||
@@ -0,0 +1,11 @@
|
|||||||
|
# bash ./run-lm-eval-gsm-vllm-baseline.sh -m deepseek-ai/DeepSeek-V2-Lite-Chat -b "auto" -l 1000 -f 5 -t 2
|
||||||
|
model_name: "deepseek-ai/DeepSeek-V2-Lite-Chat"
|
||||||
|
tasks:
|
||||||
|
- name: "gsm8k"
|
||||||
|
metrics:
|
||||||
|
- name: "exact_match,strict-match"
|
||||||
|
value: 0.671
|
||||||
|
- name: "exact_match,flexible-extract"
|
||||||
|
value: 0.664
|
||||||
|
limit: 1000
|
||||||
|
num_fewshot: 5
|
||||||
@@ -0,0 +1,11 @@
|
|||||||
|
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m nm-testing/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform -b auto -l 1000 -f 5
|
||||||
|
model_name: "nm-testing/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform"
|
||||||
|
tasks:
|
||||||
|
- name: "gsm8k"
|
||||||
|
metrics:
|
||||||
|
- name: "exact_match,strict-match"
|
||||||
|
value: 0.905
|
||||||
|
- name: "exact_match,flexible-extract"
|
||||||
|
value: 0.905
|
||||||
|
limit: 1000
|
||||||
|
num_fewshot: 5
|
||||||
@@ -0,0 +1,11 @@
|
|||||||
|
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m meta-llama/Meta-Llama-3-70B-Instruct -b 32 -l 250 -f 5
|
||||||
|
model_name: "meta-llama/Meta-Llama-3-70B-Instruct"
|
||||||
|
tasks:
|
||||||
|
- name: "gsm8k"
|
||||||
|
metrics:
|
||||||
|
- name: "exact_match,strict-match"
|
||||||
|
value: 0.892
|
||||||
|
- name: "exact_match,flexible-extract"
|
||||||
|
value: 0.892
|
||||||
|
limit: 250
|
||||||
|
num_fewshot: 5
|
||||||
@@ -0,0 +1,11 @@
|
|||||||
|
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-W8A8-FP8-Channelwise-compressed-tensors -b auto -l 1000 -f 5 -t 1
|
||||||
|
model_name: "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-FP8-Channelwise-compressed-tensors"
|
||||||
|
tasks:
|
||||||
|
- name: "gsm8k"
|
||||||
|
metrics:
|
||||||
|
- name: "exact_match,strict-match"
|
||||||
|
value: 0.752
|
||||||
|
- name: "exact_match,flexible-extract"
|
||||||
|
value: 0.754
|
||||||
|
limit: 1000
|
||||||
|
num_fewshot: 5
|
||||||
@@ -0,0 +1,11 @@
|
|||||||
|
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-FBGEMM-nonuniform -b auto -l 1000 -f 5 -t 1
|
||||||
|
model_name: "nm-testing/Meta-Llama-3-8B-Instruct-FBGEMM-nonuniform"
|
||||||
|
tasks:
|
||||||
|
- name: "gsm8k"
|
||||||
|
metrics:
|
||||||
|
- name: "exact_match,strict-match"
|
||||||
|
value: 0.753
|
||||||
|
- name: "exact_match,flexible-extract"
|
||||||
|
value: 0.753
|
||||||
|
limit: 1000
|
||||||
|
num_fewshot: 5
|
||||||
@@ -0,0 +1,11 @@
|
|||||||
|
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test -b 32 -l 1000 -f 5 -t 1
|
||||||
|
model_name: "nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test"
|
||||||
|
tasks:
|
||||||
|
- name: "gsm8k"
|
||||||
|
metrics:
|
||||||
|
- name: "exact_match,strict-match"
|
||||||
|
value: 0.755
|
||||||
|
- name: "exact_match,flexible-extract"
|
||||||
|
value: 0.755
|
||||||
|
limit: 1000
|
||||||
|
num_fewshot: 5
|
||||||
@@ -0,0 +1,11 @@
|
|||||||
|
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Meta-Llama-3-8B-Instruct-FP8 -b 32 -l 250 -f 5 -t 1
|
||||||
|
model_name: "neuralmagic/Meta-Llama-3-8B-Instruct-FP8"
|
||||||
|
tasks:
|
||||||
|
- name: "gsm8k"
|
||||||
|
metrics:
|
||||||
|
- name: "exact_match,strict-match"
|
||||||
|
value: 0.753
|
||||||
|
- name: "exact_match,flexible-extract"
|
||||||
|
value: 0.753
|
||||||
|
limit: 1000
|
||||||
|
num_fewshot: 5
|
||||||
@@ -0,0 +1,11 @@
|
|||||||
|
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Per-Token-Test -b "auto" -l 250 -f 5 -t 1
|
||||||
|
model_name: "nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Per-Token-Test"
|
||||||
|
tasks:
|
||||||
|
- name: "gsm8k"
|
||||||
|
metrics:
|
||||||
|
- name: "exact_match,strict-match"
|
||||||
|
value: 0.728
|
||||||
|
- name: "exact_match,flexible-extract"
|
||||||
|
value: 0.728
|
||||||
|
limit: 250
|
||||||
|
num_fewshot: 5
|
||||||
@@ -0,0 +1,11 @@
|
|||||||
|
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-nonuniform-test -b auto -l 1000 -f 5 -t 1
|
||||||
|
model_name: "nm-testing/Meta-Llama-3-8B-Instruct-nonuniform-test"
|
||||||
|
tasks:
|
||||||
|
- name: "gsm8k"
|
||||||
|
metrics:
|
||||||
|
- name: "exact_match,strict-match"
|
||||||
|
value: 0.758
|
||||||
|
- name: "exact_match,flexible-extract"
|
||||||
|
value: 0.759
|
||||||
|
limit: 1000
|
||||||
|
num_fewshot: 5
|
||||||
@@ -0,0 +1,11 @@
|
|||||||
|
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m meta-llama/Meta-Llama-3-8B-Instruct -b 32 -l 250 -f 5 -t 1
|
||||||
|
model_name: "meta-llama/Meta-Llama-3-8B-Instruct"
|
||||||
|
tasks:
|
||||||
|
- name: "gsm8k"
|
||||||
|
metrics:
|
||||||
|
- name: "exact_match,strict-match"
|
||||||
|
value: 0.756
|
||||||
|
- name: "exact_match,flexible-extract"
|
||||||
|
value: 0.752
|
||||||
|
limit: 250
|
||||||
|
num_fewshot: 5
|
||||||
11
.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-QQQ.yaml
Normal file
11
.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-QQQ.yaml
Normal file
@@ -0,0 +1,11 @@
|
|||||||
|
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m HandH1998/QQQ-Llama-3-8b-g128 -b 32 -l 1000 -f 5 -t 1
|
||||||
|
model_name: "HandH1998/QQQ-Llama-3-8b-g128"
|
||||||
|
tasks:
|
||||||
|
- name: "gsm8k"
|
||||||
|
metrics:
|
||||||
|
- name: "exact_match,strict-match"
|
||||||
|
value: 0.409
|
||||||
|
- name: "exact_match,flexible-extract"
|
||||||
|
value: 0.406
|
||||||
|
limit: 1000
|
||||||
|
num_fewshot: 5
|
||||||
11
.buildkite/lm-eval-harness/configs/Minitron-4B-Base.yaml
Normal file
11
.buildkite/lm-eval-harness/configs/Minitron-4B-Base.yaml
Normal file
@@ -0,0 +1,11 @@
|
|||||||
|
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nvidia/Minitron-4B-Base -b auto -l 1000 -f 5 -t 1
|
||||||
|
model_name: "nvidia/Minitron-4B-Base"
|
||||||
|
tasks:
|
||||||
|
- name: "gsm8k"
|
||||||
|
metrics:
|
||||||
|
- name: "exact_match,strict-match"
|
||||||
|
value: 0.252
|
||||||
|
- name: "exact_match,flexible-extract"
|
||||||
|
value: 0.252
|
||||||
|
limit: 1000
|
||||||
|
num_fewshot: 5
|
||||||
@@ -0,0 +1,11 @@
|
|||||||
|
# bash ./run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Mixtral-8x22B-Instruct-v0.1-FP8-dynamic -b "auto" -l 250 -f 5 -t 8
|
||||||
|
model_name: "neuralmagic/Mixtral-8x22B-Instruct-v0.1-FP8-dynamic"
|
||||||
|
tasks:
|
||||||
|
- name: "gsm8k"
|
||||||
|
metrics:
|
||||||
|
- name: "exact_match,strict-match"
|
||||||
|
value: 0.86
|
||||||
|
- name: "exact_match,flexible-extract"
|
||||||
|
value: 0.86
|
||||||
|
limit: 250
|
||||||
|
num_fewshot: 5
|
||||||
@@ -0,0 +1,11 @@
|
|||||||
|
# bash ./run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8 -b "auto" -l 250 -f 5 -t 4
|
||||||
|
model_name: "neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8"
|
||||||
|
tasks:
|
||||||
|
- name: "gsm8k"
|
||||||
|
metrics:
|
||||||
|
- name: "exact_match,strict-match"
|
||||||
|
value: 0.624
|
||||||
|
- name: "exact_match,flexible-extract"
|
||||||
|
value: 0.624
|
||||||
|
limit: 250
|
||||||
|
num_fewshot: 5
|
||||||
@@ -0,0 +1,11 @@
|
|||||||
|
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m neuralmagic/Mixtral-8x7B-Instruct-v0.1 -b 32 -l 250 -f 5 -t 4
|
||||||
|
model_name: "mistralai/Mixtral-8x7B-Instruct-v0.1"
|
||||||
|
tasks:
|
||||||
|
- name: "gsm8k"
|
||||||
|
metrics:
|
||||||
|
- name: "exact_match,strict-match"
|
||||||
|
value: 0.616
|
||||||
|
- name: "exact_match,flexible-extract"
|
||||||
|
value: 0.632
|
||||||
|
limit: 250
|
||||||
|
num_fewshot: 5
|
||||||
@@ -0,0 +1,11 @@
|
|||||||
|
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Qwen2-1.5B-Instruct-FP8W8 -b auto -l 1000 -f 5 -t 1
|
||||||
|
model_name: "nm-testing/Qwen2-1.5B-Instruct-FP8W8"
|
||||||
|
tasks:
|
||||||
|
- name: "gsm8k"
|
||||||
|
metrics:
|
||||||
|
- name: "exact_match,strict-match"
|
||||||
|
value: 0.578
|
||||||
|
- name: "exact_match,flexible-extract"
|
||||||
|
value: 0.585
|
||||||
|
limit: 1000
|
||||||
|
num_fewshot: 5
|
||||||
@@ -0,0 +1,11 @@
|
|||||||
|
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Qwen2-1.5B-Instruct-quantized.w8a8 -b "auto" -l 1000 -f 5 -t 1
|
||||||
|
model_name: "neuralmagic/Qwen2-1.5B-Instruct-quantized.w8a8"
|
||||||
|
tasks:
|
||||||
|
- name: "gsm8k"
|
||||||
|
metrics:
|
||||||
|
- name: "exact_match,strict-match"
|
||||||
|
value: 0.593
|
||||||
|
- name: "exact_match,flexible-extract"
|
||||||
|
value: 0.588
|
||||||
|
limit: 1000
|
||||||
|
num_fewshot: 5
|
||||||
@@ -0,0 +1,11 @@
|
|||||||
|
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Qwen2-1.5B-Instruct-W8A16-Channelwise -b "auto" -l 1000 -f 5 -t 1
|
||||||
|
model_name: "nm-testing/Qwen2-1.5B-Instruct-W8A16-Channelwise"
|
||||||
|
tasks:
|
||||||
|
- name: "gsm8k"
|
||||||
|
metrics:
|
||||||
|
- name: "exact_match,strict-match"
|
||||||
|
value: 0.595
|
||||||
|
- name: "exact_match,flexible-extract"
|
||||||
|
value: 0.582
|
||||||
|
limit: 1000
|
||||||
|
num_fewshot: 5
|
||||||
@@ -0,0 +1,11 @@
|
|||||||
|
# bash ./run-lm-eval-gsm-vllm-baseline.sh -m Qwen/Qwen2-57B-A14B-Instruct -b "auto" -l 250 -f 5 -t 4
|
||||||
|
model_name: "Qwen/Qwen2-57B-A14B-Instruct"
|
||||||
|
tasks:
|
||||||
|
- name: "gsm8k"
|
||||||
|
metrics:
|
||||||
|
- name: "exact_match,strict-match"
|
||||||
|
value: 0.792
|
||||||
|
- name: "exact_match,flexible-extract"
|
||||||
|
value: 0.824
|
||||||
|
limit: 250
|
||||||
|
num_fewshot: 5
|
||||||
5
.buildkite/lm-eval-harness/configs/models-large.txt
Normal file
5
.buildkite/lm-eval-harness/configs/models-large.txt
Normal file
@@ -0,0 +1,5 @@
|
|||||||
|
Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform.yaml
|
||||||
|
Meta-Llama-3-70B-Instruct.yaml
|
||||||
|
Mixtral-8x7B-Instruct-v0.1.yaml
|
||||||
|
Qwen2-57B-A14-Instruct.yaml
|
||||||
|
DeepSeek-V2-Lite-Chat.yaml
|
||||||
10
.buildkite/lm-eval-harness/configs/models-small.txt
Normal file
10
.buildkite/lm-eval-harness/configs/models-small.txt
Normal file
@@ -0,0 +1,10 @@
|
|||||||
|
Meta-Llama-3-8B-Instruct.yaml
|
||||||
|
Meta-Llama-3-8B-Instruct-FP8.yaml
|
||||||
|
Meta-Llama-3-8B-Instruct-FP8-compressed-tensors.yaml
|
||||||
|
Meta-Llama-3-8B-Instruct-INT8-compressed-tensors.yaml
|
||||||
|
Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml
|
||||||
|
Meta-Llama-3-8B-Instruct-Channelwise-compressed-tensors.yaml
|
||||||
|
Minitron-4B-Base.yaml
|
||||||
|
Qwen2-1.5B-Instruct-INT8-compressed-tensors.yaml
|
||||||
|
Qwen2-1.5B-Instruct-FP8W8.yaml
|
||||||
|
Meta-Llama-3-8B-QQQ.yaml
|
||||||
46
.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh
Normal file
46
.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh
Normal file
@@ -0,0 +1,46 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
# We can use this script to compute baseline accuracy on GSM for transformers.
|
||||||
|
#
|
||||||
|
# Make sure you have lm-eval-harness installed:
|
||||||
|
# pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@9516087b81a61d0e220b22cc1b75be76de23bc10
|
||||||
|
|
||||||
|
usage() {
|
||||||
|
echo``
|
||||||
|
echo "Runs lm eval harness on GSM8k using huggingface transformers."
|
||||||
|
echo "This pathway is intended to be used to create baselines for "
|
||||||
|
echo "our automated nm-test-accuracy workflow"
|
||||||
|
echo
|
||||||
|
echo "usage: ${0} <options>"
|
||||||
|
echo
|
||||||
|
echo " -m - huggingface stub or local directory of the model"
|
||||||
|
echo " -b - batch size to run the evaluation at"
|
||||||
|
echo " -l - limit number of samples to run"
|
||||||
|
echo " -f - number of fewshot samples to use"
|
||||||
|
echo
|
||||||
|
}
|
||||||
|
|
||||||
|
while getopts "m:b:l:f:" OPT; do
|
||||||
|
case ${OPT} in
|
||||||
|
m )
|
||||||
|
MODEL="$OPTARG"
|
||||||
|
;;
|
||||||
|
b )
|
||||||
|
BATCH_SIZE="$OPTARG"
|
||||||
|
;;
|
||||||
|
l )
|
||||||
|
LIMIT="$OPTARG"
|
||||||
|
;;
|
||||||
|
f )
|
||||||
|
FEWSHOT="$OPTARG"
|
||||||
|
;;
|
||||||
|
\? )
|
||||||
|
usage
|
||||||
|
exit 1
|
||||||
|
;;
|
||||||
|
esac
|
||||||
|
done
|
||||||
|
|
||||||
|
lm_eval --model hf \
|
||||||
|
--model_args pretrained=$MODEL,parallelize=True \
|
||||||
|
--tasks gsm8k --num_fewshot $FEWSHOT --limit $LIMIT \
|
||||||
|
--batch_size $BATCH_SIZE
|
||||||
51
.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh
Normal file
51
.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh
Normal file
@@ -0,0 +1,51 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
# We can use this script to compute baseline accuracy on GSM for vllm.
|
||||||
|
# We use this for fp8, which HF does not support.
|
||||||
|
#
|
||||||
|
# Make sure you have lm-eval-harness installed:
|
||||||
|
# pip install lm-eval==0.4.3
|
||||||
|
|
||||||
|
usage() {
|
||||||
|
echo``
|
||||||
|
echo "Runs lm eval harness on GSM8k using huggingface transformers."
|
||||||
|
echo "This pathway is intended to be used to create baselines for "
|
||||||
|
echo "our automated nm-test-accuracy workflow"
|
||||||
|
echo
|
||||||
|
echo "usage: ${0} <options>"
|
||||||
|
echo
|
||||||
|
echo " -m - huggingface stub or local directory of the model"
|
||||||
|
echo " -b - batch size to run the evaluation at"
|
||||||
|
echo " -l - limit number of samples to run"
|
||||||
|
echo " -f - number of fewshot samples to use"
|
||||||
|
echo " -t - tensor parallel size to run at"
|
||||||
|
echo
|
||||||
|
}
|
||||||
|
|
||||||
|
while getopts "m:b:l:f:t:" OPT; do
|
||||||
|
case ${OPT} in
|
||||||
|
m )
|
||||||
|
MODEL="$OPTARG"
|
||||||
|
;;
|
||||||
|
b )
|
||||||
|
BATCH_SIZE="$OPTARG"
|
||||||
|
;;
|
||||||
|
l )
|
||||||
|
LIMIT="$OPTARG"
|
||||||
|
;;
|
||||||
|
f )
|
||||||
|
FEWSHOT="$OPTARG"
|
||||||
|
;;
|
||||||
|
t )
|
||||||
|
TP_SIZE="$OPTARG"
|
||||||
|
;;
|
||||||
|
\? )
|
||||||
|
usage
|
||||||
|
exit 1
|
||||||
|
;;
|
||||||
|
esac
|
||||||
|
done
|
||||||
|
|
||||||
|
lm_eval --model vllm \
|
||||||
|
--model_args pretrained=$MODEL,tensor_parallel_size=$TP_SIZE,distributed_executor_backend="ray",trust_remote_code=true,max_model_len=4096 \
|
||||||
|
--tasks gsm8k --num_fewshot $FEWSHOT --limit $LIMIT \
|
||||||
|
--batch_size $BATCH_SIZE
|
||||||
59
.buildkite/lm-eval-harness/run-tests.sh
Normal file
59
.buildkite/lm-eval-harness/run-tests.sh
Normal file
@@ -0,0 +1,59 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
usage() {
|
||||||
|
echo``
|
||||||
|
echo "Runs lm eval harness on GSM8k using vllm and compares to "
|
||||||
|
echo "precomputed baseline (measured by HF transformers.)"
|
||||||
|
echo
|
||||||
|
echo "usage: ${0} <options>"
|
||||||
|
echo
|
||||||
|
echo " -c - path to the test data config (e.g. configs/small-models.txt)"
|
||||||
|
echo " -t - tensor parallel size"
|
||||||
|
echo
|
||||||
|
}
|
||||||
|
|
||||||
|
SUCCESS=0
|
||||||
|
|
||||||
|
while getopts "c:t:" OPT; do
|
||||||
|
case ${OPT} in
|
||||||
|
c )
|
||||||
|
CONFIG="$OPTARG"
|
||||||
|
;;
|
||||||
|
t )
|
||||||
|
TP_SIZE="$OPTARG"
|
||||||
|
;;
|
||||||
|
\? )
|
||||||
|
usage
|
||||||
|
exit 1
|
||||||
|
;;
|
||||||
|
esac
|
||||||
|
done
|
||||||
|
|
||||||
|
# Parse list of configs.
|
||||||
|
IFS=$'\n' read -d '' -r -a MODEL_CONFIGS < $CONFIG
|
||||||
|
|
||||||
|
for MODEL_CONFIG in "${MODEL_CONFIGS[@]}"
|
||||||
|
do
|
||||||
|
LOCAL_SUCCESS=0
|
||||||
|
|
||||||
|
echo "=== RUNNING MODEL: $MODEL_CONFIG WITH TP SIZE: $TP_SIZE==="
|
||||||
|
|
||||||
|
export LM_EVAL_TEST_DATA_FILE=$PWD/configs/${MODEL_CONFIG}
|
||||||
|
export LM_EVAL_TP_SIZE=$TP_SIZE
|
||||||
|
pytest -s test_lm_eval_correctness.py || LOCAL_SUCCESS=$?
|
||||||
|
|
||||||
|
if [[ $LOCAL_SUCCESS == 0 ]]; then
|
||||||
|
echo "=== PASSED MODEL: ${MODEL_CONFIG} ==="
|
||||||
|
else
|
||||||
|
echo "=== FAILED MODEL: ${MODEL_CONFIG} ==="
|
||||||
|
fi
|
||||||
|
|
||||||
|
SUCCESS=$((SUCCESS + LOCAL_SUCCESS))
|
||||||
|
|
||||||
|
done
|
||||||
|
|
||||||
|
if [ "${SUCCESS}" -eq "0" ]; then
|
||||||
|
exit 0
|
||||||
|
else
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
55
.buildkite/lm-eval-harness/test_lm_eval_correctness.py
Normal file
55
.buildkite/lm-eval-harness/test_lm_eval_correctness.py
Normal file
@@ -0,0 +1,55 @@
|
|||||||
|
"""
|
||||||
|
LM eval harness on model to compare vs HF baseline computed offline.
|
||||||
|
Configs are found in configs/$MODEL.yaml
|
||||||
|
|
||||||
|
* export LM_EVAL_TEST_DATA_FILE=configs/Meta-Llama-3-70B-Instruct.yaml
|
||||||
|
* export LM_EVAL_TP_SIZE=4
|
||||||
|
* pytest -s test_lm_eval_correctness.py
|
||||||
|
"""
|
||||||
|
|
||||||
|
import os
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import lm_eval
|
||||||
|
import numpy
|
||||||
|
import yaml
|
||||||
|
|
||||||
|
RTOL = 0.02
|
||||||
|
TEST_DATA_FILE = os.environ.get(
|
||||||
|
"LM_EVAL_TEST_DATA_FILE",
|
||||||
|
".buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct.yaml")
|
||||||
|
|
||||||
|
TP_SIZE = os.environ.get("LM_EVAL_TP_SIZE", 1)
|
||||||
|
|
||||||
|
|
||||||
|
def launch_lm_eval(eval_config):
|
||||||
|
model_args = f"pretrained={eval_config['model_name']}," \
|
||||||
|
f"tensor_parallel_size={TP_SIZE}," \
|
||||||
|
f"add_bos_token=true"
|
||||||
|
|
||||||
|
results = lm_eval.simple_evaluate(
|
||||||
|
model="vllm",
|
||||||
|
model_args=model_args,
|
||||||
|
tasks=[task["name"] for task in eval_config["tasks"]],
|
||||||
|
num_fewshot=eval_config["num_fewshot"],
|
||||||
|
limit=eval_config["limit"],
|
||||||
|
batch_size="auto")
|
||||||
|
|
||||||
|
return results
|
||||||
|
|
||||||
|
|
||||||
|
def test_lm_eval_correctness():
|
||||||
|
eval_config = yaml.safe_load(
|
||||||
|
Path(TEST_DATA_FILE).read_text(encoding="utf-8"))
|
||||||
|
|
||||||
|
# Launch eval requests.
|
||||||
|
results = launch_lm_eval(eval_config)
|
||||||
|
|
||||||
|
# Confirm scores match ground truth.
|
||||||
|
for task in eval_config["tasks"]:
|
||||||
|
for metric in task["metrics"]:
|
||||||
|
ground_truth = metric["value"]
|
||||||
|
measured_value = results["results"][task["name"]][metric["name"]]
|
||||||
|
print(f'{task["name"]} | {metric["name"]}: '
|
||||||
|
f'ground_truth={ground_truth} | measured={measured_value}')
|
||||||
|
assert numpy.isclose(ground_truth, measured_value, rtol=RTOL)
|
||||||
152
.buildkite/nightly-benchmarks/README.md
Normal file
152
.buildkite/nightly-benchmarks/README.md
Normal file
@@ -0,0 +1,152 @@
|
|||||||
|
# vLLM benchmark suite
|
||||||
|
|
||||||
|
|
||||||
|
## Introduction
|
||||||
|
|
||||||
|
This directory contains two sets of benchmark for vllm.
|
||||||
|
- Performance benchmark: benchmark vllm's performance under various workload, for **developers** to gain clarity on whether their PR improves/degrades vllm's performance
|
||||||
|
- Nightly benchmark: compare vllm's performance against alternatives (tgi, trt-llm and lmdeploy), for **the public** to know when to choose vllm.
|
||||||
|
|
||||||
|
|
||||||
|
See [vLLM performance dashboard](https://perf.vllm.ai) for the latest performance benchmark results and [vLLM GitHub README](https://github.com/vllm-project/vllm/blob/main/README.md) for latest nightly benchmark results.
|
||||||
|
|
||||||
|
|
||||||
|
## Performance benchmark quick overview
|
||||||
|
|
||||||
|
**Benchmarking Coverage**: latency, throughput and fix-qps serving on A100 (the support for FP8 benchmark on H100 is coming!), with different models.
|
||||||
|
|
||||||
|
**Benchmarking Duration**: about 1hr.
|
||||||
|
|
||||||
|
**For benchmarking developers**: please try your best to constraint the duration of benchmarking to about 1 hr so that it won't take forever to run.
|
||||||
|
|
||||||
|
|
||||||
|
## Nightly benchmark quick overview
|
||||||
|
|
||||||
|
**Benchmarking Coverage**: Fix-qps serving on A100 (the support for FP8 benchmark on H100 is coming!) on Llama-3 8B, 70B and Mixtral 8x7B.
|
||||||
|
|
||||||
|
**Benchmarking engines**: vllm, TGI, trt-llm and lmdeploy.
|
||||||
|
|
||||||
|
**Benchmarking Duration**: about 3.5hrs.
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
## Trigger the benchmark
|
||||||
|
|
||||||
|
Performance benchmark will be triggered when:
|
||||||
|
- A PR being merged into vllm.
|
||||||
|
- Every commit for those PRs with `perf-benchmarks` label.
|
||||||
|
|
||||||
|
Nightly benchmark will be triggered when:
|
||||||
|
- Every commit for those PRs with `nightly-benchmarks` label.
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
## Performance benchmark details
|
||||||
|
|
||||||
|
See [descriptions.md](tests/descriptions.md) for detailed descriptions, and use `tests/latency-tests.json`, `tests/throughput-tests.json`, `tests/serving-tests.json` to configure the test cases.
|
||||||
|
|
||||||
|
|
||||||
|
#### Latency test
|
||||||
|
|
||||||
|
Here is an example of one test inside `latency-tests.json`:
|
||||||
|
|
||||||
|
```json
|
||||||
|
[
|
||||||
|
{
|
||||||
|
"test_name": "latency_llama8B_tp1",
|
||||||
|
"parameters": {
|
||||||
|
"model": "meta-llama/Meta-Llama-3-8B",
|
||||||
|
"tensor_parallel_size": 1,
|
||||||
|
"load_format": "dummy",
|
||||||
|
"num_iters_warmup": 5,
|
||||||
|
"num_iters": 15
|
||||||
|
}
|
||||||
|
},
|
||||||
|
]
|
||||||
|
```
|
||||||
|
|
||||||
|
In this example:
|
||||||
|
- The `test_name` attributes is a unique identifier for the test. In `latency-tests.json`, it must start with `latency_`.
|
||||||
|
- The `parameters` attribute control the command line arguments to be used for `benchmark_latency.py`. Note that please use underline `_` instead of the dash `-` when specifying the command line arguments, and `run-benchmarks-suite.sh` will convert the underline to dash when feeding the arguments to `benchmark_latency.py`. For example, the corresponding command line arguments for `benchmark_latency.py` will be `--model meta-llama/Meta-Llama-3-8B --tensor-parallel-size 1 --load-format dummy --num-iters-warmup 5 --num-iters 15`
|
||||||
|
|
||||||
|
Note that the performance numbers are highly sensitive to the value of the parameters. Please make sure the parameters are set correctly.
|
||||||
|
|
||||||
|
WARNING: The benchmarking script will save json results by itself, so please do not configure `--output-json` parameter in the json file.
|
||||||
|
|
||||||
|
|
||||||
|
#### Throughput test
|
||||||
|
The tests are specified in `throughput-tests.json`. The syntax is similar to `latency-tests.json`, except for that the parameters will be fed forward to `benchmark_throughput.py`.
|
||||||
|
|
||||||
|
The number of this test is also stable -- a slight change on the value of this number might vary the performance numbers by a lot.
|
||||||
|
|
||||||
|
#### Serving test
|
||||||
|
We test the throughput by using `benchmark_serving.py` with request rate = inf to cover the online serving overhead. The corresponding parameters are in `serving-tests.json`, and here is an example:
|
||||||
|
|
||||||
|
```
|
||||||
|
[
|
||||||
|
{
|
||||||
|
"test_name": "serving_llama8B_tp1_sharegpt",
|
||||||
|
"qps_list": [1, 4, 16, "inf"],
|
||||||
|
"server_parameters": {
|
||||||
|
"model": "meta-llama/Meta-Llama-3-8B",
|
||||||
|
"tensor_parallel_size": 1,
|
||||||
|
"swap_space": 16,
|
||||||
|
"disable_log_stats": "",
|
||||||
|
"disable_log_requests": "",
|
||||||
|
"load_format": "dummy"
|
||||||
|
},
|
||||||
|
"client_parameters": {
|
||||||
|
"model": "meta-llama/Meta-Llama-3-8B",
|
||||||
|
"backend": "vllm",
|
||||||
|
"dataset_name": "sharegpt",
|
||||||
|
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||||
|
"num_prompts": 200
|
||||||
|
}
|
||||||
|
},
|
||||||
|
]
|
||||||
|
```
|
||||||
|
|
||||||
|
Inside this example:
|
||||||
|
- The `test_name` attribute is also a unique identifier for the test. It must start with `serving_`.
|
||||||
|
- The `server-parameters` includes the command line arguments for vLLM server.
|
||||||
|
- The `client-parameters` includes the command line arguments for `benchmark_serving.py`.
|
||||||
|
- The `qps_list` controls the list of qps for test. It will be used to configure the `--request-rate` parameter in `benchmark_serving.py`
|
||||||
|
|
||||||
|
The number of this test is less stable compared to the delay and latency benchmarks (due to randomized sharegpt dataset sampling inside `benchmark_serving.py`), but a large change on this number (e.g. 5% change) still vary the output greatly.
|
||||||
|
|
||||||
|
WARNING: The benchmarking script will save json results by itself, so please do not configure `--save-results` or other results-saving-related parameters in `serving-tests.json`.
|
||||||
|
|
||||||
|
#### Visualizing the results
|
||||||
|
The `convert-results-json-to-markdown.py` helps you put the benchmarking results inside a markdown table, by formatting [descriptions.md](tests/descriptions.md) with real benchmarking results.
|
||||||
|
You can find the result presented as a table inside the `buildkite/performance-benchmark` job page.
|
||||||
|
If you do not see the table, please wait till the benchmark finish running.
|
||||||
|
The json version of the table (together with the json version of the benchmark) will be also attached to the markdown file.
|
||||||
|
The raw benchmarking results (in the format of json files) are in the `Artifacts` tab of the benchmarking.
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
## Nightly test details
|
||||||
|
|
||||||
|
See [nightly-descriptions.md](nightly-descriptions.md) for the detailed description on test workload, models and docker containers of benchmarking other llm engines.
|
||||||
|
|
||||||
|
|
||||||
|
#### Workflow
|
||||||
|
|
||||||
|
- The [nightly-pipeline.yaml](nightly-pipeline.yaml) specifies the docker containers for different LLM serving engines.
|
||||||
|
- Inside each container, we run [run-nightly-suite.sh](run-nightly-suite.sh), which will probe the serving engine of the current container.
|
||||||
|
- The `run-nightly-suite.sh` will redirect the request to `tests/run-[llm serving engine name]-nightly.sh`, which parses the workload described in [nightly-tests.json](tests/nightly-tests.json) and performs the benchmark.
|
||||||
|
- At last, we run [scripts/plot-nightly-results.py](scripts/plot-nightly-results.py) to collect and plot the final benchmarking results, and update the results to buildkite.
|
||||||
|
|
||||||
|
#### Nightly tests
|
||||||
|
|
||||||
|
In [nightly-tests.json](tests/nightly-tests.json), we include the command line arguments for benchmarking commands, together with the benchmarking test cases. The format is highly similar to performance benchmark.
|
||||||
|
|
||||||
|
#### Docker containers
|
||||||
|
|
||||||
|
The docker containers for benchmarking are specified in `nightly-pipeline.yaml`.
|
||||||
|
|
||||||
|
WARNING: the docker versions are HARD-CODED and SHOULD BE ALIGNED WITH `nightly-descriptions.md`. The docker versions need to be hard-coded as there are several version-specific bug fixes inside `tests/run-[llm serving engine name]-nightly.sh`.
|
||||||
|
|
||||||
|
WARNING: populating `trt-llm` to latest version is not easy, as it requires updating several protobuf files in [tensorrt-demo](https://github.com/neuralmagic/tensorrt-demo.git).
|
||||||
|
|
||||||
61
.buildkite/nightly-benchmarks/benchmark-pipeline.yaml
Normal file
61
.buildkite/nightly-benchmarks/benchmark-pipeline.yaml
Normal file
@@ -0,0 +1,61 @@
|
|||||||
|
steps:
|
||||||
|
- label: "Wait for container to be ready"
|
||||||
|
agents:
|
||||||
|
queue: A100
|
||||||
|
plugins:
|
||||||
|
- kubernetes:
|
||||||
|
podSpec:
|
||||||
|
containers:
|
||||||
|
- image: badouralix/curl-jq
|
||||||
|
command:
|
||||||
|
- sh
|
||||||
|
- .buildkite/nightly-benchmarks/scripts/wait-for-image.sh
|
||||||
|
- wait
|
||||||
|
- label: "A100"
|
||||||
|
agents:
|
||||||
|
queue: A100
|
||||||
|
plugins:
|
||||||
|
- kubernetes:
|
||||||
|
podSpec:
|
||||||
|
priorityClassName: perf-benchmark
|
||||||
|
containers:
|
||||||
|
- image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
|
||||||
|
command:
|
||||||
|
- bash .buildkite/nightly-benchmarks/run-benchmarks-suite.sh
|
||||||
|
resources:
|
||||||
|
limits:
|
||||||
|
nvidia.com/gpu: 8
|
||||||
|
volumeMounts:
|
||||||
|
- name: devshm
|
||||||
|
mountPath: /dev/shm
|
||||||
|
env:
|
||||||
|
- name: VLLM_USAGE_SOURCE
|
||||||
|
value: ci-test
|
||||||
|
- name: HF_TOKEN
|
||||||
|
valueFrom:
|
||||||
|
secretKeyRef:
|
||||||
|
name: hf-token-secret
|
||||||
|
key: token
|
||||||
|
nodeSelector:
|
||||||
|
nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
|
||||||
|
volumes:
|
||||||
|
- name: devshm
|
||||||
|
emptyDir:
|
||||||
|
medium: Memory
|
||||||
|
# - label: "H100"
|
||||||
|
# agents:
|
||||||
|
# queue: H100
|
||||||
|
# plugins:
|
||||||
|
# - docker#v5.11.0:
|
||||||
|
# image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
|
||||||
|
# command:
|
||||||
|
# - bash
|
||||||
|
# - .buildkite/nightly-benchmarks/run-benchmarks-suite.sh
|
||||||
|
# mount-buildkite-agent: true
|
||||||
|
# propagate-environment: true
|
||||||
|
# ipc: host
|
||||||
|
# gpus: all
|
||||||
|
# environment:
|
||||||
|
# - VLLM_USAGE_SOURCE
|
||||||
|
# - HF_TOKEN
|
||||||
|
|
||||||
45
.buildkite/nightly-benchmarks/nightly-descriptions.md
Normal file
45
.buildkite/nightly-benchmarks/nightly-descriptions.md
Normal file
@@ -0,0 +1,45 @@
|
|||||||
|
|
||||||
|
# Nightly benchmark
|
||||||
|
|
||||||
|
The main goal of this benchmarking is two-fold:
|
||||||
|
- Performance clarity: Provide clarity on which one (vllm, tensorrt-llm, lmdeploy and tgi) leads in performance in what workload.
|
||||||
|
- Reproducible: one can run the exact same set of benchmarking commands inside the exact same docker by following reproducing instructions in [reproduce.md]().
|
||||||
|
|
||||||
|
|
||||||
|
## Docker images
|
||||||
|
|
||||||
|
We benchmark vllm, tensorrt-llm, lmdeploy and tgi using the following docker images:
|
||||||
|
- vllm/vllm-openai:v0.5.0.post1
|
||||||
|
- nvcr.io/nvidia/tritonserver:24.04-trtllm-python-py3
|
||||||
|
- openmmlab/lmdeploy:v0.5.0
|
||||||
|
- ghcr.io/huggingface/text-generation-inference:2.1
|
||||||
|
|
||||||
|
<!-- Please check <a href="artifact://workspace/build/buildkite/vllm/performance-benchmark/.buildkite/nightly-benchmarks/nightly-pipeline.yaml">nightly-pipeline.yaml</a> artifact for more details on how we deploy the docker images. -->
|
||||||
|
|
||||||
|
|
||||||
|
## Hardware
|
||||||
|
|
||||||
|
One AWS node with 8x NVIDIA A100 GPUs.
|
||||||
|
|
||||||
|
|
||||||
|
## Workload description
|
||||||
|
|
||||||
|
We benchmark vllm, tensorrt-llm, lmdeploy and tgi using the following workload:
|
||||||
|
|
||||||
|
- Input length: randomly sample 500 prompts from ShareGPT dataset (with fixed random seed).
|
||||||
|
- Output length: the corresponding output length of these 500 prompts.
|
||||||
|
- Models: llama-3 8B, llama-3 70B, mixtral 8x7B.
|
||||||
|
- Average QPS (query per second): 4 for the small model (llama-3 8B) and 2 for other two models. For each QPS, the arrival time of each query is determined using a random Poisson process (with fixed random seed).
|
||||||
|
- Evaluation metrics: Throughput (higher the better), TTFT (time to the first token, lower the better), ITL (inter-token latency, lower the better).
|
||||||
|
|
||||||
|
<!-- Check <a href="artifact://workspace/build/buildkite/vllm/performance-benchmark/.buildkite/nightly-benchmarks/tests/nightly-tests.json">nightly-tests.json</a> artifact for more details. -->
|
||||||
|
|
||||||
|
## Plots
|
||||||
|
|
||||||
|
In the following plots, the dot shows the mean and the error bar shows the standard error of the mean. Value 0 means that the corresponding benchmark crashed.
|
||||||
|
|
||||||
|
<img src="artifact://nightly_results.png" alt="Benchmarking results" height=250 >
|
||||||
|
|
||||||
|
## Results
|
||||||
|
|
||||||
|
{nightly_results_benchmarking_table}
|
||||||
120
.buildkite/nightly-benchmarks/nightly-pipeline.yaml
Normal file
120
.buildkite/nightly-benchmarks/nightly-pipeline.yaml
Normal file
@@ -0,0 +1,120 @@
|
|||||||
|
common_pod_spec: &common_pod_spec
|
||||||
|
priorityClassName: perf-benchmark
|
||||||
|
nodeSelector:
|
||||||
|
nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
|
||||||
|
volumes:
|
||||||
|
- name: devshm
|
||||||
|
emptyDir:
|
||||||
|
medium: Memory
|
||||||
|
- name: hf-cache
|
||||||
|
hostPath:
|
||||||
|
path: /root/.cache/huggingface
|
||||||
|
type: Directory
|
||||||
|
|
||||||
|
common_container_settings: &common_container_settings
|
||||||
|
command:
|
||||||
|
- bash .buildkite/nightly-benchmarks/run-nightly-suite.sh
|
||||||
|
resources:
|
||||||
|
limits:
|
||||||
|
nvidia.com/gpu: 8
|
||||||
|
volumeMounts:
|
||||||
|
- name: devshm
|
||||||
|
mountPath: /dev/shm
|
||||||
|
- name: hf-cache
|
||||||
|
mountPath: /root/.cache/huggingface
|
||||||
|
env:
|
||||||
|
- name: VLLM_USAGE_SOURCE
|
||||||
|
value: ci-test
|
||||||
|
- name: HF_HOME
|
||||||
|
value: /root/.cache/huggingface
|
||||||
|
- name: VLLM_SOURCE_CODE_LOC
|
||||||
|
value: /workspace/build/buildkite/vllm/performance-benchmark
|
||||||
|
- name: HF_TOKEN
|
||||||
|
valueFrom:
|
||||||
|
secretKeyRef:
|
||||||
|
name: hf-token-secret
|
||||||
|
key: token
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- block: ":rocket: Ready for comparing vllm against alternatives? This will take 4 hours."
|
||||||
|
- label: "A100 trt benchmark"
|
||||||
|
priority: 100
|
||||||
|
agents:
|
||||||
|
queue: A100
|
||||||
|
plugins:
|
||||||
|
- kubernetes:
|
||||||
|
podSpec:
|
||||||
|
<<: *common_pod_spec
|
||||||
|
containers:
|
||||||
|
- image: nvcr.io/nvidia/tritonserver:24.04-trtllm-python-py3
|
||||||
|
<<: *common_container_settings
|
||||||
|
|
||||||
|
- label: "A100 lmdeploy benchmark"
|
||||||
|
priority: 100
|
||||||
|
agents:
|
||||||
|
queue: A100
|
||||||
|
plugins:
|
||||||
|
- kubernetes:
|
||||||
|
podSpec:
|
||||||
|
<<: *common_pod_spec
|
||||||
|
containers:
|
||||||
|
- image: openmmlab/lmdeploy:v0.5.0
|
||||||
|
<<: *common_container_settings
|
||||||
|
|
||||||
|
|
||||||
|
- label: "A100 vllm benchmark"
|
||||||
|
priority: 100
|
||||||
|
agents:
|
||||||
|
queue: A100
|
||||||
|
plugins:
|
||||||
|
- kubernetes:
|
||||||
|
podSpec:
|
||||||
|
<<: *common_pod_spec
|
||||||
|
containers:
|
||||||
|
- image: vllm/vllm-openai:latest
|
||||||
|
<<: *common_container_settings
|
||||||
|
|
||||||
|
- label: "A100 tgi benchmark"
|
||||||
|
priority: 100
|
||||||
|
agents:
|
||||||
|
queue: A100
|
||||||
|
plugins:
|
||||||
|
- kubernetes:
|
||||||
|
podSpec:
|
||||||
|
<<: *common_pod_spec
|
||||||
|
containers:
|
||||||
|
- image: ghcr.io/huggingface/text-generation-inference:2.1
|
||||||
|
<<: *common_container_settings
|
||||||
|
|
||||||
|
- wait
|
||||||
|
|
||||||
|
- label: "Plot"
|
||||||
|
priority: 100
|
||||||
|
agents:
|
||||||
|
queue: A100
|
||||||
|
plugins:
|
||||||
|
- kubernetes:
|
||||||
|
podSpec:
|
||||||
|
<<: *common_pod_spec
|
||||||
|
containers:
|
||||||
|
- image: vllm/vllm-openai:v0.5.0.post1
|
||||||
|
command:
|
||||||
|
- bash .buildkite/nightly-benchmarks/scripts/nightly-annotate.sh
|
||||||
|
resources:
|
||||||
|
limits:
|
||||||
|
nvidia.com/gpu: 8
|
||||||
|
volumeMounts:
|
||||||
|
- name: devshm
|
||||||
|
mountPath: /dev/shm
|
||||||
|
env:
|
||||||
|
- name: VLLM_USAGE_SOURCE
|
||||||
|
value: ci-test
|
||||||
|
- name: VLLM_SOURCE_CODE_LOC
|
||||||
|
value: /workspace/build/buildkite/vllm/performance-benchmark
|
||||||
|
- name: HF_TOKEN
|
||||||
|
valueFrom:
|
||||||
|
secretKeyRef:
|
||||||
|
name: hf-token-secret
|
||||||
|
key: token
|
||||||
|
|
||||||
|
- wait
|
||||||
380
.buildkite/nightly-benchmarks/run-benchmarks-suite.sh
Normal file
380
.buildkite/nightly-benchmarks/run-benchmarks-suite.sh
Normal file
@@ -0,0 +1,380 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
# This script should be run inside the CI process
|
||||||
|
# This script assumes that we are already inside the vllm/ directory
|
||||||
|
# Benchmarking results will be available inside vllm/benchmarks/results/
|
||||||
|
|
||||||
|
# Do not set -e, as the mixtral 8x22B model tends to crash occasionally
|
||||||
|
# and we still want to see other benchmarking results even when mixtral crashes.
|
||||||
|
set -o pipefail
|
||||||
|
|
||||||
|
check_gpus() {
|
||||||
|
# check the number of GPUs and GPU type.
|
||||||
|
declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
|
||||||
|
if [[ $gpu_count -gt 0 ]]; then
|
||||||
|
echo "GPU found."
|
||||||
|
else
|
||||||
|
echo "Need at least 1 GPU to run benchmarking."
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
declare -g gpu_type=$(echo $(nvidia-smi --query-gpu=name --format=csv,noheader) | awk '{print $2}')
|
||||||
|
echo "GPU type is $gpu_type"
|
||||||
|
}
|
||||||
|
|
||||||
|
check_hf_token() {
|
||||||
|
# check if HF_TOKEN is available and valid
|
||||||
|
if [[ -z "$HF_TOKEN" ]]; then
|
||||||
|
echo "Error: HF_TOKEN is not set."
|
||||||
|
exit 1
|
||||||
|
elif [[ ! "$HF_TOKEN" =~ ^hf_ ]]; then
|
||||||
|
echo "Error: HF_TOKEN does not start with 'hf_'."
|
||||||
|
exit 1
|
||||||
|
else
|
||||||
|
echo "HF_TOKEN is set and valid."
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
ensure_sharegpt_downloaded() {
|
||||||
|
local FILE=ShareGPT_V3_unfiltered_cleaned_split.json
|
||||||
|
if [ ! -f "$FILE" ]; then
|
||||||
|
wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/$FILE
|
||||||
|
else
|
||||||
|
echo "$FILE already exists."
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
json2args() {
|
||||||
|
# transforms the JSON string to command line args, and '_' is replaced to '-'
|
||||||
|
# example:
|
||||||
|
# input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 }
|
||||||
|
# output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1
|
||||||
|
local json_string=$1
|
||||||
|
local args=$(
|
||||||
|
echo "$json_string" | jq -r '
|
||||||
|
to_entries |
|
||||||
|
map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) |
|
||||||
|
join(" ")
|
||||||
|
'
|
||||||
|
)
|
||||||
|
echo "$args"
|
||||||
|
}
|
||||||
|
|
||||||
|
wait_for_server() {
|
||||||
|
# wait for vllm server to start
|
||||||
|
# return 1 if vllm server crashes
|
||||||
|
timeout 1200 bash -c '
|
||||||
|
until curl -X POST localhost:8000/v1/completions; do
|
||||||
|
sleep 1
|
||||||
|
done' && return 0 || return 1
|
||||||
|
}
|
||||||
|
|
||||||
|
kill_gpu_processes() {
|
||||||
|
# kill all processes on GPU.
|
||||||
|
pids=$(nvidia-smi --query-compute-apps=pid --format=csv,noheader)
|
||||||
|
if [ -z "$pids" ]; then
|
||||||
|
echo "No GPU processes found."
|
||||||
|
else
|
||||||
|
for pid in $pids; do
|
||||||
|
kill -9 "$pid"
|
||||||
|
echo "Killed process with PID: $pid"
|
||||||
|
done
|
||||||
|
|
||||||
|
echo "All GPU processes have been killed."
|
||||||
|
fi
|
||||||
|
|
||||||
|
# waiting for GPU processes to be fully killed
|
||||||
|
# loop while nvidia-smi returns any processes
|
||||||
|
while [ -n "$(nvidia-smi --query-compute-apps=pid --format=csv,noheader)" ]; do
|
||||||
|
sleep 1
|
||||||
|
echo "Waiting for GPU processes to be killed"
|
||||||
|
done
|
||||||
|
|
||||||
|
# remove vllm config file
|
||||||
|
rm -rf ~/.config/vllm
|
||||||
|
|
||||||
|
# Print the GPU memory usage
|
||||||
|
# so that we know if all GPU processes are killed.
|
||||||
|
gpu_memory_usage=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits -i 0)
|
||||||
|
# The memory usage should be 0 MB.
|
||||||
|
echo "GPU 0 Memory Usage: $gpu_memory_usage MB"
|
||||||
|
}
|
||||||
|
|
||||||
|
upload_to_buildkite() {
|
||||||
|
# upload the benchmarking results to buildkite
|
||||||
|
|
||||||
|
# if the agent binary is not found, skip uploading the results, exit 0
|
||||||
|
# Check if buildkite-agent is available in the PATH or at /workspace/buildkite-agent
|
||||||
|
if command -v buildkite-agent >/dev/null 2>&1; then
|
||||||
|
BUILDKITE_AGENT_COMMAND="buildkite-agent"
|
||||||
|
elif [ -f /workspace/buildkite-agent ]; then
|
||||||
|
BUILDKITE_AGENT_COMMAND="/workspace/buildkite-agent"
|
||||||
|
else
|
||||||
|
echo "buildkite-agent binary not found. Skip uploading the results."
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Use the determined command to annotate and upload artifacts
|
||||||
|
$BUILDKITE_AGENT_COMMAND annotate --style "info" --context "$BUILDKITE_LABEL-benchmark-results" < $RESULTS_FOLDER/benchmark_results.md
|
||||||
|
$BUILDKITE_AGENT_COMMAND artifact upload "$RESULTS_FOLDER/*"
|
||||||
|
}
|
||||||
|
|
||||||
|
run_latency_tests() {
|
||||||
|
# run latency tests using `benchmark_latency.py`
|
||||||
|
# $1: a json file specifying latency test cases
|
||||||
|
|
||||||
|
local latency_test_file
|
||||||
|
latency_test_file=$1
|
||||||
|
|
||||||
|
# Iterate over latency tests
|
||||||
|
jq -c '.[]' "$latency_test_file" | while read -r params; do
|
||||||
|
# get the test name, and append the GPU type back to it.
|
||||||
|
test_name=$(echo "$params" | jq -r '.test_name')
|
||||||
|
if [[ ! "$test_name" =~ ^latency_ ]]; then
|
||||||
|
echo "In latency-test.json, test_name must start with \"latency_\"."
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# if TEST_SELECTOR is set, only run the test cases that match the selector
|
||||||
|
if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
|
||||||
|
echo "Skip test case $test_name."
|
||||||
|
continue
|
||||||
|
fi
|
||||||
|
|
||||||
|
# get arguments
|
||||||
|
latency_params=$(echo "$params" | jq -r '.parameters')
|
||||||
|
latency_args=$(json2args "$latency_params")
|
||||||
|
|
||||||
|
# check if there is enough GPU to run the test
|
||||||
|
tp=$(echo "$latency_params" | jq -r '.tensor_parallel_size')
|
||||||
|
if [[ $gpu_count -lt $tp ]]; then
|
||||||
|
echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $testname."
|
||||||
|
continue
|
||||||
|
fi
|
||||||
|
|
||||||
|
latency_command="python3 benchmark_latency.py \
|
||||||
|
--output-json $RESULTS_FOLDER/${test_name}.json \
|
||||||
|
$latency_args"
|
||||||
|
|
||||||
|
echo "Running test case $test_name"
|
||||||
|
echo "Latency command: $latency_command"
|
||||||
|
|
||||||
|
# recoding benchmarking command ang GPU command
|
||||||
|
jq_output=$(jq -n \
|
||||||
|
--arg latency "$latency_command" \
|
||||||
|
--arg gpu "$gpu_type" \
|
||||||
|
'{
|
||||||
|
latency_command: $latency,
|
||||||
|
gpu_type: $gpu
|
||||||
|
}')
|
||||||
|
echo "$jq_output" > "$RESULTS_FOLDER/$test_name.commands"
|
||||||
|
|
||||||
|
# run the benchmark
|
||||||
|
eval "$latency_command"
|
||||||
|
|
||||||
|
kill_gpu_processes
|
||||||
|
|
||||||
|
done
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
run_throughput_tests() {
|
||||||
|
# run throughput tests using `benchmark_throughput.py`
|
||||||
|
# $1: a json file specifying throughput test cases
|
||||||
|
|
||||||
|
local throughput_test_file
|
||||||
|
throughput_test_file=$1
|
||||||
|
|
||||||
|
# Iterate over throughput tests
|
||||||
|
jq -c '.[]' "$throughput_test_file" | while read -r params; do
|
||||||
|
# get the test name, and append the GPU type back to it.
|
||||||
|
test_name=$(echo "$params" | jq -r '.test_name')
|
||||||
|
if [[ ! "$test_name" =~ ^throughput_ ]]; then
|
||||||
|
echo "In throughput-test.json, test_name must start with \"throughput_\"."
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# if TEST_SELECTOR is set, only run the test cases that match the selector
|
||||||
|
if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
|
||||||
|
echo "Skip test case $test_name."
|
||||||
|
continue
|
||||||
|
fi
|
||||||
|
|
||||||
|
# get arguments
|
||||||
|
throughput_params=$(echo "$params" | jq -r '.parameters')
|
||||||
|
throughput_args=$(json2args "$throughput_params")
|
||||||
|
|
||||||
|
# check if there is enough GPU to run the test
|
||||||
|
tp=$(echo $throughput_params | jq -r '.tensor_parallel_size')
|
||||||
|
if [[ $gpu_count -lt $tp ]]; then
|
||||||
|
echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $testname."
|
||||||
|
continue
|
||||||
|
fi
|
||||||
|
|
||||||
|
throughput_command="python3 benchmark_throughput.py \
|
||||||
|
--output-json $RESULTS_FOLDER/${test_name}.json \
|
||||||
|
$throughput_args"
|
||||||
|
|
||||||
|
echo "Running test case $test_name"
|
||||||
|
echo "Throughput command: $throughput_command"
|
||||||
|
# recoding benchmarking command ang GPU command
|
||||||
|
jq_output=$(jq -n \
|
||||||
|
--arg command "$throughput_command" \
|
||||||
|
--arg gpu "$gpu_type" \
|
||||||
|
'{
|
||||||
|
throughput_command: $command,
|
||||||
|
gpu_type: $gpu
|
||||||
|
}')
|
||||||
|
echo "$jq_output" > "$RESULTS_FOLDER/$test_name.commands"
|
||||||
|
|
||||||
|
# run the benchmark
|
||||||
|
eval "$throughput_command"
|
||||||
|
|
||||||
|
kill_gpu_processes
|
||||||
|
|
||||||
|
done
|
||||||
|
}
|
||||||
|
|
||||||
|
run_serving_tests() {
|
||||||
|
# run serving tests using `benchmark_serving.py`
|
||||||
|
# $1: a json file specifying serving test cases
|
||||||
|
|
||||||
|
local serving_test_file
|
||||||
|
serving_test_file=$1
|
||||||
|
|
||||||
|
# Iterate over serving tests
|
||||||
|
jq -c '.[]' "$serving_test_file" | while read -r params; do
|
||||||
|
# get the test name, and append the GPU type back to it.
|
||||||
|
test_name=$(echo "$params" | jq -r '.test_name')
|
||||||
|
if [[ ! "$test_name" =~ ^serving_ ]]; then
|
||||||
|
echo "In serving-test.json, test_name must start with \"serving_\"."
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# if TEST_SELECTOR is set, only run the test cases that match the selector
|
||||||
|
if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
|
||||||
|
echo "Skip test case $test_name."
|
||||||
|
continue
|
||||||
|
fi
|
||||||
|
|
||||||
|
|
||||||
|
# get client and server arguments
|
||||||
|
server_params=$(echo "$params" | jq -r '.server_parameters')
|
||||||
|
client_params=$(echo "$params" | jq -r '.client_parameters')
|
||||||
|
server_args=$(json2args "$server_params")
|
||||||
|
client_args=$(json2args "$client_params")
|
||||||
|
qps_list=$(echo "$params" | jq -r '.qps_list')
|
||||||
|
qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
|
||||||
|
echo "Running over qps list $qps_list"
|
||||||
|
|
||||||
|
# check if there is enough GPU to run the test
|
||||||
|
tp=$(echo "$server_params" | jq -r '.tensor_parallel_size')
|
||||||
|
if [[ $gpu_count -lt $tp ]]; then
|
||||||
|
echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $testname."
|
||||||
|
continue
|
||||||
|
fi
|
||||||
|
|
||||||
|
# check if server model and client model is aligned
|
||||||
|
server_model=$(echo "$server_params" | jq -r '.model')
|
||||||
|
client_model=$(echo "$client_params" | jq -r '.model')
|
||||||
|
if [[ $server_model != "$client_model" ]]; then
|
||||||
|
echo "Server model and client model must be the same. Skip testcase $testname."
|
||||||
|
continue
|
||||||
|
fi
|
||||||
|
|
||||||
|
server_command="python3 \
|
||||||
|
-m vllm.entrypoints.openai.api_server \
|
||||||
|
$server_args"
|
||||||
|
|
||||||
|
# run the server
|
||||||
|
echo "Running test case $test_name"
|
||||||
|
echo "Server command: $server_command"
|
||||||
|
eval "$server_command" &
|
||||||
|
server_pid=$!
|
||||||
|
|
||||||
|
# wait until the server is alive
|
||||||
|
wait_for_server
|
||||||
|
if [ $? -eq 0 ]; then
|
||||||
|
echo ""
|
||||||
|
echo "vllm server is up and running."
|
||||||
|
else
|
||||||
|
echo ""
|
||||||
|
echo "vllm failed to start within the timeout period."
|
||||||
|
fi
|
||||||
|
|
||||||
|
# iterate over different QPS
|
||||||
|
for qps in $qps_list; do
|
||||||
|
# remove the surrounding single quote from qps
|
||||||
|
if [[ "$qps" == *"inf"* ]]; then
|
||||||
|
echo "qps was $qps"
|
||||||
|
qps="inf"
|
||||||
|
echo "now qps is $qps"
|
||||||
|
fi
|
||||||
|
|
||||||
|
new_test_name=$test_name"_qps_"$qps
|
||||||
|
|
||||||
|
client_command="python3 benchmark_serving.py \
|
||||||
|
--save-result \
|
||||||
|
--result-dir $RESULTS_FOLDER \
|
||||||
|
--result-filename ${new_test_name}.json \
|
||||||
|
--request-rate $qps \
|
||||||
|
$client_args"
|
||||||
|
|
||||||
|
echo "Running test case $test_name with qps $qps"
|
||||||
|
echo "Client command: $client_command"
|
||||||
|
|
||||||
|
eval "$client_command"
|
||||||
|
|
||||||
|
# record the benchmarking commands
|
||||||
|
jq_output=$(jq -n \
|
||||||
|
--arg server "$server_command" \
|
||||||
|
--arg client "$client_command" \
|
||||||
|
--arg gpu "$gpu_type" \
|
||||||
|
'{
|
||||||
|
server_command: $server,
|
||||||
|
client_command: $client,
|
||||||
|
gpu_type: $gpu
|
||||||
|
}')
|
||||||
|
echo "$jq_output" > "$RESULTS_FOLDER/${new_test_name}.commands"
|
||||||
|
|
||||||
|
done
|
||||||
|
|
||||||
|
# clean up
|
||||||
|
kill -9 $server_pid
|
||||||
|
kill_gpu_processes
|
||||||
|
done
|
||||||
|
}
|
||||||
|
|
||||||
|
main() {
|
||||||
|
check_gpus
|
||||||
|
check_hf_token
|
||||||
|
|
||||||
|
# dependencies
|
||||||
|
(which wget && which curl) || (apt-get update && apt-get install -y wget curl)
|
||||||
|
(which jq) || (apt-get update && apt-get -y install jq)
|
||||||
|
|
||||||
|
# get the current IP address, required by benchmark_serving.py
|
||||||
|
export VLLM_HOST_IP=$(hostname -I | awk '{print $1}')
|
||||||
|
# turn of the reporting of the status of each request, to clean up the terminal output
|
||||||
|
export VLLM_LOG_LEVEL="WARNING"
|
||||||
|
|
||||||
|
# prepare for benchmarking
|
||||||
|
cd benchmarks || exit 1
|
||||||
|
ensure_sharegpt_downloaded
|
||||||
|
declare -g RESULTS_FOLDER=results/
|
||||||
|
mkdir -p $RESULTS_FOLDER
|
||||||
|
QUICK_BENCHMARK_ROOT=../.buildkite/nightly-benchmarks/
|
||||||
|
|
||||||
|
# benchmarking
|
||||||
|
run_serving_tests $QUICK_BENCHMARK_ROOT/tests/serving-tests.json
|
||||||
|
run_latency_tests $QUICK_BENCHMARK_ROOT/tests/latency-tests.json
|
||||||
|
run_throughput_tests $QUICK_BENCHMARK_ROOT/tests/throughput-tests.json
|
||||||
|
|
||||||
|
|
||||||
|
# postprocess benchmarking results
|
||||||
|
pip install tabulate pandas
|
||||||
|
python3 $QUICK_BENCHMARK_ROOT/scripts/convert-results-json-to-markdown.py
|
||||||
|
|
||||||
|
upload_to_buildkite
|
||||||
|
}
|
||||||
|
|
||||||
|
main "$@"
|
||||||
76
.buildkite/nightly-benchmarks/run-nightly-suite.sh
Normal file
76
.buildkite/nightly-benchmarks/run-nightly-suite.sh
Normal file
@@ -0,0 +1,76 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
set -o pipefail
|
||||||
|
set -x
|
||||||
|
|
||||||
|
check_gpus() {
|
||||||
|
# check the number of GPUs and GPU type.
|
||||||
|
declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
|
||||||
|
if [[ $gpu_count -gt 0 ]]; then
|
||||||
|
echo "GPU found."
|
||||||
|
else
|
||||||
|
echo "Need at least 1 GPU to run benchmarking."
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
declare -g gpu_type=$(echo $(nvidia-smi --query-gpu=name --format=csv,noheader) | awk '{print $2}')
|
||||||
|
echo "GPU type is $gpu_type"
|
||||||
|
}
|
||||||
|
|
||||||
|
check_hf_token() {
|
||||||
|
# check if HF_TOKEN is available and valid
|
||||||
|
if [[ -z "$HF_TOKEN" ]]; then
|
||||||
|
echo "Error: HF_TOKEN is not set."
|
||||||
|
exit 1
|
||||||
|
elif [[ ! "$HF_TOKEN" =~ ^hf_ ]]; then
|
||||||
|
echo "Error: HF_TOKEN does not start with 'hf_'."
|
||||||
|
exit 1
|
||||||
|
else
|
||||||
|
echo "HF_TOKEN is set and valid."
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
main() {
|
||||||
|
|
||||||
|
check_gpus
|
||||||
|
check_hf_token
|
||||||
|
|
||||||
|
df -h
|
||||||
|
|
||||||
|
(which wget && which curl) || (apt-get update && apt-get install -y wget curl)
|
||||||
|
(which jq) || (apt-get update && apt-get -y install jq)
|
||||||
|
|
||||||
|
cd $VLLM_SOURCE_CODE_LOC/benchmarks
|
||||||
|
wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
|
||||||
|
|
||||||
|
|
||||||
|
# run lmdeploy
|
||||||
|
if which lmdeploy >/dev/null; then
|
||||||
|
echo "lmdeploy is available, redirect to run-lmdeploy-nightly.sh"
|
||||||
|
bash ../.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
# run tgi
|
||||||
|
if [ -e /tgi-entrypoint.sh ]; then
|
||||||
|
echo "tgi is available, redirect to run-tgi-nightly.sh"
|
||||||
|
bash ../.buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
# run trt
|
||||||
|
if which trtllm-build >/dev/null; then
|
||||||
|
echo "trtllm is available, redirect to run-trt-nightly.sh"
|
||||||
|
bash ../.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
# run vllm
|
||||||
|
if [ -e /vllm-workspace ]; then
|
||||||
|
echo "vllm is available, redirect to run-vllm-nightly.sh"
|
||||||
|
bash ../.buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
main "$@"
|
||||||
@@ -0,0 +1,192 @@
|
|||||||
|
import json
|
||||||
|
import os
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import pandas as pd
|
||||||
|
from tabulate import tabulate
|
||||||
|
|
||||||
|
results_folder = Path("results/")
|
||||||
|
|
||||||
|
# latency results and the keys that will be printed into markdown
|
||||||
|
latency_results = []
|
||||||
|
latency_column_mapping = {
|
||||||
|
"test_name": "Test name",
|
||||||
|
"gpu_type": "GPU",
|
||||||
|
"avg_latency": "Mean latency (ms)",
|
||||||
|
# "P10": "P10 (s)",
|
||||||
|
# "P25": "P25 (s)",
|
||||||
|
"P50": "Median latency (ms)",
|
||||||
|
# "P75": "P75 (s)",
|
||||||
|
# "P90": "P90 (s)",
|
||||||
|
"P99": "P99 latency (ms)",
|
||||||
|
}
|
||||||
|
|
||||||
|
# throughput tests and the keys that will be printed into markdown
|
||||||
|
throughput_results = []
|
||||||
|
throughput_results_column_mapping = {
|
||||||
|
"test_name": "Test name",
|
||||||
|
"gpu_type": "GPU",
|
||||||
|
# "num_requests": "# of req.",
|
||||||
|
# "total_num_tokens": "Total # of tokens",
|
||||||
|
# "elapsed_time": "Elapsed time (s)",
|
||||||
|
"requests_per_second": "Tput (req/s)",
|
||||||
|
# "tokens_per_second": "Tput (tok/s)",
|
||||||
|
}
|
||||||
|
|
||||||
|
# serving results and the keys that will be printed into markdown
|
||||||
|
serving_results = []
|
||||||
|
serving_column_mapping = {
|
||||||
|
"test_name": "Test name",
|
||||||
|
"gpu_type": "GPU",
|
||||||
|
# "completed": "# of req.",
|
||||||
|
"request_throughput": "Tput (req/s)",
|
||||||
|
# "input_throughput": "Input Tput (tok/s)",
|
||||||
|
# "output_throughput": "Output Tput (tok/s)",
|
||||||
|
"mean_ttft_ms": "Mean TTFT (ms)",
|
||||||
|
"median_ttft_ms": "Median TTFT (ms)",
|
||||||
|
"p99_ttft_ms": "P99 TTFT (ms)",
|
||||||
|
# "mean_tpot_ms": "Mean TPOT (ms)",
|
||||||
|
# "median_tpot_ms": "Median",
|
||||||
|
# "p99_tpot_ms": "P99",
|
||||||
|
"mean_itl_ms": "Mean ITL (ms)",
|
||||||
|
"median_itl_ms": "Median ITL (ms)",
|
||||||
|
"p99_itl_ms": "P99 ITL (ms)",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def read_markdown(file):
|
||||||
|
if os.path.exists(file):
|
||||||
|
with open(file, "r") as f:
|
||||||
|
return f.read() + "\n"
|
||||||
|
else:
|
||||||
|
return f"{file} not found.\n"
|
||||||
|
|
||||||
|
|
||||||
|
def results_to_json(latency, throughput, serving):
|
||||||
|
return json.dumps({
|
||||||
|
'latency': latency.to_dict(),
|
||||||
|
'throughput': throughput.to_dict(),
|
||||||
|
'serving': serving.to_dict()
|
||||||
|
})
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
|
||||||
|
# collect results
|
||||||
|
for test_file in results_folder.glob("*.json"):
|
||||||
|
|
||||||
|
with open(test_file, "r") as f:
|
||||||
|
raw_result = json.loads(f.read())
|
||||||
|
|
||||||
|
if "serving" in str(test_file):
|
||||||
|
# this result is generated via `benchmark_serving.py`
|
||||||
|
|
||||||
|
# attach the benchmarking command to raw_result
|
||||||
|
with open(test_file.with_suffix(".commands"), "r") as f:
|
||||||
|
command = json.loads(f.read())
|
||||||
|
raw_result.update(command)
|
||||||
|
|
||||||
|
# update the test name of this result
|
||||||
|
raw_result.update({"test_name": test_file.stem})
|
||||||
|
|
||||||
|
# add the result to raw_result
|
||||||
|
serving_results.append(raw_result)
|
||||||
|
continue
|
||||||
|
|
||||||
|
elif "latency" in f.name:
|
||||||
|
# this result is generated via `benchmark_latency.py`
|
||||||
|
|
||||||
|
# attach the benchmarking command to raw_result
|
||||||
|
with open(test_file.with_suffix(".commands"), "r") as f:
|
||||||
|
command = json.loads(f.read())
|
||||||
|
raw_result.update(command)
|
||||||
|
|
||||||
|
# update the test name of this result
|
||||||
|
raw_result.update({"test_name": test_file.stem})
|
||||||
|
|
||||||
|
# get different percentiles
|
||||||
|
for perc in [10, 25, 50, 75, 90, 99]:
|
||||||
|
# Multiply 1000 to convert the time unit from s to ms
|
||||||
|
raw_result.update(
|
||||||
|
{f"P{perc}": 1000 * raw_result["percentiles"][str(perc)]})
|
||||||
|
raw_result["avg_latency"] = raw_result["avg_latency"] * 1000
|
||||||
|
|
||||||
|
# add the result to raw_result
|
||||||
|
latency_results.append(raw_result)
|
||||||
|
continue
|
||||||
|
|
||||||
|
elif "throughput" in f.name:
|
||||||
|
# this result is generated via `benchmark_throughput.py`
|
||||||
|
|
||||||
|
# attach the benchmarking command to raw_result
|
||||||
|
with open(test_file.with_suffix(".commands"), "r") as f:
|
||||||
|
command = json.loads(f.read())
|
||||||
|
raw_result.update(command)
|
||||||
|
|
||||||
|
# update the test name of this result
|
||||||
|
raw_result.update({"test_name": test_file.stem})
|
||||||
|
|
||||||
|
# add the result to raw_result
|
||||||
|
throughput_results.append(raw_result)
|
||||||
|
continue
|
||||||
|
|
||||||
|
print(f"Skipping {test_file}")
|
||||||
|
|
||||||
|
latency_results = pd.DataFrame.from_dict(latency_results)
|
||||||
|
serving_results = pd.DataFrame.from_dict(serving_results)
|
||||||
|
throughput_results = pd.DataFrame.from_dict(throughput_results)
|
||||||
|
|
||||||
|
raw_results_json = results_to_json(latency_results, throughput_results,
|
||||||
|
serving_results)
|
||||||
|
|
||||||
|
# remapping the key, for visualization purpose
|
||||||
|
if not latency_results.empty:
|
||||||
|
latency_results = latency_results[list(
|
||||||
|
latency_column_mapping.keys())].rename(
|
||||||
|
columns=latency_column_mapping)
|
||||||
|
if not serving_results.empty:
|
||||||
|
serving_results = serving_results[list(
|
||||||
|
serving_column_mapping.keys())].rename(
|
||||||
|
columns=serving_column_mapping)
|
||||||
|
if not throughput_results.empty:
|
||||||
|
throughput_results = throughput_results[list(
|
||||||
|
throughput_results_column_mapping.keys())].rename(
|
||||||
|
columns=throughput_results_column_mapping)
|
||||||
|
|
||||||
|
processed_results_json = results_to_json(latency_results,
|
||||||
|
throughput_results,
|
||||||
|
serving_results)
|
||||||
|
|
||||||
|
# get markdown tables
|
||||||
|
latency_md_table = tabulate(latency_results,
|
||||||
|
headers='keys',
|
||||||
|
tablefmt='pipe',
|
||||||
|
showindex=False)
|
||||||
|
serving_md_table = tabulate(serving_results,
|
||||||
|
headers='keys',
|
||||||
|
tablefmt='pipe',
|
||||||
|
showindex=False)
|
||||||
|
throughput_md_table = tabulate(throughput_results,
|
||||||
|
headers='keys',
|
||||||
|
tablefmt='pipe',
|
||||||
|
showindex=False)
|
||||||
|
|
||||||
|
# document the result
|
||||||
|
with open(results_folder / "benchmark_results.md", "w") as f:
|
||||||
|
|
||||||
|
results = read_markdown(
|
||||||
|
"../.buildkite/nightly-benchmarks/tests/descriptions.md")
|
||||||
|
results = results.format(
|
||||||
|
latency_tests_markdown_table=latency_md_table,
|
||||||
|
throughput_tests_markdown_table=throughput_md_table,
|
||||||
|
serving_tests_markdown_table=serving_md_table,
|
||||||
|
benchmarking_results_in_json_string=processed_results_json)
|
||||||
|
f.write(results)
|
||||||
|
|
||||||
|
# document benchmarking results in json
|
||||||
|
with open(results_folder / "benchmark_results.json", "w") as f:
|
||||||
|
|
||||||
|
results = latency_results.to_dict(
|
||||||
|
orient='records') + throughput_results.to_dict(
|
||||||
|
orient='records') + serving_results.to_dict(orient='records')
|
||||||
|
f.write(json.dumps(results))
|
||||||
26
.buildkite/nightly-benchmarks/scripts/download-tokenizer.py
Normal file
26
.buildkite/nightly-benchmarks/scripts/download-tokenizer.py
Normal file
@@ -0,0 +1,26 @@
|
|||||||
|
import argparse
|
||||||
|
|
||||||
|
from transformers import AutoTokenizer
|
||||||
|
|
||||||
|
|
||||||
|
def main(model, cachedir):
|
||||||
|
# Load the tokenizer and save it to the specified directory
|
||||||
|
tokenizer = AutoTokenizer.from_pretrained(model)
|
||||||
|
tokenizer.save_pretrained(cachedir)
|
||||||
|
print(f"Tokenizer saved to {cachedir}")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
parser = argparse.ArgumentParser(
|
||||||
|
description="Download and save Hugging Face tokenizer")
|
||||||
|
parser.add_argument("--model",
|
||||||
|
type=str,
|
||||||
|
required=True,
|
||||||
|
help="Name of the model")
|
||||||
|
parser.add_argument("--cachedir",
|
||||||
|
type=str,
|
||||||
|
required=True,
|
||||||
|
help="Directory to save the tokenizer")
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
main(args.model, args.cachedir)
|
||||||
@@ -0,0 +1,6 @@
|
|||||||
|
from lmdeploy.serve.openai.api_client import APIClient
|
||||||
|
|
||||||
|
api_client = APIClient("http://localhost:8000")
|
||||||
|
model_name = api_client.available_models[0]
|
||||||
|
|
||||||
|
print(model_name)
|
||||||
102
.buildkite/nightly-benchmarks/scripts/launch-trt-server.sh
Normal file
102
.buildkite/nightly-benchmarks/scripts/launch-trt-server.sh
Normal file
@@ -0,0 +1,102 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
|
||||||
|
server_params=$1
|
||||||
|
common_params=$2
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
model_path=$(echo "$common_params" | jq -r '.model')
|
||||||
|
model_name="${model_path#*/}"
|
||||||
|
model_type=$(echo "$server_params" | jq -r '.model_type')
|
||||||
|
model_dtype=$(echo "$server_params" | jq -r '.model_dtype')
|
||||||
|
model_tp_size=$(echo "$common_params" | jq -r '.tp')
|
||||||
|
max_batch_size=$(echo "$server_params" | jq -r '.max_batch_size')
|
||||||
|
max_input_len=$(echo "$server_params" | jq -r '.max_input_len')
|
||||||
|
max_output_len=$(echo "$server_params" | jq -r '.max_output_len')
|
||||||
|
trt_llm_version=$(echo "$server_params" | jq -r '.trt_llm_version')
|
||||||
|
|
||||||
|
cd ~
|
||||||
|
rm -rf models
|
||||||
|
mkdir -p models
|
||||||
|
cd models
|
||||||
|
models_dir=$(pwd)
|
||||||
|
trt_model_path=${models_dir}/${model_name}-trt-ckpt
|
||||||
|
trt_engine_path=${models_dir}/${model_name}-trt-engine
|
||||||
|
|
||||||
|
cd ~
|
||||||
|
rm -rf tensorrt-demo
|
||||||
|
git clone https://github.com/neuralmagic/tensorrt-demo.git
|
||||||
|
cd tensorrt-demo
|
||||||
|
tensorrt_demo_dir=$(pwd)
|
||||||
|
|
||||||
|
# make sure the parameter inside tensorrt_demo is consistent to envvar
|
||||||
|
sed -i.bak "/key: \"tokenizer_dir\"/,/string_value:/s|string_value: \".*\"|string_value: \"$model_path\"|" ./triton_model_repo/postprocessing/config.pbtxt
|
||||||
|
sed -i.bak "/key: \"tokenizer_dir\"/,/string_value:/s|string_value: \".*\"|string_value: \"$model_path\"|" ./triton_model_repo/preprocessing/config.pbtxt
|
||||||
|
sed -i.bak "s|\(max_batch_size:\s*\)[0-9]*|\1$max_batch_size|g" ./triton_model_repo/ensemble/config.pbtxt
|
||||||
|
sed -i.bak "s|\(max_batch_size:\s*\)[0-9]*|\1$max_batch_size|g" ./triton_model_repo/preprocessing/config.pbtxt
|
||||||
|
sed -i.bak "s|\(max_batch_size:\s*\)[0-9]*|\1$max_batch_size|g" ./triton_model_repo/postprocessing/config.pbtxt
|
||||||
|
sed -i.bak "s|\(max_batch_size:\s*\)[0-9]*|\1$max_batch_size|g" ./triton_model_repo/tensorrt_llm_bls/config.pbtxt
|
||||||
|
|
||||||
|
|
||||||
|
cd /
|
||||||
|
rm -rf tensorrtllm_backend
|
||||||
|
git clone https://github.com/triton-inference-server/tensorrtllm_backend.git
|
||||||
|
git lfs install
|
||||||
|
cd tensorrtllm_backend
|
||||||
|
git checkout $trt_llm_version
|
||||||
|
tensorrtllm_backend_dir=$(pwd)
|
||||||
|
git submodule update --init --recursive
|
||||||
|
cp -r ${tensorrt_demo_dir}/triton_model_repo ${tensorrtllm_backend_dir}/
|
||||||
|
|
||||||
|
cd /tensorrtllm_backend
|
||||||
|
cd ./tensorrt_llm/examples/${model_type}
|
||||||
|
|
||||||
|
|
||||||
|
if echo "$common_params" | jq -e 'has("fp8")' > /dev/null; then
|
||||||
|
|
||||||
|
echo "Key 'fp8' exists in common params. Use quantize.py instead of convert_checkpoint.py"
|
||||||
|
echo "Reference: https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/llama/README.md"
|
||||||
|
python ../quantization/quantize.py \
|
||||||
|
--model_dir ${model_path} \
|
||||||
|
--dtype ${model_dtype} \
|
||||||
|
--tp_size ${model_tp_size} \
|
||||||
|
--output_dir ${trt_model_path} \
|
||||||
|
--qformat fp8 \
|
||||||
|
--kv_cache_dtype fp8 \
|
||||||
|
--calib_size 2
|
||||||
|
|
||||||
|
else
|
||||||
|
|
||||||
|
echo "Key 'fp8' does not exist in common params. Use convert_checkpoint.py"
|
||||||
|
python3 convert_checkpoint.py \
|
||||||
|
--model_dir ${model_path} \
|
||||||
|
--dtype ${model_dtype} \
|
||||||
|
--tp_size ${model_tp_size} \
|
||||||
|
--output_dir ${trt_model_path}
|
||||||
|
|
||||||
|
fi
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
trtllm-build \
|
||||||
|
--checkpoint_dir=${trt_model_path} \
|
||||||
|
--gpt_attention_plugin=${model_dtype} \
|
||||||
|
--gemm_plugin=${model_dtype} \
|
||||||
|
--remove_input_padding=enable \
|
||||||
|
--paged_kv_cache=enable \
|
||||||
|
--tp_size=${model_tp_size} \
|
||||||
|
--max_batch_size=${max_batch_size} \
|
||||||
|
--max_input_len=${max_input_len} \
|
||||||
|
--max_output_len=${max_output_len} \
|
||||||
|
--max_num_tokens=${max_output_len} \
|
||||||
|
--opt_num_tokens=${max_output_len} \
|
||||||
|
--output_dir=${trt_engine_path}
|
||||||
|
|
||||||
|
cd /tensorrtllm_backend/triton_model_repo
|
||||||
|
rm -rf ./tensorrt_llm/1/*
|
||||||
|
cp -r ${trt_engine_path}/* ./tensorrt_llm/1
|
||||||
|
cd /tensorrtllm_backend
|
||||||
|
python3 scripts/launch_triton_server.py \
|
||||||
|
--world_size=${model_tp_size} \
|
||||||
|
--model_repo=/tensorrtllm_backend/triton_model_repo &
|
||||||
40
.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh
Normal file
40
.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh
Normal file
@@ -0,0 +1,40 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
set -ex
|
||||||
|
set -o pipefail
|
||||||
|
|
||||||
|
|
||||||
|
main() {
|
||||||
|
|
||||||
|
(which wget && which curl) || (apt-get update && apt-get install -y wget curl)
|
||||||
|
(which jq) || (apt-get update && apt-get -y install jq)
|
||||||
|
|
||||||
|
if [ ! -f /workspace/buildkite-agent ]; then
|
||||||
|
echo "buildkite-agent binary not found. Skip plotting the results."
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
# initial annotation
|
||||||
|
description="$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/nightly-descriptions.md"
|
||||||
|
|
||||||
|
# download results
|
||||||
|
cd $VLLM_SOURCE_CODE_LOC/benchmarks
|
||||||
|
mkdir -p results/
|
||||||
|
/workspace/buildkite-agent artifact download 'results/*nightly_results.json' results/
|
||||||
|
ls
|
||||||
|
ls results/
|
||||||
|
|
||||||
|
# generate figures
|
||||||
|
python3 -m pip install tabulate pandas matplotlib
|
||||||
|
python3 $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py \
|
||||||
|
--description $description \
|
||||||
|
--results-folder results/
|
||||||
|
|
||||||
|
# upload results and figures
|
||||||
|
/workspace/buildkite-agent artifact upload "nightly_results.png"
|
||||||
|
/workspace/buildkite-agent artifact upload $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/nightly-pipeline.yaml
|
||||||
|
/workspace/buildkite-agent artifact upload $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/tests/nightly-tests.json
|
||||||
|
/workspace/buildkite-agent annotate --style "success" --context "nightly-benchmarks-results" --append < nightly_results.md
|
||||||
|
}
|
||||||
|
|
||||||
|
main "$@"
|
||||||
135
.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py
Normal file
135
.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py
Normal file
@@ -0,0 +1,135 @@
|
|||||||
|
import argparse
|
||||||
|
import json
|
||||||
|
import math
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
import pandas as pd
|
||||||
|
from tabulate import tabulate
|
||||||
|
|
||||||
|
|
||||||
|
def parse_arguments():
|
||||||
|
parser = argparse.ArgumentParser(
|
||||||
|
description=
|
||||||
|
'Parse command line arguments for summary-nightly-results script.')
|
||||||
|
parser.add_argument('--results-folder',
|
||||||
|
type=str,
|
||||||
|
required=True,
|
||||||
|
help='The folder where the results are stored.')
|
||||||
|
parser.add_argument('--description',
|
||||||
|
type=str,
|
||||||
|
required=True,
|
||||||
|
help='Description of the results.')
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
return args
|
||||||
|
|
||||||
|
|
||||||
|
def main(args):
|
||||||
|
bar_colors = ['#56B4E9', '#009E73', '#D55E00', '#E69F00']
|
||||||
|
results_folder = Path(args.results_folder)
|
||||||
|
|
||||||
|
results = []
|
||||||
|
|
||||||
|
# collect results
|
||||||
|
for test_file in results_folder.glob("*_nightly_results.json"):
|
||||||
|
with open(test_file, "r") as f:
|
||||||
|
results = results + json.loads(f.read())
|
||||||
|
|
||||||
|
# generate markdown table
|
||||||
|
df = pd.DataFrame.from_dict(results)
|
||||||
|
|
||||||
|
md_table = tabulate(df, headers='keys', tablefmt='pipe', showindex=False)
|
||||||
|
|
||||||
|
with open(args.description, "r") as f:
|
||||||
|
description = f.read()
|
||||||
|
|
||||||
|
description = description.format(
|
||||||
|
nightly_results_benchmarking_table=md_table)
|
||||||
|
|
||||||
|
with open("nightly_results.md", "w") as f:
|
||||||
|
f.write(description)
|
||||||
|
|
||||||
|
plt.rcParams.update({'font.size': 20})
|
||||||
|
|
||||||
|
# plot results
|
||||||
|
fig, axes = plt.subplots(3, 3, figsize=(16, 14))
|
||||||
|
fig.subplots_adjust(hspace=1)
|
||||||
|
methods = ["vllm", "trt", "lmdeploy", "tgi"]
|
||||||
|
for i, model in enumerate(["llama8B", "llama70B", "mixtral8x7B"]):
|
||||||
|
for j, metric in enumerate(["TTFT", "ITL"]):
|
||||||
|
means, stds = [], []
|
||||||
|
for method in methods:
|
||||||
|
target = df['Test name'].str.contains(model)
|
||||||
|
target = target & df['Engine'].str.contains(method)
|
||||||
|
filtered_df = df[target]
|
||||||
|
|
||||||
|
if filtered_df.empty:
|
||||||
|
means.append(0.)
|
||||||
|
stds.append(0.)
|
||||||
|
else:
|
||||||
|
means.append(filtered_df[f"Mean {metric} (ms)"].values[0])
|
||||||
|
std = filtered_df[f"Std {metric} (ms)"].values[0]
|
||||||
|
success = filtered_df["Successful req."].values[0]
|
||||||
|
stds.append(std / math.sqrt(success))
|
||||||
|
|
||||||
|
print(model, metric)
|
||||||
|
print(means, stds)
|
||||||
|
|
||||||
|
ax = axes[i, j + 1]
|
||||||
|
|
||||||
|
bars = ax.bar(
|
||||||
|
["vllm", "trt", "lmdeploy", "tgi"],
|
||||||
|
means,
|
||||||
|
yerr=stds,
|
||||||
|
capsize=10,
|
||||||
|
)
|
||||||
|
for idx, bar in enumerate(bars):
|
||||||
|
bar.set_color(bar_colors[idx])
|
||||||
|
ax.set_ylim(bottom=0)
|
||||||
|
|
||||||
|
ax.set_ylabel(f"{metric} (ms)")
|
||||||
|
ax.set_title(f"{model} {metric}")
|
||||||
|
ax.grid(axis='y')
|
||||||
|
|
||||||
|
metric = "Tput"
|
||||||
|
j = 0
|
||||||
|
if True:
|
||||||
|
tputs = []
|
||||||
|
for method in methods:
|
||||||
|
target = df['Test name'].str.contains(model)
|
||||||
|
target = target & df['Engine'].str.contains(method)
|
||||||
|
filtered_df = df[target]
|
||||||
|
|
||||||
|
if filtered_df.empty:
|
||||||
|
tputs.append(0.)
|
||||||
|
else:
|
||||||
|
input_tput = filtered_df["Input Tput (tok/s)"].values[0]
|
||||||
|
output_tput = filtered_df["Output Tput (tok/s)"].values[0]
|
||||||
|
tputs.append(input_tput + output_tput)
|
||||||
|
|
||||||
|
print(model, metric)
|
||||||
|
print(tputs)
|
||||||
|
|
||||||
|
ax = axes[i, j]
|
||||||
|
|
||||||
|
bars = ax.bar(
|
||||||
|
["vllm", "trt", "lmdeploy", "tgi"],
|
||||||
|
tputs,
|
||||||
|
)
|
||||||
|
for idx, bar in enumerate(bars):
|
||||||
|
bar.set_color(bar_colors[idx])
|
||||||
|
|
||||||
|
ax.set_ylim(bottom=0)
|
||||||
|
|
||||||
|
ax.set_ylabel("Tput (token/s)")
|
||||||
|
ax.set_title(f"{model} {metric}")
|
||||||
|
ax.grid(axis='y')
|
||||||
|
|
||||||
|
fig.tight_layout()
|
||||||
|
fig.savefig("nightly_results.png", bbox_inches='tight', dpi=400)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
args = parse_arguments()
|
||||||
|
main(args)
|
||||||
218
.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh
Normal file
218
.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh
Normal file
@@ -0,0 +1,218 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
set -o pipefail
|
||||||
|
|
||||||
|
check_gpus() {
|
||||||
|
# check the number of GPUs and GPU type.
|
||||||
|
declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
|
||||||
|
if [[ $gpu_count -gt 0 ]]; then
|
||||||
|
echo "GPU found."
|
||||||
|
else
|
||||||
|
echo "Need at least 1 GPU to run benchmarking."
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
declare -g gpu_type=$(echo $(nvidia-smi --query-gpu=name --format=csv,noheader) | awk '{print $2}')
|
||||||
|
echo "GPU type is $gpu_type"
|
||||||
|
}
|
||||||
|
|
||||||
|
kill_gpu_processes() {
|
||||||
|
pkill lmdeploy || true
|
||||||
|
# waiting for GPU processes to be fully killed
|
||||||
|
sleep 10
|
||||||
|
# Print the GPU memory usage
|
||||||
|
# so that we know if all GPU processes are killed.
|
||||||
|
gpu_memory_usage=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits -i 0)
|
||||||
|
# The memory usage should be 0 MB.
|
||||||
|
echo "GPU 0 Memory Usage: $gpu_memory_usage MB"
|
||||||
|
}
|
||||||
|
|
||||||
|
json2args() {
|
||||||
|
# transforms the JSON string to command line args, and '_' is replaced to '-'
|
||||||
|
# example:
|
||||||
|
# input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 }
|
||||||
|
# output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1
|
||||||
|
local json_string=$1
|
||||||
|
local args=$(
|
||||||
|
echo "$json_string" | jq -r '
|
||||||
|
to_entries |
|
||||||
|
map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) |
|
||||||
|
join(" ")
|
||||||
|
'
|
||||||
|
)
|
||||||
|
echo "$args"
|
||||||
|
}
|
||||||
|
|
||||||
|
wait_for_server() {
|
||||||
|
# wait for vllm server to start
|
||||||
|
# return 1 if vllm server crashes
|
||||||
|
timeout 1200 bash -c '
|
||||||
|
until curl -s localhost:8000/v1/completions > /dev/null; do
|
||||||
|
sleep 1
|
||||||
|
done' && return 0 || return 1
|
||||||
|
}
|
||||||
|
|
||||||
|
run_serving_tests() {
|
||||||
|
# run serving tests using `benchmark_serving.py`
|
||||||
|
# $1: a json file specifying serving test cases
|
||||||
|
|
||||||
|
local serving_test_file
|
||||||
|
serving_test_file=$1
|
||||||
|
|
||||||
|
# Iterate over serving tests
|
||||||
|
jq -c '.[]' "$serving_test_file" | while read -r params; do
|
||||||
|
# get the test name, and append the GPU type back to it.
|
||||||
|
test_name=$(echo "$params" | jq -r '.test_name')
|
||||||
|
|
||||||
|
# if TEST_SELECTOR is set, only run the test cases that match the selector
|
||||||
|
if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
|
||||||
|
echo "Skip test case $test_name."
|
||||||
|
continue
|
||||||
|
fi
|
||||||
|
|
||||||
|
# append lmdeploy to the test name
|
||||||
|
test_name=lmdeploy_$test_name
|
||||||
|
|
||||||
|
# get common parameters
|
||||||
|
common_params=$(echo "$params" | jq -r '.common_parameters')
|
||||||
|
model=$(echo "$common_params" | jq -r '.model')
|
||||||
|
tp=$(echo "$common_params" | jq -r '.tp')
|
||||||
|
dataset_name=$(echo "$common_params" | jq -r '.dataset_name')
|
||||||
|
dataset_path=$(echo "$common_params" | jq -r '.dataset_path')
|
||||||
|
port=$(echo "$common_params" | jq -r '.port')
|
||||||
|
num_prompts=$(echo "$common_params" | jq -r '.num_prompts')
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# get client and server arguments
|
||||||
|
server_params=$(echo "$params" | jq -r '.lmdeploy_server_parameters')
|
||||||
|
client_params=$(echo "$params" | jq -r '.lmdeploy_client_parameters')
|
||||||
|
server_args=$(json2args "$server_params")
|
||||||
|
client_args=$(json2args "$client_params")
|
||||||
|
qps_list=$(echo "$params" | jq -r '.qps_list')
|
||||||
|
qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
|
||||||
|
echo "Running over qps list $qps_list"
|
||||||
|
|
||||||
|
# check if there is enough GPU to run the test
|
||||||
|
if [[ $gpu_count -lt $tp ]]; then
|
||||||
|
echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name."
|
||||||
|
continue
|
||||||
|
fi
|
||||||
|
|
||||||
|
# prepare tokenizer
|
||||||
|
rm -rf /tokenizer_cache
|
||||||
|
mkdir /tokenizer_cache
|
||||||
|
python ../.buildkite/nightly-benchmarks/scripts/download-tokenizer.py \
|
||||||
|
--model "$model" \
|
||||||
|
--cachedir /tokenizer_cache
|
||||||
|
|
||||||
|
server_command="lmdeploy serve api_server $model \
|
||||||
|
--tp $tp \
|
||||||
|
--server-port $port \
|
||||||
|
$server_args"
|
||||||
|
|
||||||
|
# run the server
|
||||||
|
echo "Running test case $test_name"
|
||||||
|
echo "Server command: $server_command"
|
||||||
|
bash -c "$server_command" &
|
||||||
|
|
||||||
|
# wait until the server is alive
|
||||||
|
wait_for_server
|
||||||
|
if [ $? -eq 0 ]; then
|
||||||
|
echo ""
|
||||||
|
echo "lmdeploy server is up and running."
|
||||||
|
else
|
||||||
|
echo ""
|
||||||
|
echo "lmdeploy failed to start within the timeout period."
|
||||||
|
break
|
||||||
|
fi
|
||||||
|
|
||||||
|
# get model name
|
||||||
|
model_name=$(python ../.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py)
|
||||||
|
|
||||||
|
# iterate over different QPS
|
||||||
|
for qps in $qps_list; do
|
||||||
|
# remove the surrounding single quote from qps
|
||||||
|
if [[ "$qps" == *"inf"* ]]; then
|
||||||
|
echo "qps was $qps"
|
||||||
|
qps="inf"
|
||||||
|
echo "now qps is $qps"
|
||||||
|
fi
|
||||||
|
|
||||||
|
new_test_name=$test_name"_qps_"$qps
|
||||||
|
|
||||||
|
client_command="python3 benchmark_serving.py \
|
||||||
|
--backend lmdeploy \
|
||||||
|
--tokenizer /tokenizer_cache \
|
||||||
|
--dataset-name $dataset_name \
|
||||||
|
--dataset-path $dataset_path \
|
||||||
|
--num-prompts $num_prompts \
|
||||||
|
--port $port \
|
||||||
|
--save-result \
|
||||||
|
--result-dir $RESULTS_FOLDER \
|
||||||
|
--result-filename ${new_test_name}.json \
|
||||||
|
--request-rate $qps \
|
||||||
|
--model \"$model_name\" \
|
||||||
|
$client_args"
|
||||||
|
|
||||||
|
echo "Running test case $test_name with qps $qps"
|
||||||
|
echo "Client command: $client_command"
|
||||||
|
|
||||||
|
eval "$client_command"
|
||||||
|
|
||||||
|
# record the benchmarking commands
|
||||||
|
jq_output=$(jq -n \
|
||||||
|
--arg server "$server_command" \
|
||||||
|
--arg client "$client_command" \
|
||||||
|
--arg gpu "$gpu_type" \
|
||||||
|
--arg engine "lmdeploy" \
|
||||||
|
'{
|
||||||
|
server_command: $server,
|
||||||
|
client_command: $client,
|
||||||
|
gpu_type: $gpu,
|
||||||
|
engine: $engine
|
||||||
|
}')
|
||||||
|
echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands"
|
||||||
|
|
||||||
|
done
|
||||||
|
|
||||||
|
# clean up
|
||||||
|
kill_gpu_processes
|
||||||
|
rm -rf /root/.cache/huggingface/*
|
||||||
|
done
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
upload_to_buildkite() {
|
||||||
|
# upload the benchmarking results to buildkite
|
||||||
|
|
||||||
|
# if the agent binary is not found, skip uploading the results, exit 0
|
||||||
|
if [ ! -f /workspace/buildkite-agent ]; then
|
||||||
|
echo "buildkite-agent binary not found. Skip uploading the results."
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
# /workspace/buildkite-agent annotate --style "success" --context "benchmark-results" --append < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md
|
||||||
|
/workspace/buildkite-agent artifact upload "$RESULTS_FOLDER/*"
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
main() {
|
||||||
|
|
||||||
|
check_gpus
|
||||||
|
# enter vllm directory
|
||||||
|
cd $VLLM_SOURCE_CODE_LOC/benchmarks
|
||||||
|
|
||||||
|
declare -g RESULTS_FOLDER=results/
|
||||||
|
mkdir -p $RESULTS_FOLDER
|
||||||
|
BENCHMARK_ROOT=../.buildkite/nightly-benchmarks/
|
||||||
|
|
||||||
|
python -m pip install transformers==4.41.2
|
||||||
|
|
||||||
|
export CURRENT_LLM_SERVING_ENGINE=lmdeploy
|
||||||
|
run_serving_tests $BENCHMARK_ROOT/tests/nightly-tests.json
|
||||||
|
python -m pip install tabulate pandas
|
||||||
|
python $BENCHMARK_ROOT/scripts/summary-nightly-results.py
|
||||||
|
upload_to_buildkite
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
main "$@"
|
||||||
216
.buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh
Normal file
216
.buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh
Normal file
@@ -0,0 +1,216 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
set -o pipefail
|
||||||
|
|
||||||
|
check_gpus() {
|
||||||
|
# check the number of GPUs and GPU type.
|
||||||
|
declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
|
||||||
|
if [[ $gpu_count -gt 0 ]]; then
|
||||||
|
echo "GPU found."
|
||||||
|
else
|
||||||
|
echo "Need at least 1 GPU to run benchmarking."
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
declare -g gpu_type=$(echo $(nvidia-smi --query-gpu=name --format=csv,noheader) | awk '{print $2}')
|
||||||
|
echo "GPU type is $gpu_type"
|
||||||
|
}
|
||||||
|
|
||||||
|
kill_gpu_processes() {
|
||||||
|
pkill text-generation || true
|
||||||
|
# waiting for GPU processes to be fully killed
|
||||||
|
sleep 10
|
||||||
|
# Print the GPU memory usage
|
||||||
|
# so that we know if all GPU processes are killed.
|
||||||
|
gpu_memory_usage=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits -i 0)
|
||||||
|
# The memory usage should be 0 MB.
|
||||||
|
echo "GPU 0 Memory Usage: $gpu_memory_usage MB"
|
||||||
|
}
|
||||||
|
|
||||||
|
json2args() {
|
||||||
|
# transforms the JSON string to command line args, and '_' is replaced to '-'
|
||||||
|
# example:
|
||||||
|
# input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 }
|
||||||
|
# output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1
|
||||||
|
local json_string=$1
|
||||||
|
local args=$(
|
||||||
|
echo "$json_string" | jq -r '
|
||||||
|
to_entries |
|
||||||
|
map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) |
|
||||||
|
join(" ")
|
||||||
|
'
|
||||||
|
)
|
||||||
|
echo "$args"
|
||||||
|
}
|
||||||
|
|
||||||
|
wait_for_server() {
|
||||||
|
timeout 1200 bash -c '
|
||||||
|
until curl -s localhost:8000/generate_stream > /dev/null; do
|
||||||
|
sleep 1
|
||||||
|
done' && return 0 || return 1
|
||||||
|
}
|
||||||
|
|
||||||
|
run_serving_tests() {
|
||||||
|
# run serving tests using `benchmark_serving.py`
|
||||||
|
# $1: a json file specifying serving test cases
|
||||||
|
|
||||||
|
local serving_test_file
|
||||||
|
serving_test_file=$1
|
||||||
|
|
||||||
|
# Iterate over serving tests
|
||||||
|
jq -c '.[]' "$serving_test_file" | while read -r params; do
|
||||||
|
# get the test name, and append the GPU type back to it.
|
||||||
|
test_name=$(echo "$params" | jq -r '.test_name')
|
||||||
|
|
||||||
|
|
||||||
|
# if TEST_SELECTOR is set, only run the test cases that match the selector
|
||||||
|
if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
|
||||||
|
echo "Skip test case $test_name."
|
||||||
|
continue
|
||||||
|
fi
|
||||||
|
|
||||||
|
# append tgi to the test name
|
||||||
|
test_name=tgi_$test_name
|
||||||
|
|
||||||
|
# get common parameters
|
||||||
|
common_params=$(echo "$params" | jq -r '.common_parameters')
|
||||||
|
model=$(echo "$common_params" | jq -r '.model')
|
||||||
|
tp=$(echo "$common_params" | jq -r '.tp')
|
||||||
|
dataset_name=$(echo "$common_params" | jq -r '.dataset_name')
|
||||||
|
dataset_path=$(echo "$common_params" | jq -r '.dataset_path')
|
||||||
|
port=$(echo "$common_params" | jq -r '.port')
|
||||||
|
num_prompts=$(echo "$common_params" | jq -r '.num_prompts')
|
||||||
|
|
||||||
|
# get client and server arguments
|
||||||
|
server_params=$(echo "$params" | jq -r '.tgi_server_parameters')
|
||||||
|
client_params=$(echo "$params" | jq -r '.tgi_client_parameters')
|
||||||
|
server_args=$(json2args "$server_params")
|
||||||
|
client_args=$(json2args "$client_params")
|
||||||
|
qps_list=$(echo "$params" | jq -r '.qps_list')
|
||||||
|
qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
|
||||||
|
echo "Running over qps list $qps_list"
|
||||||
|
|
||||||
|
# check if there is enough GPU to run the test
|
||||||
|
if [[ $gpu_count -lt $tp ]]; then
|
||||||
|
echo "Required num-shard $tp but only $gpu_count GPU found. Skip testcase $test_name."
|
||||||
|
continue
|
||||||
|
fi
|
||||||
|
|
||||||
|
if echo "$common_params" | jq -e 'has("fp8")' > /dev/null; then
|
||||||
|
echo "Key 'fp8' exists in common params."
|
||||||
|
server_command="/tgi-entrypoint.sh \
|
||||||
|
--model-id $model \
|
||||||
|
--num-shard $tp \
|
||||||
|
--port $port \
|
||||||
|
--quantize fp8 \
|
||||||
|
$server_args"
|
||||||
|
else
|
||||||
|
echo "Key 'fp8' does not exist in common params."
|
||||||
|
server_command="/tgi-entrypoint.sh \
|
||||||
|
--model-id $model \
|
||||||
|
--num-shard $tp \
|
||||||
|
--port $port \
|
||||||
|
$server_args"
|
||||||
|
fi
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# run the server
|
||||||
|
echo "Running test case $test_name"
|
||||||
|
echo "Server command: $server_command"
|
||||||
|
eval "$server_command" &
|
||||||
|
|
||||||
|
# wait until the server is alive
|
||||||
|
wait_for_server
|
||||||
|
if [ $? -eq 0 ]; then
|
||||||
|
echo ""
|
||||||
|
echo "tgi server is up and running."
|
||||||
|
else
|
||||||
|
echo ""
|
||||||
|
echo "tgi failed to start within the timeout period."
|
||||||
|
break
|
||||||
|
fi
|
||||||
|
|
||||||
|
# iterate over different QPS
|
||||||
|
for qps in $qps_list; do
|
||||||
|
# remove the surrounding single quote from qps
|
||||||
|
if [[ "$qps" == *"inf"* ]]; then
|
||||||
|
echo "qps was $qps"
|
||||||
|
qps="inf"
|
||||||
|
echo "now qps is $qps"
|
||||||
|
fi
|
||||||
|
|
||||||
|
new_test_name=$test_name"_qps_"$qps
|
||||||
|
|
||||||
|
client_command="python3 benchmark_serving.py \
|
||||||
|
--backend tgi \
|
||||||
|
--model $model \
|
||||||
|
--dataset-name $dataset_name \
|
||||||
|
--dataset-path $dataset_path \
|
||||||
|
--num-prompts $num_prompts \
|
||||||
|
--port $port \
|
||||||
|
--save-result \
|
||||||
|
--result-dir $RESULTS_FOLDER \
|
||||||
|
--result-filename ${new_test_name}.json \
|
||||||
|
--request-rate $qps \
|
||||||
|
$client_args"
|
||||||
|
|
||||||
|
echo "Running test case $test_name with qps $qps"
|
||||||
|
echo "Client command: $client_command"
|
||||||
|
|
||||||
|
eval "$client_command"
|
||||||
|
|
||||||
|
# record the benchmarking commands
|
||||||
|
jq_output=$(jq -n \
|
||||||
|
--arg server "$server_command" \
|
||||||
|
--arg client "$client_command" \
|
||||||
|
--arg gpu "$gpu_type" \
|
||||||
|
--arg engine "tgi" \
|
||||||
|
'{
|
||||||
|
server_command: $server,
|
||||||
|
client_command: $client,
|
||||||
|
gpu_type: $gpu,
|
||||||
|
engine: $engine
|
||||||
|
}')
|
||||||
|
echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands"
|
||||||
|
|
||||||
|
done
|
||||||
|
|
||||||
|
# clean up
|
||||||
|
kill_gpu_processes
|
||||||
|
rm -rf /root/.cache/huggingface/*
|
||||||
|
done
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
upload_to_buildkite() {
|
||||||
|
# upload the benchmarking results to buildkite
|
||||||
|
|
||||||
|
# if the agent binary is not found, skip uploading the results, exit 0
|
||||||
|
if [ ! -f /workspace/buildkite-agent ]; then
|
||||||
|
echo "buildkite-agent binary not found. Skip uploading the results."
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
# /workspace/buildkite-agent annotate --style "success" --context "benchmark-results" --append < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md
|
||||||
|
/workspace/buildkite-agent artifact upload "$RESULTS_FOLDER/*"
|
||||||
|
}
|
||||||
|
|
||||||
|
main() {
|
||||||
|
|
||||||
|
check_gpus
|
||||||
|
# enter vllm directory
|
||||||
|
cd $VLLM_SOURCE_CODE_LOC/benchmarks
|
||||||
|
declare -g RESULTS_FOLDER=results/
|
||||||
|
mkdir -p $RESULTS_FOLDER
|
||||||
|
BENCHMARK_ROOT=../.buildkite/nightly-benchmarks/
|
||||||
|
|
||||||
|
export CURRENT_LLM_SERVING_ENGINE=tgi
|
||||||
|
run_serving_tests $BENCHMARK_ROOT/tests/nightly-tests.json
|
||||||
|
python -m pip install tabulate pandas
|
||||||
|
python $BENCHMARK_ROOT/scripts/summary-nightly-results.py
|
||||||
|
upload_to_buildkite
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
main "$@"
|
||||||
214
.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh
Normal file
214
.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh
Normal file
@@ -0,0 +1,214 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
set -o pipefail
|
||||||
|
|
||||||
|
check_gpus() {
|
||||||
|
# check the number of GPUs and GPU type.
|
||||||
|
declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
|
||||||
|
if [[ $gpu_count -gt 0 ]]; then
|
||||||
|
echo "GPU found."
|
||||||
|
else
|
||||||
|
echo "Need at least 1 GPU to run benchmarking."
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
declare -g gpu_type=$(echo $(nvidia-smi --query-gpu=name --format=csv,noheader) | awk '{print $2}')
|
||||||
|
echo "GPU type is $gpu_type"
|
||||||
|
}
|
||||||
|
|
||||||
|
kill_gpu_processes() {
|
||||||
|
pkill tritonserver || true
|
||||||
|
# waiting for GPU processes to be fully killed
|
||||||
|
sleep 20
|
||||||
|
# Print the GPU memory usage
|
||||||
|
# so that we know if all GPU processes are killed.
|
||||||
|
gpu_memory_usage=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits -i 0)
|
||||||
|
# The memory usage should be 0 MB.
|
||||||
|
echo "GPU 0 Memory Usage: $gpu_memory_usage MB"
|
||||||
|
}
|
||||||
|
|
||||||
|
json2args() {
|
||||||
|
# transforms the JSON string to command line args, and '_' is replaced to '-'
|
||||||
|
# example:
|
||||||
|
# input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 }
|
||||||
|
# output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1
|
||||||
|
local json_string=$1
|
||||||
|
local args=$(
|
||||||
|
echo "$json_string" | jq -r '
|
||||||
|
to_entries |
|
||||||
|
map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) |
|
||||||
|
join(" ")
|
||||||
|
'
|
||||||
|
)
|
||||||
|
echo "$args"
|
||||||
|
}
|
||||||
|
|
||||||
|
wait_for_server() {
|
||||||
|
timeout 1200 bash -c '
|
||||||
|
until curl -s localhost:8000/generate_stream > /dev/null; do
|
||||||
|
sleep 1
|
||||||
|
done' && return 0 || return 1
|
||||||
|
}
|
||||||
|
|
||||||
|
run_serving_tests() {
|
||||||
|
# run serving tests using `benchmark_serving.py`
|
||||||
|
# $1: a json file specifying serving test cases
|
||||||
|
|
||||||
|
local serving_test_file
|
||||||
|
serving_test_file=$1
|
||||||
|
|
||||||
|
# Iterate over serving tests
|
||||||
|
jq -c '.[]' "$serving_test_file" | while read -r params; do
|
||||||
|
# get the test name, and append the GPU type back to it.
|
||||||
|
test_name=$(echo "$params" | jq -r '.test_name')
|
||||||
|
|
||||||
|
# if TEST_SELECTOR is set, only run the test cases that match the selector
|
||||||
|
if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
|
||||||
|
echo "Skip test case $test_name."
|
||||||
|
continue
|
||||||
|
fi
|
||||||
|
|
||||||
|
# append trt to the test name
|
||||||
|
test_name=trt_$test_name
|
||||||
|
|
||||||
|
# get common parameters
|
||||||
|
common_params=$(echo "$params" | jq -r '.common_parameters')
|
||||||
|
model=$(echo "$common_params" | jq -r '.model')
|
||||||
|
tp=$(echo "$common_params" | jq -r '.tp')
|
||||||
|
dataset_name=$(echo "$common_params" | jq -r '.dataset_name')
|
||||||
|
dataset_path=$(echo "$common_params" | jq -r '.dataset_path')
|
||||||
|
port=$(echo "$common_params" | jq -r '.port')
|
||||||
|
num_prompts=$(echo "$common_params" | jq -r '.num_prompts')
|
||||||
|
|
||||||
|
# get client and server arguments
|
||||||
|
server_params=$(echo "$params" | jq -r '.trt_server_parameters')
|
||||||
|
client_params=$(echo "$params" | jq -r '.trt_client_parameters')
|
||||||
|
client_args=$(json2args "$client_params")
|
||||||
|
qps_list=$(echo "$params" | jq -r '.qps_list')
|
||||||
|
qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
|
||||||
|
echo "Running over qps list $qps_list"
|
||||||
|
|
||||||
|
# check if there is enough GPU to run the test
|
||||||
|
if [[ $gpu_count -lt $tp ]]; then
|
||||||
|
echo "Required model_tp_size $tp but only $gpu_count GPU found. Skip testcase $test_name."
|
||||||
|
continue
|
||||||
|
fi
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
cd $VLLM_SOURCE_CODE_LOC/benchmarks
|
||||||
|
|
||||||
|
|
||||||
|
echo "Running test case $test_name"
|
||||||
|
bash ../.buildkite/nightly-benchmarks/scripts/launch-trt-server.sh "$server_params" "$common_params"
|
||||||
|
|
||||||
|
# wait until the server is alive
|
||||||
|
wait_for_server
|
||||||
|
if [ $? -eq 0 ]; then
|
||||||
|
echo ""
|
||||||
|
echo "trt server is up and running."
|
||||||
|
else
|
||||||
|
echo ""
|
||||||
|
echo "trt failed to start within the timeout period."
|
||||||
|
break
|
||||||
|
fi
|
||||||
|
|
||||||
|
# prepare tokenizer
|
||||||
|
cd $VLLM_SOURCE_CODE_LOC/benchmarks
|
||||||
|
rm -rf /tokenizer_cache
|
||||||
|
mkdir /tokenizer_cache
|
||||||
|
python ../.buildkite/nightly-benchmarks/scripts/download-tokenizer.py \
|
||||||
|
--model "$model" \
|
||||||
|
--cachedir /tokenizer_cache
|
||||||
|
cd $VLLM_SOURCE_CODE_LOC/benchmarks
|
||||||
|
|
||||||
|
|
||||||
|
# iterate over different QPS
|
||||||
|
for qps in $qps_list; do
|
||||||
|
# remove the surrounding single quote from qps
|
||||||
|
if [[ "$qps" == *"inf"* ]]; then
|
||||||
|
echo "qps was $qps"
|
||||||
|
qps="inf"
|
||||||
|
echo "now qps is $qps"
|
||||||
|
fi
|
||||||
|
|
||||||
|
new_test_name=$test_name"_qps_"$qps
|
||||||
|
|
||||||
|
client_command="python3 benchmark_serving.py \
|
||||||
|
--backend tensorrt-llm \
|
||||||
|
--tokenizer /tokenizer_cache \
|
||||||
|
--model $model \
|
||||||
|
--dataset-name $dataset_name \
|
||||||
|
--dataset-path $dataset_path \
|
||||||
|
--num-prompts $num_prompts \
|
||||||
|
--port $port \
|
||||||
|
--save-result \
|
||||||
|
--result-dir $RESULTS_FOLDER \
|
||||||
|
--result-filename ${new_test_name}.json \
|
||||||
|
--request-rate $qps \
|
||||||
|
$client_args"
|
||||||
|
|
||||||
|
echo "Running test case $test_name with qps $qps"
|
||||||
|
echo "Client command: $client_command"
|
||||||
|
|
||||||
|
eval "$client_command"
|
||||||
|
|
||||||
|
server_command=""
|
||||||
|
# record the benchmarking commands
|
||||||
|
jq_output=$(jq -n \
|
||||||
|
--arg server "$server_command" \
|
||||||
|
--arg client "$client_command" \
|
||||||
|
--arg gpu "$gpu_type" \
|
||||||
|
--arg engine "trt" \
|
||||||
|
'{
|
||||||
|
server_command: $server,
|
||||||
|
client_command: $client,
|
||||||
|
gpu_type: $gpu,
|
||||||
|
engine: $engine
|
||||||
|
}')
|
||||||
|
echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands"
|
||||||
|
|
||||||
|
done
|
||||||
|
|
||||||
|
# clean up
|
||||||
|
kill_gpu_processes
|
||||||
|
rm -rf /root/.cache/huggingface/*
|
||||||
|
done
|
||||||
|
}
|
||||||
|
|
||||||
|
upload_to_buildkite() {
|
||||||
|
# upload the benchmarking results to buildkite
|
||||||
|
|
||||||
|
# if the agent binary is not found, skip uploading the results, exit 0
|
||||||
|
if [ ! -f /workspace/buildkite-agent ]; then
|
||||||
|
echo "buildkite-agent binary not found. Skip uploading the results."
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
# /workspace/buildkite-agent annotate --style "success" --context "benchmark-results" --append < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md
|
||||||
|
/workspace/buildkite-agent artifact upload "$RESULTS_FOLDER/*"
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
main() {
|
||||||
|
|
||||||
|
check_gpus
|
||||||
|
|
||||||
|
|
||||||
|
# enter vllm directory
|
||||||
|
cd $VLLM_SOURCE_CODE_LOC/benchmarks
|
||||||
|
|
||||||
|
declare -g RESULTS_FOLDER=results/
|
||||||
|
mkdir -p $RESULTS_FOLDER
|
||||||
|
BENCHMARK_ROOT=../.buildkite/nightly-benchmarks/
|
||||||
|
|
||||||
|
# update transformers package, to make sure mixtral tokenizer is available
|
||||||
|
python -m pip install transformers -U
|
||||||
|
|
||||||
|
export CURRENT_LLM_SERVING_ENGINE=trt
|
||||||
|
run_serving_tests $BENCHMARK_ROOT/tests/nightly-tests.json
|
||||||
|
python -m pip install tabulate pandas
|
||||||
|
python $BENCHMARK_ROOT/scripts/summary-nightly-results.py
|
||||||
|
upload_to_buildkite
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
main "$@"
|
||||||
221
.buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh
Normal file
221
.buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh
Normal file
@@ -0,0 +1,221 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
set -o pipefail
|
||||||
|
|
||||||
|
check_gpus() {
|
||||||
|
# check the number of GPUs and GPU type.
|
||||||
|
declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
|
||||||
|
if [[ $gpu_count -gt 0 ]]; then
|
||||||
|
echo "GPU found."
|
||||||
|
else
|
||||||
|
echo "Need at least 1 GPU to run benchmarking."
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
declare -g gpu_type=$(echo $(nvidia-smi --query-gpu=name --format=csv,noheader) | awk '{print $2}')
|
||||||
|
echo "GPU type is $gpu_type"
|
||||||
|
}
|
||||||
|
|
||||||
|
kill_gpu_processes() {
|
||||||
|
# kill all processes on GPU.
|
||||||
|
pkill pt_main_thread
|
||||||
|
sleep 10
|
||||||
|
|
||||||
|
# remove vllm config file
|
||||||
|
rm -rf ~/.config/vllm
|
||||||
|
|
||||||
|
# Print the GPU memory usage
|
||||||
|
# so that we know if all GPU processes are killed.
|
||||||
|
gpu_memory_usage=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits -i 0)
|
||||||
|
# The memory usage should be 0 MB.
|
||||||
|
echo "GPU 0 Memory Usage: $gpu_memory_usage MB"
|
||||||
|
}
|
||||||
|
|
||||||
|
json2args() {
|
||||||
|
# transforms the JSON string to command line args, and '_' is replaced to '-'
|
||||||
|
# example:
|
||||||
|
# input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 }
|
||||||
|
# output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1
|
||||||
|
local json_string=$1
|
||||||
|
local args=$(
|
||||||
|
echo "$json_string" | jq -r '
|
||||||
|
to_entries |
|
||||||
|
map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) |
|
||||||
|
join(" ")
|
||||||
|
'
|
||||||
|
)
|
||||||
|
echo "$args"
|
||||||
|
}
|
||||||
|
|
||||||
|
wait_for_server() {
|
||||||
|
# wait for vllm server to start
|
||||||
|
# return 1 if vllm server crashes
|
||||||
|
timeout 1200 bash -c '
|
||||||
|
until curl -s localhost:8000/v1/completions > /dev/null; do
|
||||||
|
sleep 1
|
||||||
|
done' && return 0 || return 1
|
||||||
|
}
|
||||||
|
|
||||||
|
run_serving_tests() {
|
||||||
|
# run serving tests using `benchmark_serving.py`
|
||||||
|
# $1: a json file specifying serving test cases
|
||||||
|
|
||||||
|
local serving_test_file
|
||||||
|
serving_test_file=$1
|
||||||
|
|
||||||
|
# Iterate over serving tests
|
||||||
|
jq -c '.[]' "$serving_test_file" | while read -r params; do
|
||||||
|
# get the test name, and append the GPU type back to it.
|
||||||
|
test_name=$(echo "$params" | jq -r '.test_name')
|
||||||
|
|
||||||
|
# if TEST_SELECTOR is set, only run the test cases that match the selector
|
||||||
|
if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
|
||||||
|
echo "Skip test case $test_name."
|
||||||
|
continue
|
||||||
|
fi
|
||||||
|
|
||||||
|
# append vllm to the test name
|
||||||
|
test_name=vllm_$test_name
|
||||||
|
|
||||||
|
|
||||||
|
# get common parameters
|
||||||
|
common_params=$(echo "$params" | jq -r '.common_parameters')
|
||||||
|
model=$(echo "$common_params" | jq -r '.model')
|
||||||
|
tp=$(echo "$common_params" | jq -r '.tp')
|
||||||
|
dataset_name=$(echo "$common_params" | jq -r '.dataset_name')
|
||||||
|
dataset_path=$(echo "$common_params" | jq -r '.dataset_path')
|
||||||
|
port=$(echo "$common_params" | jq -r '.port')
|
||||||
|
num_prompts=$(echo "$common_params" | jq -r '.num_prompts')
|
||||||
|
|
||||||
|
# get client and server arguments
|
||||||
|
server_params=$(echo "$params" | jq -r '.vllm_server_parameters')
|
||||||
|
client_params=$(echo "$params" | jq -r '.vllm_client_parameters')
|
||||||
|
server_args=$(json2args "$server_params")
|
||||||
|
client_args=$(json2args "$client_params")
|
||||||
|
qps_list=$(echo "$params" | jq -r '.qps_list')
|
||||||
|
qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
|
||||||
|
echo "Running over qps list $qps_list"
|
||||||
|
|
||||||
|
# check if there is enough GPU to run the test
|
||||||
|
if [[ $gpu_count -lt $tp ]]; then
|
||||||
|
echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name."
|
||||||
|
continue
|
||||||
|
fi
|
||||||
|
|
||||||
|
if echo "$common_params" | jq -e 'has("fp8")' > /dev/null; then
|
||||||
|
echo "Key 'fp8' exists in common params. Use neuralmagic fp8 model for convenience."
|
||||||
|
model=$(echo "$common_params" | jq -r '.neuralmagic_quantized_model')
|
||||||
|
server_command="python3 \
|
||||||
|
-m vllm.entrypoints.openai.api_server \
|
||||||
|
-tp $tp \
|
||||||
|
--model $model \
|
||||||
|
--port $port \
|
||||||
|
$server_args"
|
||||||
|
else
|
||||||
|
echo "Key 'fp8' does not exist in common params."
|
||||||
|
server_command="python3 \
|
||||||
|
-m vllm.entrypoints.openai.api_server \
|
||||||
|
-tp $tp \
|
||||||
|
--model $model \
|
||||||
|
--port $port \
|
||||||
|
$server_args"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# run the server
|
||||||
|
echo "Running test case $test_name"
|
||||||
|
echo "Server command: $server_command"
|
||||||
|
eval "$server_command" &
|
||||||
|
|
||||||
|
# wait until the server is alive
|
||||||
|
wait_for_server
|
||||||
|
if [ $? -eq 0 ]; then
|
||||||
|
echo ""
|
||||||
|
echo "vllm server is up and running."
|
||||||
|
else
|
||||||
|
echo ""
|
||||||
|
echo "vllm failed to start within the timeout period."
|
||||||
|
break
|
||||||
|
fi
|
||||||
|
|
||||||
|
# iterate over different QPS
|
||||||
|
for qps in $qps_list; do
|
||||||
|
# remove the surrounding single quote from qps
|
||||||
|
if [[ "$qps" == *"inf"* ]]; then
|
||||||
|
echo "qps was $qps"
|
||||||
|
qps="inf"
|
||||||
|
echo "now qps is $qps"
|
||||||
|
fi
|
||||||
|
|
||||||
|
new_test_name=$test_name"_qps_"$qps
|
||||||
|
|
||||||
|
client_command="python3 benchmark_serving.py \
|
||||||
|
--backend vllm \
|
||||||
|
--model $model \
|
||||||
|
--dataset-name $dataset_name \
|
||||||
|
--dataset-path $dataset_path \
|
||||||
|
--num-prompts $num_prompts \
|
||||||
|
--port $port \
|
||||||
|
--save-result \
|
||||||
|
--result-dir $RESULTS_FOLDER \
|
||||||
|
--result-filename ${new_test_name}.json \
|
||||||
|
--request-rate $qps \
|
||||||
|
$client_args"
|
||||||
|
|
||||||
|
echo "Running test case $test_name with qps $qps"
|
||||||
|
echo "Client command: $client_command"
|
||||||
|
|
||||||
|
eval "$client_command"
|
||||||
|
|
||||||
|
# record the benchmarking commands
|
||||||
|
jq_output=$(jq -n \
|
||||||
|
--arg server "$server_command" \
|
||||||
|
--arg client "$client_command" \
|
||||||
|
--arg gpu "$gpu_type" \
|
||||||
|
--arg engine "vllm" \
|
||||||
|
'{
|
||||||
|
server_command: $server,
|
||||||
|
client_command: $client,
|
||||||
|
gpu_type: $gpu,
|
||||||
|
engine: $engine
|
||||||
|
}')
|
||||||
|
echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands"
|
||||||
|
|
||||||
|
done
|
||||||
|
|
||||||
|
# clean up
|
||||||
|
kill_gpu_processes
|
||||||
|
rm -rf /root/.cache/huggingface/*
|
||||||
|
done
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
upload_to_buildkite() {
|
||||||
|
# upload the benchmarking results to buildkite
|
||||||
|
|
||||||
|
# if the agent binary is not found, skip uploading the results, exit 0
|
||||||
|
if [ ! -f /workspace/buildkite-agent ]; then
|
||||||
|
echo "buildkite-agent binary not found. Skip uploading the results."
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
# /workspace/buildkite-agent annotate --style "success" --context "benchmark-results" --append < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md
|
||||||
|
/workspace/buildkite-agent artifact upload "$RESULTS_FOLDER/*"
|
||||||
|
}
|
||||||
|
|
||||||
|
main() {
|
||||||
|
|
||||||
|
check_gpus
|
||||||
|
# enter vllm directory
|
||||||
|
cd $VLLM_SOURCE_CODE_LOC/benchmarks
|
||||||
|
declare -g RESULTS_FOLDER=results/
|
||||||
|
mkdir -p $RESULTS_FOLDER
|
||||||
|
BENCHMARK_ROOT=../.buildkite/nightly-benchmarks/
|
||||||
|
|
||||||
|
export CURRENT_LLM_SERVING_ENGINE=vllm
|
||||||
|
run_serving_tests $BENCHMARK_ROOT/tests/nightly-tests.json
|
||||||
|
|
||||||
|
python3 -m pip install tabulate pandas
|
||||||
|
python3 $BENCHMARK_ROOT/scripts/summary-nightly-results.py
|
||||||
|
upload_to_buildkite
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
main "$@"
|
||||||
@@ -0,0 +1,76 @@
|
|||||||
|
import datetime
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import pandas as pd
|
||||||
|
from tabulate import tabulate
|
||||||
|
|
||||||
|
results_folder = Path("results/")
|
||||||
|
|
||||||
|
# serving results and the keys that will be printed into markdown
|
||||||
|
serving_results = []
|
||||||
|
serving_column_mapping = {
|
||||||
|
"test_name": "Test name",
|
||||||
|
"gpu_type": "GPU",
|
||||||
|
"completed": "Successful req.",
|
||||||
|
"request_throughput": "Tput (req/s)",
|
||||||
|
"mean_ttft_ms": "Mean TTFT (ms)",
|
||||||
|
"std_ttft_ms": "Std TTFT (ms)",
|
||||||
|
"mean_itl_ms": "Mean ITL (ms)",
|
||||||
|
"std_itl_ms": "Std ITL (ms)",
|
||||||
|
"input_throughput": "Input Tput (tok/s)",
|
||||||
|
"output_throughput": "Output Tput (tok/s)",
|
||||||
|
"engine": "Engine",
|
||||||
|
}
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
|
||||||
|
# collect results
|
||||||
|
for test_file in results_folder.glob("*.json"):
|
||||||
|
|
||||||
|
with open(test_file, "r") as f:
|
||||||
|
raw_result = json.loads(f.read())
|
||||||
|
|
||||||
|
# attach the benchmarking command to raw_result
|
||||||
|
with open(test_file.with_suffix(".commands"), "r") as f:
|
||||||
|
command = json.loads(f.read())
|
||||||
|
raw_result.update(command)
|
||||||
|
|
||||||
|
# update the test name of this result
|
||||||
|
raw_result.update({"test_name": test_file.stem})
|
||||||
|
|
||||||
|
# add the result to raw_result
|
||||||
|
serving_results.append(raw_result)
|
||||||
|
continue
|
||||||
|
|
||||||
|
serving_results = pd.DataFrame.from_dict(serving_results)
|
||||||
|
|
||||||
|
if not serving_results.empty:
|
||||||
|
serving_results = serving_results[list(
|
||||||
|
serving_column_mapping.keys())].rename(
|
||||||
|
columns=serving_column_mapping)
|
||||||
|
|
||||||
|
serving_md_table_with_headers = tabulate(serving_results,
|
||||||
|
headers='keys',
|
||||||
|
tablefmt='pipe',
|
||||||
|
showindex=False)
|
||||||
|
# remove the first line of header
|
||||||
|
serving_md_table_lines = serving_md_table_with_headers.split('\n')
|
||||||
|
serving_md_table_without_header = '\n'.join(serving_md_table_lines[2:])
|
||||||
|
|
||||||
|
prefix = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
|
||||||
|
prefix = prefix + "_" + os.environ.get("CURRENT_LLM_SERVING_ENGINE")
|
||||||
|
|
||||||
|
# document benchmarking results in markdown
|
||||||
|
with open(results_folder / f"{prefix}_nightly_results.md", "w") as f:
|
||||||
|
# document results with header.
|
||||||
|
# for those who wants to reproduce our benchmark.
|
||||||
|
f.write(serving_md_table_with_headers)
|
||||||
|
f.write('\n')
|
||||||
|
|
||||||
|
# document benchmarking results in json
|
||||||
|
with open(results_folder / f"{prefix}_nightly_results.json", "w") as f:
|
||||||
|
|
||||||
|
results = serving_results.to_dict(orient='records')
|
||||||
|
f.write(json.dumps(results))
|
||||||
17
.buildkite/nightly-benchmarks/scripts/wait-for-image.sh
Normal file
17
.buildkite/nightly-benchmarks/scripts/wait-for-image.sh
Normal file
@@ -0,0 +1,17 @@
|
|||||||
|
#!/bin/sh
|
||||||
|
TOKEN=$(curl -s -L "https://public.ecr.aws/token?service=public.ecr.aws&scope=repository:q9t5s3a7/vllm-ci-test-repo:pull" | jq -r .token)
|
||||||
|
URL="https://public.ecr.aws/v2/q9t5s3a7/vllm-ci-test-repo/manifests/$BUILDKITE_COMMIT"
|
||||||
|
|
||||||
|
retries=0
|
||||||
|
while [ $retries -lt 1000 ]; do
|
||||||
|
if [ $(curl -s -L -H "Authorization: Bearer $TOKEN" -o /dev/null -w "%{http_code}" $URL) -eq 200 ]; then
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "Waiting for image to be available..."
|
||||||
|
|
||||||
|
retries=$((retries + 1))
|
||||||
|
sleep 5
|
||||||
|
done
|
||||||
|
|
||||||
|
exit 1
|
||||||
67
.buildkite/nightly-benchmarks/tests/descriptions.md
Normal file
67
.buildkite/nightly-benchmarks/tests/descriptions.md
Normal file
@@ -0,0 +1,67 @@
|
|||||||
|
|
||||||
|
## Latency tests
|
||||||
|
|
||||||
|
This test suite aims to test vllm's end-to-end latency under a controlled setup.
|
||||||
|
|
||||||
|
- Input length: 32 tokens.
|
||||||
|
- Output length: 128 tokens.
|
||||||
|
- Batch size: fixed (8).
|
||||||
|
- Models: llama-3 8B, llama-3 70B, mixtral 8x7B.
|
||||||
|
- Evaluation metrics: end-to-end latency (mean, median, p99).
|
||||||
|
|
||||||
|
### Latency benchmarking results
|
||||||
|
|
||||||
|
{latency_tests_markdown_table}
|
||||||
|
|
||||||
|
## Throughput tests
|
||||||
|
|
||||||
|
This test suite aims to test vllm's throughput.
|
||||||
|
|
||||||
|
- Input length: randomly sample 200 prompts from ShareGPT dataset (with fixed random seed).
|
||||||
|
- Output length: the corresponding output length of these 200 prompts.
|
||||||
|
- Batch size: dynamically determined by vllm to achieve maximum throughput.
|
||||||
|
- Models: llama-3 8B, llama-3 70B, mixtral 8x7B.
|
||||||
|
- Evaluation metrics: throughput.
|
||||||
|
|
||||||
|
### Throughput benchmarking results
|
||||||
|
|
||||||
|
{throughput_tests_markdown_table}
|
||||||
|
|
||||||
|
## Serving tests
|
||||||
|
|
||||||
|
This test suite aims to test vllm's real serving metrics.
|
||||||
|
|
||||||
|
- Input length: randomly sample 200 prompts from ShareGPT dataset (with fixed random seed).
|
||||||
|
- Output length: the corresponding output length of these 200 prompts.
|
||||||
|
- Batch size: dynamically determined by vllm and the arrival pattern of the requests.
|
||||||
|
- **Average QPS (query per second)**: 1, 4, 16 and inf. QPS = inf means all requests come at once. For other QPS values, the arrival time of each query is determined using a random Poisson process (with fixed random seed).
|
||||||
|
- Models: llama-3 8B, llama-3 70B, mixtral 8x7B.
|
||||||
|
- Evaluation metrics: throughput, TTFT (time to the first token, with mean, median and p99), ITL (inter-token latency, with mean, median and p99).
|
||||||
|
|
||||||
|
### Serving benchmarking results
|
||||||
|
|
||||||
|
{serving_tests_markdown_table}
|
||||||
|
|
||||||
|
## json version of the benchmarking tables
|
||||||
|
|
||||||
|
This section contains the data of the markdown tables above in JSON format.
|
||||||
|
You can load the benchmarking tables into pandas dataframes as follows:
|
||||||
|
|
||||||
|
```python
|
||||||
|
import json
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
benchmarking_results_json = """The json string"""
|
||||||
|
benchmarking_results = json.loads(benchmarking_results_json)
|
||||||
|
latency_results = pd.DataFrame.from_dict(benchmarking_results["latency"])
|
||||||
|
throughput_results = pd.DataFrame.from_dict(benchmarking_results["throughput"])
|
||||||
|
serving_results = pd.DataFrame.from_dict(benchmarking_results["serving"])
|
||||||
|
```
|
||||||
|
|
||||||
|
The json string for all benchmarking tables:
|
||||||
|
```json
|
||||||
|
{benchmarking_results_in_json_string}
|
||||||
|
```
|
||||||
|
|
||||||
|
You can also check the raw experiment data in the Artifact tab of the Buildkite page.
|
||||||
|
|
||||||
32
.buildkite/nightly-benchmarks/tests/latency-tests.json
Normal file
32
.buildkite/nightly-benchmarks/tests/latency-tests.json
Normal file
@@ -0,0 +1,32 @@
|
|||||||
|
[
|
||||||
|
{
|
||||||
|
"test_name": "latency_llama8B_tp1",
|
||||||
|
"parameters": {
|
||||||
|
"model": "meta-llama/Meta-Llama-3-8B",
|
||||||
|
"tensor_parallel_size": 1,
|
||||||
|
"load_format": "dummy",
|
||||||
|
"num_iters_warmup": 5,
|
||||||
|
"num_iters": 15
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"test_name": "latency_llama70B_tp4",
|
||||||
|
"parameters": {
|
||||||
|
"model": "meta-llama/Meta-Llama-3-70B-Instruct",
|
||||||
|
"tensor_parallel_size": 4,
|
||||||
|
"load_format": "dummy",
|
||||||
|
"num-iters-warmup": 5,
|
||||||
|
"num-iters": 15
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"test_name": "latency_mixtral8x7B_tp2",
|
||||||
|
"parameters": {
|
||||||
|
"model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
|
||||||
|
"tensor_parallel_size": 2,
|
||||||
|
"load_format": "dummy",
|
||||||
|
"num-iters-warmup": 5,
|
||||||
|
"num-iters": 15
|
||||||
|
}
|
||||||
|
}
|
||||||
|
]
|
||||||
116
.buildkite/nightly-benchmarks/tests/nightly-tests.json
Normal file
116
.buildkite/nightly-benchmarks/tests/nightly-tests.json
Normal file
@@ -0,0 +1,116 @@
|
|||||||
|
[
|
||||||
|
{
|
||||||
|
"test_name": "llama8B_tp1",
|
||||||
|
"qps_list": [4],
|
||||||
|
"common_parameters": {
|
||||||
|
"model": "meta-llama/Meta-Llama-3-8B",
|
||||||
|
"tp": 1,
|
||||||
|
"dataset_name": "sharegpt",
|
||||||
|
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||||
|
"num_prompts": 500,
|
||||||
|
"port": 8000
|
||||||
|
},
|
||||||
|
"lmdeploy_server_parameters": {
|
||||||
|
},
|
||||||
|
"lmdeploy_client_parameters": {
|
||||||
|
},
|
||||||
|
"tgi_server_parameters": {
|
||||||
|
},
|
||||||
|
"tgi_client_parameters": {
|
||||||
|
"endpoint": "/generate_stream"
|
||||||
|
},
|
||||||
|
"trt_server_parameters": {
|
||||||
|
"model_type": "llama",
|
||||||
|
"model_dtype": "float16",
|
||||||
|
"max_batch_size": 256,
|
||||||
|
"max_input_len": 4096,
|
||||||
|
"max_output_len": 4096,
|
||||||
|
"trt_llm_version": "r24.04"
|
||||||
|
},
|
||||||
|
"trt_client_parameters": {
|
||||||
|
"endpoint": "/v2/models/ensemble/generate_stream"
|
||||||
|
},
|
||||||
|
"vllm_server_parameters": {
|
||||||
|
"disable_log_stats": "",
|
||||||
|
"disable_log_requests": ""
|
||||||
|
},
|
||||||
|
"vllm_client_parameters": {
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"test_name": "llama70B_tp4",
|
||||||
|
"qps_list": [2],
|
||||||
|
"common_parameters": {
|
||||||
|
"model": "meta-llama/Meta-Llama-3-70B-Instruct",
|
||||||
|
"tp": 4,
|
||||||
|
"dataset_name": "sharegpt",
|
||||||
|
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||||
|
"num_prompts": 500,
|
||||||
|
"port": 8000
|
||||||
|
},
|
||||||
|
"lmdeploy_server_parameters": {
|
||||||
|
},
|
||||||
|
"lmdeploy_client_parameters": {
|
||||||
|
},
|
||||||
|
"tgi_server_parameters": {
|
||||||
|
},
|
||||||
|
"tgi_client_parameters": {
|
||||||
|
"endpoint": "/generate_stream"
|
||||||
|
},
|
||||||
|
"trt_server_parameters": {
|
||||||
|
"model_type": "llama",
|
||||||
|
"model_dtype": "float16",
|
||||||
|
"max_batch_size": 256,
|
||||||
|
"max_input_len": 4096,
|
||||||
|
"max_output_len": 4096,
|
||||||
|
"trt_llm_version": "r24.04"
|
||||||
|
},
|
||||||
|
"trt_client_parameters": {
|
||||||
|
"endpoint": "/v2/models/ensemble/generate_stream"
|
||||||
|
},
|
||||||
|
"vllm_server_parameters": {
|
||||||
|
"disable_log_stats": "",
|
||||||
|
"disable_log_requests": ""
|
||||||
|
},
|
||||||
|
"vllm_client_parameters": {
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"test_name": "mixtral8x7B_tp2",
|
||||||
|
"qps_list": [2],
|
||||||
|
"common_parameters": {
|
||||||
|
"model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
|
||||||
|
"tp": 2,
|
||||||
|
"dataset_name": "sharegpt",
|
||||||
|
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||||
|
"num_prompts": 500,
|
||||||
|
"port": 8000
|
||||||
|
},
|
||||||
|
"lmdeploy_server_parameters": {
|
||||||
|
},
|
||||||
|
"lmdeploy_client_parameters": {
|
||||||
|
},
|
||||||
|
"tgi_server_parameters": {
|
||||||
|
},
|
||||||
|
"tgi_client_parameters": {
|
||||||
|
"endpoint": "/generate_stream"
|
||||||
|
},
|
||||||
|
"trt_server_parameters": {
|
||||||
|
"model_type": "llama",
|
||||||
|
"model_dtype": "float16",
|
||||||
|
"max_batch_size": 256,
|
||||||
|
"max_input_len": 4096,
|
||||||
|
"max_output_len": 4096,
|
||||||
|
"trt_llm_version": "r24.04"
|
||||||
|
},
|
||||||
|
"trt_client_parameters": {
|
||||||
|
"endpoint": "/v2/models/ensemble/generate_stream"
|
||||||
|
},
|
||||||
|
"vllm_server_parameters": {
|
||||||
|
"disable_log_stats": "",
|
||||||
|
"disable_log_requests": ""
|
||||||
|
},
|
||||||
|
"vllm_client_parameters": {
|
||||||
|
}
|
||||||
|
}
|
||||||
|
]
|
||||||
80
.buildkite/nightly-benchmarks/tests/serving-tests.json
Normal file
80
.buildkite/nightly-benchmarks/tests/serving-tests.json
Normal file
@@ -0,0 +1,80 @@
|
|||||||
|
[
|
||||||
|
{
|
||||||
|
"test_name": "serving_llama8B_tp1_sharegpt",
|
||||||
|
"qps_list": [1, 4, 16, "inf"],
|
||||||
|
"server_parameters": {
|
||||||
|
"model": "meta-llama/Meta-Llama-3-8B",
|
||||||
|
"tensor_parallel_size": 1,
|
||||||
|
"swap_space": 16,
|
||||||
|
"disable_log_stats": "",
|
||||||
|
"disable_log_requests": "",
|
||||||
|
"load_format": "dummy"
|
||||||
|
},
|
||||||
|
"client_parameters": {
|
||||||
|
"model": "meta-llama/Meta-Llama-3-8B",
|
||||||
|
"backend": "vllm",
|
||||||
|
"dataset_name": "sharegpt",
|
||||||
|
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||||
|
"num_prompts": 200
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"test_name": "serving_llama70B_tp4_sharegpt",
|
||||||
|
"qps_list": [1, 4, 16, "inf"],
|
||||||
|
"server_parameters": {
|
||||||
|
"model": "meta-llama/Meta-Llama-3-70B-Instruct",
|
||||||
|
"tensor_parallel_size": 4,
|
||||||
|
"swap_space": 16,
|
||||||
|
"disable_log_stats": "",
|
||||||
|
"disable_log_requests": "",
|
||||||
|
"load_format": "dummy"
|
||||||
|
},
|
||||||
|
"client_parameters": {
|
||||||
|
"model": "meta-llama/Meta-Llama-3-70B-Instruct",
|
||||||
|
"backend": "vllm",
|
||||||
|
"dataset_name": "sharegpt",
|
||||||
|
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||||
|
"num_prompts": 200
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"test_name": "serving_mixtral8x7B_tp2_sharegpt",
|
||||||
|
"qps_list": [1, 4, 16, "inf"],
|
||||||
|
"server_parameters": {
|
||||||
|
"model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
|
||||||
|
"tensor_parallel_size": 2,
|
||||||
|
"swap_space": 16,
|
||||||
|
"disable_log_stats": "",
|
||||||
|
"disable_log_requests": "",
|
||||||
|
"load_format": "dummy"
|
||||||
|
},
|
||||||
|
"client_parameters": {
|
||||||
|
"model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
|
||||||
|
"backend": "vllm",
|
||||||
|
"dataset_name": "sharegpt",
|
||||||
|
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||||
|
"num_prompts": 200
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"test_name": "serving_llama70B_tp4_sharegpt_specdecode",
|
||||||
|
"qps_list": [2],
|
||||||
|
"server_parameters": {
|
||||||
|
"model": "meta-llama/Meta-Llama-3-70B-Instruct",
|
||||||
|
"disable_log_requests": "",
|
||||||
|
"tensor_parallel_size": 4,
|
||||||
|
"swap_space": 16,
|
||||||
|
"speculative_model": "turboderp/Qwama-0.5B-Instruct",
|
||||||
|
"num_speculative_tokens": 4,
|
||||||
|
"speculative_draft_tensor_parallel_size": 1,
|
||||||
|
"use_v2_block_manager": ""
|
||||||
|
},
|
||||||
|
"client_parameters": {
|
||||||
|
"model": "meta-llama/Meta-Llama-3-70B-Instruct",
|
||||||
|
"backend": "vllm",
|
||||||
|
"dataset_name": "sharegpt",
|
||||||
|
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||||
|
"num_prompts": 200
|
||||||
|
}
|
||||||
|
}
|
||||||
|
]
|
||||||
35
.buildkite/nightly-benchmarks/tests/throughput-tests.json
Normal file
35
.buildkite/nightly-benchmarks/tests/throughput-tests.json
Normal file
@@ -0,0 +1,35 @@
|
|||||||
|
[
|
||||||
|
{
|
||||||
|
"test_name": "throughput_llama8B_tp1",
|
||||||
|
"parameters": {
|
||||||
|
"model": "meta-llama/Meta-Llama-3-8B",
|
||||||
|
"tensor_parallel_size": 1,
|
||||||
|
"load_format": "dummy",
|
||||||
|
"dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||||
|
"num_prompts": 200,
|
||||||
|
"backend": "vllm"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"test_name": "throughput_llama70B_tp4",
|
||||||
|
"parameters": {
|
||||||
|
"model": "meta-llama/Meta-Llama-3-70B-Instruct",
|
||||||
|
"tensor_parallel_size": 4,
|
||||||
|
"load_format": "dummy",
|
||||||
|
"dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||||
|
"num_prompts": 200,
|
||||||
|
"backend": "vllm"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"test_name": "throughput_mixtral8x7B_tp2",
|
||||||
|
"parameters": {
|
||||||
|
"model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
|
||||||
|
"tensor_parallel_size": 2,
|
||||||
|
"load_format": "dummy",
|
||||||
|
"dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||||
|
"num_prompts": 200,
|
||||||
|
"backend": "vllm"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
]
|
||||||
19
.buildkite/release-pipeline.yaml
Normal file
19
.buildkite/release-pipeline.yaml
Normal file
@@ -0,0 +1,19 @@
|
|||||||
|
steps:
|
||||||
|
- label: "Build wheel - CUDA {{matrix.cuda_version}}"
|
||||||
|
agents:
|
||||||
|
queue: cpu_queue
|
||||||
|
commands:
|
||||||
|
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg buildkite_commit=$BUILDKITE_COMMIT --build-arg USE_SCCACHE=1 --build-arg CUDA_VERSION={{matrix.cuda_version}} --tag vllm-ci:build-image --target build --progress plain ."
|
||||||
|
- "mkdir artifacts"
|
||||||
|
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
||||||
|
# rename the files to change linux -> manylinux1
|
||||||
|
- "for f in artifacts/dist/*.whl; do mv -- \"$$f\" \"$${f/linux/manylinux1}\"; done"
|
||||||
|
- "aws s3 cp --recursive artifacts/dist s3://vllm-wheels/$BUILDKITE_COMMIT/"
|
||||||
|
- "aws s3 cp --recursive artifacts/dist s3://vllm-wheels/nightly/"
|
||||||
|
env:
|
||||||
|
DOCKER_BUILDKIT: "1"
|
||||||
|
matrix:
|
||||||
|
setup:
|
||||||
|
cuda_version:
|
||||||
|
- "11.8.0"
|
||||||
|
- "12.1.0"
|
||||||
@@ -1,38 +1,84 @@
|
|||||||
# This script build the ROCm docker image and run the API server inside the container.
|
# This script runs test inside the corresponding ROCm docker container.
|
||||||
# It serves a sanity check for compilation and basic model usage.
|
|
||||||
set -ex
|
set -ex
|
||||||
|
|
||||||
# Print ROCm version
|
# Print ROCm version
|
||||||
|
echo "--- Confirming Clean Initial State"
|
||||||
|
while true; do
|
||||||
|
sleep 3
|
||||||
|
if grep -q clean /opt/amdgpu/etc/gpu_state; then
|
||||||
|
echo "GPUs state is \"clean\""
|
||||||
|
break
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
||||||
|
echo "--- ROCm info"
|
||||||
rocminfo
|
rocminfo
|
||||||
|
|
||||||
# Try building the docker image
|
# cleanup older docker images
|
||||||
docker build -t rocm -f Dockerfile.rocm .
|
cleanup_docker() {
|
||||||
|
# Get Docker's root directory
|
||||||
# Setup cleanup
|
docker_root=$(docker info -f '{{.DockerRootDir}}')
|
||||||
remove_docker_container() { docker rm -f rocm || true; }
|
if [ -z "$docker_root" ]; then
|
||||||
trap remove_docker_container EXIT
|
echo "Failed to determine Docker root directory."
|
||||||
remove_docker_container
|
exit 1
|
||||||
|
fi
|
||||||
# Run the image
|
echo "Docker root directory: $docker_root"
|
||||||
docker run --device /dev/kfd --device /dev/dri --network host --name rocm rocm python3 -m vllm.entrypoints.api_server &
|
# Check disk usage of the filesystem where Docker's root directory is located
|
||||||
|
disk_usage=$(df "$docker_root" | tail -1 | awk '{print $5}' | sed 's/%//')
|
||||||
# Wait for the server to start
|
# Define the threshold
|
||||||
wait_for_server_to_start() {
|
threshold=70
|
||||||
timeout=300
|
if [ "$disk_usage" -gt "$threshold" ]; then
|
||||||
counter=0
|
echo "Disk usage is above $threshold%. Cleaning up Docker images and volumes..."
|
||||||
|
# Remove dangling images (those that are not tagged and not used by any container)
|
||||||
while [ "$(curl -s -o /dev/null -w ''%{http_code}'' localhost:8000/health)" != "200" ]; do
|
docker image prune -f
|
||||||
sleep 1
|
# Remove unused volumes
|
||||||
counter=$((counter + 1))
|
docker volume prune -f
|
||||||
if [ $counter -ge $timeout ]; then
|
echo "Docker images and volumes cleanup completed."
|
||||||
echo "Timeout after $timeout seconds"
|
else
|
||||||
break
|
echo "Disk usage is below $threshold%. No cleanup needed."
|
||||||
fi
|
fi
|
||||||
done
|
|
||||||
}
|
}
|
||||||
wait_for_server_to_start
|
|
||||||
|
|
||||||
# Test a simple prompt
|
# Call the cleanup docker function
|
||||||
curl -X POST -H "Content-Type: application/json" \
|
cleanup_docker
|
||||||
localhost:8000/generate \
|
|
||||||
-d '{"prompt": "San Francisco is a"}'
|
echo "--- Resetting GPUs"
|
||||||
|
|
||||||
|
echo "reset" > /opt/amdgpu/etc/gpu_state
|
||||||
|
|
||||||
|
while true; do
|
||||||
|
sleep 3
|
||||||
|
if grep -q clean /opt/amdgpu/etc/gpu_state; then
|
||||||
|
echo "GPUs state is \"clean\""
|
||||||
|
break
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
||||||
|
echo "--- Pulling container"
|
||||||
|
image_name="rocm/vllm-ci:${BUILDKITE_COMMIT}"
|
||||||
|
container_name="rocm_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
|
||||||
|
docker pull ${image_name}
|
||||||
|
|
||||||
|
remove_docker_container() {
|
||||||
|
docker rm -f ${container_name} || docker image rm -f ${image_name} || true
|
||||||
|
}
|
||||||
|
trap remove_docker_container EXIT
|
||||||
|
|
||||||
|
echo "--- Running container"
|
||||||
|
|
||||||
|
HF_CACHE="$(realpath ~)/huggingface"
|
||||||
|
mkdir -p ${HF_CACHE}
|
||||||
|
HF_MOUNT="/root/.cache/huggingface"
|
||||||
|
|
||||||
|
docker run \
|
||||||
|
--device /dev/kfd --device /dev/dri \
|
||||||
|
--network host \
|
||||||
|
--shm-size=16gb \
|
||||||
|
--rm \
|
||||||
|
-e HF_TOKEN \
|
||||||
|
-v ${HF_CACHE}:${HF_MOUNT} \
|
||||||
|
-e HF_HOME=${HF_MOUNT} \
|
||||||
|
--name ${container_name} \
|
||||||
|
${image_name} \
|
||||||
|
/bin/bash -c "${@}"
|
||||||
|
|
||||||
|
|||||||
@@ -9,10 +9,10 @@ cd "$(dirname "${BASH_SOURCE[0]}")/.."
|
|||||||
(which wget && which curl) || (apt-get update && apt-get install -y wget curl)
|
(which wget && which curl) || (apt-get update && apt-get install -y wget curl)
|
||||||
|
|
||||||
# run python-based benchmarks and upload the result to buildkite
|
# run python-based benchmarks and upload the result to buildkite
|
||||||
python3 benchmarks/benchmark_latency.py 2>&1 | tee benchmark_latency.txt
|
python3 benchmarks/benchmark_latency.py --output-json latency_results.json 2>&1 | tee benchmark_latency.txt
|
||||||
bench_latency_exit_code=$?
|
bench_latency_exit_code=$?
|
||||||
|
|
||||||
python3 benchmarks/benchmark_throughput.py --input-len 256 --output-len 256 2>&1 | tee benchmark_throughput.txt
|
python3 benchmarks/benchmark_throughput.py --input-len 256 --output-len 256 --output-json throughput_results.json 2>&1 | tee benchmark_throughput.txt
|
||||||
bench_throughput_exit_code=$?
|
bench_throughput_exit_code=$?
|
||||||
|
|
||||||
# run server-based benchmarks and upload the result to buildkite
|
# run server-based benchmarks and upload the result to buildkite
|
||||||
@@ -50,11 +50,16 @@ echo "### Serving Benchmarks" >> benchmark_results.md
|
|||||||
sed -n '1p' benchmark_serving.txt >> benchmark_results.md # first line
|
sed -n '1p' benchmark_serving.txt >> benchmark_results.md # first line
|
||||||
echo "" >> benchmark_results.md
|
echo "" >> benchmark_results.md
|
||||||
echo '```' >> benchmark_results.md
|
echo '```' >> benchmark_results.md
|
||||||
tail -n 20 benchmark_serving.txt >> benchmark_results.md # last 20 lines
|
tail -n 24 benchmark_serving.txt >> benchmark_results.md # last 24 lines
|
||||||
echo '```' >> benchmark_results.md
|
echo '```' >> benchmark_results.md
|
||||||
|
|
||||||
|
# if the agent binary is not found, skip uploading the results, exit 0
|
||||||
|
if [ ! -f /usr/bin/buildkite-agent ]; then
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
|
||||||
# upload the results to buildkite
|
# upload the results to buildkite
|
||||||
/workspace/buildkite-agent annotate --style "info" --context "benchmark-results" < benchmark_results.md
|
buildkite-agent annotate --style "info" --context "benchmark-results" < benchmark_results.md
|
||||||
|
|
||||||
# exit with the exit code of the benchmarks
|
# exit with the exit code of the benchmarks
|
||||||
if [ $bench_latency_exit_code -ne 0 ]; then
|
if [ $bench_latency_exit_code -ne 0 ]; then
|
||||||
@@ -69,4 +74,5 @@ if [ $bench_serving_exit_code -ne 0 ]; then
|
|||||||
exit $bench_serving_exit_code
|
exit $bench_serving_exit_code
|
||||||
fi
|
fi
|
||||||
|
|
||||||
/workspace/buildkite-agent artifact upload openai-*.json
|
rm ShareGPT_V3_unfiltered_cleaned_split.json
|
||||||
|
buildkite-agent artifact upload "*.json"
|
||||||
|
|||||||
40
.buildkite/run-cpu-test.sh
Normal file
40
.buildkite/run-cpu-test.sh
Normal file
@@ -0,0 +1,40 @@
|
|||||||
|
# This script build the CPU docker image and run the offline inference inside the container.
|
||||||
|
# It serves a sanity check for compilation and basic model usage.
|
||||||
|
set -ex
|
||||||
|
|
||||||
|
# Try building the docker image
|
||||||
|
numactl -C 48-95 -N 1 docker build -t cpu-test -f Dockerfile.cpu .
|
||||||
|
numactl -C 48-95 -N 1 docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" -t cpu-test-avx2 -f Dockerfile.cpu .
|
||||||
|
|
||||||
|
# Setup cleanup
|
||||||
|
remove_docker_container() { docker rm -f cpu-test cpu-test-avx2 || true; }
|
||||||
|
trap remove_docker_container EXIT
|
||||||
|
remove_docker_container
|
||||||
|
|
||||||
|
# Run the image, setting --shm-size=4g for tensor parallel.
|
||||||
|
docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus=48-95 \
|
||||||
|
--cpuset-mems=1 --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test cpu-test
|
||||||
|
docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus=48-95 \
|
||||||
|
--cpuset-mems=1 --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-avx2 cpu-test-avx2
|
||||||
|
|
||||||
|
# offline inference
|
||||||
|
docker exec cpu-test-avx2 bash -c "python3 examples/offline_inference.py"
|
||||||
|
|
||||||
|
# Run basic model test
|
||||||
|
docker exec cpu-test bash -c "
|
||||||
|
pip install pytest Pillow protobuf
|
||||||
|
pytest -v -s tests/models -m \"not vlm\" --ignore=tests/models/test_embedding.py --ignore=tests/models/test_registry.py --ignore=tests/models/test_jamba.py --ignore=tests/models/test_danube3_4b.py" # Mamba and Danube3-4B on CPU is not supported
|
||||||
|
|
||||||
|
# online inference
|
||||||
|
docker exec cpu-test bash -c "
|
||||||
|
export VLLM_CPU_KVCACHE_SPACE=10
|
||||||
|
export VLLM_CPU_OMP_THREADS_BIND=48-92
|
||||||
|
python3 -m vllm.entrypoints.openai.api_server --model facebook/opt-125m &
|
||||||
|
timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1
|
||||||
|
python3 benchmarks/benchmark_serving.py \
|
||||||
|
--backend vllm \
|
||||||
|
--dataset-name random \
|
||||||
|
--model facebook/opt-125m \
|
||||||
|
--num-prompts 20 \
|
||||||
|
--endpoint /v1/completions \
|
||||||
|
--tokenizer facebook/opt-125m"
|
||||||
105
.buildkite/run-multi-node-test.sh
Executable file
105
.buildkite/run-multi-node-test.sh
Executable file
@@ -0,0 +1,105 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
set -euox pipefail
|
||||||
|
|
||||||
|
if [[ $# -lt 4 ]]; then
|
||||||
|
echo "Usage: .buildkite/run-multi-node-test.sh WORKING_DIR NUM_NODES NUM_GPUS DOCKER_IMAGE COMMAND1 COMMAND2 ... COMMANDN"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
WORKING_DIR=$1
|
||||||
|
NUM_NODES=$2
|
||||||
|
NUM_GPUS=$3
|
||||||
|
DOCKER_IMAGE=$4
|
||||||
|
|
||||||
|
shift 4
|
||||||
|
COMMANDS=("$@")
|
||||||
|
if [ ${#COMMANDS[@]} -ne $NUM_NODES ]; then
|
||||||
|
echo "The number of commands must be equal to the number of nodes."
|
||||||
|
echo "Number of nodes: $NUM_NODES"
|
||||||
|
echo "Number of commands: ${#COMMANDS[@]}"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "List of commands"
|
||||||
|
for command in "${COMMANDS[@]}"; do
|
||||||
|
echo $command
|
||||||
|
done
|
||||||
|
|
||||||
|
start_network() {
|
||||||
|
docker network create --subnet=192.168.10.0/24 docker-net
|
||||||
|
}
|
||||||
|
|
||||||
|
start_nodes() {
|
||||||
|
for node in $(seq 0 $(($NUM_NODES-1))); do
|
||||||
|
GPU_DEVICES='"device='
|
||||||
|
for node_gpu in $(seq 0 $(($NUM_GPUS - 1))); do
|
||||||
|
DEVICE_NUM=$(($node * $NUM_GPUS + $node_gpu))
|
||||||
|
GPU_DEVICES+=$(($DEVICE_NUM))
|
||||||
|
if [ $node_gpu -lt $(($NUM_GPUS - 1)) ]; then
|
||||||
|
GPU_DEVICES+=','
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
GPU_DEVICES+='"'
|
||||||
|
|
||||||
|
# start the container in detached mode
|
||||||
|
# things to note:
|
||||||
|
# 1. --shm-size=10.24gb is required. don't use --ipc=host
|
||||||
|
# 2. pass HF_TOKEN to the container
|
||||||
|
# 3. map the huggingface cache directory to the container
|
||||||
|
# 3. assign ip addresses to the containers (head node: 192.168.10.10, worker nodes:
|
||||||
|
# starting from 192.168.10.11)
|
||||||
|
docker run -d --gpus "$GPU_DEVICES" --shm-size=10.24gb -e HF_TOKEN -v ~/.cache/huggingface:/root/.cache/huggingface --name node$node --network docker-net --ip 192.168.10.$((10 + $node)) --rm $DOCKER_IMAGE /bin/bash -c "tail -f /dev/null"
|
||||||
|
|
||||||
|
# organize containers into a ray cluster
|
||||||
|
if [ $node -eq 0 ]; then
|
||||||
|
# start the ray head node
|
||||||
|
docker exec -d node$node /bin/bash -c "ray start --head --port=6379 --block"
|
||||||
|
# wait for the head node to be ready
|
||||||
|
sleep 10
|
||||||
|
else
|
||||||
|
# start the ray worker nodes, and connect them to the head node
|
||||||
|
docker exec -d node$node /bin/bash -c "ray start --address=192.168.10.10:6379 --block"
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
||||||
|
# wait for the cluster to be ready
|
||||||
|
sleep 10
|
||||||
|
|
||||||
|
# print the cluster status
|
||||||
|
docker exec node0 /bin/bash -c "ray status"
|
||||||
|
}
|
||||||
|
|
||||||
|
run_nodes() {
|
||||||
|
# important: iterate in reverse order to start the head node last
|
||||||
|
# we start the worker nodes first, in detached mode, and then start the head node
|
||||||
|
# in the foreground, so that the output of the head node is visible in the buildkite logs
|
||||||
|
for node in $(seq $(($NUM_NODES - 1)) -1 0); do
|
||||||
|
GPU_DEVICES='"device='
|
||||||
|
for node_gpu in $(seq 0 $(($NUM_GPUS - 1))); do
|
||||||
|
DEVICE_NUM=$(($node * $NUM_GPUS + $node_gpu))
|
||||||
|
GPU_DEVICES+=$(($DEVICE_NUM))
|
||||||
|
if [ $node_gpu -lt $(($NUM_GPUS - 1)) ]; then
|
||||||
|
GPU_DEVICES+=','
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
GPU_DEVICES+='"'
|
||||||
|
echo "Running node$node with GPU devices: $GPU_DEVICES"
|
||||||
|
if [ $node -ne 0 ]; then
|
||||||
|
docker exec -d node$node /bin/bash -c "cd $WORKING_DIR ; ${COMMANDS[$node]}"
|
||||||
|
else
|
||||||
|
docker exec node$node /bin/bash -c "cd $WORKING_DIR ; ${COMMANDS[$node]}"
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
}
|
||||||
|
cleanup() {
|
||||||
|
for node in $(seq 0 $(($NUM_NODES-1))); do
|
||||||
|
docker stop node$node
|
||||||
|
done
|
||||||
|
docker network rm docker-net
|
||||||
|
}
|
||||||
|
trap cleanup EXIT
|
||||||
|
start_network
|
||||||
|
start_nodes
|
||||||
|
run_nodes
|
||||||
|
|
||||||
51
.buildkite/run-neuron-test.sh
Normal file
51
.buildkite/run-neuron-test.sh
Normal file
@@ -0,0 +1,51 @@
|
|||||||
|
# This script build the Neuron docker image and run the API server inside the container.
|
||||||
|
# It serves a sanity check for compilation and basic model usage.
|
||||||
|
set -e
|
||||||
|
|
||||||
|
# Try building the docker image
|
||||||
|
aws ecr get-login-password --region us-west-2 | docker login --username AWS --password-stdin 763104351884.dkr.ecr.us-west-2.amazonaws.com
|
||||||
|
|
||||||
|
# prune old image and containers to save disk space, and only once a day
|
||||||
|
# by using a timestamp file in tmp.
|
||||||
|
if [ -f /tmp/neuron-docker-build-timestamp ]; then
|
||||||
|
last_build=$(cat /tmp/neuron-docker-build-timestamp)
|
||||||
|
current_time=$(date +%s)
|
||||||
|
if [ $((current_time - last_build)) -gt 86400 ]; then
|
||||||
|
docker system prune -f
|
||||||
|
echo $current_time > /tmp/neuron-docker-build-timestamp
|
||||||
|
fi
|
||||||
|
else
|
||||||
|
echo $(date +%s) > /tmp/neuron-docker-build-timestamp
|
||||||
|
fi
|
||||||
|
|
||||||
|
docker build -t neuron -f Dockerfile.neuron .
|
||||||
|
|
||||||
|
# Setup cleanup
|
||||||
|
remove_docker_container() { docker rm -f neuron || true; }
|
||||||
|
trap remove_docker_container EXIT
|
||||||
|
remove_docker_container
|
||||||
|
|
||||||
|
# Run the image
|
||||||
|
docker run --device=/dev/neuron0 --device=/dev/neuron1 --network host --name neuron neuron python3 -m vllm.entrypoints.api_server \
|
||||||
|
--model TinyLlama/TinyLlama-1.1B-Chat-v1.0 --max-num-seqs 8 --max-model-len 128 --block-size 128 --device neuron --tensor-parallel-size 2 &
|
||||||
|
|
||||||
|
# Wait for the server to start
|
||||||
|
wait_for_server_to_start() {
|
||||||
|
timeout=300
|
||||||
|
counter=0
|
||||||
|
|
||||||
|
while [ "$(curl -s -o /dev/null -w ''%{http_code}'' localhost:8000/health)" != "200" ]; do
|
||||||
|
sleep 1
|
||||||
|
counter=$((counter + 1))
|
||||||
|
if [ $counter -ge $timeout ]; then
|
||||||
|
echo "Timeout after $timeout seconds"
|
||||||
|
break
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
}
|
||||||
|
wait_for_server_to_start
|
||||||
|
|
||||||
|
# Test a simple prompt
|
||||||
|
curl -X POST -H "Content-Type: application/json" \
|
||||||
|
localhost:8000/generate \
|
||||||
|
-d '{"prompt": "San Francisco is a"}'
|
||||||
14
.buildkite/run-openvino-test.sh
Executable file
14
.buildkite/run-openvino-test.sh
Executable file
@@ -0,0 +1,14 @@
|
|||||||
|
# This script build the OpenVINO docker image and run the offline inference inside the container.
|
||||||
|
# It serves a sanity check for compilation and basic model usage.
|
||||||
|
set -ex
|
||||||
|
|
||||||
|
# Try building the docker image
|
||||||
|
docker build -t openvino-test -f Dockerfile.openvino .
|
||||||
|
|
||||||
|
# Setup cleanup
|
||||||
|
remove_docker_container() { docker rm -f openvino-test || true; }
|
||||||
|
trap remove_docker_container EXIT
|
||||||
|
remove_docker_container
|
||||||
|
|
||||||
|
# Run the image and launch offline inference
|
||||||
|
docker run --network host --env VLLM_OPENVINO_KVCACHE_SPACE=1 --name openvino-test openvino-test python3 /workspace/vllm/examples/offline_inference.py
|
||||||
16
.buildkite/run-tpu-test.sh
Normal file
16
.buildkite/run-tpu-test.sh
Normal file
@@ -0,0 +1,16 @@
|
|||||||
|
set -e
|
||||||
|
|
||||||
|
# Build the docker image.
|
||||||
|
docker build -f Dockerfile.tpu -t vllm-tpu .
|
||||||
|
|
||||||
|
# Set up cleanup.
|
||||||
|
remove_docker_container() { docker rm -f tpu-test || true; }
|
||||||
|
trap remove_docker_container EXIT
|
||||||
|
# Remove the container that might not be cleaned up in the previous run.
|
||||||
|
remove_docker_container
|
||||||
|
|
||||||
|
# For HF_TOKEN.
|
||||||
|
source /etc/environment
|
||||||
|
# Run a simple end-to-end example.
|
||||||
|
docker run --privileged --net host --shm-size=16G -it -e HF_TOKEN=$HF_TOKEN --name tpu-test vllm-tpu \
|
||||||
|
python3 /workspace/vllm/examples/offline_inference_tpu.py
|
||||||
14
.buildkite/run-xpu-test.sh
Normal file
14
.buildkite/run-xpu-test.sh
Normal file
@@ -0,0 +1,14 @@
|
|||||||
|
# This script build the CPU docker image and run the offline inference inside the container.
|
||||||
|
# It serves a sanity check for compilation and basic model usage.
|
||||||
|
set -ex
|
||||||
|
|
||||||
|
# Try building the docker image
|
||||||
|
docker build -t xpu-test -f Dockerfile.xpu .
|
||||||
|
|
||||||
|
# Setup cleanup
|
||||||
|
remove_docker_container() { docker rm -f xpu-test || true; }
|
||||||
|
trap remove_docker_container EXIT
|
||||||
|
remove_docker_container
|
||||||
|
|
||||||
|
# Run the image and launch offline inference
|
||||||
|
docker run --network host --name xpu-test --device /dev/dri -v /dev/dri/by-path:/dev/dri/by-path xpu-test python3 examples/offline_inference.py
|
||||||
@@ -1,97 +1,268 @@
|
|||||||
# In this file, you can add more tests to run either by adding a new step or
|
# In this file, you can add more tests to run either by adding a new step or
|
||||||
# adding a new command to an existing step. See different options here for examples.
|
# adding a new command to an existing step. See different options here for examples.
|
||||||
# This script will be feed into Jinja template in `test-template.j2` to generate
|
|
||||||
# the final pipeline yaml file.
|
# This script will be feed into Jinja template in `test-template-aws.j2` at
|
||||||
|
# https://github.com/vllm-project/buildkite-ci/blob/main/scripts/test-template-aws.j2
|
||||||
|
# to generate the final pipeline yaml file.
|
||||||
|
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
|
- label: Async Engine, Inputs, Utils, Worker Test
|
||||||
|
fast_check: true
|
||||||
|
fast_check_only: true
|
||||||
|
commands:
|
||||||
|
- pytest -v -s async_engine # Async Engine
|
||||||
|
- pytest -v -s test_inputs.py
|
||||||
|
- pytest -v -s multimodal
|
||||||
|
- pytest -v -s test_utils.py # Utils
|
||||||
|
- pytest -v -s worker # Worker
|
||||||
|
|
||||||
|
- label: Metrics, Tracing Test
|
||||||
|
fast_check: true
|
||||||
|
fast_check_only: true
|
||||||
|
commands:
|
||||||
|
- pytest -v -s metrics # Metrics
|
||||||
|
- "pip install \
|
||||||
|
opentelemetry-sdk \
|
||||||
|
opentelemetry-api \
|
||||||
|
opentelemetry-exporter-otlp \
|
||||||
|
opentelemetry-semantic-conventions-ai" # Tracing
|
||||||
|
- pytest -v -s tracing
|
||||||
|
|
||||||
- label: Regression Test
|
- label: Regression Test
|
||||||
|
mirror_hardwares: [amd]
|
||||||
|
fast_check: true
|
||||||
command: pytest -v -s test_regression.py
|
command: pytest -v -s test_regression.py
|
||||||
working_dir: "/vllm-workspace/tests" # optional
|
working_dir: "/vllm-workspace/tests" # optional
|
||||||
|
|
||||||
- label: AsyncEngine Test
|
- label: AsyncEngine Test
|
||||||
|
#mirror_hardwares: [amd]
|
||||||
command: pytest -v -s async_engine
|
command: pytest -v -s async_engine
|
||||||
|
|
||||||
- label: Basic Correctness Test
|
- label: Basic Correctness Test
|
||||||
command: pytest -v -s basic_correctness
|
mirror_hardwares: [amd]
|
||||||
|
fast_check: true
|
||||||
|
commands:
|
||||||
|
# This flashinfer installation will fail on AMD ROCm, so it is set as optional.
|
||||||
|
- pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.1.2/flashinfer-0.1.2+cu121torch2.4-cp310-cp310-linux_x86_64.whl || true
|
||||||
|
- pytest -v -s basic_correctness/test_basic_correctness.py
|
||||||
|
- pytest -v -s basic_correctness/test_cpu_offload.py
|
||||||
|
- VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s basic_correctness/test_chunked_prefill.py
|
||||||
|
- VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_chunked_prefill.py
|
||||||
|
- VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py
|
||||||
|
|
||||||
- label: Core Test
|
- label: Core Test
|
||||||
command: pytest -v -s core
|
mirror_hardwares: [amd]
|
||||||
|
fast_check: true
|
||||||
|
commands:
|
||||||
|
- pytest -v -s core
|
||||||
|
|
||||||
- label: Distributed Comm Ops Test
|
- label: Distributed Comm Ops Test
|
||||||
command: pytest -v -s test_comm_ops.py
|
#mirror_hardwares: [amd]
|
||||||
working_dir: "/vllm-workspace/tests/distributed"
|
working_dir: "/vllm-workspace/tests"
|
||||||
num_gpus: 2 # only support 1 or 2 for now.
|
num_gpus: 2
|
||||||
|
|
||||||
- label: Distributed Tests
|
|
||||||
working_dir: "/vllm-workspace/tests/distributed"
|
|
||||||
num_gpus: 2 # only support 1 or 2 for now.
|
|
||||||
commands:
|
commands:
|
||||||
- pytest -v -s test_pynccl.py
|
- pytest -v -s distributed/test_comm_ops.py
|
||||||
- TEST_DIST_MODEL=facebook/opt-125m pytest -v -s test_basic_distributed_correctness.py
|
- pytest -v -s distributed/test_shm_broadcast.py
|
||||||
- TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf pytest -v -s test_basic_distributed_correctness.py
|
|
||||||
|
- label: 2 Node Tests (4 GPUs in total)
|
||||||
|
working_dir: "/vllm-workspace/tests"
|
||||||
|
num_gpus: 2
|
||||||
|
num_nodes: 2
|
||||||
|
commands:
|
||||||
|
- # the following commands are for the first node, with ip 192.168.10.10 (ray environment already set up)
|
||||||
|
- VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py
|
||||||
|
- VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py
|
||||||
|
- # the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up)
|
||||||
|
- VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py
|
||||||
|
|
||||||
|
- label: Distributed Tests (2 GPUs)
|
||||||
|
mirror_hardwares: [amd]
|
||||||
|
working_dir: "/vllm-workspace/tests"
|
||||||
|
num_gpus: 2
|
||||||
|
commands:
|
||||||
|
- VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py
|
||||||
|
- TARGET_TEST_SUITE=L4 pytest -v -s distributed/test_basic_distributed_correctness.py
|
||||||
|
- pytest -v -s distributed/test_chunked_prefill_distributed.py
|
||||||
|
- pytest -v -s distributed/test_multimodal_broadcast.py
|
||||||
|
- pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py
|
||||||
|
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
|
||||||
|
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s distributed/test_utils.py
|
||||||
|
|
||||||
|
- label: Distributed Tests (4 GPUs)
|
||||||
|
#mirror_hardwares: [amd]
|
||||||
|
working_dir: "/vllm-workspace/tests"
|
||||||
|
num_gpus: 4
|
||||||
|
fast_check: true
|
||||||
|
commands:
|
||||||
|
- pytest -v -s distributed/test_pynccl.py
|
||||||
|
- pytest -v -s spec_decode/e2e/test_integration_dist_tp4.py
|
||||||
|
|
||||||
|
- label: Pipeline Parallelism Test
|
||||||
|
working_dir: "/vllm-workspace/tests"
|
||||||
|
num_gpus: 4
|
||||||
|
commands:
|
||||||
|
- pytest -v -s distributed/test_pipeline_parallel.py
|
||||||
|
|
||||||
- label: Engine Test
|
- label: Engine Test
|
||||||
command: pytest -v -s engine tokenization test_sequence.py test_config.py
|
mirror_hardwares: [amd]
|
||||||
|
commands:
|
||||||
|
- pytest -v -s engine test_sequence.py test_config.py test_logger.py
|
||||||
|
# OOM in the CI unless we run this separately
|
||||||
|
- pytest -v -s tokenization
|
||||||
|
|
||||||
- label: Entrypoints Test
|
- label: Entrypoints Test
|
||||||
command: pytest -v -s entrypoints
|
fast_check: true
|
||||||
|
mirror_hardwares: [amd]
|
||||||
|
|
||||||
|
commands:
|
||||||
|
- pytest -v -s entrypoints/llm
|
||||||
|
- pytest -v -s entrypoints/openai
|
||||||
|
|
||||||
- label: Examples Test
|
- label: Examples Test
|
||||||
working_dir: "/vllm-workspace/examples"
|
working_dir: "/vllm-workspace/examples"
|
||||||
|
mirror_hardwares: [amd]
|
||||||
commands:
|
commands:
|
||||||
# install aws cli for llava_example.py
|
# install tensorizer for tensorize_vllm_model.py
|
||||||
- pip install awscli
|
- pip install awscli tensorizer
|
||||||
- python3 offline_inference.py
|
- python3 offline_inference.py
|
||||||
|
- python3 cpu_offload.py
|
||||||
- python3 offline_inference_with_prefix.py
|
- python3 offline_inference_with_prefix.py
|
||||||
- python3 llm_engine_example.py
|
- python3 llm_engine_example.py
|
||||||
- python3 llava_example.py
|
- python3 offline_inference_vision_language.py
|
||||||
|
- python3 tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
|
||||||
|
|
||||||
- label: Kernels Test %N
|
- label: Inputs Test
|
||||||
command: pytest -v -s kernels --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
|
#mirror_hardwares: [amd]
|
||||||
parallelism: 4
|
commands:
|
||||||
|
- pytest -v -s test_inputs.py
|
||||||
|
- pytest -v -s multimodal
|
||||||
|
|
||||||
|
# - label: Kernels Test %N
|
||||||
|
# #mirror_hardwares: [amd]
|
||||||
|
# commands:
|
||||||
|
# - pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.0.8/flashinfer-0.0.8+cu121torch2.3-cp310-cp310-linux_x86_64.whl
|
||||||
|
# - pytest -v -s kernels --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
|
||||||
|
# parallelism: 4
|
||||||
|
|
||||||
- label: Models Test
|
- label: Models Test
|
||||||
|
#mirror_hardwares: [amd]
|
||||||
commands:
|
commands:
|
||||||
- bash ../.buildkite/download-images.sh
|
- pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.1.2/flashinfer-0.1.2+cu121torch2.4-cp310-cp310-linux_x86_64.whl
|
||||||
- pytest -v -s models --ignore=models/test_llava.py --ignore=models/test_mistral.py
|
- pytest -v -s models -m \"not vlm\"
|
||||||
|
|
||||||
- label: Llava Test
|
- label: Vision Language Models Test
|
||||||
|
mirror_hardwares: [amd]
|
||||||
commands:
|
commands:
|
||||||
- bash ../.buildkite/download-images.sh
|
- pytest -v -s models -m vlm
|
||||||
- pytest -v -s models/test_llava.py
|
|
||||||
|
|
||||||
- label: Prefix Caching Test
|
- label: Prefix Caching Test
|
||||||
|
mirror_hardwares: [amd]
|
||||||
commands:
|
commands:
|
||||||
- pytest -v -s prefix_caching
|
- pytest -v -s prefix_caching
|
||||||
|
|
||||||
- label: Samplers Test
|
- label: Samplers Test
|
||||||
|
#mirror_hardwares: [amd]
|
||||||
command: pytest -v -s samplers
|
command: pytest -v -s samplers
|
||||||
|
|
||||||
- label: LogitsProcessor Test
|
- label: LogitsProcessor Test
|
||||||
|
mirror_hardwares: [amd]
|
||||||
command: pytest -v -s test_logits_processor.py
|
command: pytest -v -s test_logits_processor.py
|
||||||
|
|
||||||
|
- label: Utils Test
|
||||||
|
commands:
|
||||||
|
- pytest -v -s test_utils.py
|
||||||
|
- pytest -v -s test_embedded_commit.py
|
||||||
|
|
||||||
- label: Worker Test
|
- label: Worker Test
|
||||||
|
mirror_hardwares: [amd]
|
||||||
command: pytest -v -s worker
|
command: pytest -v -s worker
|
||||||
|
|
||||||
- label: Speculative decoding tests
|
- label: Speculative decoding tests
|
||||||
command: pytest -v -s spec_decode
|
#mirror_hardwares: [amd]
|
||||||
|
commands:
|
||||||
|
# See https://github.com/vllm-project/vllm/issues/5152
|
||||||
|
- export VLLM_ATTENTION_BACKEND=XFORMERS
|
||||||
|
- pytest -v -s spec_decode
|
||||||
|
|
||||||
- label: LoRA Test %N
|
# - label: LoRA Test %N
|
||||||
command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
|
# #mirror_hardwares: [amd]
|
||||||
parallelism: 4
|
# command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_long_context.py
|
||||||
|
# parallelism: 4
|
||||||
|
|
||||||
|
# - label: LoRA Long Context (Distributed)
|
||||||
|
# #mirror_hardwares: [amd]
|
||||||
|
# num_gpus: 4
|
||||||
|
# # This test runs llama 13B, so it is required to run on 4 GPUs.
|
||||||
|
# commands:
|
||||||
|
# # FIXIT: find out which code initialize cuda before running the test
|
||||||
|
# # before the fix, we need to use spawn to test it
|
||||||
|
# - export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||||
|
# - pytest -v -s -x lora/test_long_context.py
|
||||||
|
|
||||||
|
- label: Tensorizer Test
|
||||||
|
#mirror_hardwares: [amd]
|
||||||
|
fast_check: true
|
||||||
|
commands:
|
||||||
|
- apt-get install -y curl libsodium23
|
||||||
|
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||||
|
- pytest -v -s tensorizer_loader
|
||||||
|
|
||||||
- label: Metrics Test
|
- label: Metrics Test
|
||||||
|
mirror_hardwares: [amd]
|
||||||
command: pytest -v -s metrics
|
command: pytest -v -s metrics
|
||||||
|
|
||||||
|
- label: Quantization Test
|
||||||
|
#mirror_hardwares: [amd]
|
||||||
|
command: pytest -v -s quantization
|
||||||
|
|
||||||
|
- label: Tracing Test
|
||||||
|
commands:
|
||||||
|
- "pip install \
|
||||||
|
opentelemetry-sdk \
|
||||||
|
opentelemetry-api \
|
||||||
|
opentelemetry-exporter-otlp \
|
||||||
|
opentelemetry-semantic-conventions-ai"
|
||||||
|
- pytest -v -s tracing
|
||||||
|
|
||||||
- label: Benchmarks
|
- label: Benchmarks
|
||||||
working_dir: "/vllm-workspace/.buildkite"
|
working_dir: "/vllm-workspace/.buildkite"
|
||||||
|
mirror_hardwares: [amd]
|
||||||
commands:
|
commands:
|
||||||
- pip install aiohttp
|
- pip install aiohttp
|
||||||
- bash run-benchmarks.sh
|
- bash run-benchmarks.sh
|
||||||
|
|
||||||
|
- label: LM Eval Small Models
|
||||||
|
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
|
||||||
|
commands:
|
||||||
|
- pip install lm-eval
|
||||||
|
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||||
|
- bash ./run-tests.sh -c configs/models-small.txt -t 1
|
||||||
|
|
||||||
|
- label: LM Eval Large Models
|
||||||
|
gpu: a100
|
||||||
|
num_gpus: 4
|
||||||
|
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
|
||||||
|
commands:
|
||||||
|
- pip install lm-eval
|
||||||
|
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||||
|
- bash ./run-tests.sh -c configs/models-large.txt -t 4
|
||||||
|
|
||||||
- label: Documentation Build
|
- label: Documentation Build
|
||||||
working_dir: "/vllm-workspace/docs"
|
working_dir: "/vllm-workspace/test_docs/docs"
|
||||||
|
fast_check: true
|
||||||
no_gpu: True
|
no_gpu: True
|
||||||
commands:
|
commands:
|
||||||
- pip install -r requirements-docs.txt
|
- pip install -r requirements-docs.txt
|
||||||
- SPHINXOPTS=\"-W\" make html
|
- SPHINXOPTS=\"-W\" make html
|
||||||
|
|
||||||
|
- label: Distributed Tests (A100)
|
||||||
|
gpu: a100
|
||||||
|
num_gpus: 4
|
||||||
|
commands:
|
||||||
|
# NOTE: don't test llama model here, it seems hf implementation is buggy
|
||||||
|
# see https://github.com/vllm-project/vllm/pull/5689 for details
|
||||||
|
- pytest -v -s distributed/test_custom_all_reduce.py
|
||||||
|
- pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.1.2/flashinfer-0.1.2+cu121torch2.4-cp310-cp310-linux_x86_64.whl
|
||||||
|
- TARGET_TEST_SUITE=A100 pytest -v -s distributed/test_basic_distributed_correctness.py
|
||||||
|
- pytest -v -s -x lora/test_mixtral.py
|
||||||
|
|||||||
@@ -1,66 +0,0 @@
|
|||||||
{% set docker_image = "us-central1-docker.pkg.dev/vllm-405802/vllm-ci-test-repo/vllm-test:$BUILDKITE_COMMIT" %}
|
|
||||||
{% set default_num_gpu = 1 %}
|
|
||||||
{% set default_working_dir = "/vllm-workspace/tests" %}
|
|
||||||
|
|
||||||
steps:
|
|
||||||
- label: "AMD Test"
|
|
||||||
agents:
|
|
||||||
queue: amd
|
|
||||||
command: bash .buildkite/run-amd-test.sh
|
|
||||||
|
|
||||||
- label: ":docker: build image"
|
|
||||||
commands:
|
|
||||||
- "docker build --build-arg max_jobs=16 --tag {{ docker_image }} --target test --progress plain ."
|
|
||||||
- "docker push {{ docker_image }}"
|
|
||||||
env:
|
|
||||||
DOCKER_BUILDKIT: "1"
|
|
||||||
retry:
|
|
||||||
automatic:
|
|
||||||
- exit_status: -1 # Agent was lost
|
|
||||||
limit: 5
|
|
||||||
- wait
|
|
||||||
|
|
||||||
{% for step in steps %}
|
|
||||||
- label: "{{ step.label }}"
|
|
||||||
agents:
|
|
||||||
queue: kubernetes
|
|
||||||
soft_fail: {{ step.soft_fail or false }}
|
|
||||||
{% if step.parallelism %}
|
|
||||||
parallelism: {{ step.parallelism }}
|
|
||||||
{% endif %}
|
|
||||||
retry:
|
|
||||||
automatic:
|
|
||||||
- exit_status: -1 # Agent was lost
|
|
||||||
limit: 5
|
|
||||||
plugins:
|
|
||||||
- kubernetes:
|
|
||||||
podSpec:
|
|
||||||
volumes:
|
|
||||||
- name: dshm
|
|
||||||
emptyDir:
|
|
||||||
medium: Memory
|
|
||||||
containers:
|
|
||||||
- image: "{{ docker_image }}"
|
|
||||||
command: ["bash"]
|
|
||||||
args:
|
|
||||||
- '-c'
|
|
||||||
- "'cd {{ (step.working_dir or default_working_dir) | safe }} && {{ step.command or (step.commands | join(' && ')) | safe }}'"
|
|
||||||
{% if not step.no_gpu %}
|
|
||||||
resources:
|
|
||||||
requests:
|
|
||||||
nvidia.com/gpu: "{{ step.num_gpus or default_num_gpu }}"
|
|
||||||
limits:
|
|
||||||
nvidia.com/gpu: "{{ step.num_gpus or default_num_gpu }}"
|
|
||||||
{% endif %}
|
|
||||||
env:
|
|
||||||
- name: VLLM_USAGE_SOURCE
|
|
||||||
value: ci-test
|
|
||||||
- name: HF_TOKEN
|
|
||||||
valueFrom:
|
|
||||||
secretKeyRef:
|
|
||||||
name: hf-token-secret
|
|
||||||
key: token
|
|
||||||
volumeMounts:
|
|
||||||
- mountPath: /dev/shm
|
|
||||||
name: dshm
|
|
||||||
{% endfor %}
|
|
||||||
26
.clang-format
Normal file
26
.clang-format
Normal file
@@ -0,0 +1,26 @@
|
|||||||
|
BasedOnStyle: Google
|
||||||
|
UseTab: Never
|
||||||
|
IndentWidth: 2
|
||||||
|
ColumnLimit: 80
|
||||||
|
|
||||||
|
# Force pointers to the type for C++.
|
||||||
|
DerivePointerAlignment: false
|
||||||
|
PointerAlignment: Left
|
||||||
|
|
||||||
|
# Reordering #include statements can (and currently will) introduce errors
|
||||||
|
SortIncludes: false
|
||||||
|
|
||||||
|
# Style choices
|
||||||
|
AlignConsecutiveAssignments: false
|
||||||
|
AlignConsecutiveDeclarations: false
|
||||||
|
IndentPPDirectives: BeforeHash
|
||||||
|
|
||||||
|
IncludeCategories:
|
||||||
|
- Regex: '^<'
|
||||||
|
Priority: 4
|
||||||
|
- Regex: '^"(llvm|llvm-c|clang|clang-c|mlir|mlir-c)/'
|
||||||
|
Priority: 3
|
||||||
|
- Regex: '^"(qoda|\.\.)/'
|
||||||
|
Priority: 2
|
||||||
|
- Regex: '.*'
|
||||||
|
Priority: 1
|
||||||
2
.github/FUNDING.yml
vendored
Normal file
2
.github/FUNDING.yml
vendored
Normal file
@@ -0,0 +1,2 @@
|
|||||||
|
github: [vllm-project]
|
||||||
|
open_collective: [vllm]
|
||||||
1
.github/ISSUE_TEMPLATE/200-installation.yml
vendored
1
.github/ISSUE_TEMPLATE/200-installation.yml
vendored
@@ -18,6 +18,7 @@ body:
|
|||||||
# For security purposes, please feel free to check the contents of collect_env.py before running it.
|
# For security purposes, please feel free to check the contents of collect_env.py before running it.
|
||||||
python collect_env.py
|
python collect_env.py
|
||||||
```
|
```
|
||||||
|
It is suggested to download and execute the latest script, as vllm might frequently update the diagnosis information needed for accurately and quickly responding to issues.
|
||||||
value: |
|
value: |
|
||||||
```text
|
```text
|
||||||
The output of `python collect_env.py`
|
The output of `python collect_env.py`
|
||||||
|
|||||||
1
.github/ISSUE_TEMPLATE/300-usage.yml
vendored
1
.github/ISSUE_TEMPLATE/300-usage.yml
vendored
@@ -18,6 +18,7 @@ body:
|
|||||||
# For security purposes, please feel free to check the contents of collect_env.py before running it.
|
# For security purposes, please feel free to check the contents of collect_env.py before running it.
|
||||||
python collect_env.py
|
python collect_env.py
|
||||||
```
|
```
|
||||||
|
It is suggested to download and execute the latest script, as vllm might frequently update the diagnosis information needed for accurately and quickly responding to issues.
|
||||||
value: |
|
value: |
|
||||||
```text
|
```text
|
||||||
The output of `python collect_env.py`
|
The output of `python collect_env.py`
|
||||||
|
|||||||
5
.github/ISSUE_TEMPLATE/400-bug report.yml
vendored
5
.github/ISSUE_TEMPLATE/400-bug report.yml
vendored
@@ -18,6 +18,7 @@ body:
|
|||||||
# For security purposes, please feel free to check the contents of collect_env.py before running it.
|
# For security purposes, please feel free to check the contents of collect_env.py before running it.
|
||||||
python collect_env.py
|
python collect_env.py
|
||||||
```
|
```
|
||||||
|
It is suggested to download and execute the latest script, as vllm might frequently update the diagnosis information needed for accurately and quickly responding to issues.
|
||||||
value: |
|
value: |
|
||||||
```text
|
```text
|
||||||
The output of `python collect_env.py`
|
The output of `python collect_env.py`
|
||||||
@@ -57,6 +58,10 @@ body:
|
|||||||
If the code is too long (hopefully, it isn't), feel free to put it in a public gist and link it in the issue: https://gist.github.com.
|
If the code is too long (hopefully, it isn't), feel free to put it in a public gist and link it in the issue: https://gist.github.com.
|
||||||
|
|
||||||
Please also paste or describe the results you observe instead of the expected results. If you observe an error, please paste the error message including the **full** traceback of the exception. It may be relevant to wrap error messages in ```` ```triple quotes blocks``` ````.
|
Please also paste or describe the results you observe instead of the expected results. If you observe an error, please paste the error message including the **full** traceback of the exception. It may be relevant to wrap error messages in ```` ```triple quotes blocks``` ````.
|
||||||
|
|
||||||
|
Please set the environment variable `export VLLM_LOGGING_LEVEL=DEBUG` to turn on more logging to help debugging potential issues.
|
||||||
|
|
||||||
|
If you experienced crashes or hangs, it would be helpful to run vllm with `export VLLM_TRACE_FUNCTION=1` . All the function calls in vllm will be recorded. Inspect these log files, and tell which function crashes or hangs.
|
||||||
placeholder: |
|
placeholder: |
|
||||||
A clear and concise description of what the bug is.
|
A clear and concise description of what the bug is.
|
||||||
|
|
||||||
|
|||||||
@@ -39,6 +39,7 @@ body:
|
|||||||
# For security purposes, please feel free to check the contents of collect_env.py before running it.
|
# For security purposes, please feel free to check the contents of collect_env.py before running it.
|
||||||
python collect_env.py
|
python collect_env.py
|
||||||
```
|
```
|
||||||
|
It is suggested to download and execute the latest script, as vllm might frequently update the diagnosis information needed for accurately and quickly responding to issues.
|
||||||
value: |
|
value: |
|
||||||
```text
|
```text
|
||||||
The output of `python collect_env.py`
|
The output of `python collect_env.py`
|
||||||
|
|||||||
49
.github/ISSUE_TEMPLATE/750-RFC.yml
vendored
Normal file
49
.github/ISSUE_TEMPLATE/750-RFC.yml
vendored
Normal file
@@ -0,0 +1,49 @@
|
|||||||
|
name: 💬 Request for comments (RFC).
|
||||||
|
description: Ask for feedback on major architectural changes or design choices.
|
||||||
|
title: "[RFC]: "
|
||||||
|
labels: ["RFC"]
|
||||||
|
|
||||||
|
body:
|
||||||
|
- type: markdown
|
||||||
|
attributes:
|
||||||
|
value: >
|
||||||
|
#### Please take a look at previous [RFCs](https://github.com/vllm-project/vllm/issues?q=label%3ARFC+sort%3Aupdated-desc) for reference.
|
||||||
|
- type: textarea
|
||||||
|
attributes:
|
||||||
|
label: Motivation.
|
||||||
|
description: >
|
||||||
|
The motivation of the RFC.
|
||||||
|
validations:
|
||||||
|
required: true
|
||||||
|
- type: textarea
|
||||||
|
attributes:
|
||||||
|
label: Proposed Change.
|
||||||
|
description: >
|
||||||
|
The proposed change of the RFC.
|
||||||
|
validations:
|
||||||
|
required: true
|
||||||
|
- type: textarea
|
||||||
|
attributes:
|
||||||
|
label: Feedback Period.
|
||||||
|
description: >
|
||||||
|
The feedback period of the RFC. Usually at least one week.
|
||||||
|
validations:
|
||||||
|
required: false
|
||||||
|
- type: textarea
|
||||||
|
attributes:
|
||||||
|
label: CC List.
|
||||||
|
description: >
|
||||||
|
The list of people you want to CC.
|
||||||
|
validations:
|
||||||
|
required: false
|
||||||
|
- type: textarea
|
||||||
|
attributes:
|
||||||
|
label: Any Other Things.
|
||||||
|
description: >
|
||||||
|
Any other things you would like to mention.
|
||||||
|
validations:
|
||||||
|
required: false
|
||||||
|
- type: markdown
|
||||||
|
attributes:
|
||||||
|
value: >
|
||||||
|
Thanks for contributing 🎉!
|
||||||
21
.github/workflows/add_label_automerge.yml
vendored
Normal file
21
.github/workflows/add_label_automerge.yml
vendored
Normal file
@@ -0,0 +1,21 @@
|
|||||||
|
name: Add label on auto-merge enabled
|
||||||
|
on:
|
||||||
|
pull_request_target:
|
||||||
|
types:
|
||||||
|
- auto_merge_enabled
|
||||||
|
jobs:
|
||||||
|
add-label-on-auto-merge:
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
steps:
|
||||||
|
- name: Add label
|
||||||
|
uses: actions/github-script@v5
|
||||||
|
with:
|
||||||
|
script: |
|
||||||
|
github.rest.issues.addLabels({
|
||||||
|
owner: context.repo.owner,
|
||||||
|
repo: context.repo.repo,
|
||||||
|
issue_number: context.issue.number,
|
||||||
|
labels: ['ready']
|
||||||
|
})
|
||||||
|
env:
|
||||||
|
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||||
23
.github/workflows/add_label_ready_comment.yml
vendored
Normal file
23
.github/workflows/add_label_ready_comment.yml
vendored
Normal file
@@ -0,0 +1,23 @@
|
|||||||
|
name: Add Ready Label on Ready Comment
|
||||||
|
|
||||||
|
on:
|
||||||
|
issue_comment:
|
||||||
|
types: [created]
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
add-ready-label:
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
if: github.event.issue.pull_request && contains(github.event.comment.body, '/ready')
|
||||||
|
steps:
|
||||||
|
- name: Add label
|
||||||
|
uses: actions/github-script@v5
|
||||||
|
with:
|
||||||
|
script: |
|
||||||
|
github.rest.issues.addLabels({
|
||||||
|
owner: context.repo.owner,
|
||||||
|
repo: context.repo.repo,
|
||||||
|
issue_number: context.issue.number,
|
||||||
|
labels: ['ready']
|
||||||
|
})
|
||||||
|
env:
|
||||||
|
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||||
36
.github/workflows/clang-format.yml
vendored
Normal file
36
.github/workflows/clang-format.yml
vendored
Normal file
@@ -0,0 +1,36 @@
|
|||||||
|
name: clang-format
|
||||||
|
|
||||||
|
on:
|
||||||
|
# Trigger the workflow on push or pull request,
|
||||||
|
# but only for the main branch
|
||||||
|
push:
|
||||||
|
branches:
|
||||||
|
- main
|
||||||
|
pull_request:
|
||||||
|
branches:
|
||||||
|
- main
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
clang-format:
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
strategy:
|
||||||
|
matrix:
|
||||||
|
python-version: ["3.11"]
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@v2
|
||||||
|
- name: Set up Python ${{ matrix.python-version }}
|
||||||
|
uses: actions/setup-python@v2
|
||||||
|
with:
|
||||||
|
python-version: ${{ matrix.python-version }}
|
||||||
|
- name: Install dependencies
|
||||||
|
run: |
|
||||||
|
python -m pip install --upgrade pip
|
||||||
|
pip install clang-format==18.1.5
|
||||||
|
- name: Running clang-format
|
||||||
|
run: |
|
||||||
|
EXCLUDES=(
|
||||||
|
'csrc/moe/topk_softmax_kernels.cu'
|
||||||
|
)
|
||||||
|
find csrc/ \( -name '*.h' -o -name '*.cpp' -o -name '*.cu' -o -name '*.cuh' \) -print \
|
||||||
|
| grep -vFf <(printf "%s\n" "${EXCLUDES[@]}") \
|
||||||
|
| xargs clang-format --dry-run --Werror
|
||||||
48
.github/workflows/mypy.yaml
vendored
Normal file
48
.github/workflows/mypy.yaml
vendored
Normal file
@@ -0,0 +1,48 @@
|
|||||||
|
name: mypy
|
||||||
|
|
||||||
|
on:
|
||||||
|
# Trigger the workflow on push or pull request,
|
||||||
|
# but only for the main branch
|
||||||
|
push:
|
||||||
|
branches:
|
||||||
|
- main
|
||||||
|
pull_request:
|
||||||
|
branches:
|
||||||
|
- main
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
ruff:
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
strategy:
|
||||||
|
matrix:
|
||||||
|
python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@v2
|
||||||
|
- name: Set up Python ${{ matrix.python-version }}
|
||||||
|
uses: actions/setup-python@v2
|
||||||
|
with:
|
||||||
|
python-version: ${{ matrix.python-version }}
|
||||||
|
- name: Install dependencies
|
||||||
|
run: |
|
||||||
|
python -m pip install --upgrade pip
|
||||||
|
pip install mypy==1.9.0
|
||||||
|
pip install types-setuptools
|
||||||
|
pip install types-PyYAML
|
||||||
|
pip install types-requests
|
||||||
|
pip install types-setuptools
|
||||||
|
- name: Mypy
|
||||||
|
run: |
|
||||||
|
mypy
|
||||||
|
mypy tests --follow-imports skip
|
||||||
|
mypy vllm/attention --follow-imports skip
|
||||||
|
mypy vllm/core --follow-imports skip
|
||||||
|
mypy vllm/distributed --follow-imports skip
|
||||||
|
mypy vllm/engine --follow-imports skip
|
||||||
|
mypy vllm/entrypoints --follow-imports skip
|
||||||
|
mypy vllm/executor --follow-imports skip
|
||||||
|
mypy vllm/lora --follow-imports skip
|
||||||
|
mypy vllm/model_executor --follow-imports skip
|
||||||
|
mypy vllm/prompt_adapter --follow-imports skip
|
||||||
|
mypy vllm/spec_decode --follow-imports skip
|
||||||
|
mypy vllm/worker --follow-imports skip
|
||||||
|
|
||||||
12
.github/workflows/publish.yml
vendored
12
.github/workflows/publish.yml
vendored
@@ -48,14 +48,20 @@ jobs:
|
|||||||
fail-fast: false
|
fail-fast: false
|
||||||
matrix:
|
matrix:
|
||||||
os: ['ubuntu-20.04']
|
os: ['ubuntu-20.04']
|
||||||
python-version: ['3.8', '3.9', '3.10', '3.11']
|
python-version: ['3.8', '3.9', '3.10', '3.11', '3.12']
|
||||||
pytorch-version: ['2.1.2'] # Must be the most recent version that meets requirements.txt.
|
pytorch-version: ['2.4.0'] # Must be the most recent version that meets requirements-cuda.txt.
|
||||||
cuda-version: ['11.8', '12.1']
|
cuda-version: ['11.8', '12.1']
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
- name: Checkout
|
- name: Checkout
|
||||||
uses: actions/checkout@v3
|
uses: actions/checkout@v3
|
||||||
|
|
||||||
|
- name: Setup ccache
|
||||||
|
uses: hendrikmuhs/ccache-action@v1.2
|
||||||
|
with:
|
||||||
|
create-symlink: true
|
||||||
|
key: ${{ github.job }}-${{ matrix.python-version }}-${{ matrix.cuda-version }}
|
||||||
|
|
||||||
- name: Set up Linux Env
|
- name: Set up Linux Env
|
||||||
if: ${{ runner.os == 'Linux' }}
|
if: ${{ runner.os == 'Linux' }}
|
||||||
run: |
|
run: |
|
||||||
@@ -76,6 +82,8 @@ jobs:
|
|||||||
|
|
||||||
- name: Build wheel
|
- name: Build wheel
|
||||||
shell: bash
|
shell: bash
|
||||||
|
env:
|
||||||
|
CMAKE_BUILD_TYPE: Release # do not compile with debug symbol to reduce wheel size
|
||||||
run: |
|
run: |
|
||||||
bash -x .github/workflows/scripts/build.sh ${{ matrix.python-version }} ${{ matrix.cuda-version }}
|
bash -x .github/workflows/scripts/build.sh ${{ matrix.python-version }} ${{ matrix.cuda-version }}
|
||||||
wheel_name=$(ls dist/*whl | xargs -n 1 basename)
|
wheel_name=$(ls dist/*whl | xargs -n 1 basename)
|
||||||
|
|||||||
21
.github/workflows/reminder_comment.yml
vendored
Normal file
21
.github/workflows/reminder_comment.yml
vendored
Normal file
@@ -0,0 +1,21 @@
|
|||||||
|
name: PR Reminder Comment Bot
|
||||||
|
on:
|
||||||
|
pull_request_target:
|
||||||
|
types: [opened]
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
pr_reminder:
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
steps:
|
||||||
|
- name: Remind to run full CI on PR
|
||||||
|
uses: actions/github-script@v6
|
||||||
|
with:
|
||||||
|
script: |
|
||||||
|
github.rest.issues.createComment({
|
||||||
|
owner: context.repo.owner,
|
||||||
|
repo: context.repo.repo,
|
||||||
|
issue_number: context.issue.number,
|
||||||
|
body: '👋 Hi! Thank you for contributing to the vLLM project.\n Just a reminder: PRs would not trigger full CI run by default. Instead, it would only run `fastcheck` CI which consists a small and essential subset of CI tests to quickly catch errors. You can run other CI tests on top of default ones by unblocking the steps in your `fast-check` build on Buildkite UI. \n\nOnce the PR is approved and ready to go, please make sure to run full CI as it is required to merge (or just use auto-merge).\n\n To run full CI, you can do one of these:\n- Comment `/ready` on the PR\n- Add `ready` label to the PR\n- Enable auto-merge.\n\n🚀'
|
||||||
|
})
|
||||||
|
env:
|
||||||
|
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||||
23
.github/workflows/remove_label_not_ready_comment.yml
vendored
Normal file
23
.github/workflows/remove_label_not_ready_comment.yml
vendored
Normal file
@@ -0,0 +1,23 @@
|
|||||||
|
name: Remove ready Label on notready Comment
|
||||||
|
|
||||||
|
on:
|
||||||
|
issue_comment:
|
||||||
|
types: [created]
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
add-ready-label:
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
if: github.event.issue.pull_request && contains(github.event.comment.body, '/notready')
|
||||||
|
steps:
|
||||||
|
- name: Remove ready label
|
||||||
|
uses: actions/github-script@v5
|
||||||
|
with:
|
||||||
|
script: |
|
||||||
|
github.rest.issues.removeLabel({
|
||||||
|
owner: context.repo.owner,
|
||||||
|
repo: context.repo.repo,
|
||||||
|
issue_number: context.issue.number,
|
||||||
|
name: 'ready'
|
||||||
|
})
|
||||||
|
env:
|
||||||
|
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||||
4
.github/workflows/ruff.yml
vendored
4
.github/workflows/ruff.yml
vendored
@@ -15,7 +15,7 @@ jobs:
|
|||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
strategy:
|
strategy:
|
||||||
matrix:
|
matrix:
|
||||||
python-version: ["3.10"]
|
python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v2
|
- uses: actions/checkout@v2
|
||||||
- name: Set up Python ${{ matrix.python-version }}
|
- name: Set up Python ${{ matrix.python-version }}
|
||||||
@@ -25,7 +25,7 @@ jobs:
|
|||||||
- name: Install dependencies
|
- name: Install dependencies
|
||||||
run: |
|
run: |
|
||||||
python -m pip install --upgrade pip
|
python -m pip install --upgrade pip
|
||||||
pip install ruff==0.1.5 codespell==2.2.6 tomli==2.0.1 isort==5.13.2
|
pip install ruff==0.1.5 codespell==2.3.0 tomli==2.0.1 isort==5.13.2
|
||||||
- name: Analysing the code with ruff
|
- name: Analysing the code with ruff
|
||||||
run: |
|
run: |
|
||||||
ruff .
|
ruff .
|
||||||
|
|||||||
7
.github/workflows/scripts/build.sh
vendored
7
.github/workflows/scripts/build.sh
vendored
@@ -9,12 +9,11 @@ LD_LIBRARY_PATH=${cuda_home}/lib64:$LD_LIBRARY_PATH
|
|||||||
|
|
||||||
# Install requirements
|
# Install requirements
|
||||||
$python_executable -m pip install wheel packaging
|
$python_executable -m pip install wheel packaging
|
||||||
$python_executable -m pip install -r requirements.txt
|
$python_executable -m pip install -r requirements-cuda.txt
|
||||||
|
|
||||||
# Limit the number of parallel jobs to avoid OOM
|
# Limit the number of parallel jobs to avoid OOM
|
||||||
export MAX_JOBS=1
|
export MAX_JOBS=1
|
||||||
# Make sure punica is built for the release (for LoRA)
|
# Make sure release wheels are built for the following architectures
|
||||||
export VLLM_INSTALL_PUNICA_KERNELS=1
|
export TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 8.9 9.0+PTX"
|
||||||
|
|
||||||
# Build
|
# Build
|
||||||
$python_executable setup.py bdist_wheel --dist-dir=dist
|
$python_executable setup.py bdist_wheel --dist-dir=dist
|
||||||
|
|||||||
2
.github/workflows/scripts/create_release.js
vendored
2
.github/workflows/scripts/create_release.js
vendored
@@ -8,7 +8,7 @@ module.exports = async (github, context, core) => {
|
|||||||
generate_release_notes: true,
|
generate_release_notes: true,
|
||||||
name: process.env.RELEASE_TAG,
|
name: process.env.RELEASE_TAG,
|
||||||
owner: context.repo.owner,
|
owner: context.repo.owner,
|
||||||
prerelease: false,
|
prerelease: true,
|
||||||
repo: context.repo.repo,
|
repo: context.repo.repo,
|
||||||
tag_name: process.env.RELEASE_TAG,
|
tag_name: process.env.RELEASE_TAG,
|
||||||
});
|
});
|
||||||
|
|||||||
2
.github/workflows/yapf.yml
vendored
2
.github/workflows/yapf.yml
vendored
@@ -14,7 +14,7 @@ jobs:
|
|||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
strategy:
|
strategy:
|
||||||
matrix:
|
matrix:
|
||||||
python-version: ["3.10"]
|
python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v2
|
- uses: actions/checkout@v2
|
||||||
- name: Set up Python ${{ matrix.python-version }}
|
- name: Set up Python ${{ matrix.python-version }}
|
||||||
|
|||||||
6
.gitignore
vendored
6
.gitignore
vendored
@@ -1,3 +1,6 @@
|
|||||||
|
# vllm commit id, generated by setup.py
|
||||||
|
vllm/commit_id.py
|
||||||
|
|
||||||
# Byte-compiled / optimized / DLL files
|
# Byte-compiled / optimized / DLL files
|
||||||
__pycache__/
|
__pycache__/
|
||||||
*.py[cod]
|
*.py[cod]
|
||||||
@@ -70,6 +73,8 @@ instance/
|
|||||||
|
|
||||||
# Sphinx documentation
|
# Sphinx documentation
|
||||||
docs/_build/
|
docs/_build/
|
||||||
|
docs/source/getting_started/examples/*.rst
|
||||||
|
!**/*.template.rst
|
||||||
|
|
||||||
# PyBuilder
|
# PyBuilder
|
||||||
.pybuilder/
|
.pybuilder/
|
||||||
@@ -181,6 +186,7 @@ _build/
|
|||||||
# hip files generated by PyTorch
|
# hip files generated by PyTorch
|
||||||
*.hip
|
*.hip
|
||||||
*_hip*
|
*_hip*
|
||||||
|
hip_compat.h
|
||||||
|
|
||||||
# Benchmark dataset
|
# Benchmark dataset
|
||||||
*.json
|
*.json
|
||||||
|
|||||||
@@ -10,6 +10,7 @@ build:
|
|||||||
|
|
||||||
sphinx:
|
sphinx:
|
||||||
configuration: docs/source/conf.py
|
configuration: docs/source/conf.py
|
||||||
|
fail_on_warning: true
|
||||||
|
|
||||||
# If using Sphinx, optionally build your docs in additional formats such as PDF
|
# If using Sphinx, optionally build your docs in additional formats such as PDF
|
||||||
formats:
|
formats:
|
||||||
|
|||||||
219
CMakeLists.txt
219
CMakeLists.txt
@@ -2,7 +2,11 @@ cmake_minimum_required(VERSION 3.21)
|
|||||||
|
|
||||||
project(vllm_extensions LANGUAGES CXX)
|
project(vllm_extensions LANGUAGES CXX)
|
||||||
|
|
||||||
|
# CUDA by default, can be overridden by using -DVLLM_TARGET_DEVICE=... (used by setup.py)
|
||||||
|
set(VLLM_TARGET_DEVICE "cuda" CACHE STRING "Target device backend for vLLM")
|
||||||
|
|
||||||
message(STATUS "Build type: ${CMAKE_BUILD_TYPE}")
|
message(STATUS "Build type: ${CMAKE_BUILD_TYPE}")
|
||||||
|
message(STATUS "Target device: ${VLLM_TARGET_DEVICE}")
|
||||||
|
|
||||||
include(${CMAKE_CURRENT_LIST_DIR}/cmake/utils.cmake)
|
include(${CMAKE_CURRENT_LIST_DIR}/cmake/utils.cmake)
|
||||||
|
|
||||||
@@ -10,13 +14,13 @@ include(${CMAKE_CURRENT_LIST_DIR}/cmake/utils.cmake)
|
|||||||
# Supported python versions. These versions will be searched in order, the
|
# Supported python versions. These versions will be searched in order, the
|
||||||
# first match will be selected. These should be kept in sync with setup.py.
|
# first match will be selected. These should be kept in sync with setup.py.
|
||||||
#
|
#
|
||||||
set(PYTHON_SUPPORTED_VERSIONS "3.8" "3.9" "3.10" "3.11")
|
set(PYTHON_SUPPORTED_VERSIONS "3.8" "3.9" "3.10" "3.11" "3.12")
|
||||||
|
|
||||||
# Supported NVIDIA architectures.
|
# Supported NVIDIA architectures.
|
||||||
set(CUDA_SUPPORTED_ARCHS "7.0;7.5;8.0;8.6;8.9;9.0")
|
set(CUDA_SUPPORTED_ARCHS "7.0;7.5;8.0;8.6;8.9;9.0")
|
||||||
|
|
||||||
# Supported AMD GPU architectures.
|
# Supported AMD GPU architectures.
|
||||||
set(HIP_SUPPORTED_ARCHS "gfx908;gfx90a;gfx942;gfx1100")
|
set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx1100")
|
||||||
|
|
||||||
#
|
#
|
||||||
# Supported/expected torch versions for CUDA/ROCm.
|
# Supported/expected torch versions for CUDA/ROCm.
|
||||||
@@ -28,9 +32,8 @@ set(HIP_SUPPORTED_ARCHS "gfx908;gfx90a;gfx942;gfx1100")
|
|||||||
# requirements.txt files and should be kept consistent. The ROCm torch
|
# requirements.txt files and should be kept consistent. The ROCm torch
|
||||||
# versions are derived from Dockerfile.rocm
|
# versions are derived from Dockerfile.rocm
|
||||||
#
|
#
|
||||||
set(TORCH_SUPPORTED_VERSION_CUDA "2.1.2")
|
set(TORCH_SUPPORTED_VERSION_CUDA "2.4.0")
|
||||||
set(TORCH_SUPPORTED_VERSION_ROCM_5X "2.0.1")
|
set(TORCH_SUPPORTED_VERSION_ROCM "2.5.0")
|
||||||
set(TORCH_SUPPORTED_VERSION_ROCM_6X "2.1.1")
|
|
||||||
|
|
||||||
#
|
#
|
||||||
# Try to find python package with an executable that exactly matches
|
# Try to find python package with an executable that exactly matches
|
||||||
@@ -64,17 +67,50 @@ endif()
|
|||||||
find_package(Torch REQUIRED)
|
find_package(Torch REQUIRED)
|
||||||
|
|
||||||
#
|
#
|
||||||
# Normally `torch.utils.cpp_extension.CUDAExtension` would add
|
# Add the `default` target which detects which extensions should be
|
||||||
# `libtorch_python.so` for linking against an extension. Torch's cmake
|
# built based on platform/architecture. This is the same logic that
|
||||||
# configuration does not include this library (presumably since the cmake
|
# setup.py uses to select which extensions should be built and should
|
||||||
# config is used for standalone C++ binaries that link against torch).
|
# be kept in sync.
|
||||||
# The `libtorch_python.so` library defines some of the glue code between
|
|
||||||
# torch/python via pybind and is required by VLLM extensions for this
|
|
||||||
# reason. So, add it by manually with `find_library` using torch's
|
|
||||||
# installed library path.
|
|
||||||
#
|
#
|
||||||
find_library(torch_python_LIBRARY torch_python PATHS
|
# The `default` target makes direct use of cmake easier since knowledge
|
||||||
"${TORCH_INSTALL_PREFIX}/lib")
|
# of which extensions are supported has been factored in, e.g.
|
||||||
|
#
|
||||||
|
# mkdir build && cd build
|
||||||
|
# cmake -G Ninja -DVLLM_PYTHON_EXECUTABLE=`which python3` -DCMAKE_LIBRARY_OUTPUT_DIRECTORY=../vllm ..
|
||||||
|
# cmake --build . --target default
|
||||||
|
#
|
||||||
|
add_custom_target(default)
|
||||||
|
message(STATUS "Enabling core extension.")
|
||||||
|
|
||||||
|
# Define _core_C extension
|
||||||
|
# built for (almost) every target platform, (excludes TPU and Neuron)
|
||||||
|
|
||||||
|
set(VLLM_EXT_SRC
|
||||||
|
"csrc/core/torch_bindings.cpp")
|
||||||
|
|
||||||
|
define_gpu_extension_target(
|
||||||
|
_core_C
|
||||||
|
DESTINATION vllm
|
||||||
|
LANGUAGE CXX
|
||||||
|
SOURCES ${VLLM_EXT_SRC}
|
||||||
|
COMPILE_FLAGS ${CXX_COMPILE_FLAGS}
|
||||||
|
USE_SABI 3
|
||||||
|
WITH_SOABI)
|
||||||
|
|
||||||
|
add_dependencies(default _core_C)
|
||||||
|
|
||||||
|
#
|
||||||
|
# Forward the non-CUDA device extensions to external CMake scripts.
|
||||||
|
#
|
||||||
|
if (NOT VLLM_TARGET_DEVICE STREQUAL "cuda" AND
|
||||||
|
NOT VLLM_TARGET_DEVICE STREQUAL "rocm")
|
||||||
|
if (VLLM_TARGET_DEVICE STREQUAL "cpu")
|
||||||
|
include(${CMAKE_CURRENT_LIST_DIR}/cmake/cpu_extension.cmake)
|
||||||
|
else()
|
||||||
|
return()
|
||||||
|
endif()
|
||||||
|
return()
|
||||||
|
endif()
|
||||||
|
|
||||||
#
|
#
|
||||||
# Set up GPU language and check the torch version and warn if it isn't
|
# Set up GPU language and check the torch version and warn if it isn't
|
||||||
@@ -95,18 +131,11 @@ elseif(HIP_FOUND)
|
|||||||
# .hip extension automatically, HIP must be enabled explicitly.
|
# .hip extension automatically, HIP must be enabled explicitly.
|
||||||
enable_language(HIP)
|
enable_language(HIP)
|
||||||
|
|
||||||
# ROCm 5.x
|
# ROCm 5.X and 6.X
|
||||||
if (ROCM_VERSION_DEV_MAJOR EQUAL 5 AND
|
if (ROCM_VERSION_DEV_MAJOR GREATER_EQUAL 5 AND
|
||||||
NOT Torch_VERSION VERSION_EQUAL ${TORCH_SUPPORTED_VERSION_ROCM_5X})
|
NOT Torch_VERSION VERSION_EQUAL ${TORCH_SUPPORTED_VERSION_ROCM})
|
||||||
message(WARNING "Pytorch version ${TORCH_SUPPORTED_VERSION_ROCM_5X} "
|
message(WARNING "Pytorch version >= ${TORCH_SUPPORTED_VERSION_ROCM} "
|
||||||
"expected for ROCMm 5.x build, saw ${Torch_VERSION} instead.")
|
"expected for ROCm build, saw ${Torch_VERSION} instead.")
|
||||||
endif()
|
|
||||||
|
|
||||||
# ROCm 6.x
|
|
||||||
if (ROCM_VERSION_DEV_MAJOR EQUAL 6 AND
|
|
||||||
NOT Torch_VERSION VERSION_EQUAL ${TORCH_SUPPORTED_VERSION_ROCM_6X})
|
|
||||||
message(WARNING "Pytorch version ${TORCH_SUPPORTED_VERSION_ROCM_6X} "
|
|
||||||
"expected for ROCMm 6.x build, saw ${Torch_VERSION} instead.")
|
|
||||||
endif()
|
endif()
|
||||||
else()
|
else()
|
||||||
message(FATAL_ERROR "Can't find CUDA or HIP installation.")
|
message(FATAL_ERROR "Can't find CUDA or HIP installation.")
|
||||||
@@ -136,7 +165,7 @@ if(NVCC_THREADS AND VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
endif()
|
endif()
|
||||||
|
|
||||||
#
|
#
|
||||||
# Define extension targets
|
# Define other extension targets
|
||||||
#
|
#
|
||||||
|
|
||||||
#
|
#
|
||||||
@@ -151,15 +180,52 @@ set(VLLM_EXT_SRC
|
|||||||
"csrc/layernorm_kernels.cu"
|
"csrc/layernorm_kernels.cu"
|
||||||
"csrc/quantization/squeezellm/quant_cuda_kernel.cu"
|
"csrc/quantization/squeezellm/quant_cuda_kernel.cu"
|
||||||
"csrc/quantization/gptq/q_gemm.cu"
|
"csrc/quantization/gptq/q_gemm.cu"
|
||||||
|
"csrc/quantization/compressed_tensors/int8_quant_kernels.cu"
|
||||||
|
"csrc/quantization/fp8/common.cu"
|
||||||
"csrc/cuda_utils_kernels.cu"
|
"csrc/cuda_utils_kernels.cu"
|
||||||
"csrc/moe_align_block_size_kernels.cu"
|
"csrc/moe_align_block_size_kernels.cu"
|
||||||
"csrc/pybind.cpp")
|
"csrc/prepare_inputs/advance_step.cu"
|
||||||
|
"csrc/torch_bindings.cpp")
|
||||||
|
|
||||||
if(VLLM_GPU_LANG STREQUAL "CUDA")
|
if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||||
|
include(FetchContent)
|
||||||
|
SET(CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL "Enable only the header library")
|
||||||
|
FetchContent_Declare(
|
||||||
|
cutlass
|
||||||
|
GIT_REPOSITORY https://github.com/nvidia/cutlass.git
|
||||||
|
# CUTLASS 3.5.1
|
||||||
|
GIT_TAG 06b21349bcf6ddf6a1686a47a137ad1446579db9
|
||||||
|
GIT_PROGRESS TRUE
|
||||||
|
)
|
||||||
|
FetchContent_MakeAvailable(cutlass)
|
||||||
|
|
||||||
list(APPEND VLLM_EXT_SRC
|
list(APPEND VLLM_EXT_SRC
|
||||||
|
"csrc/quantization/aqlm/gemm_kernels.cu"
|
||||||
"csrc/quantization/awq/gemm_kernels.cu"
|
"csrc/quantization/awq/gemm_kernels.cu"
|
||||||
"csrc/quantization/marlin/marlin_cuda_kernel.cu"
|
"csrc/quantization/marlin/dense/marlin_cuda_kernel.cu"
|
||||||
"csrc/custom_all_reduce.cu")
|
"csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu"
|
||||||
|
"csrc/quantization/marlin/qqq/marlin_qqq_gemm_kernel.cu"
|
||||||
|
"csrc/quantization/gptq_marlin/gptq_marlin.cu"
|
||||||
|
"csrc/quantization/gptq_marlin/gptq_marlin_repack.cu"
|
||||||
|
"csrc/quantization/gptq_marlin/awq_marlin_repack.cu"
|
||||||
|
"csrc/quantization/fp8/fp8_marlin.cu"
|
||||||
|
"csrc/custom_all_reduce.cu"
|
||||||
|
"csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu"
|
||||||
|
"csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu"
|
||||||
|
"csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu")
|
||||||
|
|
||||||
|
#
|
||||||
|
# The CUTLASS kernels for Hopper require sm90a to be enabled.
|
||||||
|
# This is done via the below gencode option, BUT that creates kernels for both sm90 and sm90a.
|
||||||
|
# That adds an extra 17MB to compiled binary, so instead we selectively enable it.
|
||||||
|
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0)
|
||||||
|
set_source_files_properties(
|
||||||
|
"csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu"
|
||||||
|
PROPERTIES
|
||||||
|
COMPILE_FLAGS
|
||||||
|
"-gencode arch=compute_90a,code=sm_90a")
|
||||||
|
endif()
|
||||||
|
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
define_gpu_extension_target(
|
define_gpu_extension_target(
|
||||||
@@ -169,6 +235,8 @@ define_gpu_extension_target(
|
|||||||
SOURCES ${VLLM_EXT_SRC}
|
SOURCES ${VLLM_EXT_SRC}
|
||||||
COMPILE_FLAGS ${VLLM_GPU_FLAGS}
|
COMPILE_FLAGS ${VLLM_GPU_FLAGS}
|
||||||
ARCHITECTURES ${VLLM_GPU_ARCHES}
|
ARCHITECTURES ${VLLM_GPU_ARCHES}
|
||||||
|
INCLUDE_DIRECTORIES ${CUTLASS_INCLUDE_DIR}
|
||||||
|
USE_SABI 3
|
||||||
WITH_SOABI)
|
WITH_SOABI)
|
||||||
|
|
||||||
#
|
#
|
||||||
@@ -176,7 +244,7 @@ define_gpu_extension_target(
|
|||||||
#
|
#
|
||||||
|
|
||||||
set(VLLM_MOE_EXT_SRC
|
set(VLLM_MOE_EXT_SRC
|
||||||
"csrc/moe/moe_ops.cpp"
|
"csrc/moe/torch_bindings.cpp"
|
||||||
"csrc/moe/topk_softmax_kernels.cu")
|
"csrc/moe/topk_softmax_kernels.cu")
|
||||||
|
|
||||||
define_gpu_extension_target(
|
define_gpu_extension_target(
|
||||||
@@ -186,101 +254,16 @@ define_gpu_extension_target(
|
|||||||
SOURCES ${VLLM_MOE_EXT_SRC}
|
SOURCES ${VLLM_MOE_EXT_SRC}
|
||||||
COMPILE_FLAGS ${VLLM_GPU_FLAGS}
|
COMPILE_FLAGS ${VLLM_GPU_FLAGS}
|
||||||
ARCHITECTURES ${VLLM_GPU_ARCHES}
|
ARCHITECTURES ${VLLM_GPU_ARCHES}
|
||||||
|
USE_SABI 3
|
||||||
WITH_SOABI)
|
WITH_SOABI)
|
||||||
|
|
||||||
#
|
|
||||||
# _punica_C extension
|
|
||||||
#
|
|
||||||
|
|
||||||
set(VLLM_PUNICA_EXT_SRC
|
|
||||||
"csrc/punica/bgmv/bgmv_bf16_bf16_bf16.cu"
|
|
||||||
"csrc/punica/bgmv/bgmv_bf16_bf16_fp16.cu"
|
|
||||||
"csrc/punica/bgmv/bgmv_bf16_fp16_bf16.cu"
|
|
||||||
"csrc/punica/bgmv/bgmv_bf16_fp16_fp16.cu"
|
|
||||||
"csrc/punica/bgmv/bgmv_bf16_fp32_bf16.cu"
|
|
||||||
"csrc/punica/bgmv/bgmv_bf16_fp32_fp16.cu"
|
|
||||||
"csrc/punica/bgmv/bgmv_fp16_bf16_bf16.cu"
|
|
||||||
"csrc/punica/bgmv/bgmv_fp16_bf16_fp16.cu"
|
|
||||||
"csrc/punica/bgmv/bgmv_fp16_fp16_bf16.cu"
|
|
||||||
"csrc/punica/bgmv/bgmv_fp16_fp16_fp16.cu"
|
|
||||||
"csrc/punica/bgmv/bgmv_fp16_fp32_bf16.cu"
|
|
||||||
"csrc/punica/bgmv/bgmv_fp16_fp32_fp16.cu"
|
|
||||||
"csrc/punica/bgmv/bgmv_fp32_bf16_bf16.cu"
|
|
||||||
"csrc/punica/bgmv/bgmv_fp32_bf16_fp16.cu"
|
|
||||||
"csrc/punica/bgmv/bgmv_fp32_fp16_bf16.cu"
|
|
||||||
"csrc/punica/bgmv/bgmv_fp32_fp16_fp16.cu"
|
|
||||||
"csrc/punica/bgmv/bgmv_fp32_fp32_bf16.cu"
|
|
||||||
"csrc/punica/bgmv/bgmv_fp32_fp32_fp16.cu"
|
|
||||||
"csrc/punica/punica_ops.cc")
|
|
||||||
|
|
||||||
#
|
|
||||||
# Copy GPU compilation flags+update for punica
|
|
||||||
#
|
|
||||||
set(VLLM_PUNICA_GPU_FLAGS ${VLLM_GPU_FLAGS})
|
|
||||||
list(REMOVE_ITEM VLLM_PUNICA_GPU_FLAGS
|
|
||||||
"-D__CUDA_NO_HALF_OPERATORS__"
|
|
||||||
"-D__CUDA_NO_HALF_CONVERSIONS__"
|
|
||||||
"-D__CUDA_NO_BFLOAT16_CONVERSIONS__"
|
|
||||||
"-D__CUDA_NO_HALF2_OPERATORS__")
|
|
||||||
|
|
||||||
#
|
|
||||||
# Filter out CUDA architectures < 8.0 for punica.
|
|
||||||
#
|
|
||||||
if (${VLLM_GPU_LANG} STREQUAL "CUDA")
|
|
||||||
set(VLLM_PUNICA_GPU_ARCHES)
|
|
||||||
foreach(ARCH ${VLLM_GPU_ARCHES})
|
|
||||||
string_to_ver(CODE_VER ${ARCH})
|
|
||||||
if (CODE_VER GREATER_EQUAL 8.0)
|
|
||||||
list(APPEND VLLM_PUNICA_GPU_ARCHES ${ARCH})
|
|
||||||
endif()
|
|
||||||
endforeach()
|
|
||||||
message(STATUS "Punica target arches: ${VLLM_PUNICA_GPU_ARCHES}")
|
|
||||||
endif()
|
|
||||||
|
|
||||||
if (VLLM_PUNICA_GPU_ARCHES)
|
|
||||||
define_gpu_extension_target(
|
|
||||||
_punica_C
|
|
||||||
DESTINATION vllm
|
|
||||||
LANGUAGE ${VLLM_GPU_LANG}
|
|
||||||
SOURCES ${VLLM_PUNICA_EXT_SRC}
|
|
||||||
COMPILE_FLAGS ${VLLM_PUNICA_GPU_FLAGS}
|
|
||||||
ARCHITECTURES ${VLLM_PUNICA_GPU_ARCHES}
|
|
||||||
WITH_SOABI)
|
|
||||||
else()
|
|
||||||
message(WARNING "Unable to create _punica_C target because none of the "
|
|
||||||
"requested architectures (${VLLM_GPU_ARCHES}) are supported, i.e. >= 8.0")
|
|
||||||
endif()
|
|
||||||
|
|
||||||
#
|
|
||||||
# Add the `default` target which detects which extensions should be
|
|
||||||
# built based on platform/architecture. This is the same logic that
|
|
||||||
# setup.py uses to select which extensions should be built and should
|
|
||||||
# be kept in sync.
|
|
||||||
#
|
|
||||||
# The `default` target makes direct use of cmake easier since knowledge
|
|
||||||
# of which extensions are supported has been factored in, e.g.
|
|
||||||
#
|
|
||||||
# mkdir build && cd build
|
|
||||||
# cmake -G Ninja -DVLLM_PYTHON_EXECUTABLE=`which python3` -DCMAKE_LIBRARY_OUTPUT_DIRECTORY=../vllm ..
|
|
||||||
# cmake --build . --target default
|
|
||||||
#
|
|
||||||
add_custom_target(default)
|
|
||||||
|
|
||||||
if(VLLM_GPU_LANG STREQUAL "CUDA" OR VLLM_GPU_LANG STREQUAL "HIP")
|
if(VLLM_GPU_LANG STREQUAL "CUDA" OR VLLM_GPU_LANG STREQUAL "HIP")
|
||||||
message(STATUS "Enabling C extension.")
|
message(STATUS "Enabling C extension.")
|
||||||
add_dependencies(default _C)
|
add_dependencies(default _C)
|
||||||
endif()
|
|
||||||
|
|
||||||
if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|
||||||
message(STATUS "Enabling moe extension.")
|
message(STATUS "Enabling moe extension.")
|
||||||
add_dependencies(default _moe_C)
|
add_dependencies(default _moe_C)
|
||||||
|
|
||||||
# Enable punica if -DVLLM_INSTALL_PUNICA_KERNELS=ON or
|
|
||||||
# VLLM_INSTALL_PUNICA_KERNELS is set in the environment and
|
|
||||||
# there are supported target arches.
|
|
||||||
if (VLLM_PUNICA_GPU_ARCHES AND
|
|
||||||
(ENV{VLLM_INSTALL_PUNICA_KERNELS} OR VLLM_INSTALL_PUNICA_KERNELS))
|
|
||||||
message(STATUS "Enabling punica extension.")
|
|
||||||
add_dependencies(default _punica_C)
|
|
||||||
endif()
|
|
||||||
endif()
|
endif()
|
||||||
|
|||||||
@@ -21,7 +21,6 @@ Express your support on Twitter if vLLM aids you, or simply offer your appreciat
|
|||||||
### Build from source
|
### Build from source
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
pip install -r requirements.txt
|
|
||||||
pip install -e . # This may take several minutes.
|
pip install -e . # This may take several minutes.
|
||||||
```
|
```
|
||||||
|
|
||||||
@@ -30,6 +29,8 @@ pip install -e . # This may take several minutes.
|
|||||||
```bash
|
```bash
|
||||||
pip install -r requirements-dev.txt
|
pip install -r requirements-dev.txt
|
||||||
|
|
||||||
|
# linting and formatting
|
||||||
|
bash format.sh
|
||||||
# Static type checking
|
# Static type checking
|
||||||
mypy
|
mypy
|
||||||
# Unit tests
|
# Unit tests
|
||||||
|
|||||||
224
Dockerfile
224
Dockerfile
@@ -1,136 +1,230 @@
|
|||||||
# The vLLM Dockerfile is used to construct vLLM image that can be directly used
|
# The vLLM Dockerfile is used to construct vLLM image that can be directly used
|
||||||
# to run the OpenAI compatible server.
|
# to run the OpenAI compatible server.
|
||||||
|
|
||||||
|
# Please update any changes made here to
|
||||||
|
# docs/source/dev/dockerfile/dockerfile.rst and
|
||||||
|
# docs/source/assets/dev/dockerfile-stages-dependency.png
|
||||||
|
|
||||||
|
ARG CUDA_VERSION=12.4.1
|
||||||
#################### BASE BUILD IMAGE ####################
|
#################### BASE BUILD IMAGE ####################
|
||||||
FROM nvidia/cuda:12.1.0-devel-ubuntu22.04 AS dev
|
# prepare basic build environment
|
||||||
|
FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04 AS base
|
||||||
|
|
||||||
|
ARG CUDA_VERSION=12.4.1
|
||||||
|
ARG PYTHON_VERSION=3.10
|
||||||
|
|
||||||
|
ENV DEBIAN_FRONTEND=noninteractive
|
||||||
|
|
||||||
|
RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
|
||||||
|
&& echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
|
||||||
|
&& apt-get update -y \
|
||||||
|
&& apt-get install -y ccache software-properties-common \
|
||||||
|
&& add-apt-repository ppa:deadsnakes/ppa \
|
||||||
|
&& apt-get update -y \
|
||||||
|
&& apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv \
|
||||||
|
&& if [ "${PYTHON_VERSION}" != "3" ]; then update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1; fi \
|
||||||
|
&& python3 --version
|
||||||
|
|
||||||
RUN apt-get update -y \
|
RUN apt-get update -y \
|
||||||
&& apt-get install -y python3-pip git
|
&& apt-get install -y git curl sudo
|
||||||
|
|
||||||
|
# Install pip s.t. it will be compatible with our PYTHON_VERSION
|
||||||
|
RUN curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION}
|
||||||
|
RUN python3 -m pip --version
|
||||||
|
|
||||||
# Workaround for https://github.com/openai/triton/issues/2507 and
|
# Workaround for https://github.com/openai/triton/issues/2507 and
|
||||||
# https://github.com/pytorch/pytorch/issues/107960 -- hopefully
|
# https://github.com/pytorch/pytorch/issues/107960 -- hopefully
|
||||||
# this won't be needed for future versions of this docker image
|
# this won't be needed for future versions of this docker image
|
||||||
# or future versions of triton.
|
# or future versions of triton.
|
||||||
RUN ldconfig /usr/local/cuda-12.1/compat/
|
RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/
|
||||||
|
|
||||||
WORKDIR /workspace
|
WORKDIR /workspace
|
||||||
|
|
||||||
# install build and runtime dependencies
|
# install build and runtime dependencies
|
||||||
COPY requirements.txt requirements.txt
|
COPY requirements-common.txt requirements-common.txt
|
||||||
|
COPY requirements-adag.txt requirements-adag.txt
|
||||||
|
COPY requirements-cuda.txt requirements-cuda.txt
|
||||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||||
pip install -r requirements.txt
|
python3 -m pip install -r requirements-cuda.txt
|
||||||
|
|
||||||
# install development dependencies
|
COPY requirements-mamba.txt requirements-mamba.txt
|
||||||
COPY requirements-dev.txt requirements-dev.txt
|
RUN python3 -m pip install packaging
|
||||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
RUN python3 -m pip install -r requirements-mamba.txt
|
||||||
pip install -r requirements-dev.txt
|
|
||||||
|
# cuda arch list used by torch
|
||||||
|
# can be useful for both `dev` and `test`
|
||||||
|
# explicitly set the list to avoid issues with torch 2.2
|
||||||
|
# see https://github.com/pytorch/pytorch/pull/123243
|
||||||
|
ARG torch_cuda_arch_list='7.0 7.5 8.0 8.6 8.9 9.0+PTX'
|
||||||
|
ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}
|
||||||
#################### BASE BUILD IMAGE ####################
|
#################### BASE BUILD IMAGE ####################
|
||||||
|
|
||||||
|
#################### WHEEL BUILD IMAGE ####################
|
||||||
|
FROM base AS build
|
||||||
|
|
||||||
#################### EXTENSION BUILD IMAGE ####################
|
ARG PYTHON_VERSION=3.10
|
||||||
FROM dev AS build
|
|
||||||
|
|
||||||
# install build dependencies
|
# install build dependencies
|
||||||
COPY requirements-build.txt requirements-build.txt
|
COPY requirements-build.txt requirements-build.txt
|
||||||
|
|
||||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||||
pip install -r requirements-build.txt
|
python3 -m pip install -r requirements-build.txt
|
||||||
|
|
||||||
# install compiler cache to speed up compilation leveraging local or remote caching
|
# install compiler cache to speed up compilation leveraging local or remote caching
|
||||||
RUN apt-get update -y && apt-get install -y ccache
|
RUN apt-get update -y && apt-get install -y ccache
|
||||||
|
|
||||||
# copy input files
|
# files and directories related to build wheels
|
||||||
COPY csrc csrc
|
COPY csrc csrc
|
||||||
COPY setup.py setup.py
|
COPY setup.py setup.py
|
||||||
COPY cmake cmake
|
COPY cmake cmake
|
||||||
COPY CMakeLists.txt CMakeLists.txt
|
COPY CMakeLists.txt CMakeLists.txt
|
||||||
COPY requirements.txt requirements.txt
|
COPY requirements-common.txt requirements-common.txt
|
||||||
|
COPY requirements-adag.txt requirements-adag.txt
|
||||||
|
COPY requirements-cuda.txt requirements-cuda.txt
|
||||||
COPY pyproject.toml pyproject.toml
|
COPY pyproject.toml pyproject.toml
|
||||||
COPY vllm/__init__.py vllm/__init__.py
|
COPY vllm vllm
|
||||||
|
|
||||||
# cuda arch list used by torch
|
|
||||||
ARG torch_cuda_arch_list='7.0 7.5 8.0 8.6 8.9 9.0+PTX'
|
|
||||||
ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}
|
|
||||||
# max jobs used by Ninja to build extensions
|
# max jobs used by Ninja to build extensions
|
||||||
ARG max_jobs=2
|
ARG max_jobs=2
|
||||||
ENV MAX_JOBS=${max_jobs}
|
ENV MAX_JOBS=${max_jobs}
|
||||||
# number of threads used by nvcc
|
# number of threads used by nvcc
|
||||||
ARG nvcc_threads=8
|
ARG nvcc_threads=8
|
||||||
ENV NVCC_THREADS=$nvcc_threads
|
ENV NVCC_THREADS=$nvcc_threads
|
||||||
# make sure punica kernels are built (for LoRA)
|
|
||||||
ENV VLLM_INSTALL_PUNICA_KERNELS=1
|
ARG buildkite_commit
|
||||||
|
ENV BUILDKITE_COMMIT=${buildkite_commit}
|
||||||
|
|
||||||
|
ARG USE_SCCACHE
|
||||||
|
# if USE_SCCACHE is set, use sccache to speed up compilation
|
||||||
|
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||||
|
if [ "$USE_SCCACHE" = "1" ]; then \
|
||||||
|
echo "Installing sccache..." \
|
||||||
|
&& curl -L -o sccache.tar.gz https://github.com/mozilla/sccache/releases/download/v0.8.1/sccache-v0.8.1-x86_64-unknown-linux-musl.tar.gz \
|
||||||
|
&& tar -xzf sccache.tar.gz \
|
||||||
|
&& sudo mv sccache-v0.8.1-x86_64-unknown-linux-musl/sccache /usr/bin/sccache \
|
||||||
|
&& rm -rf sccache.tar.gz sccache-v0.8.1-x86_64-unknown-linux-musl \
|
||||||
|
&& if [ "$CUDA_VERSION" = "11.8.0" ]; then \
|
||||||
|
export SCCACHE_BUCKET=vllm-build-sccache-2; \
|
||||||
|
else \
|
||||||
|
export SCCACHE_BUCKET=vllm-build-sccache; \
|
||||||
|
fi \
|
||||||
|
&& export SCCACHE_REGION=us-west-2 \
|
||||||
|
&& export CMAKE_BUILD_TYPE=Release \
|
||||||
|
&& sccache --show-stats \
|
||||||
|
&& python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38 \
|
||||||
|
&& sccache --show-stats; \
|
||||||
|
fi
|
||||||
|
|
||||||
ENV CCACHE_DIR=/root/.cache/ccache
|
ENV CCACHE_DIR=/root/.cache/ccache
|
||||||
RUN --mount=type=cache,target=/root/.cache/ccache \
|
RUN --mount=type=cache,target=/root/.cache/ccache \
|
||||||
python3 setup.py build_ext --inplace
|
--mount=type=cache,target=/root/.cache/pip \
|
||||||
|
if [ "$USE_SCCACHE" != "1" ]; then \
|
||||||
|
python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38; \
|
||||||
|
fi
|
||||||
|
|
||||||
|
# check the size of the wheel, we cannot upload wheels larger than 100MB
|
||||||
|
COPY .buildkite/check-wheel-size.py check-wheel-size.py
|
||||||
|
RUN python3 check-wheel-size.py dist
|
||||||
|
|
||||||
#################### EXTENSION Build IMAGE ####################
|
#################### EXTENSION Build IMAGE ####################
|
||||||
|
|
||||||
#################### FLASH_ATTENTION Build IMAGE ####################
|
#################### DEV IMAGE ####################
|
||||||
FROM dev as flash-attn-builder
|
FROM base as dev
|
||||||
|
|
||||||
|
COPY requirements-lint.txt requirements-lint.txt
|
||||||
|
COPY requirements-test.txt requirements-test.txt
|
||||||
|
COPY requirements-dev.txt requirements-dev.txt
|
||||||
|
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||||
|
python3 -m pip install -r requirements-dev.txt
|
||||||
|
|
||||||
|
#################### DEV IMAGE ####################
|
||||||
|
#################### MAMBA Build IMAGE ####################
|
||||||
|
FROM dev as mamba-builder
|
||||||
# max jobs used for build
|
# max jobs used for build
|
||||||
ARG max_jobs=2
|
ARG max_jobs=2
|
||||||
ENV MAX_JOBS=${max_jobs}
|
ENV MAX_JOBS=${max_jobs}
|
||||||
# flash attention version
|
|
||||||
ARG flash_attn_version=v2.5.6
|
|
||||||
ENV FLASH_ATTN_VERSION=${flash_attn_version}
|
|
||||||
|
|
||||||
WORKDIR /usr/src/flash-attention-v2
|
WORKDIR /usr/src/mamba
|
||||||
|
|
||||||
|
COPY requirements-mamba.txt requirements-mamba.txt
|
||||||
|
|
||||||
# Download the wheel or build it if a pre-compiled release doesn't exist
|
# Download the wheel or build it if a pre-compiled release doesn't exist
|
||||||
RUN pip --verbose wheel flash-attn==${FLASH_ATTN_VERSION} \
|
RUN pip --verbose wheel -r requirements-mamba.txt \
|
||||||
--no-build-isolation --no-deps --no-cache-dir
|
--no-build-isolation --no-deps --no-cache-dir
|
||||||
|
|
||||||
#################### FLASH_ATTENTION Build IMAGE ####################
|
#################### MAMBA Build IMAGE ####################
|
||||||
|
|
||||||
|
#################### vLLM installation IMAGE ####################
|
||||||
|
# image with vLLM installed
|
||||||
|
FROM nvidia/cuda:${CUDA_VERSION}-base-ubuntu20.04 AS vllm-base
|
||||||
|
ARG CUDA_VERSION=12.4.1
|
||||||
|
ARG PYTHON_VERSION=3.10
|
||||||
|
WORKDIR /vllm-workspace
|
||||||
|
|
||||||
|
RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
|
||||||
|
&& echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
|
||||||
|
&& apt-get update -y \
|
||||||
|
&& apt-get install -y ccache software-properties-common \
|
||||||
|
&& add-apt-repository ppa:deadsnakes/ppa \
|
||||||
|
&& apt-get update -y \
|
||||||
|
&& apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv \
|
||||||
|
&& if [ "${PYTHON_VERSION}" != "3" ]; then update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1; fi \
|
||||||
|
&& python3 --version
|
||||||
|
|
||||||
|
RUN apt-get update -y \
|
||||||
|
&& apt-get install -y python3-pip git vim curl libibverbs-dev
|
||||||
|
|
||||||
|
# Install pip s.t. it will be compatible with our PYTHON_VERSION
|
||||||
|
RUN curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION}
|
||||||
|
RUN python3 -m pip --version
|
||||||
|
|
||||||
|
# Workaround for https://github.com/openai/triton/issues/2507 and
|
||||||
|
# https://github.com/pytorch/pytorch/issues/107960 -- hopefully
|
||||||
|
# this won't be needed for future versions of this docker image
|
||||||
|
# or future versions of triton.
|
||||||
|
RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/
|
||||||
|
|
||||||
|
# install vllm wheel first, so that torch etc will be installed
|
||||||
|
RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist \
|
||||||
|
--mount=type=cache,target=/root/.cache/pip \
|
||||||
|
python3 -m pip install dist/*.whl --verbose
|
||||||
|
|
||||||
|
RUN --mount=type=bind,from=mamba-builder,src=/usr/src/mamba,target=/usr/src/mamba \
|
||||||
|
--mount=type=cache,target=/root/.cache/pip \
|
||||||
|
python3 -m pip install /usr/src/mamba/*.whl --no-cache-dir
|
||||||
|
|
||||||
|
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||||
|
python3 -m pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.1.2/flashinfer-0.1.2+cu121torch2.4-cp310-cp310-linux_x86_64.whl
|
||||||
|
#################### vLLM installation IMAGE ####################
|
||||||
|
|
||||||
|
|
||||||
#################### TEST IMAGE ####################
|
#################### TEST IMAGE ####################
|
||||||
# image to run unit testing suite
|
# image to run unit testing suite
|
||||||
FROM dev AS test
|
# note that this uses vllm installed by `pip`
|
||||||
|
FROM vllm-base AS test
|
||||||
|
|
||||||
# copy pytorch extensions separately to avoid having to rebuild
|
|
||||||
# when python code changes
|
|
||||||
WORKDIR /vllm-workspace
|
|
||||||
# ADD is used to preserve directory structure
|
|
||||||
ADD . /vllm-workspace/
|
ADD . /vllm-workspace/
|
||||||
COPY --from=build /workspace/vllm/*.so /vllm-workspace/vllm/
|
|
||||||
# Install flash attention (from pre-built wheel)
|
|
||||||
RUN --mount=type=bind,from=flash-attn-builder,src=/usr/src/flash-attention-v2,target=/usr/src/flash-attention-v2 \
|
|
||||||
pip install /usr/src/flash-attention-v2/*.whl --no-cache-dir
|
|
||||||
# ignore build dependencies installation because we are using pre-complied extensions
|
|
||||||
RUN rm pyproject.toml
|
|
||||||
RUN --mount=type=cache,target=/root/.cache/pip VLLM_USE_PRECOMPILED=1 pip install . --verbose
|
|
||||||
#################### TEST IMAGE ####################
|
|
||||||
|
|
||||||
|
# install development dependencies (for testing)
|
||||||
#################### RUNTIME BASE IMAGE ####################
|
|
||||||
# We used base cuda image because pytorch installs its own cuda libraries.
|
|
||||||
# However pynccl depends on cuda libraries so we had to switch to the runtime image
|
|
||||||
# In the future it would be nice to get a container with pytorch and cuda without duplicating cuda
|
|
||||||
FROM nvidia/cuda:12.1.0-runtime-ubuntu22.04 AS vllm-base
|
|
||||||
|
|
||||||
# libnccl required for ray
|
|
||||||
RUN apt-get update -y \
|
|
||||||
&& apt-get install -y python3-pip
|
|
||||||
|
|
||||||
WORKDIR /workspace
|
|
||||||
COPY requirements.txt requirements.txt
|
|
||||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||||
pip install -r requirements.txt
|
python3 -m pip install -r requirements-dev.txt
|
||||||
|
|
||||||
# Install flash attention (from pre-built wheel)
|
# doc requires source code
|
||||||
RUN --mount=type=bind,from=flash-attn-builder,src=/usr/src/flash-attention-v2,target=/usr/src/flash-attention-v2 \
|
# we hide them inside `test_docs/` , so that this source code
|
||||||
pip install /usr/src/flash-attention-v2/*.whl --no-cache-dir
|
# will not be imported by other tests
|
||||||
|
RUN mkdir test_docs
|
||||||
#################### RUNTIME BASE IMAGE ####################
|
RUN mv docs test_docs/
|
||||||
|
RUN mv vllm test_docs/
|
||||||
|
|
||||||
|
#################### TEST IMAGE ####################
|
||||||
|
|
||||||
#################### OPENAI API SERVER ####################
|
#################### OPENAI API SERVER ####################
|
||||||
# openai api server alternative
|
# openai api server alternative
|
||||||
FROM vllm-base AS vllm-openai
|
FROM vllm-base AS vllm-openai
|
||||||
|
|
||||||
# install additional dependencies for openai api server
|
# install additional dependencies for openai api server
|
||||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||||
pip install accelerate hf_transfer modelscope
|
pip install accelerate hf_transfer 'modelscope!=1.15.0'
|
||||||
|
|
||||||
COPY --from=build /workspace/vllm/*.so /workspace/vllm/
|
|
||||||
COPY vllm vllm
|
|
||||||
|
|
||||||
ENV VLLM_USAGE_SOURCE production-docker-image
|
ENV VLLM_USAGE_SOURCE production-docker-image
|
||||||
|
|
||||||
|
|||||||
41
Dockerfile.cpu
Normal file
41
Dockerfile.cpu
Normal file
@@ -0,0 +1,41 @@
|
|||||||
|
# This vLLM Dockerfile is used to construct image that can build and run vLLM on x86 CPU platform.
|
||||||
|
|
||||||
|
FROM ubuntu:22.04 AS cpu-test-1
|
||||||
|
|
||||||
|
RUN apt-get update -y \
|
||||||
|
&& apt-get install -y curl git wget vim numactl gcc-12 g++-12 python3 python3-pip libtcmalloc-minimal4 libnuma-dev \
|
||||||
|
&& update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12
|
||||||
|
|
||||||
|
# https://intel.github.io/intel-extension-for-pytorch/cpu/latest/tutorials/performance_tuning/tuning_guide.html
|
||||||
|
# intel-openmp provides additional performance improvement vs. openmp
|
||||||
|
# tcmalloc provides better memory allocation efficiency, e.g, holding memory in caches to speed up access of commonly-used objects.
|
||||||
|
RUN pip install intel-openmp
|
||||||
|
|
||||||
|
ENV LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:/usr/local/lib/libiomp5.so:$LD_PRELOAD"
|
||||||
|
|
||||||
|
RUN echo 'ulimit -c 0' >> ~/.bashrc
|
||||||
|
|
||||||
|
RUN pip install https://intel-extension-for-pytorch.s3.amazonaws.com/ipex_dev/cpu/intel_extension_for_pytorch-2.4.0%2Bgitfbaa4bc-cp310-cp310-linux_x86_64.whl
|
||||||
|
|
||||||
|
RUN pip install --upgrade pip \
|
||||||
|
&& pip install wheel packaging ninja "setuptools>=49.4.0" numpy
|
||||||
|
|
||||||
|
FROM cpu-test-1 AS build
|
||||||
|
|
||||||
|
COPY ./ /workspace/vllm
|
||||||
|
|
||||||
|
WORKDIR /workspace/vllm
|
||||||
|
|
||||||
|
RUN pip install -v -r requirements-cpu.txt --extra-index-url https://download.pytorch.org/whl/cpu
|
||||||
|
|
||||||
|
# Support for building with non-AVX512 vLLM: docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" ...
|
||||||
|
ARG VLLM_CPU_DISABLE_AVX512
|
||||||
|
ENV VLLM_CPU_DISABLE_AVX512=${VLLM_CPU_DISABLE_AVX512}
|
||||||
|
|
||||||
|
RUN VLLM_TARGET_DEVICE=cpu python3 setup.py install
|
||||||
|
|
||||||
|
WORKDIR /workspace/
|
||||||
|
|
||||||
|
RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks
|
||||||
|
|
||||||
|
ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
|
||||||
36
Dockerfile.neuron
Normal file
36
Dockerfile.neuron
Normal file
@@ -0,0 +1,36 @@
|
|||||||
|
# default base image
|
||||||
|
ARG BASE_IMAGE="763104351884.dkr.ecr.us-west-2.amazonaws.com/pytorch-inference-neuronx:2.1.1-neuronx-py310-sdk2.17.0-ubuntu20.04"
|
||||||
|
|
||||||
|
FROM $BASE_IMAGE
|
||||||
|
|
||||||
|
RUN echo "Base image is $BASE_IMAGE"
|
||||||
|
|
||||||
|
# Install some basic utilities
|
||||||
|
RUN apt-get update && apt-get install python3 python3-pip -y
|
||||||
|
|
||||||
|
### Mount Point ###
|
||||||
|
# When launching the container, mount the code directory to /app
|
||||||
|
ARG APP_MOUNT=/app
|
||||||
|
VOLUME [ ${APP_MOUNT} ]
|
||||||
|
WORKDIR ${APP_MOUNT}
|
||||||
|
|
||||||
|
RUN python3 -m pip install --upgrade pip
|
||||||
|
RUN python3 -m pip install --no-cache-dir fastapi ninja tokenizers pandas
|
||||||
|
RUN python3 -m pip install sentencepiece transformers==4.36.2 -U
|
||||||
|
RUN python3 -m pip install transformers-neuronx --extra-index-url=https://pip.repos.neuron.amazonaws.com -U
|
||||||
|
RUN python3 -m pip install --pre neuronx-cc==2.12.* --extra-index-url=https://pip.repos.neuron.amazonaws.com -U
|
||||||
|
|
||||||
|
COPY ./vllm /app/vllm/vllm
|
||||||
|
COPY ./setup.py /app/vllm/setup.py
|
||||||
|
COPY ./requirements-common.txt /app/vllm/requirements-common.txt
|
||||||
|
COPY ./requirements-neuron.txt /app/vllm/requirements-neuron.txt
|
||||||
|
|
||||||
|
RUN cd /app/vllm \
|
||||||
|
&& python3 -m pip install -U -r requirements-neuron.txt
|
||||||
|
|
||||||
|
ENV VLLM_TARGET_DEVICE neuron
|
||||||
|
RUN cd /app/vllm \
|
||||||
|
&& pip install -e . \
|
||||||
|
&& cd ..
|
||||||
|
|
||||||
|
CMD ["/bin/bash"]
|
||||||
29
Dockerfile.openvino
Normal file
29
Dockerfile.openvino
Normal file
@@ -0,0 +1,29 @@
|
|||||||
|
# The vLLM Dockerfile is used to construct vLLM image that can be directly used
|
||||||
|
# to run the OpenAI compatible server.
|
||||||
|
|
||||||
|
FROM ubuntu:22.04 AS dev
|
||||||
|
|
||||||
|
RUN apt-get update -y && \
|
||||||
|
apt-get install -y python3-pip git
|
||||||
|
WORKDIR /workspace
|
||||||
|
|
||||||
|
# copy requirements
|
||||||
|
COPY requirements-build.txt /workspace/vllm/
|
||||||
|
COPY requirements-common.txt /workspace/vllm/
|
||||||
|
COPY requirements-openvino.txt /workspace/vllm/
|
||||||
|
|
||||||
|
COPY vllm/ /workspace/vllm/vllm
|
||||||
|
COPY csrc/core /workspace/vllm/csrc/core
|
||||||
|
COPY cmake/utils.cmake /workspace/vllm/cmake/
|
||||||
|
COPY CMakeLists.txt /workspace/vllm/
|
||||||
|
COPY setup.py /workspace/vllm/
|
||||||
|
|
||||||
|
# install build requirements
|
||||||
|
RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" python3 -m pip install -r /workspace/vllm/requirements-build.txt
|
||||||
|
# build vLLM with OpenVINO backend
|
||||||
|
RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu https://storage.openvinotoolkit.org/simple/wheels/pre-release" VLLM_TARGET_DEVICE="openvino" python3 -m pip install /workspace/vllm/
|
||||||
|
|
||||||
|
COPY examples/ /workspace/vllm/examples
|
||||||
|
COPY benchmarks/ /workspace/vllm/benchmarks
|
||||||
|
|
||||||
|
CMD ["/bin/bash"]
|
||||||
22
Dockerfile.ppc64le
Normal file
22
Dockerfile.ppc64le
Normal file
@@ -0,0 +1,22 @@
|
|||||||
|
FROM mambaorg/micromamba
|
||||||
|
ARG MAMBA_DOCKERFILE_ACTIVATE=1
|
||||||
|
USER root
|
||||||
|
|
||||||
|
RUN apt-get update -y && apt-get install -y git wget vim numactl gcc-12 g++-12 protobuf-compiler libprotobuf-dev && update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12
|
||||||
|
|
||||||
|
# Some packages in requirements-cpu are installed here
|
||||||
|
# IBM provides optimized packages for ppc64le processors in the open-ce project for mamba
|
||||||
|
# Currently these may not be available for venv or pip directly
|
||||||
|
RUN micromamba install -y -n base -c https://ftp.osuosl.org/pub/open-ce/1.11.0-p10/ -c defaults python=3.10 pytorch-cpu=2.1.2 torchvision-cpu=0.16.2 && micromamba clean --all --yes
|
||||||
|
|
||||||
|
COPY ./ /workspace/vllm
|
||||||
|
|
||||||
|
WORKDIR /workspace/vllm
|
||||||
|
|
||||||
|
# These packages will be in rocketce eventually
|
||||||
|
RUN pip install -v -r requirements-cpu.txt --prefer-binary --extra-index-url https://repo.fury.io/mgiessing
|
||||||
|
|
||||||
|
RUN VLLM_TARGET_DEVICE=cpu python3 setup.py install
|
||||||
|
|
||||||
|
WORKDIR /vllm-workspace
|
||||||
|
ENTRYPOINT ["/opt/conda/bin/python3", "-m", "vllm.entrypoints.openai.api_server"]
|
||||||
204
Dockerfile.rocm
204
Dockerfile.rocm
@@ -1,32 +1,33 @@
|
|||||||
# default base image
|
# Default ROCm 6.1 base image
|
||||||
ARG BASE_IMAGE="rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1"
|
ARG BASE_IMAGE="rocm/pytorch:rocm6.1.2_ubuntu20.04_py3.9_pytorch_staging"
|
||||||
|
|
||||||
FROM $BASE_IMAGE
|
# Default ROCm ARCHes to build vLLM for.
|
||||||
|
ARG PYTORCH_ROCM_ARCH="gfx908;gfx90a;gfx942;gfx1100"
|
||||||
|
|
||||||
ARG BASE_IMAGE="rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1"
|
# Whether to install CK-based flash-attention
|
||||||
|
# If 0, will not install flash-attention
|
||||||
RUN echo "Base image is $BASE_IMAGE"
|
|
||||||
|
|
||||||
# BASE_IMAGE for ROCm_5.7: "rocm/pytorch:rocm5.7_ubuntu22.04_py3.10_pytorch_2.0.1"
|
|
||||||
# BASE_IMAGE for ROCm_6.0: "rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1"
|
|
||||||
|
|
||||||
|
|
||||||
ARG FA_GFX_ARCHS="gfx90a;gfx942"
|
|
||||||
RUN echo "FA_GFX_ARCHS is $FA_GFX_ARCHS"
|
|
||||||
|
|
||||||
ARG FA_BRANCH="3d2b6f5"
|
|
||||||
RUN echo "FA_BRANCH is $FA_BRANCH"
|
|
||||||
|
|
||||||
# whether to build flash-attention
|
|
||||||
# if 0, will not build flash attention
|
|
||||||
# this is useful for gfx target where flash-attention is not supported
|
|
||||||
# In that case, we need to use the python reference attention implementation in vllm
|
|
||||||
ARG BUILD_FA="1"
|
ARG BUILD_FA="1"
|
||||||
|
# If `TRY_FA_WHEEL=1`, we will try installing flash-attention from `FA_WHEEL_URL`
|
||||||
|
# If this succeeds, we use the downloaded wheel and skip building flash-attention.
|
||||||
|
# Otherwise, ROCm flash-attention from `FA_BRANCH` will be built for the
|
||||||
|
# architectures specified in `FA_GFX_ARCHS`
|
||||||
|
ARG TRY_FA_WHEEL="1"
|
||||||
|
ARG FA_WHEEL_URL="https://github.com/ROCm/flash-attention/releases/download/v2.5.9post1-cktile-vllm/flash_attn-2.5.9.post1-cp39-cp39-linux_x86_64.whl"
|
||||||
|
ARG FA_GFX_ARCHS="gfx90a;gfx942"
|
||||||
|
ARG FA_BRANCH="23a2b1c2"
|
||||||
|
|
||||||
|
# Whether to build triton on rocm
|
||||||
|
ARG BUILD_TRITON="1"
|
||||||
|
ARG TRITON_BRANCH="e0fc12c"
|
||||||
|
|
||||||
|
### Base image build stage
|
||||||
|
FROM $BASE_IMAGE AS base
|
||||||
|
|
||||||
|
# Import arg(s) defined before this build stage
|
||||||
|
ARG PYTORCH_ROCM_ARCH
|
||||||
|
|
||||||
# Install some basic utilities
|
# Install some basic utilities
|
||||||
RUN apt-get update && apt-get install python3 python3-pip -y
|
RUN apt-get update && apt-get install python3 python3-pip -y
|
||||||
|
|
||||||
# Install some basic utilities
|
|
||||||
RUN apt-get update && apt-get install -y \
|
RUN apt-get update && apt-get install -y \
|
||||||
curl \
|
curl \
|
||||||
ca-certificates \
|
ca-certificates \
|
||||||
@@ -37,59 +38,144 @@ RUN apt-get update && apt-get install -y \
|
|||||||
build-essential \
|
build-essential \
|
||||||
wget \
|
wget \
|
||||||
unzip \
|
unzip \
|
||||||
nvidia-cuda-toolkit \
|
|
||||||
tmux \
|
tmux \
|
||||||
|
ccache \
|
||||||
&& rm -rf /var/lib/apt/lists/*
|
&& rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
### Mount Point ###
|
# When launching the container, mount the code directory to /vllm-workspace
|
||||||
# When launching the container, mount the code directory to /app
|
ARG APP_MOUNT=/vllm-workspace
|
||||||
ARG APP_MOUNT=/app
|
|
||||||
VOLUME [ ${APP_MOUNT} ]
|
|
||||||
WORKDIR ${APP_MOUNT}
|
WORKDIR ${APP_MOUNT}
|
||||||
|
|
||||||
RUN python3 -m pip install --upgrade pip
|
RUN python3 -m pip install --upgrade pip
|
||||||
RUN python3 -m pip install --no-cache-dir fastapi ninja tokenizers pandas
|
# Remove sccache so it doesn't interfere with ccache
|
||||||
|
# TODO: implement sccache support across components
|
||||||
|
RUN apt-get purge -y sccache; python3 -m pip uninstall -y sccache; rm -f "$(which sccache)"
|
||||||
|
# Install torch == 2.5.0 on ROCm
|
||||||
|
RUN case "$(ls /opt | grep -Po 'rocm-[0-9]\.[0-9]')" in \
|
||||||
|
*"rocm-6.1"*) \
|
||||||
|
python3 -m pip uninstall -y torch torchvision \
|
||||||
|
&& python3 -m pip install --no-cache-dir --pre \
|
||||||
|
torch==2.5.0.dev20240726 \
|
||||||
|
torchvision==0.20.0.dev20240726 \
|
||||||
|
--index-url https://download.pytorch.org/whl/nightly/rocm6.1;; \
|
||||||
|
*) ;; esac
|
||||||
|
|
||||||
ENV LLVM_SYMBOLIZER_PATH=/opt/rocm/llvm/bin/llvm-symbolizer
|
ENV LLVM_SYMBOLIZER_PATH=/opt/rocm/llvm/bin/llvm-symbolizer
|
||||||
ENV PATH=$PATH:/opt/rocm/bin:/libtorch/bin:
|
ENV PATH=$PATH:/opt/rocm/bin:/libtorch/bin:
|
||||||
ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/rocm/lib/:/libtorch/lib:
|
ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/rocm/lib/:/libtorch/lib:
|
||||||
ENV CPLUS_INCLUDE_PATH=$CPLUS_INCLUDE_PATH:/libtorch/include:/libtorch/include/torch/csrc/api/include/:/opt/rocm/include/:
|
ENV CPLUS_INCLUDE_PATH=$CPLUS_INCLUDE_PATH:/libtorch/include:/libtorch/include/torch/csrc/api/include/:/opt/rocm/include/:
|
||||||
|
|
||||||
# Install ROCm flash-attention
|
ENV PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH}
|
||||||
RUN if [ "$BUILD_FA" = "1" ]; then \
|
ENV CCACHE_DIR=/root/.cache/ccache
|
||||||
mkdir libs \
|
|
||||||
&& cd libs \
|
|
||||||
&& git clone https://github.com/ROCm/flash-attention.git \
|
### AMD-SMI build stage
|
||||||
&& cd flash-attention \
|
FROM base AS build_amdsmi
|
||||||
&& git checkout ${FA_BRANCH} \
|
# Build amdsmi wheel always
|
||||||
&& git submodule update --init \
|
RUN cd /opt/rocm/share/amd_smi \
|
||||||
&& export GPU_ARCHS=${FA_GFX_ARCHS} \
|
&& python3 -m pip wheel . --wheel-dir=/install
|
||||||
&& if [ "$BASE_IMAGE" = "rocm/pytorch:rocm5.7_ubuntu22.04_py3.10_pytorch_2.0.1" ]; then \
|
|
||||||
patch /opt/conda/envs/py_3.10/lib/python3.10/site-packages/torch/utils/hipify/hipify_python.py hipify_patch.patch; fi \
|
|
||||||
&& python3 setup.py install \
|
### Flash-Attention wheel build stage
|
||||||
&& cd ..; \
|
FROM base AS build_fa
|
||||||
|
ARG BUILD_FA
|
||||||
|
ARG TRY_FA_WHEEL
|
||||||
|
ARG FA_WHEEL_URL
|
||||||
|
ARG FA_GFX_ARCHS
|
||||||
|
ARG FA_BRANCH
|
||||||
|
# Build ROCm flash-attention wheel if `BUILD_FA = 1`
|
||||||
|
RUN --mount=type=cache,target=${CCACHE_DIR} \
|
||||||
|
if [ "$BUILD_FA" = "1" ]; then \
|
||||||
|
if [ "${TRY_FA_WHEEL}" = "1" ] && python3 -m pip install "${FA_WHEEL_URL}"; then \
|
||||||
|
# If a suitable wheel exists, we download it instead of building FA
|
||||||
|
mkdir -p /install && wget -N "${FA_WHEEL_URL}" -P /install; \
|
||||||
|
else \
|
||||||
|
mkdir -p libs \
|
||||||
|
&& cd libs \
|
||||||
|
&& git clone https://github.com/ROCm/flash-attention.git \
|
||||||
|
&& cd flash-attention \
|
||||||
|
&& git checkout "${FA_BRANCH}" \
|
||||||
|
&& git submodule update --init \
|
||||||
|
&& GPU_ARCHS="${FA_GFX_ARCHS}" python3 setup.py bdist_wheel --dist-dir=/install; \
|
||||||
|
fi; \
|
||||||
|
# Create an empty directory otherwise as later build stages expect one
|
||||||
|
else mkdir -p /install; \
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# Error related to odd state for numpy 1.20.3 where there is no METADATA etc, but an extra LICENSES_bundled.txt.
|
|
||||||
# Manually removed it so that later steps of numpy upgrade can continue
|
|
||||||
RUN if [ "$BASE_IMAGE" = "rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1" ]; then \
|
|
||||||
rm -rf /opt/conda/envs/py_3.9/lib/python3.9/site-packages/numpy-1.20.3.dist-info/; fi
|
|
||||||
|
|
||||||
COPY ./ /app/vllm
|
### Triton wheel build stage
|
||||||
|
FROM base AS build_triton
|
||||||
|
ARG BUILD_TRITON
|
||||||
|
ARG TRITON_BRANCH
|
||||||
|
# Build triton wheel if `BUILD_TRITON = 1`
|
||||||
|
RUN --mount=type=cache,target=${CCACHE_DIR} \
|
||||||
|
if [ "$BUILD_TRITON" = "1" ]; then \
|
||||||
|
mkdir -p libs \
|
||||||
|
&& cd libs \
|
||||||
|
&& git clone https://github.com/OpenAI/triton.git \
|
||||||
|
&& cd triton \
|
||||||
|
&& git checkout "${TRITON_BRANCH}" \
|
||||||
|
&& cd python \
|
||||||
|
&& python3 setup.py bdist_wheel --dist-dir=/install; \
|
||||||
|
# Create an empty directory otherwise as later build stages expect one
|
||||||
|
else mkdir -p /install; \
|
||||||
|
fi
|
||||||
|
|
||||||
RUN python3 -m pip install --upgrade pip
|
|
||||||
RUN python3 -m pip install xformers==0.0.23 --no-deps
|
|
||||||
|
|
||||||
RUN cd /app \
|
### Final vLLM build stage
|
||||||
&& cd vllm \
|
FROM base AS final
|
||||||
&& pip install -U -r requirements-rocm.txt \
|
# Import the vLLM development directory from the build context
|
||||||
&& if [ "$BUILD_FA" = "1" ]; then \
|
COPY . .
|
||||||
bash patch_xformers.rocm.sh; fi \
|
|
||||||
&& patch /opt/rocm/include/hip/amd_detail/amd_hip_bf16.h /app/vllm/rocm_patch/rocm_bf16.patch \
|
|
||||||
&& python3 setup.py install \
|
|
||||||
&& cd ..
|
|
||||||
|
|
||||||
RUN python3 -m pip install --upgrade pip
|
# Package upgrades for useful functionality or to avoid dependency issues
|
||||||
RUN python3 -m pip install --no-cache-dir ray[all]==2.9.3
|
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||||
|
python3 -m pip install --upgrade numba scipy huggingface-hub[cli]
|
||||||
|
|
||||||
|
|
||||||
|
# Workaround for ray >= 2.10.0
|
||||||
|
ENV RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1
|
||||||
|
# Silences the HF Tokenizers warning
|
||||||
|
ENV TOKENIZERS_PARALLELISM=false
|
||||||
|
|
||||||
|
RUN --mount=type=cache,target=${CCACHE_DIR} \
|
||||||
|
--mount=type=cache,target=/root/.cache/pip \
|
||||||
|
python3 -m pip install -Ur requirements-rocm.txt \
|
||||||
|
&& case "$(ls /opt | grep -Po 'rocm-[0-9]\.[0-9]')" in \
|
||||||
|
*"rocm-6.1"*) \
|
||||||
|
# Bring in upgrades to HIP graph earlier than ROCm 6.2 for vLLM
|
||||||
|
wget -N https://github.com/ROCm/vllm/raw/fa78403/rocm_patch/libamdhip64.so.6 -P /opt/rocm/lib \
|
||||||
|
# Prevent interference if torch bundles its own HIP runtime
|
||||||
|
&& rm -f "$(python3 -c 'import torch; print(torch.__path__[0])')"/lib/libamdhip64.so* || true;; \
|
||||||
|
*) ;; esac \
|
||||||
|
&& python3 setup.py clean --all \
|
||||||
|
&& python3 setup.py develop
|
||||||
|
|
||||||
|
# Copy amdsmi wheel into final image
|
||||||
|
RUN --mount=type=bind,from=build_amdsmi,src=/install,target=/install \
|
||||||
|
mkdir -p libs \
|
||||||
|
&& cp /install/*.whl libs \
|
||||||
|
# Preemptively uninstall to avoid same-version no-installs
|
||||||
|
&& python3 -m pip uninstall -y amdsmi;
|
||||||
|
|
||||||
|
# Copy triton wheel(s) into final image if they were built
|
||||||
|
RUN --mount=type=bind,from=build_triton,src=/install,target=/install \
|
||||||
|
mkdir -p libs \
|
||||||
|
&& if ls /install/*.whl; then \
|
||||||
|
cp /install/*.whl libs \
|
||||||
|
# Preemptively uninstall to avoid same-version no-installs
|
||||||
|
&& python3 -m pip uninstall -y triton; fi
|
||||||
|
|
||||||
|
# Copy flash-attn wheel(s) into final image if they were built
|
||||||
|
RUN --mount=type=bind,from=build_fa,src=/install,target=/install \
|
||||||
|
mkdir -p libs \
|
||||||
|
&& if ls /install/*.whl; then \
|
||||||
|
cp /install/*.whl libs \
|
||||||
|
# Preemptively uninstall to avoid same-version no-installs
|
||||||
|
&& python3 -m pip uninstall -y flash-attn; fi
|
||||||
|
|
||||||
|
# Install wheels that were built to the final image
|
||||||
|
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||||
|
if ls libs/*.whl; then \
|
||||||
|
python3 -m pip install libs/*.whl; fi
|
||||||
|
|
||||||
CMD ["/bin/bash"]
|
CMD ["/bin/bash"]
|
||||||
|
|||||||
23
Dockerfile.tpu
Normal file
23
Dockerfile.tpu
Normal file
@@ -0,0 +1,23 @@
|
|||||||
|
ARG NIGHTLY_DATE="20240726"
|
||||||
|
ARG BASE_IMAGE="us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:nightly_3.10_tpuvm_$NIGHTLY_DATE"
|
||||||
|
|
||||||
|
FROM $BASE_IMAGE
|
||||||
|
WORKDIR /workspace
|
||||||
|
|
||||||
|
# Install aiohttp separately to avoid build errors.
|
||||||
|
RUN pip install aiohttp
|
||||||
|
# Install NumPy 1 instead of NumPy 2.
|
||||||
|
RUN pip install "numpy<2"
|
||||||
|
# Install the TPU and Pallas dependencies.
|
||||||
|
RUN pip install torch_xla[tpu] -f https://storage.googleapis.com/libtpu-releases/index.html
|
||||||
|
RUN pip install torch_xla[pallas] -f https://storage.googleapis.com/jax-releases/jax_nightly_releases.html -f https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html
|
||||||
|
|
||||||
|
# Fix FastAPI dependence
|
||||||
|
RUN pip install "starlette<0.38.0"
|
||||||
|
|
||||||
|
# Build vLLM.
|
||||||
|
COPY . /workspace/vllm
|
||||||
|
ENV VLLM_TARGET_DEVICE="tpu"
|
||||||
|
RUN cd /workspace/vllm && python setup.py develop
|
||||||
|
|
||||||
|
CMD ["/bin/bash"]
|
||||||
22
Dockerfile.xpu
Normal file
22
Dockerfile.xpu
Normal file
@@ -0,0 +1,22 @@
|
|||||||
|
FROM intel/oneapi-basekit:2024.1.0-devel-ubuntu20.04
|
||||||
|
|
||||||
|
RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/intel-oneapi-archive-keyring.gpg > /dev/null && \
|
||||||
|
echo "deb [signed-by=/usr/share/keyrings/intel-oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main " | tee /etc/apt/sources.list.d/oneAPI.list && \
|
||||||
|
chmod 644 /usr/share/keyrings/intel-oneapi-archive-keyring.gpg && \
|
||||||
|
rm /etc/apt/sources.list.d/intel-graphics.list && \
|
||||||
|
wget -O- https://repositories.intel.com/graphics/intel-graphics.key | gpg --dearmor | tee /usr/share/keyrings/intel-graphics.gpg > /dev/null && \
|
||||||
|
echo "deb [arch=amd64,i386 signed-by=/usr/share/keyrings/intel-graphics.gpg] https://repositories.intel.com/graphics/ubuntu jammy arc" | tee /etc/apt/sources.list.d/intel.gpu.jammy.list && \
|
||||||
|
chmod 644 /usr/share/keyrings/intel-graphics.gpg
|
||||||
|
|
||||||
|
RUN apt-get update -y \
|
||||||
|
&& apt-get install -y curl libicu70 lsb-release git wget vim numactl python3 python3-pip
|
||||||
|
|
||||||
|
COPY ./ /workspace/vllm
|
||||||
|
|
||||||
|
WORKDIR /workspace/vllm
|
||||||
|
|
||||||
|
RUN pip install -v -r requirements-xpu.txt
|
||||||
|
|
||||||
|
RUN VLLM_TARGET_DEVICE=xpu python3 setup.py install
|
||||||
|
|
||||||
|
CMD ["/bin/bash"]
|
||||||
@@ -1,5 +1,10 @@
|
|||||||
include LICENSE
|
include LICENSE
|
||||||
include requirements.txt
|
include requirements-adag.txt
|
||||||
|
include requirements-common.txt
|
||||||
|
include requirements-cuda.txt
|
||||||
|
include requirements-rocm.txt
|
||||||
|
include requirements-neuron.txt
|
||||||
|
include requirements-cpu.txt
|
||||||
include CMakeLists.txt
|
include CMakeLists.txt
|
||||||
|
|
||||||
recursive-include cmake *
|
recursive-include cmake *
|
||||||
|
|||||||
101
README.md
101
README.md
@@ -16,25 +16,14 @@ Easy, fast, and cheap LLM serving for everyone
|
|||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
**The Third vLLM Bay Area Meetup (April 2nd 6pm-8:30pm PT)**
|
|
||||||
|
|
||||||
We are thrilled to announce our third vLLM Meetup!
|
|
||||||
The vLLM team will share recent updates and roadmap.
|
|
||||||
We will also have vLLM collaborators from Roblox coming up to the stage to discuss their experience in deploying LLMs with vLLM.
|
|
||||||
Please register [here](https://robloxandvllmmeetup2024.splashthat.com/) and join us!
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
*Latest News* 🔥
|
*Latest News* 🔥
|
||||||
- [2024/01] We hosted [the second vLLM meetup](https://lu.ma/ygxbpzhl) in SF! Please find the meetup slides [here](https://docs.google.com/presentation/d/12mI2sKABnUw5RBWXDYY-HtHth4iMSNcEoQ10jDQbxgA/edit?usp=sharing).
|
- [2024/07] We hosted [the fifth vLLM meetup](https://lu.ma/lp0gyjqr) with AWS! Please find the meetup slides [here](https://docs.google.com/presentation/d/1RgUD8aCfcHocghoP3zmXzck9vX3RCI9yfUAB2Bbcl4Y/edit?usp=sharing).
|
||||||
- [2024/01] Added ROCm 6.0 support to vLLM.
|
- [2024/07] In partnership with Meta, vLLM officially supports Llama 3.1 with FP8 quantization and pipeline parallelism! Please check out our blog post [here](https://blog.vllm.ai/2024/07/23/llama31.html).
|
||||||
- [2023/12] Added ROCm 5.7 support to vLLM.
|
- [2024/06] We hosted [the fourth vLLM meetup](https://lu.ma/agivllm) with Cloudflare and BentoML! Please find the meetup slides [here](https://docs.google.com/presentation/d/1iJ8o7V2bQEi0BFEljLTwc5G1S10_Rhv3beed5oB0NJ4/edit?usp=sharing).
|
||||||
- [2023/10] We hosted [the first vLLM meetup](https://lu.ma/first-vllm-meetup) in SF! Please find the meetup slides [here](https://docs.google.com/presentation/d/1QL-XPFXiFpDBh86DbEegFXBXFXjix4v032GhShbKf3s/edit?usp=sharing).
|
- [2024/04] We hosted [the third vLLM meetup](https://robloxandvllmmeetup2024.splashthat.com/) with Roblox! Please find the meetup slides [here](https://docs.google.com/presentation/d/1A--47JAK4BJ39t954HyTkvtfwn0fkqtsL8NGFuslReM/edit?usp=sharing).
|
||||||
- [2023/09] We created our [Discord server](https://discord.gg/jz7wjKhh6g)! Join us to discuss vLLM and LLM serving! We will also post the latest announcements and updates there.
|
- [2024/01] We hosted [the second vLLM meetup](https://lu.ma/ygxbpzhl) with IBM! Please find the meetup slides [here](https://docs.google.com/presentation/d/12mI2sKABnUw5RBWXDYY-HtHth4iMSNcEoQ10jDQbxgA/edit?usp=sharing).
|
||||||
- [2023/09] We released our [PagedAttention paper](https://arxiv.org/abs/2309.06180) on arXiv!
|
- [2023/10] We hosted [the first vLLM meetup](https://lu.ma/first-vllm-meetup) with a16z! Please find the meetup slides [here](https://docs.google.com/presentation/d/1QL-XPFXiFpDBh86DbEegFXBXFXjix4v032GhShbKf3s/edit?usp=sharing).
|
||||||
- [2023/08] We would like to express our sincere gratitude to [Andreessen Horowitz](https://a16z.com/2023/08/30/supporting-the-open-source-ai-community/) (a16z) for providing a generous grant to support the open-source development and research of vLLM.
|
- [2023/08] We would like to express our sincere gratitude to [Andreessen Horowitz](https://a16z.com/2023/08/30/supporting-the-open-source-ai-community/) (a16z) for providing a generous grant to support the open-source development and research of vLLM.
|
||||||
- [2023/07] Added support for LLaMA-2! You can run and serve 7B/13B/70B LLaMA-2s on vLLM with a single command!
|
|
||||||
- [2023/06] Serving vLLM On any Cloud with SkyPilot. Check out a 1-click [example](https://github.com/skypilot-org/skypilot/blob/master/llm/vllm) to start the vLLM demo, and the [blog post](https://blog.skypilot.co/serving-llm-24x-faster-on-the-cloud-with-vllm-and-skypilot/) for the story behind vLLM development on the clouds.
|
|
||||||
- [2023/06] We officially released vLLM! FastChat-vLLM integration has powered [LMSYS Vicuna and Chatbot Arena](https://chat.lmsys.org) since mid-April. Check out our [blog post](https://vllm.ai).
|
- [2023/06] We officially released vLLM! FastChat-vLLM integration has powered [LMSYS Vicuna and Chatbot Arena](https://chat.lmsys.org) since mid-April. Check out our [blog post](https://vllm.ai).
|
||||||
|
|
||||||
---
|
---
|
||||||
@@ -50,50 +39,27 @@ vLLM is fast with:
|
|||||||
- Quantization: [GPTQ](https://arxiv.org/abs/2210.17323), [AWQ](https://arxiv.org/abs/2306.00978), [SqueezeLLM](https://arxiv.org/abs/2306.07629), FP8 KV Cache
|
- Quantization: [GPTQ](https://arxiv.org/abs/2210.17323), [AWQ](https://arxiv.org/abs/2306.00978), [SqueezeLLM](https://arxiv.org/abs/2306.07629), FP8 KV Cache
|
||||||
- Optimized CUDA kernels
|
- Optimized CUDA kernels
|
||||||
|
|
||||||
|
**Performance benchmark**: We include a [performance benchmark](https://buildkite.com/vllm/performance-benchmark/builds/4068) that compares the performance of vllm against other LLM serving engines ([TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM), [text-generation-inference](https://github.com/huggingface/text-generation-inference) and [lmdeploy](https://github.com/InternLM/lmdeploy)).
|
||||||
|
|
||||||
vLLM is flexible and easy to use with:
|
vLLM is flexible and easy to use with:
|
||||||
|
|
||||||
- Seamless integration with popular Hugging Face models
|
- Seamless integration with popular Hugging Face models
|
||||||
- High-throughput serving with various decoding algorithms, including *parallel sampling*, *beam search*, and more
|
- High-throughput serving with various decoding algorithms, including *parallel sampling*, *beam search*, and more
|
||||||
- Tensor parallelism support for distributed inference
|
- Tensor parallelism and pipeline parallelism support for distributed inference
|
||||||
- Streaming outputs
|
- Streaming outputs
|
||||||
- OpenAI-compatible API server
|
- OpenAI-compatible API server
|
||||||
- Support NVIDIA GPUs and AMD GPUs
|
- Support NVIDIA GPUs, AMD CPUs and GPUs, Intel CPUs and GPUs, PowerPC CPUs
|
||||||
- (Experimental) Prefix caching support
|
- (Experimental) Prefix caching support
|
||||||
- (Experimental) Multi-lora support
|
- (Experimental) Multi-lora support
|
||||||
|
|
||||||
vLLM seamlessly supports many Hugging Face models, including the following architectures:
|
vLLM seamlessly supports most popular open-source models on HuggingFace, including:
|
||||||
|
- Transformer-like LLMs (e.g., Llama)
|
||||||
|
- Mixture-of-Expert LLMs (e.g., Mixtral)
|
||||||
|
- Multi-modal LLMs (e.g., LLaVA)
|
||||||
|
|
||||||
- Aquila & Aquila2 (`BAAI/AquilaChat2-7B`, `BAAI/AquilaChat2-34B`, `BAAI/Aquila-7B`, `BAAI/AquilaChat-7B`, etc.)
|
Find the full list of supported models [here](https://docs.vllm.ai/en/latest/models/supported_models.html).
|
||||||
- Baichuan & Baichuan2 (`baichuan-inc/Baichuan2-13B-Chat`, `baichuan-inc/Baichuan-7B`, etc.)
|
|
||||||
- BLOOM (`bigscience/bloom`, `bigscience/bloomz`, etc.)
|
## Getting Started
|
||||||
- ChatGLM (`THUDM/chatglm2-6b`, `THUDM/chatglm3-6b`, etc.)
|
|
||||||
- Command-R (`CohereForAI/c4ai-command-r-v01`, etc.)
|
|
||||||
- DBRX (`databricks/dbrx-base`, `databricks/dbrx-instruct` etc.)
|
|
||||||
- DeciLM (`Deci/DeciLM-7B`, `Deci/DeciLM-7B-instruct`, etc.)
|
|
||||||
- Falcon (`tiiuae/falcon-7b`, `tiiuae/falcon-40b`, `tiiuae/falcon-rw-7b`, etc.)
|
|
||||||
- Gemma (`google/gemma-2b`, `google/gemma-7b`, etc.)
|
|
||||||
- GPT-2 (`gpt2`, `gpt2-xl`, etc.)
|
|
||||||
- GPT BigCode (`bigcode/starcoder`, `bigcode/gpt_bigcode-santacoder`, etc.)
|
|
||||||
- GPT-J (`EleutherAI/gpt-j-6b`, `nomic-ai/gpt4all-j`, etc.)
|
|
||||||
- GPT-NeoX (`EleutherAI/gpt-neox-20b`, `databricks/dolly-v2-12b`, `stabilityai/stablelm-tuned-alpha-7b`, etc.)
|
|
||||||
- InternLM (`internlm/internlm-7b`, `internlm/internlm-chat-7b`, etc.)
|
|
||||||
- InternLM2 (`internlm/internlm2-7b`, `internlm/internlm2-chat-7b`, etc.)
|
|
||||||
- Jais (`core42/jais-13b`, `core42/jais-13b-chat`, `core42/jais-30b-v3`, `core42/jais-30b-chat-v3`, etc.)
|
|
||||||
- LLaMA & LLaMA-2 (`meta-llama/Llama-2-70b-hf`, `lmsys/vicuna-13b-v1.3`, `young-geng/koala`, `openlm-research/open_llama_13b`, etc.)
|
|
||||||
- Mistral (`mistralai/Mistral-7B-v0.1`, `mistralai/Mistral-7B-Instruct-v0.1`, etc.)
|
|
||||||
- Mixtral (`mistralai/Mixtral-8x7B-v0.1`, `mistralai/Mixtral-8x7B-Instruct-v0.1`, etc.)
|
|
||||||
- MPT (`mosaicml/mpt-7b`, `mosaicml/mpt-30b`, etc.)
|
|
||||||
- OLMo (`allenai/OLMo-1B`, `allenai/OLMo-7B`, etc.)
|
|
||||||
- OPT (`facebook/opt-66b`, `facebook/opt-iml-max-30b`, etc.)
|
|
||||||
- Orion (`OrionStarAI/Orion-14B-Base`, `OrionStarAI/Orion-14B-Chat`, etc.)
|
|
||||||
- Phi (`microsoft/phi-1_5`, `microsoft/phi-2`, etc.)
|
|
||||||
- Qwen (`Qwen/Qwen-7B`, `Qwen/Qwen-7B-Chat`, etc.)
|
|
||||||
- Qwen2 (`Qwen/Qwen2-7B-beta`, `Qwen/Qwen-7B-Chat-beta`, etc.)
|
|
||||||
- Qwen2MoE (`Qwen/Qwen1.5-MoE-A2.7B`, `Qwen/Qwen1.5-MoE-A2.7B-Chat`, etc.)
|
|
||||||
- StableLM(`stabilityai/stablelm-3b-4e1t`, `stabilityai/stablelm-base-alpha-7b-v2`, etc.)
|
|
||||||
- Starcoder2(`bigcode/starcoder2-3b`, `bigcode/starcoder2-7b`, `bigcode/starcoder2-15b`, etc.)
|
|
||||||
- Xverse (`xverse/XVERSE-7B-Chat`, `xverse/XVERSE-13B-Chat`, `xverse/XVERSE-65B-Chat`, etc.)
|
|
||||||
- Yi (`01-ai/Yi-6B`, `01-ai/Yi-34B`, etc.)
|
|
||||||
|
|
||||||
Install vLLM with pip or [from source](https://vllm.readthedocs.io/en/latest/getting_started/installation.html#build-from-source):
|
Install vLLM with pip or [from source](https://vllm.readthedocs.io/en/latest/getting_started/installation.html#build-from-source):
|
||||||
|
|
||||||
@@ -101,9 +67,7 @@ Install vLLM with pip or [from source](https://vllm.readthedocs.io/en/latest/get
|
|||||||
pip install vllm
|
pip install vllm
|
||||||
```
|
```
|
||||||
|
|
||||||
## Getting Started
|
Visit our [documentation](https://vllm.readthedocs.io/en/latest/) to learn more.
|
||||||
|
|
||||||
Visit our [documentation](https://vllm.readthedocs.io/en/latest/) to get started.
|
|
||||||
- [Installation](https://vllm.readthedocs.io/en/latest/getting_started/installation.html)
|
- [Installation](https://vllm.readthedocs.io/en/latest/getting_started/installation.html)
|
||||||
- [Quickstart](https://vllm.readthedocs.io/en/latest/getting_started/quickstart.html)
|
- [Quickstart](https://vllm.readthedocs.io/en/latest/getting_started/quickstart.html)
|
||||||
- [Supported Models](https://vllm.readthedocs.io/en/latest/models/supported_models.html)
|
- [Supported Models](https://vllm.readthedocs.io/en/latest/models/supported_models.html)
|
||||||
@@ -113,6 +77,35 @@ Visit our [documentation](https://vllm.readthedocs.io/en/latest/) to get started
|
|||||||
We welcome and value any contributions and collaborations.
|
We welcome and value any contributions and collaborations.
|
||||||
Please check out [CONTRIBUTING.md](./CONTRIBUTING.md) for how to get involved.
|
Please check out [CONTRIBUTING.md](./CONTRIBUTING.md) for how to get involved.
|
||||||
|
|
||||||
|
## Sponsors
|
||||||
|
|
||||||
|
vLLM is a community project. Our compute resources for development and testing are supported by the following organizations. Thank you for your support!
|
||||||
|
|
||||||
|
<!-- Note: Please sort them in alphabetical order. -->
|
||||||
|
<!-- Note: Please keep these consistent with docs/source/community/sponsors.md -->
|
||||||
|
|
||||||
|
- a16z
|
||||||
|
- AMD
|
||||||
|
- Anyscale
|
||||||
|
- AWS
|
||||||
|
- Crusoe Cloud
|
||||||
|
- Databricks
|
||||||
|
- DeepInfra
|
||||||
|
- Dropbox
|
||||||
|
- Google Cloud
|
||||||
|
- Lambda Lab
|
||||||
|
- NVIDIA
|
||||||
|
- Replicate
|
||||||
|
- Roblox
|
||||||
|
- RunPod
|
||||||
|
- Sequoia Capital
|
||||||
|
- Trainy
|
||||||
|
- UC Berkeley
|
||||||
|
- UC San Diego
|
||||||
|
- ZhenFund
|
||||||
|
|
||||||
|
We also have an official fundraising venue through [OpenCollective](https://opencollective.com/vllm). We plan to use the fund to support the development, maintenance, and adoption of vLLM.
|
||||||
|
|
||||||
## Citation
|
## Citation
|
||||||
|
|
||||||
If you use vLLM for your research, please cite our [paper](https://arxiv.org/abs/2309.06180):
|
If you use vLLM for your research, please cite our [paper](https://arxiv.org/abs/2309.06180):
|
||||||
|
|||||||
@@ -4,10 +4,13 @@ import sys
|
|||||||
import time
|
import time
|
||||||
import traceback
|
import traceback
|
||||||
from dataclasses import dataclass, field
|
from dataclasses import dataclass, field
|
||||||
from typing import List, Optional
|
from typing import List, Optional, Union
|
||||||
|
|
||||||
import aiohttp
|
import aiohttp
|
||||||
|
import huggingface_hub.constants
|
||||||
from tqdm.asyncio import tqdm
|
from tqdm.asyncio import tqdm
|
||||||
|
from transformers import (AutoTokenizer, PreTrainedTokenizer,
|
||||||
|
PreTrainedTokenizerFast)
|
||||||
|
|
||||||
AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=6 * 60 * 60)
|
AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=6 * 60 * 60)
|
||||||
|
|
||||||
@@ -27,8 +30,8 @@ class RequestFuncInput:
|
|||||||
class RequestFuncOutput:
|
class RequestFuncOutput:
|
||||||
generated_text: str = ""
|
generated_text: str = ""
|
||||||
success: bool = False
|
success: bool = False
|
||||||
latency: float = 0
|
latency: float = 0.0
|
||||||
ttft: float = 0 # Time to first token
|
ttft: float = 0.0 # Time to first token
|
||||||
itl: List[float] = field(
|
itl: List[float] = field(
|
||||||
default_factory=list) # List of inter-token latencies
|
default_factory=list) # List of inter-token latencies
|
||||||
prompt_len: int = 0
|
prompt_len: int = 0
|
||||||
@@ -58,23 +61,28 @@ async def async_request_tgi(
|
|||||||
output = RequestFuncOutput()
|
output = RequestFuncOutput()
|
||||||
output.prompt_len = request_func_input.prompt_len
|
output.prompt_len = request_func_input.prompt_len
|
||||||
|
|
||||||
ttft = 0
|
ttft = 0.0
|
||||||
st = time.perf_counter()
|
st = time.perf_counter()
|
||||||
most_recent_timestamp = st
|
most_recent_timestamp = st
|
||||||
try:
|
try:
|
||||||
async with session.post(url=api_url, json=payload) as response:
|
async with session.post(url=api_url, json=payload) as response:
|
||||||
if response.status == 200:
|
if response.status == 200:
|
||||||
async for chunk in response.content:
|
async for chunk_bytes in response.content:
|
||||||
chunk = chunk.strip()
|
chunk_bytes = chunk_bytes.strip()
|
||||||
if not chunk:
|
if not chunk_bytes:
|
||||||
continue
|
continue
|
||||||
|
chunk_bytes = chunk_bytes.decode("utf-8")
|
||||||
|
|
||||||
chunk = remove_prefix(chunk.decode("utf-8"), "data:")
|
#NOTE: Sometimes TGI returns a ping response without
|
||||||
|
# any data, we should skip it.
|
||||||
|
if chunk_bytes.startswith(":"):
|
||||||
|
continue
|
||||||
|
chunk = remove_prefix(chunk_bytes, "data:")
|
||||||
|
|
||||||
data = json.loads(chunk)
|
data = json.loads(chunk)
|
||||||
timestamp = time.perf_counter()
|
timestamp = time.perf_counter()
|
||||||
# First token
|
# First token
|
||||||
if ttft == 0:
|
if ttft == 0.0:
|
||||||
ttft = time.perf_counter() - st
|
ttft = time.perf_counter() - st
|
||||||
output.ttft = ttft
|
output.ttft = ttft
|
||||||
|
|
||||||
@@ -88,6 +96,9 @@ async def async_request_tgi(
|
|||||||
output.latency = most_recent_timestamp - st
|
output.latency = most_recent_timestamp - st
|
||||||
output.success = True
|
output.success = True
|
||||||
output.generated_text = data["generated_text"]
|
output.generated_text = data["generated_text"]
|
||||||
|
else:
|
||||||
|
output.error = response.reason or ""
|
||||||
|
output.success = False
|
||||||
except Exception:
|
except Exception:
|
||||||
output.success = False
|
output.success = False
|
||||||
exc_info = sys.exc_info()
|
exc_info = sys.exc_info()
|
||||||
@@ -119,23 +130,25 @@ async def async_request_trt_llm(
|
|||||||
output = RequestFuncOutput()
|
output = RequestFuncOutput()
|
||||||
output.prompt_len = request_func_input.prompt_len
|
output.prompt_len = request_func_input.prompt_len
|
||||||
|
|
||||||
ttft = 0
|
ttft = 0.0
|
||||||
st = time.perf_counter()
|
st = time.perf_counter()
|
||||||
most_recent_timestamp = st
|
most_recent_timestamp = st
|
||||||
try:
|
try:
|
||||||
async with session.post(url=api_url, json=payload) as response:
|
async with session.post(url=api_url, json=payload) as response:
|
||||||
if response.status == 200:
|
if response.status == 200:
|
||||||
async for chunk in response.content:
|
async for chunk_bytes in response.content:
|
||||||
chunk = chunk.strip()
|
chunk_bytes = chunk_bytes.strip()
|
||||||
if not chunk:
|
if not chunk_bytes:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
chunk = remove_prefix(chunk.decode("utf-8"), "data:")
|
chunk = remove_prefix(chunk_bytes.decode("utf-8"),
|
||||||
|
"data:")
|
||||||
|
|
||||||
data = json.loads(chunk)
|
data = json.loads(chunk)
|
||||||
|
output.generated_text += data["text_output"]
|
||||||
timestamp = time.perf_counter()
|
timestamp = time.perf_counter()
|
||||||
# First token
|
# First token
|
||||||
if ttft == 0:
|
if ttft == 0.0:
|
||||||
ttft = time.perf_counter() - st
|
ttft = time.perf_counter() - st
|
||||||
output.ttft = ttft
|
output.ttft = ttft
|
||||||
|
|
||||||
@@ -147,11 +160,10 @@ async def async_request_trt_llm(
|
|||||||
most_recent_timestamp = timestamp
|
most_recent_timestamp = timestamp
|
||||||
|
|
||||||
output.latency = most_recent_timestamp - st
|
output.latency = most_recent_timestamp - st
|
||||||
output.generated_text = json.loads(data)["text_output"]
|
|
||||||
output.success = True
|
output.success = True
|
||||||
|
|
||||||
else:
|
else:
|
||||||
output.error = response.reason
|
output.error = response.reason or ""
|
||||||
output.success = False
|
output.success = False
|
||||||
except Exception:
|
except Exception:
|
||||||
output.success = False
|
output.success = False
|
||||||
@@ -195,7 +207,7 @@ async def async_request_deepspeed_mii(
|
|||||||
output.generated_text = parsed_resp["text"][0]
|
output.generated_text = parsed_resp["text"][0]
|
||||||
output.success = True
|
output.success = True
|
||||||
else:
|
else:
|
||||||
output.error = response.reason
|
output.error = response.reason or ""
|
||||||
output.success = False
|
output.success = False
|
||||||
except Exception:
|
except Exception:
|
||||||
output.success = False
|
output.success = False
|
||||||
@@ -213,8 +225,8 @@ async def async_request_openai_completions(
|
|||||||
) -> RequestFuncOutput:
|
) -> RequestFuncOutput:
|
||||||
api_url = request_func_input.api_url
|
api_url = request_func_input.api_url
|
||||||
assert api_url.endswith(
|
assert api_url.endswith(
|
||||||
"v1/completions"
|
"completions"
|
||||||
), "OpenAI Completions API URL must end with 'v1/completions'."
|
), "OpenAI Completions API URL must end with 'completions'."
|
||||||
|
|
||||||
async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
|
async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
|
||||||
assert not request_func_input.use_beam_search
|
assert not request_func_input.use_beam_search
|
||||||
@@ -234,38 +246,38 @@ async def async_request_openai_completions(
|
|||||||
output.prompt_len = request_func_input.prompt_len
|
output.prompt_len = request_func_input.prompt_len
|
||||||
|
|
||||||
generated_text = ""
|
generated_text = ""
|
||||||
ttft = 0
|
ttft = 0.0
|
||||||
st = time.perf_counter()
|
st = time.perf_counter()
|
||||||
most_recent_timestamp = st
|
most_recent_timestamp = st
|
||||||
try:
|
try:
|
||||||
async with session.post(url=api_url, json=payload,
|
async with session.post(url=api_url, json=payload,
|
||||||
headers=headers) as response:
|
headers=headers) as response:
|
||||||
if response.status == 200:
|
if response.status == 200:
|
||||||
async for chunk in response.content:
|
async for chunk_bytes in response.content:
|
||||||
chunk = chunk.strip()
|
chunk_bytes = chunk_bytes.strip()
|
||||||
if not chunk:
|
if not chunk_bytes:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
chunk = remove_prefix(chunk.decode("utf-8"), "data: ")
|
chunk = remove_prefix(chunk_bytes.decode("utf-8"),
|
||||||
|
"data: ")
|
||||||
if chunk == "[DONE]":
|
if chunk == "[DONE]":
|
||||||
latency = time.perf_counter() - st
|
latency = time.perf_counter() - st
|
||||||
else:
|
else:
|
||||||
data = json.loads(chunk)
|
data = json.loads(chunk)
|
||||||
|
|
||||||
|
# NOTE: Some completion API might have a last
|
||||||
|
# usage summary response without a token so we
|
||||||
|
# want to check a token was generated
|
||||||
if data["choices"][0]["text"]:
|
if data["choices"][0]["text"]:
|
||||||
timestamp = time.perf_counter()
|
timestamp = time.perf_counter()
|
||||||
# First token
|
# First token
|
||||||
if ttft == 0:
|
if ttft == 0.0:
|
||||||
ttft = time.perf_counter() - st
|
ttft = time.perf_counter() - st
|
||||||
output.ttft = ttft
|
output.ttft = ttft
|
||||||
|
|
||||||
# Decoding phase
|
# Decoding phase
|
||||||
# NOTE: Some completion API might have a last
|
output.itl.append(timestamp -
|
||||||
# usage summary response without a token so we
|
most_recent_timestamp)
|
||||||
# do not want to include as inter-token-latency
|
|
||||||
elif data.get("usage", None) is None:
|
|
||||||
output.itl.append(timestamp -
|
|
||||||
most_recent_timestamp)
|
|
||||||
|
|
||||||
most_recent_timestamp = timestamp
|
most_recent_timestamp = timestamp
|
||||||
generated_text += data["choices"][0]["text"]
|
generated_text += data["choices"][0]["text"]
|
||||||
@@ -273,6 +285,9 @@ async def async_request_openai_completions(
|
|||||||
output.generated_text = generated_text
|
output.generated_text = generated_text
|
||||||
output.success = True
|
output.success = True
|
||||||
output.latency = latency
|
output.latency = latency
|
||||||
|
else:
|
||||||
|
output.error = response.reason or ""
|
||||||
|
output.success = False
|
||||||
except Exception:
|
except Exception:
|
||||||
output.success = False
|
output.success = False
|
||||||
exc_info = sys.exc_info()
|
exc_info = sys.exc_info()
|
||||||
@@ -289,8 +304,8 @@ async def async_request_openai_chat_completions(
|
|||||||
) -> RequestFuncOutput:
|
) -> RequestFuncOutput:
|
||||||
api_url = request_func_input.api_url
|
api_url = request_func_input.api_url
|
||||||
assert api_url.endswith(
|
assert api_url.endswith(
|
||||||
"v1/chat/completions"
|
"chat/completions"
|
||||||
), "OpenAI Chat Completions API URL must end with 'v1/chat/completions'."
|
), "OpenAI Chat Completions API URL must end with 'chat/completions'."
|
||||||
|
|
||||||
async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
|
async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
|
||||||
assert not request_func_input.use_beam_search
|
assert not request_func_input.use_beam_search
|
||||||
@@ -315,28 +330,30 @@ async def async_request_openai_chat_completions(
|
|||||||
output.prompt_len = request_func_input.prompt_len
|
output.prompt_len = request_func_input.prompt_len
|
||||||
|
|
||||||
generated_text = ""
|
generated_text = ""
|
||||||
ttft = 0
|
ttft = 0.0
|
||||||
st = time.perf_counter()
|
st = time.perf_counter()
|
||||||
most_recent_timestamp = st
|
most_recent_timestamp = st
|
||||||
try:
|
try:
|
||||||
async with session.post(url=api_url, json=payload,
|
async with session.post(url=api_url, json=payload,
|
||||||
headers=headers) as response:
|
headers=headers) as response:
|
||||||
if response.status == 200:
|
if response.status == 200:
|
||||||
async for chunk in response.content:
|
async for chunk_bytes in response.content:
|
||||||
chunk = chunk.strip()
|
chunk_bytes = chunk_bytes.strip()
|
||||||
if not chunk:
|
if not chunk_bytes:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
chunk = remove_prefix(chunk.decode("utf-8"), "data: ")
|
chunk = remove_prefix(chunk_bytes.decode("utf-8"),
|
||||||
|
"data: ")
|
||||||
if chunk == "[DONE]":
|
if chunk == "[DONE]":
|
||||||
latency = time.perf_counter() - st
|
latency = time.perf_counter() - st
|
||||||
else:
|
else:
|
||||||
timestamp = time.perf_counter()
|
timestamp = time.perf_counter()
|
||||||
data = json.loads(chunk)
|
data = json.loads(chunk)
|
||||||
|
|
||||||
if "content" in data["choices"][0]["delta"]:
|
delta = data["choices"][0]["delta"]
|
||||||
|
if delta.get("content", None):
|
||||||
# First token
|
# First token
|
||||||
if ttft == 0:
|
if ttft == 0.0:
|
||||||
ttft = time.perf_counter() - st
|
ttft = time.perf_counter() - st
|
||||||
output.ttft = ttft
|
output.ttft = ttft
|
||||||
|
|
||||||
@@ -345,8 +362,7 @@ async def async_request_openai_chat_completions(
|
|||||||
output.itl.append(timestamp -
|
output.itl.append(timestamp -
|
||||||
most_recent_timestamp)
|
most_recent_timestamp)
|
||||||
|
|
||||||
generated_text += data["choices"][0]["delta"][
|
generated_text += delta["content"]
|
||||||
"content"]
|
|
||||||
|
|
||||||
most_recent_timestamp = timestamp
|
most_recent_timestamp = timestamp
|
||||||
|
|
||||||
@@ -354,7 +370,7 @@ async def async_request_openai_chat_completions(
|
|||||||
output.success = True
|
output.success = True
|
||||||
output.latency = latency
|
output.latency = latency
|
||||||
else:
|
else:
|
||||||
output.error = response.reason
|
output.error = response.reason or ""
|
||||||
output.success = False
|
output.success = False
|
||||||
except Exception:
|
except Exception:
|
||||||
output.success = False
|
output.success = False
|
||||||
@@ -374,6 +390,30 @@ def remove_prefix(text: str, prefix: str) -> str:
|
|||||||
return text
|
return text
|
||||||
|
|
||||||
|
|
||||||
|
def get_model(pretrained_model_name_or_path: str) -> str:
|
||||||
|
if os.getenv('VLLM_USE_MODELSCOPE', 'False').lower() == 'true':
|
||||||
|
from modelscope import snapshot_download
|
||||||
|
|
||||||
|
model_path = snapshot_download(
|
||||||
|
model_id=pretrained_model_name_or_path,
|
||||||
|
local_files_only=huggingface_hub.constants.HF_HUB_OFFLINE,
|
||||||
|
ignore_file_pattern=[".*.pt", ".*.safetensors", ".*.bin"])
|
||||||
|
|
||||||
|
return model_path
|
||||||
|
return pretrained_model_name_or_path
|
||||||
|
|
||||||
|
|
||||||
|
def get_tokenizer(
|
||||||
|
pretrained_model_name_or_path: str, trust_remote_code: bool
|
||||||
|
) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]:
|
||||||
|
if pretrained_model_name_or_path is not None and not os.path.exists(
|
||||||
|
pretrained_model_name_or_path):
|
||||||
|
pretrained_model_name_or_path = get_model(
|
||||||
|
pretrained_model_name_or_path)
|
||||||
|
return AutoTokenizer.from_pretrained(pretrained_model_name_or_path,
|
||||||
|
trust_remote_code=trust_remote_code)
|
||||||
|
|
||||||
|
|
||||||
ASYNC_REQUEST_FUNCS = {
|
ASYNC_REQUEST_FUNCS = {
|
||||||
"tgi": async_request_tgi,
|
"tgi": async_request_tgi,
|
||||||
"vllm": async_request_openai_completions,
|
"vllm": async_request_openai_completions,
|
||||||
@@ -382,4 +422,5 @@ ASYNC_REQUEST_FUNCS = {
|
|||||||
"openai": async_request_openai_completions,
|
"openai": async_request_openai_completions,
|
||||||
"openai-chat": async_request_openai_chat_completions,
|
"openai-chat": async_request_openai_chat_completions,
|
||||||
"tensorrt-llm": async_request_trt_llm,
|
"tensorrt-llm": async_request_trt_llm,
|
||||||
|
"scalellm": async_request_openai_completions,
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,14 +1,19 @@
|
|||||||
"""Benchmark the latency of processing a single batch of requests."""
|
"""Benchmark the latency of processing a single batch of requests."""
|
||||||
import argparse
|
import argparse
|
||||||
|
import json
|
||||||
import time
|
import time
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Optional
|
from typing import List, Optional
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import torch
|
import torch
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
|
|
||||||
from vllm import LLM, SamplingParams
|
from vllm import LLM, SamplingParams
|
||||||
|
from vllm.engine.arg_utils import EngineArgs
|
||||||
|
from vllm.inputs import PromptInputs
|
||||||
|
from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
|
||||||
|
from vllm.utils import FlexibleArgumentParser
|
||||||
|
|
||||||
|
|
||||||
def main(args: argparse.Namespace):
|
def main(args: argparse.Namespace):
|
||||||
@@ -16,19 +21,33 @@ def main(args: argparse.Namespace):
|
|||||||
|
|
||||||
# NOTE(woosuk): If the request cannot be processed in a single batch,
|
# NOTE(woosuk): If the request cannot be processed in a single batch,
|
||||||
# the engine will automatically process the request in multiple batches.
|
# the engine will automatically process the request in multiple batches.
|
||||||
llm = LLM(model=args.model,
|
llm = LLM(
|
||||||
tokenizer=args.tokenizer,
|
model=args.model,
|
||||||
quantization=args.quantization,
|
speculative_model=args.speculative_model,
|
||||||
tensor_parallel_size=args.tensor_parallel_size,
|
num_speculative_tokens=args.num_speculative_tokens,
|
||||||
trust_remote_code=args.trust_remote_code,
|
speculative_draft_tensor_parallel_size=\
|
||||||
dtype=args.dtype,
|
args.speculative_draft_tensor_parallel_size,
|
||||||
enforce_eager=args.enforce_eager,
|
tokenizer=args.tokenizer,
|
||||||
kv_cache_dtype=args.kv_cache_dtype,
|
quantization=args.quantization,
|
||||||
device=args.device,
|
tensor_parallel_size=args.tensor_parallel_size,
|
||||||
ray_workers_use_nsight=args.ray_workers_use_nsight,
|
trust_remote_code=args.trust_remote_code,
|
||||||
enable_chunked_prefill=args.enable_chunked_prefill,
|
dtype=args.dtype,
|
||||||
download_dir=args.download_dir,
|
max_model_len=args.max_model_len,
|
||||||
block_size=args.block_size)
|
enforce_eager=args.enforce_eager,
|
||||||
|
kv_cache_dtype=args.kv_cache_dtype,
|
||||||
|
quantization_param_path=args.quantization_param_path,
|
||||||
|
device=args.device,
|
||||||
|
ray_workers_use_nsight=args.ray_workers_use_nsight,
|
||||||
|
use_v2_block_manager=args.use_v2_block_manager,
|
||||||
|
enable_chunked_prefill=args.enable_chunked_prefill,
|
||||||
|
download_dir=args.download_dir,
|
||||||
|
block_size=args.block_size,
|
||||||
|
gpu_memory_utilization=args.gpu_memory_utilization,
|
||||||
|
load_format=args.load_format,
|
||||||
|
distributed_executor_backend=args.distributed_executor_backend,
|
||||||
|
otlp_traces_endpoint=args.otlp_traces_endpoint,
|
||||||
|
enable_prefix_caching=args.enable_prefix_caching,
|
||||||
|
)
|
||||||
|
|
||||||
sampling_params = SamplingParams(
|
sampling_params = SamplingParams(
|
||||||
n=args.n,
|
n=args.n,
|
||||||
@@ -42,7 +61,9 @@ def main(args: argparse.Namespace):
|
|||||||
dummy_prompt_token_ids = np.random.randint(10000,
|
dummy_prompt_token_ids = np.random.randint(10000,
|
||||||
size=(args.batch_size,
|
size=(args.batch_size,
|
||||||
args.input_len))
|
args.input_len))
|
||||||
dummy_prompt_token_ids = dummy_prompt_token_ids.tolist()
|
dummy_inputs: List[PromptInputs] = [{
|
||||||
|
"prompt_token_ids": batch
|
||||||
|
} for batch in dummy_prompt_token_ids.tolist()]
|
||||||
|
|
||||||
def run_to_completion(profile_dir: Optional[str] = None):
|
def run_to_completion(profile_dir: Optional[str] = None):
|
||||||
if profile_dir:
|
if profile_dir:
|
||||||
@@ -53,13 +74,13 @@ def main(args: argparse.Namespace):
|
|||||||
],
|
],
|
||||||
on_trace_ready=torch.profiler.tensorboard_trace_handler(
|
on_trace_ready=torch.profiler.tensorboard_trace_handler(
|
||||||
str(profile_dir))) as p:
|
str(profile_dir))) as p:
|
||||||
llm.generate(prompt_token_ids=dummy_prompt_token_ids,
|
llm.generate(dummy_inputs,
|
||||||
sampling_params=sampling_params,
|
sampling_params=sampling_params,
|
||||||
use_tqdm=False)
|
use_tqdm=False)
|
||||||
print(p.key_averages())
|
print(p.key_averages())
|
||||||
else:
|
else:
|
||||||
start_time = time.perf_counter()
|
start_time = time.perf_counter()
|
||||||
llm.generate(prompt_token_ids=dummy_prompt_token_ids,
|
llm.generate(dummy_inputs,
|
||||||
sampling_params=sampling_params,
|
sampling_params=sampling_params,
|
||||||
use_tqdm=False)
|
use_tqdm=False)
|
||||||
end_time = time.perf_counter()
|
end_time = time.perf_counter()
|
||||||
@@ -67,7 +88,8 @@ def main(args: argparse.Namespace):
|
|||||||
return latency
|
return latency
|
||||||
|
|
||||||
print("Warming up...")
|
print("Warming up...")
|
||||||
run_to_completion(profile_dir=None)
|
for _ in tqdm(range(args.num_iters_warmup), desc="Warmup iterations"):
|
||||||
|
run_to_completion(profile_dir=None)
|
||||||
|
|
||||||
if args.profile:
|
if args.profile:
|
||||||
profile_dir = args.profile_result_dir
|
profile_dir = args.profile_result_dir
|
||||||
@@ -83,18 +105,39 @@ def main(args: argparse.Namespace):
|
|||||||
latencies = []
|
latencies = []
|
||||||
for _ in tqdm(range(args.num_iters), desc="Profiling iterations"):
|
for _ in tqdm(range(args.num_iters), desc="Profiling iterations"):
|
||||||
latencies.append(run_to_completion(profile_dir=None))
|
latencies.append(run_to_completion(profile_dir=None))
|
||||||
|
latencies = np.array(latencies)
|
||||||
|
percentages = [10, 25, 50, 75, 90, 99]
|
||||||
|
percentiles = np.percentile(latencies, percentages)
|
||||||
print(f'Avg latency: {np.mean(latencies)} seconds')
|
print(f'Avg latency: {np.mean(latencies)} seconds')
|
||||||
|
for percentage, percentile in zip(percentages, percentiles):
|
||||||
|
print(f'{percentage}% percentile latency: {percentile} seconds')
|
||||||
|
|
||||||
|
# Output JSON results if specified
|
||||||
|
if args.output_json:
|
||||||
|
results = {
|
||||||
|
"avg_latency": np.mean(latencies),
|
||||||
|
"latencies": latencies.tolist(),
|
||||||
|
"percentiles": dict(zip(percentages, percentiles.tolist())),
|
||||||
|
}
|
||||||
|
with open(args.output_json, "w") as f:
|
||||||
|
json.dump(results, f, indent=4)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
parser = argparse.ArgumentParser(
|
parser = FlexibleArgumentParser(
|
||||||
description='Benchmark the latency of processing a single batch of '
|
description='Benchmark the latency of processing a single batch of '
|
||||||
'requests till completion.')
|
'requests till completion.')
|
||||||
parser.add_argument('--model', type=str, default='facebook/opt-125m')
|
parser.add_argument('--model', type=str, default='facebook/opt-125m')
|
||||||
|
parser.add_argument('--speculative-model', type=str, default=None)
|
||||||
|
parser.add_argument('--num-speculative-tokens', type=int, default=None)
|
||||||
|
parser.add_argument('--speculative-draft-tensor-parallel-size',
|
||||||
|
'-spec-draft-tp',
|
||||||
|
type=int,
|
||||||
|
default=None)
|
||||||
parser.add_argument('--tokenizer', type=str, default=None)
|
parser.add_argument('--tokenizer', type=str, default=None)
|
||||||
parser.add_argument('--quantization',
|
parser.add_argument('--quantization',
|
||||||
'-q',
|
'-q',
|
||||||
choices=['awq', 'gptq', 'squeezellm', None],
|
choices=[*QUANTIZATION_METHODS, None],
|
||||||
default=None)
|
default=None)
|
||||||
parser.add_argument('--tensor-parallel-size', '-tp', type=int, default=1)
|
parser.add_argument('--tensor-parallel-size', '-tp', type=int, default=1)
|
||||||
parser.add_argument('--input-len', type=int, default=32)
|
parser.add_argument('--input-len', type=int, default=32)
|
||||||
@@ -105,13 +148,23 @@ if __name__ == '__main__':
|
|||||||
default=1,
|
default=1,
|
||||||
help='Number of generated sequences per prompt.')
|
help='Number of generated sequences per prompt.')
|
||||||
parser.add_argument('--use-beam-search', action='store_true')
|
parser.add_argument('--use-beam-search', action='store_true')
|
||||||
|
parser.add_argument('--num-iters-warmup',
|
||||||
|
type=int,
|
||||||
|
default=10,
|
||||||
|
help='Number of iterations to run for warmup.')
|
||||||
parser.add_argument('--num-iters',
|
parser.add_argument('--num-iters',
|
||||||
type=int,
|
type=int,
|
||||||
default=3,
|
default=30,
|
||||||
help='Number of iterations to run.')
|
help='Number of iterations to run.')
|
||||||
parser.add_argument('--trust-remote-code',
|
parser.add_argument('--trust-remote-code',
|
||||||
action='store_true',
|
action='store_true',
|
||||||
help='trust remote code from huggingface')
|
help='trust remote code from huggingface')
|
||||||
|
parser.add_argument(
|
||||||
|
'--max-model-len',
|
||||||
|
type=int,
|
||||||
|
default=None,
|
||||||
|
help='Maximum length of a sequence (including prompt and output). '
|
||||||
|
'If None, will be derived from the model.')
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
'--dtype',
|
'--dtype',
|
||||||
type=str,
|
type=str,
|
||||||
@@ -125,12 +178,23 @@ if __name__ == '__main__':
|
|||||||
action='store_true',
|
action='store_true',
|
||||||
help='enforce eager mode and disable CUDA graph')
|
help='enforce eager mode and disable CUDA graph')
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--kv-cache-dtype",
|
'--kv-cache-dtype',
|
||||||
type=str,
|
type=str,
|
||||||
choices=['auto', 'fp8_e5m2'],
|
choices=['auto', 'fp8', 'fp8_e5m2', 'fp8_e4m3'],
|
||||||
default='auto',
|
default="auto",
|
||||||
help=
|
help='Data type for kv cache storage. If "auto", will use model '
|
||||||
'Data type for kv cache storage. If "auto", will use model data type.')
|
'data type. CUDA 11.8+ supports fp8 (=fp8_e4m3) and fp8_e5m2. '
|
||||||
|
'ROCm (AMD GPU) supports fp8 (=fp8_e4m3)')
|
||||||
|
parser.add_argument(
|
||||||
|
'--quantization-param-path',
|
||||||
|
type=str,
|
||||||
|
default=None,
|
||||||
|
help='Path to the JSON file containing the KV cache scaling factors. '
|
||||||
|
'This should generally be supplied, when KV cache dtype is FP8. '
|
||||||
|
'Otherwise, KV cache scaling factors default to 1.0, which may cause '
|
||||||
|
'accuracy issues. FP8_E5M2 (without scaling) is only supported on '
|
||||||
|
'cuda version greater than 11.8. On ROCm (AMD GPU), FP8_E4M3 is '
|
||||||
|
'instead supported for common inference criteria.')
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
'--profile',
|
'--profile',
|
||||||
action='store_true',
|
action='store_true',
|
||||||
@@ -144,19 +208,23 @@ if __name__ == '__main__':
|
|||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--device",
|
"--device",
|
||||||
type=str,
|
type=str,
|
||||||
default="cuda",
|
default="auto",
|
||||||
choices=["cuda"],
|
choices=["auto", "cuda", "cpu", "openvino", "tpu", "xpu"],
|
||||||
help='device type for vLLM execution, supporting CUDA only currently.')
|
help='device type for vLLM execution, supporting CUDA, OpenVINO and '
|
||||||
|
'CPU.')
|
||||||
parser.add_argument('--block-size',
|
parser.add_argument('--block-size',
|
||||||
type=int,
|
type=int,
|
||||||
default=16,
|
default=16,
|
||||||
help='block size of key/value cache')
|
help='block size of key/value cache')
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
'--enable-chunked-prefill',
|
'--enable-chunked-prefill',
|
||||||
type=bool,
|
action='store_true',
|
||||||
default=False,
|
|
||||||
help='If True, the prefill requests can be chunked based on the '
|
help='If True, the prefill requests can be chunked based on the '
|
||||||
'max_num_batched_tokens')
|
'max_num_batched_tokens')
|
||||||
|
parser.add_argument("--enable-prefix-caching",
|
||||||
|
action='store_true',
|
||||||
|
help="Enable automatic prefix caching")
|
||||||
|
parser.add_argument('--use-v2-block-manager', action='store_true')
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--ray-workers-use-nsight",
|
"--ray-workers-use-nsight",
|
||||||
action='store_true',
|
action='store_true',
|
||||||
@@ -167,5 +235,51 @@ if __name__ == '__main__':
|
|||||||
default=None,
|
default=None,
|
||||||
help='directory to download and load the weights, '
|
help='directory to download and load the weights, '
|
||||||
'default to the default cache dir of huggingface')
|
'default to the default cache dir of huggingface')
|
||||||
|
parser.add_argument(
|
||||||
|
'--output-json',
|
||||||
|
type=str,
|
||||||
|
default=None,
|
||||||
|
help='Path to save the latency results in JSON format.')
|
||||||
|
parser.add_argument('--gpu-memory-utilization',
|
||||||
|
type=float,
|
||||||
|
default=0.9,
|
||||||
|
help='the fraction of GPU memory to be used for '
|
||||||
|
'the model executor, which can range from 0 to 1.'
|
||||||
|
'If unspecified, will use the default value of 0.9.')
|
||||||
|
parser.add_argument(
|
||||||
|
'--load-format',
|
||||||
|
type=str,
|
||||||
|
default=EngineArgs.load_format,
|
||||||
|
choices=[
|
||||||
|
'auto', 'pt', 'safetensors', 'npcache', 'dummy', 'tensorizer',
|
||||||
|
'bitsandbytes'
|
||||||
|
],
|
||||||
|
help='The format of the model weights to load.\n\n'
|
||||||
|
'* "auto" will try to load the weights in the safetensors format '
|
||||||
|
'and fall back to the pytorch bin format if safetensors format '
|
||||||
|
'is not available.\n'
|
||||||
|
'* "pt" will load the weights in the pytorch bin format.\n'
|
||||||
|
'* "safetensors" will load the weights in the safetensors format.\n'
|
||||||
|
'* "npcache" will load the weights in pytorch format and store '
|
||||||
|
'a numpy cache to speed up the loading.\n'
|
||||||
|
'* "dummy" will initialize the weights with random values, '
|
||||||
|
'which is mainly for profiling.\n'
|
||||||
|
'* "tensorizer" will load the weights using tensorizer from '
|
||||||
|
'CoreWeave. See the Tensorize vLLM Model script in the Examples'
|
||||||
|
'section for more information.\n'
|
||||||
|
'* "bitsandbytes" will load the weights using bitsandbytes '
|
||||||
|
'quantization.\n')
|
||||||
|
parser.add_argument(
|
||||||
|
'--distributed-executor-backend',
|
||||||
|
choices=['ray', 'mp'],
|
||||||
|
default=None,
|
||||||
|
help='Backend to use for distributed serving. When more than 1 GPU '
|
||||||
|
'is used, will be automatically set to "ray" if installed '
|
||||||
|
'or "mp" (multiprocessing) otherwise.')
|
||||||
|
parser.add_argument(
|
||||||
|
'--otlp-traces-endpoint',
|
||||||
|
type=str,
|
||||||
|
default=None,
|
||||||
|
help='Target URL to which OpenTelemetry traces will be sent.')
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
main(args)
|
main(args)
|
||||||
|
|||||||
@@ -1,7 +1,7 @@
|
|||||||
import argparse
|
|
||||||
import time
|
import time
|
||||||
|
|
||||||
from vllm import LLM, SamplingParams
|
from vllm import LLM, SamplingParams
|
||||||
|
from vllm.utils import FlexibleArgumentParser
|
||||||
|
|
||||||
PROMPT = "You are a helpful assistant in recognizes the content of tables in markdown format. Here is a table as fellows. You need to answer my question about the table.\n# Table\n|Opening|Opening|Sl. No.|Film|Cast|Director|Music Director|Notes|\n|----|----|----|----|----|----|----|----|\n|J A N|9|1|Agni Pushpam|Jayabharathi, Kamalahasan|Jeassy|M. K. Arjunan||\n|J A N|16|2|Priyamvada|Mohan Sharma, Lakshmi, KPAC Lalitha|K. S. Sethumadhavan|V. Dakshinamoorthy||\n|J A N|23|3|Yakshagaanam|Madhu, Sheela|Sheela|M. S. Viswanathan||\n|J A N|30|4|Paalkkadal|Sheela, Sharada|T. K. Prasad|A. T. Ummer||\n|F E B|5|5|Amma|Madhu, Srividya|M. Krishnan Nair|M. K. Arjunan||\n|F E B|13|6|Appooppan|Thikkurissi Sukumaran Nair, Kamal Haasan|P. Bhaskaran|M. S. Baburaj||\n|F E B|20|7|Srishti|Chowalloor Krishnankutty, Ravi Alummoodu|K. T. Muhammad|M. S. Baburaj||\n|F E B|20|8|Vanadevatha|Prem Nazir, Madhubala|Yusufali Kechery|G. Devarajan||\n|F E B|27|9|Samasya|Madhu, Kamalahaasan|K. Thankappan|Shyam||\n|F E B|27|10|Yudhabhoomi|K. P. Ummer, Vidhubala|Crossbelt Mani|R. K. Shekhar||\n|M A R|5|11|Seemantha Puthran|Prem Nazir, Jayabharathi|A. B. Raj|M. K. Arjunan||\n|M A R|12|12|Swapnadanam|Rani Chandra, Dr. Mohandas|K. G. George|Bhaskar Chandavarkar||\n|M A R|19|13|Thulavarsham|Prem Nazir, sreedevi, Sudheer|N. Sankaran Nair|V. Dakshinamoorthy||\n|M A R|20|14|Aruthu|Kaviyoor Ponnamma, Kamalahasan|Ravi|G. Devarajan||\n|M A R|26|15|Swimming Pool|Kamal Haasan, M. G. Soman|J. Sasikumar|M. K. Arjunan||\n\n# Question\nWhat' s the content in the (1,1) cells\n" # noqa: E501
|
PROMPT = "You are a helpful assistant in recognizes the content of tables in markdown format. Here is a table as fellows. You need to answer my question about the table.\n# Table\n|Opening|Opening|Sl. No.|Film|Cast|Director|Music Director|Notes|\n|----|----|----|----|----|----|----|----|\n|J A N|9|1|Agni Pushpam|Jayabharathi, Kamalahasan|Jeassy|M. K. Arjunan||\n|J A N|16|2|Priyamvada|Mohan Sharma, Lakshmi, KPAC Lalitha|K. S. Sethumadhavan|V. Dakshinamoorthy||\n|J A N|23|3|Yakshagaanam|Madhu, Sheela|Sheela|M. S. Viswanathan||\n|J A N|30|4|Paalkkadal|Sheela, Sharada|T. K. Prasad|A. T. Ummer||\n|F E B|5|5|Amma|Madhu, Srividya|M. Krishnan Nair|M. K. Arjunan||\n|F E B|13|6|Appooppan|Thikkurissi Sukumaran Nair, Kamal Haasan|P. Bhaskaran|M. S. Baburaj||\n|F E B|20|7|Srishti|Chowalloor Krishnankutty, Ravi Alummoodu|K. T. Muhammad|M. S. Baburaj||\n|F E B|20|8|Vanadevatha|Prem Nazir, Madhubala|Yusufali Kechery|G. Devarajan||\n|F E B|27|9|Samasya|Madhu, Kamalahaasan|K. Thankappan|Shyam||\n|F E B|27|10|Yudhabhoomi|K. P. Ummer, Vidhubala|Crossbelt Mani|R. K. Shekhar||\n|M A R|5|11|Seemantha Puthran|Prem Nazir, Jayabharathi|A. B. Raj|M. K. Arjunan||\n|M A R|12|12|Swapnadanam|Rani Chandra, Dr. Mohandas|K. G. George|Bhaskar Chandavarkar||\n|M A R|19|13|Thulavarsham|Prem Nazir, sreedevi, Sudheer|N. Sankaran Nair|V. Dakshinamoorthy||\n|M A R|20|14|Aruthu|Kaviyoor Ponnamma, Kamalahasan|Ravi|G. Devarajan||\n|M A R|26|15|Swimming Pool|Kamal Haasan, M. G. Soman|J. Sasikumar|M. K. Arjunan||\n\n# Question\nWhat' s the content in the (1,1) cells\n" # noqa: E501
|
||||||
|
|
||||||
@@ -16,20 +16,22 @@ def test_prefix(llm=None, sampling_params=None, prompts=None):
|
|||||||
|
|
||||||
|
|
||||||
def main(args):
|
def main(args):
|
||||||
llm = LLM(model="baichuan-inc/Baichuan2-13B-Chat",
|
llm = LLM(model=args.model,
|
||||||
tokenizer_mode='auto',
|
tokenizer_mode='auto',
|
||||||
trust_remote_code=True,
|
trust_remote_code=True,
|
||||||
enforce_eager=True,
|
enforce_eager=True,
|
||||||
|
use_v2_block_manager=args.use_v2_block_manager,
|
||||||
|
tensor_parallel_size=args.tensor_parallel_size,
|
||||||
enable_prefix_caching=args.enable_prefix_caching)
|
enable_prefix_caching=args.enable_prefix_caching)
|
||||||
|
|
||||||
num_prompts = 100
|
num_prompts = 100
|
||||||
prompts = [PROMPT] * num_prompts
|
prompts = [PROMPT] * num_prompts
|
||||||
sampling_params = SamplingParams(temperature=0, max_tokens=100)
|
sampling_params = SamplingParams(temperature=0, max_tokens=args.output_len)
|
||||||
|
|
||||||
print("------warm up------")
|
print("------warm up------")
|
||||||
test_prefix(
|
test_prefix(
|
||||||
llm=llm,
|
llm=llm,
|
||||||
prompts=prompts[:1],
|
prompts=prompts,
|
||||||
sampling_params=sampling_params,
|
sampling_params=sampling_params,
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -42,11 +44,19 @@ def main(args):
|
|||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
parser = argparse.ArgumentParser(
|
parser = FlexibleArgumentParser(
|
||||||
description='Benchmark the performance with or without automatic '
|
description='Benchmark the performance with or without automatic '
|
||||||
'prefix caching.')
|
'prefix caching.')
|
||||||
|
parser.add_argument('--model',
|
||||||
|
type=str,
|
||||||
|
default='baichuan-inc/Baichuan2-13B-Chat')
|
||||||
|
parser.add_argument('--tensor-parallel-size', '-tp', type=int, default=1)
|
||||||
|
parser.add_argument('--output-len', type=int, default=10)
|
||||||
parser.add_argument('--enable-prefix-caching',
|
parser.add_argument('--enable-prefix-caching',
|
||||||
action='store_true',
|
action='store_true',
|
||||||
help='enable prefix caching')
|
help='enable prefix caching')
|
||||||
|
parser.add_argument('--use-v2-block-manager',
|
||||||
|
action='store_true',
|
||||||
|
help='Use BlockSpaceMangerV2')
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
main(args)
|
main(args)
|
||||||
|
|||||||
@@ -2,8 +2,8 @@
|
|||||||
|
|
||||||
On the server side, run one of the following commands:
|
On the server side, run one of the following commands:
|
||||||
vLLM OpenAI API server
|
vLLM OpenAI API server
|
||||||
python -m vllm.entrypoints.openai.api_server \
|
vllm serve <your_model> \
|
||||||
--model <your_model> --swap-space 16 \
|
--swap-space 16 \
|
||||||
--disable-log-requests
|
--disable-log-requests
|
||||||
|
|
||||||
(TGI backend)
|
(TGI backend)
|
||||||
@@ -17,6 +17,10 @@ On the client side, run:
|
|||||||
--dataset-path <path to dataset> \
|
--dataset-path <path to dataset> \
|
||||||
--request-rate <request_rate> \ # By default <request_rate> is inf
|
--request-rate <request_rate> \ # By default <request_rate> is inf
|
||||||
--num-prompts <num_prompts> # By default <num_prompts> is 1000
|
--num-prompts <num_prompts> # By default <num_prompts> is 1000
|
||||||
|
|
||||||
|
when using tgi backend, add
|
||||||
|
--endpoint /generate_stream
|
||||||
|
to the end of the command above.
|
||||||
"""
|
"""
|
||||||
import argparse
|
import argparse
|
||||||
import asyncio
|
import asyncio
|
||||||
@@ -27,7 +31,7 @@ import time
|
|||||||
import warnings
|
import warnings
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from typing import AsyncGenerator, List, Tuple
|
from typing import Any, AsyncGenerator, Dict, List, Optional, Tuple
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from backend_request_func import (ASYNC_REQUEST_FUNCS, RequestFuncInput,
|
from backend_request_func import (ASYNC_REQUEST_FUNCS, RequestFuncInput,
|
||||||
@@ -35,7 +39,15 @@ from backend_request_func import (ASYNC_REQUEST_FUNCS, RequestFuncInput,
|
|||||||
from tqdm.asyncio import tqdm
|
from tqdm.asyncio import tqdm
|
||||||
from transformers import PreTrainedTokenizerBase
|
from transformers import PreTrainedTokenizerBase
|
||||||
|
|
||||||
from vllm.transformers_utils.tokenizer import get_tokenizer
|
try:
|
||||||
|
from vllm.transformers_utils.tokenizer import get_tokenizer
|
||||||
|
except ImportError:
|
||||||
|
from backend_request_func import get_tokenizer
|
||||||
|
|
||||||
|
try:
|
||||||
|
from vllm.utils import FlexibleArgumentParser
|
||||||
|
except ImportError:
|
||||||
|
from argparse import ArgumentParser as FlexibleArgumentParser
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
@@ -48,17 +60,26 @@ class BenchmarkMetrics:
|
|||||||
output_throughput: float
|
output_throughput: float
|
||||||
mean_ttft_ms: float
|
mean_ttft_ms: float
|
||||||
median_ttft_ms: float
|
median_ttft_ms: float
|
||||||
|
std_ttft_ms: float
|
||||||
p99_ttft_ms: float
|
p99_ttft_ms: float
|
||||||
mean_tpot_ms: float
|
mean_tpot_ms: float
|
||||||
median_tpot_ms: float
|
median_tpot_ms: float
|
||||||
|
std_tpot_ms: float
|
||||||
p99_tpot_ms: float
|
p99_tpot_ms: float
|
||||||
|
mean_itl_ms: float
|
||||||
|
median_itl_ms: float
|
||||||
|
std_itl_ms: float
|
||||||
|
p99_itl_ms: float
|
||||||
|
|
||||||
|
|
||||||
def sample_sharegpt_requests(
|
def sample_sharegpt_requests(
|
||||||
dataset_path: str,
|
dataset_path: str,
|
||||||
num_requests: int,
|
num_requests: int,
|
||||||
tokenizer: PreTrainedTokenizerBase,
|
tokenizer: PreTrainedTokenizerBase,
|
||||||
|
fixed_output_len: Optional[int] = None,
|
||||||
) -> List[Tuple[str, int, int]]:
|
) -> List[Tuple[str, int, int]]:
|
||||||
|
if fixed_output_len is not None and fixed_output_len < 4:
|
||||||
|
raise ValueError("output_len too small")
|
||||||
# Load the dataset.
|
# Load the dataset.
|
||||||
with open(dataset_path) as f:
|
with open(dataset_path) as f:
|
||||||
dataset = json.load(f)
|
dataset = json.load(f)
|
||||||
@@ -68,38 +89,32 @@ def sample_sharegpt_requests(
|
|||||||
dataset = [(data["conversations"][0]["value"],
|
dataset = [(data["conversations"][0]["value"],
|
||||||
data["conversations"][1]["value"]) for data in dataset]
|
data["conversations"][1]["value"]) for data in dataset]
|
||||||
|
|
||||||
# some of these will be filtered out, so sample more than we need
|
# Shuffle the dataset.
|
||||||
sampled_indices = random.sample(range(len(dataset)),
|
random.shuffle(dataset)
|
||||||
int(num_requests * 1.2))
|
|
||||||
dataset = [dataset[i] for i in sampled_indices]
|
|
||||||
|
|
||||||
# Tokenize the prompts and completions.
|
# Filter out sequences that are too long or too short
|
||||||
prompts = [prompt for prompt, _ in dataset]
|
|
||||||
prompt_token_ids = tokenizer(prompts).input_ids
|
|
||||||
completions = [completion for _, completion in dataset]
|
|
||||||
completion_token_ids = tokenizer(completions).input_ids
|
|
||||||
tokenized_dataset = []
|
|
||||||
for i in range(len(dataset)):
|
|
||||||
output_len = len(completion_token_ids[i])
|
|
||||||
tokenized_dataset.append((prompts[i], prompt_token_ids[i], output_len))
|
|
||||||
|
|
||||||
# Filter out too long sequences.
|
|
||||||
filtered_dataset: List[Tuple[str, int, int]] = []
|
filtered_dataset: List[Tuple[str, int, int]] = []
|
||||||
for prompt, prompt_token_ids, output_len in tokenized_dataset:
|
for i in range(len(dataset)):
|
||||||
|
if len(filtered_dataset) == num_requests:
|
||||||
|
break
|
||||||
|
|
||||||
|
# Tokenize the prompts and completions.
|
||||||
|
prompt = dataset[i][0]
|
||||||
|
prompt_token_ids = tokenizer(prompt).input_ids
|
||||||
|
completion = dataset[i][1]
|
||||||
|
completion_token_ids = tokenizer(completion).input_ids
|
||||||
prompt_len = len(prompt_token_ids)
|
prompt_len = len(prompt_token_ids)
|
||||||
|
output_len = len(completion_token_ids
|
||||||
|
) if fixed_output_len is None else fixed_output_len
|
||||||
if prompt_len < 4 or output_len < 4:
|
if prompt_len < 4 or output_len < 4:
|
||||||
# Prune too short sequences.
|
# Prune too short sequences.
|
||||||
# This is because TGI causes errors when the input or output length
|
|
||||||
# is too short.
|
|
||||||
continue
|
continue
|
||||||
if prompt_len > 1024 or prompt_len + output_len > 2048:
|
if prompt_len > 1024 or prompt_len + output_len > 2048:
|
||||||
# Prune too long sequences.
|
# Prune too long sequences.
|
||||||
continue
|
continue
|
||||||
filtered_dataset.append((prompt, prompt_len, output_len))
|
filtered_dataset.append((prompt, prompt_len, output_len))
|
||||||
|
|
||||||
# Sample the requests.
|
return filtered_dataset
|
||||||
sampled_requests = random.sample(filtered_dataset, num_requests)
|
|
||||||
return sampled_requests
|
|
||||||
|
|
||||||
|
|
||||||
def sample_sonnet_requests(
|
def sample_sonnet_requests(
|
||||||
@@ -110,7 +125,9 @@ def sample_sonnet_requests(
|
|||||||
prefix_len: int,
|
prefix_len: int,
|
||||||
tokenizer: PreTrainedTokenizerBase,
|
tokenizer: PreTrainedTokenizerBase,
|
||||||
) -> List[Tuple[str, str, int, int]]:
|
) -> List[Tuple[str, str, int, int]]:
|
||||||
assert input_len > prefix_len, "input_len must be greater than prefix_len."
|
assert (
|
||||||
|
input_len > prefix_len
|
||||||
|
), "'args.sonnet-input-len' must be greater than 'args.prefix-input-len'."
|
||||||
|
|
||||||
# Load the dataset.
|
# Load the dataset.
|
||||||
with open(dataset_path) as f:
|
with open(dataset_path) as f:
|
||||||
@@ -131,8 +148,9 @@ def sample_sonnet_requests(
|
|||||||
base_message, add_generation_prompt=True, tokenize=False)
|
base_message, add_generation_prompt=True, tokenize=False)
|
||||||
base_prompt_offset = len(tokenizer(base_prompt_formatted).input_ids)
|
base_prompt_offset = len(tokenizer(base_prompt_formatted).input_ids)
|
||||||
|
|
||||||
assert (input_len > base_prompt_offset
|
assert (
|
||||||
), f"Please set 'args.input-len' higher than {base_prompt_offset}."
|
input_len > base_prompt_offset
|
||||||
|
), f"Please set 'args.sonnet-input-len' higher than {base_prompt_offset}."
|
||||||
num_input_lines = round(
|
num_input_lines = round(
|
||||||
(input_len - base_prompt_offset) / average_poem_len)
|
(input_len - base_prompt_offset) / average_poem_len)
|
||||||
|
|
||||||
@@ -140,7 +158,7 @@ def sample_sonnet_requests(
|
|||||||
# prompt are fixed poem lines.
|
# prompt are fixed poem lines.
|
||||||
assert (
|
assert (
|
||||||
prefix_len > base_prompt_offset
|
prefix_len > base_prompt_offset
|
||||||
), f"Please set 'args.prefix-len' higher than {base_prompt_offset}."
|
), f"Please set 'args.sonnet-prefix-len' higher than {base_prompt_offset}."
|
||||||
|
|
||||||
num_prefix_lines = round(
|
num_prefix_lines = round(
|
||||||
(prefix_len - base_prompt_offset) / average_poem_len)
|
(prefix_len - base_prompt_offset) / average_poem_len)
|
||||||
@@ -169,6 +187,31 @@ def sample_sonnet_requests(
|
|||||||
return sampled_requests
|
return sampled_requests
|
||||||
|
|
||||||
|
|
||||||
|
def sample_random_requests(
|
||||||
|
input_len: int, output_len: int, num_prompts: int, range_ratio: float,
|
||||||
|
tokenizer: PreTrainedTokenizerBase) -> List[Tuple[str, int, int]]:
|
||||||
|
|
||||||
|
input_lens = np.random.randint(
|
||||||
|
int(input_len * range_ratio),
|
||||||
|
input_len + 1,
|
||||||
|
size=num_prompts,
|
||||||
|
)
|
||||||
|
output_lens = np.random.randint(
|
||||||
|
int(output_len * range_ratio),
|
||||||
|
output_len + 1,
|
||||||
|
size=num_prompts,
|
||||||
|
)
|
||||||
|
offsets = np.random.randint(0, tokenizer.vocab_size, size=num_prompts)
|
||||||
|
input_requests = []
|
||||||
|
for i in range(num_prompts):
|
||||||
|
prompt = tokenizer.decode([(offsets[i] + i + j) % tokenizer.vocab_size
|
||||||
|
for j in range(input_lens[i])])
|
||||||
|
input_requests.append(
|
||||||
|
(prompt, int(input_lens[i]), int(output_lens[i])))
|
||||||
|
|
||||||
|
return input_requests
|
||||||
|
|
||||||
|
|
||||||
async def get_request(
|
async def get_request(
|
||||||
input_requests: List[Tuple[str, int, int]],
|
input_requests: List[Tuple[str, int, int]],
|
||||||
request_rate: float,
|
request_rate: float,
|
||||||
@@ -180,6 +223,7 @@ async def get_request(
|
|||||||
if request_rate == float("inf"):
|
if request_rate == float("inf"):
|
||||||
# If the request rate is infinity, then we don't need to wait.
|
# If the request rate is infinity, then we don't need to wait.
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# Sample the request interval from the exponential distribution.
|
# Sample the request interval from the exponential distribution.
|
||||||
interval = np.random.exponential(1.0 / request_rate)
|
interval = np.random.exponential(1.0 / request_rate)
|
||||||
# The next request will be sent after the interval.
|
# The next request will be sent after the interval.
|
||||||
@@ -192,24 +236,37 @@ def calculate_metrics(
|
|||||||
dur_s: float,
|
dur_s: float,
|
||||||
tokenizer: PreTrainedTokenizerBase,
|
tokenizer: PreTrainedTokenizerBase,
|
||||||
) -> Tuple[BenchmarkMetrics, List[int]]:
|
) -> Tuple[BenchmarkMetrics, List[int]]:
|
||||||
actual_output_lens = []
|
actual_output_lens: List[int] = []
|
||||||
total_input = 0
|
total_input = 0
|
||||||
completed = 0
|
completed = 0
|
||||||
tpots = []
|
itls: List[float] = []
|
||||||
ttfts = []
|
tpots: List[float] = []
|
||||||
|
ttfts: List[float] = []
|
||||||
for i in range(len(outputs)):
|
for i in range(len(outputs)):
|
||||||
if outputs[i].success:
|
if outputs[i].success:
|
||||||
output_len = len(tokenizer(outputs[i].generated_text).input_ids)
|
# We use the tokenizer to count the number of output tokens for all
|
||||||
|
# serving backends instead of looking at len(outputs[i].itl) since
|
||||||
|
# multiple output tokens may be bundled together
|
||||||
|
# Note : this may inflate the output token count slightly
|
||||||
|
output_len = len(
|
||||||
|
tokenizer(outputs[i].generated_text,
|
||||||
|
add_special_tokens=False).input_ids)
|
||||||
actual_output_lens.append(output_len)
|
actual_output_lens.append(output_len)
|
||||||
total_input += input_requests[i][1]
|
total_input += input_requests[i][1]
|
||||||
if output_len > 1:
|
if output_len > 1:
|
||||||
tpots.append(
|
tpots.append(
|
||||||
(outputs[i].latency - outputs[i].ttft) / (output_len - 1))
|
(outputs[i].latency - outputs[i].ttft) / (output_len - 1))
|
||||||
|
itls += outputs[i].itl
|
||||||
ttfts.append(outputs[i].ttft)
|
ttfts.append(outputs[i].ttft)
|
||||||
completed += 1
|
completed += 1
|
||||||
else:
|
else:
|
||||||
actual_output_lens.append(0)
|
actual_output_lens.append(0)
|
||||||
|
|
||||||
|
if completed == 0:
|
||||||
|
warnings.warn(
|
||||||
|
"All requests failed. This is likely due to a misconfiguration "
|
||||||
|
"on the benchmark arguments.",
|
||||||
|
stacklevel=2)
|
||||||
metrics = BenchmarkMetrics(
|
metrics = BenchmarkMetrics(
|
||||||
completed=completed,
|
completed=completed,
|
||||||
total_input=total_input,
|
total_input=total_input,
|
||||||
@@ -220,10 +277,16 @@ def calculate_metrics(
|
|||||||
mean_ttft_ms=np.mean(ttfts or 0) *
|
mean_ttft_ms=np.mean(ttfts or 0) *
|
||||||
1000, # ttfts is empty if streaming is not supported by backend
|
1000, # ttfts is empty if streaming is not supported by backend
|
||||||
median_ttft_ms=np.median(ttfts or 0) * 1000,
|
median_ttft_ms=np.median(ttfts or 0) * 1000,
|
||||||
|
std_ttft_ms=np.std(ttfts or 0) * 1000,
|
||||||
p99_ttft_ms=np.percentile(ttfts or 0, 99) * 1000,
|
p99_ttft_ms=np.percentile(ttfts or 0, 99) * 1000,
|
||||||
mean_tpot_ms=np.mean(tpots) * 1000,
|
mean_tpot_ms=np.mean(tpots or 0) * 1000,
|
||||||
median_tpot_ms=np.median(tpots) * 1000,
|
median_tpot_ms=np.median(tpots or 0) * 1000,
|
||||||
p99_tpot_ms=np.percentile(tpots, 99) * 1000,
|
std_tpot_ms=np.std(tpots or 0) * 1000,
|
||||||
|
p99_tpot_ms=np.percentile(tpots or 0, 99) * 1000,
|
||||||
|
mean_itl_ms=np.mean(itls or 0) * 1000,
|
||||||
|
median_itl_ms=np.median(itls or 0) * 1000,
|
||||||
|
std_itl_ms=np.std(itls or 0) * 1000,
|
||||||
|
p99_itl_ms=np.percentile(itls or 0, 99) * 1000,
|
||||||
)
|
)
|
||||||
|
|
||||||
return metrics, actual_output_lens
|
return metrics, actual_output_lens
|
||||||
@@ -241,16 +304,34 @@ async def benchmark(
|
|||||||
disable_tqdm: bool,
|
disable_tqdm: bool,
|
||||||
):
|
):
|
||||||
if backend in ASYNC_REQUEST_FUNCS:
|
if backend in ASYNC_REQUEST_FUNCS:
|
||||||
request_func = ASYNC_REQUEST_FUNCS.get(backend)
|
request_func = ASYNC_REQUEST_FUNCS[backend]
|
||||||
else:
|
else:
|
||||||
raise ValueError(f"Unknown backend: {backend}")
|
raise ValueError(f"Unknown backend: {backend}")
|
||||||
|
|
||||||
|
print("Starting initial single prompt test run...")
|
||||||
|
test_prompt, test_prompt_len, test_output_len = input_requests[0]
|
||||||
|
test_input = RequestFuncInput(
|
||||||
|
model=model_id,
|
||||||
|
prompt=test_prompt,
|
||||||
|
api_url=api_url,
|
||||||
|
prompt_len=test_prompt_len,
|
||||||
|
output_len=test_output_len,
|
||||||
|
best_of=best_of,
|
||||||
|
use_beam_search=use_beam_search,
|
||||||
|
)
|
||||||
|
test_output = await request_func(request_func_input=test_input)
|
||||||
|
if not test_output.success:
|
||||||
|
raise ValueError(
|
||||||
|
"Initial test run failed - Please make sure benchmark arguments "
|
||||||
|
f"are correctly specified. Error: {test_output.error}")
|
||||||
|
else:
|
||||||
|
print("Initial test run completed. Starting main benchmark run...")
|
||||||
print(f"Traffic request rate: {request_rate}")
|
print(f"Traffic request rate: {request_rate}")
|
||||||
|
|
||||||
pbar = None if disable_tqdm else tqdm(total=len(input_requests))
|
pbar = None if disable_tqdm else tqdm(total=len(input_requests))
|
||||||
|
|
||||||
benchmark_start_time = time.perf_counter()
|
benchmark_start_time = time.perf_counter()
|
||||||
tasks = []
|
tasks: List[asyncio.Task] = []
|
||||||
async for request in get_request(input_requests, request_rate):
|
async for request in get_request(input_requests, request_rate):
|
||||||
prompt, prompt_len, output_len = request
|
prompt, prompt_len, output_len = request
|
||||||
request_func_input = RequestFuncInput(
|
request_func_input = RequestFuncInput(
|
||||||
@@ -268,7 +349,7 @@ async def benchmark(
|
|||||||
pbar=pbar)))
|
pbar=pbar)))
|
||||||
outputs: List[RequestFuncOutput] = await asyncio.gather(*tasks)
|
outputs: List[RequestFuncOutput] = await asyncio.gather(*tasks)
|
||||||
|
|
||||||
if not disable_tqdm:
|
if pbar is not None:
|
||||||
pbar.close()
|
pbar.close()
|
||||||
|
|
||||||
benchmark_duration = time.perf_counter() - benchmark_start_time
|
benchmark_duration = time.perf_counter() - benchmark_start_time
|
||||||
@@ -305,6 +386,10 @@ async def benchmark(
|
|||||||
print("{:<40} {:<10.2f}".format("Median TPOT (ms):",
|
print("{:<40} {:<10.2f}".format("Median TPOT (ms):",
|
||||||
metrics.median_tpot_ms))
|
metrics.median_tpot_ms))
|
||||||
print("{:<40} {:<10.2f}".format("P99 TPOT (ms):", metrics.p99_tpot_ms))
|
print("{:<40} {:<10.2f}".format("P99 TPOT (ms):", metrics.p99_tpot_ms))
|
||||||
|
print("{s:{c}^{n}}".format(s='Inter-token Latency', n=50, c='-'))
|
||||||
|
print("{:<40} {:<10.2f}".format("Mean ITL (ms):", metrics.mean_itl_ms))
|
||||||
|
print("{:<40} {:<10.2f}".format("Median ITL (ms):", metrics.median_itl_ms))
|
||||||
|
print("{:<40} {:<10.2f}".format("P99 ITL (ms):", metrics.p99_itl_ms))
|
||||||
print("=" * 50)
|
print("=" * 50)
|
||||||
|
|
||||||
result = {
|
result = {
|
||||||
@@ -317,10 +402,16 @@ async def benchmark(
|
|||||||
"output_throughput": metrics.output_throughput,
|
"output_throughput": metrics.output_throughput,
|
||||||
"mean_ttft_ms": metrics.mean_ttft_ms,
|
"mean_ttft_ms": metrics.mean_ttft_ms,
|
||||||
"median_ttft_ms": metrics.median_ttft_ms,
|
"median_ttft_ms": metrics.median_ttft_ms,
|
||||||
|
"std_ttft_ms": metrics.std_ttft_ms,
|
||||||
"p99_ttft_ms": metrics.p99_ttft_ms,
|
"p99_ttft_ms": metrics.p99_ttft_ms,
|
||||||
"mean_tpot_ms": metrics.mean_tpot_ms,
|
"mean_tpot_ms": metrics.mean_tpot_ms,
|
||||||
"median_tpot_ms": metrics.median_tpot_ms,
|
"median_tpot_ms": metrics.median_tpot_ms,
|
||||||
|
"std_tpot_ms": metrics.std_tpot_ms,
|
||||||
"p99_tpot_ms": metrics.p99_tpot_ms,
|
"p99_tpot_ms": metrics.p99_tpot_ms,
|
||||||
|
"mean_itl_ms": metrics.mean_itl_ms,
|
||||||
|
"median_itl_ms": metrics.median_itl_ms,
|
||||||
|
"std_itl_ms": metrics.std_itl_ms,
|
||||||
|
"p99_itl_ms": metrics.p99_itl_ms,
|
||||||
"input_lens": [output.prompt_len for output in outputs],
|
"input_lens": [output.prompt_len for output in outputs],
|
||||||
"output_lens": actual_output_lens,
|
"output_lens": actual_output_lens,
|
||||||
"ttfts": [output.ttft for output in outputs],
|
"ttfts": [output.ttft for output in outputs],
|
||||||
@@ -358,6 +449,7 @@ def main(args: argparse.Namespace):
|
|||||||
dataset_path=args.dataset,
|
dataset_path=args.dataset,
|
||||||
num_requests=args.num_prompts,
|
num_requests=args.num_prompts,
|
||||||
tokenizer=tokenizer,
|
tokenizer=tokenizer,
|
||||||
|
fixed_output_len=args.sharegpt_output_len,
|
||||||
)
|
)
|
||||||
|
|
||||||
elif args.dataset_name == "sharegpt":
|
elif args.dataset_name == "sharegpt":
|
||||||
@@ -365,6 +457,7 @@ def main(args: argparse.Namespace):
|
|||||||
dataset_path=args.dataset_path,
|
dataset_path=args.dataset_path,
|
||||||
num_requests=args.num_prompts,
|
num_requests=args.num_prompts,
|
||||||
tokenizer=tokenizer,
|
tokenizer=tokenizer,
|
||||||
|
fixed_output_len=args.sharegpt_output_len,
|
||||||
)
|
)
|
||||||
|
|
||||||
elif args.dataset_name == "sonnet":
|
elif args.dataset_name == "sonnet":
|
||||||
@@ -373,9 +466,9 @@ def main(args: argparse.Namespace):
|
|||||||
input_requests = sample_sonnet_requests(
|
input_requests = sample_sonnet_requests(
|
||||||
dataset_path=args.dataset_path,
|
dataset_path=args.dataset_path,
|
||||||
num_requests=args.num_prompts,
|
num_requests=args.num_prompts,
|
||||||
input_len=args.input_len,
|
input_len=args.sonnet_input_len,
|
||||||
output_len=args.output_len,
|
output_len=args.sonnet_output_len,
|
||||||
prefix_len=args.prefix_len,
|
prefix_len=args.sonnet_prefix_len,
|
||||||
tokenizer=tokenizer,
|
tokenizer=tokenizer,
|
||||||
)
|
)
|
||||||
input_requests = [(prompt, prompt_len, output_len)
|
input_requests = [(prompt, prompt_len, output_len)
|
||||||
@@ -388,15 +481,24 @@ def main(args: argparse.Namespace):
|
|||||||
input_requests = sample_sonnet_requests(
|
input_requests = sample_sonnet_requests(
|
||||||
dataset_path=args.dataset_path,
|
dataset_path=args.dataset_path,
|
||||||
num_requests=args.num_prompts,
|
num_requests=args.num_prompts,
|
||||||
input_len=args.input_len,
|
input_len=args.sonnet_input_len,
|
||||||
output_len=args.output_len,
|
output_len=args.sonnet_output_len,
|
||||||
prefix_len=args.prefix_len,
|
prefix_len=args.sonnet_prefix_len,
|
||||||
tokenizer=tokenizer,
|
tokenizer=tokenizer,
|
||||||
)
|
)
|
||||||
input_requests = [(prompt_formatted, prompt_len, output_len)
|
input_requests = [(prompt_formatted, prompt_len, output_len)
|
||||||
for prompt, prompt_formatted, prompt_len,
|
for prompt, prompt_formatted, prompt_len,
|
||||||
output_len in input_requests]
|
output_len in input_requests]
|
||||||
|
|
||||||
|
elif args.dataset_name == "random":
|
||||||
|
input_requests = sample_random_requests(
|
||||||
|
input_len=args.random_input_len,
|
||||||
|
output_len=args.random_output_len,
|
||||||
|
num_prompts=args.num_prompts,
|
||||||
|
range_ratio=args.random_range_ratio,
|
||||||
|
tokenizer=tokenizer,
|
||||||
|
)
|
||||||
|
|
||||||
else:
|
else:
|
||||||
raise ValueError(f"Unknown dataset: {args.dataset_name}")
|
raise ValueError(f"Unknown dataset: {args.dataset_name}")
|
||||||
|
|
||||||
@@ -415,7 +517,7 @@ def main(args: argparse.Namespace):
|
|||||||
|
|
||||||
# Save config and results to json
|
# Save config and results to json
|
||||||
if args.save_result:
|
if args.save_result:
|
||||||
result_json = {}
|
result_json: Dict[str, Any] = {}
|
||||||
|
|
||||||
# Setup
|
# Setup
|
||||||
current_dt = datetime.now().strftime("%Y%m%d-%H%M%S")
|
current_dt = datetime.now().strftime("%Y%m%d-%H%M%S")
|
||||||
@@ -448,6 +550,8 @@ def main(args: argparse.Namespace):
|
|||||||
# Save to file
|
# Save to file
|
||||||
base_model_id = model_id.split("/")[-1]
|
base_model_id = model_id.split("/")[-1]
|
||||||
file_name = f"{backend}-{args.request_rate}qps-{base_model_id}-{current_dt}.json" #noqa
|
file_name = f"{backend}-{args.request_rate}qps-{base_model_id}-{current_dt}.json" #noqa
|
||||||
|
if args.result_filename:
|
||||||
|
file_name = args.result_filename
|
||||||
if args.result_dir:
|
if args.result_dir:
|
||||||
file_name = os.path.join(args.result_dir, file_name)
|
file_name = os.path.join(args.result_dir, file_name)
|
||||||
with open(file_name, "w") as outfile:
|
with open(file_name, "w") as outfile:
|
||||||
@@ -455,7 +559,7 @@ def main(args: argparse.Namespace):
|
|||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
parser = argparse.ArgumentParser(
|
parser = FlexibleArgumentParser(
|
||||||
description="Benchmark the online serving throughput.")
|
description="Benchmark the online serving throughput.")
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--backend",
|
"--backend",
|
||||||
@@ -488,7 +592,7 @@ if __name__ == "__main__":
|
|||||||
"--dataset-name",
|
"--dataset-name",
|
||||||
type=str,
|
type=str,
|
||||||
default="sharegpt",
|
default="sharegpt",
|
||||||
choices=["sharegpt", "sonnet"],
|
choices=["sharegpt", "sonnet", "random"],
|
||||||
help="Name of the dataset to benchmark on.",
|
help="Name of the dataset to benchmark on.",
|
||||||
)
|
)
|
||||||
parser.add_argument("--dataset-path",
|
parser.add_argument("--dataset-path",
|
||||||
@@ -505,7 +609,7 @@ if __name__ == "__main__":
|
|||||||
"--tokenizer",
|
"--tokenizer",
|
||||||
type=str,
|
type=str,
|
||||||
help=
|
help=
|
||||||
"Name or path of the tokenizer, if not using the default tokenizer.",
|
"Name or path of the tokenizer, if not using the default tokenizer.", # noqa: E501
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--best-of",
|
"--best-of",
|
||||||
@@ -521,6 +625,12 @@ if __name__ == "__main__":
|
|||||||
default=1000,
|
default=1000,
|
||||||
help="Number of prompts to process.",
|
help="Number of prompts to process.",
|
||||||
)
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--sharegpt-output-len",
|
||||||
|
type=int,
|
||||||
|
default=None,
|
||||||
|
help="Output length for each request. Overrides the output length "
|
||||||
|
"from the ShareGPT dataset.")
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--sonnet-input-len",
|
"--sonnet-input-len",
|
||||||
type=int,
|
type=int,
|
||||||
@@ -542,6 +652,27 @@ if __name__ == "__main__":
|
|||||||
help=
|
help=
|
||||||
"Number of prefix tokens per request, used only for sonnet dataset.",
|
"Number of prefix tokens per request, used only for sonnet dataset.",
|
||||||
)
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--random-input-len",
|
||||||
|
type=int,
|
||||||
|
default=1024,
|
||||||
|
help=
|
||||||
|
"Number of input tokens per request, used only for random sampling.",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--random-output-len",
|
||||||
|
type=int,
|
||||||
|
default=128,
|
||||||
|
help=
|
||||||
|
"Number of output tokens per request, used only for random sampling.",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--random-range-ratio",
|
||||||
|
type=float,
|
||||||
|
default=1.0,
|
||||||
|
help="Range of sampled ratio of input/output length, "
|
||||||
|
"used only for random sampling.",
|
||||||
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--request-rate",
|
"--request-rate",
|
||||||
type=float,
|
type=float,
|
||||||
@@ -582,6 +713,15 @@ if __name__ == "__main__":
|
|||||||
help="Specify directory to save benchmark json results."
|
help="Specify directory to save benchmark json results."
|
||||||
"If not specified, results are saved in the current directory.",
|
"If not specified, results are saved in the current directory.",
|
||||||
)
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--result-filename",
|
||||||
|
type=str,
|
||||||
|
default=None,
|
||||||
|
help="Specify the filename to save benchmark json results."
|
||||||
|
"If not specified, results will be saved in "
|
||||||
|
"{backend}-{args.request_rate}qps-{base_model_id}-{current_dt}.json"
|
||||||
|
" format.",
|
||||||
|
)
|
||||||
|
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
main(args)
|
main(args)
|
||||||
|
|||||||
@@ -10,6 +10,10 @@ from tqdm import tqdm
|
|||||||
from transformers import (AutoModelForCausalLM, AutoTokenizer,
|
from transformers import (AutoModelForCausalLM, AutoTokenizer,
|
||||||
PreTrainedTokenizerBase)
|
PreTrainedTokenizerBase)
|
||||||
|
|
||||||
|
from vllm.engine.arg_utils import EngineArgs
|
||||||
|
from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
|
||||||
|
from vllm.utils import FlexibleArgumentParser
|
||||||
|
|
||||||
|
|
||||||
def sample_requests(
|
def sample_requests(
|
||||||
dataset_path: str,
|
dataset_path: str,
|
||||||
@@ -29,22 +33,23 @@ def sample_requests(
|
|||||||
dataset = [(data["conversations"][0]["value"],
|
dataset = [(data["conversations"][0]["value"],
|
||||||
data["conversations"][1]["value"]) for data in dataset]
|
data["conversations"][1]["value"]) for data in dataset]
|
||||||
|
|
||||||
# Tokenize the prompts and completions.
|
# Shuffle the dataset.
|
||||||
prompts = [prompt for prompt, _ in dataset]
|
random.shuffle(dataset)
|
||||||
prompt_token_ids = tokenizer(prompts).input_ids
|
|
||||||
completions = [completion for _, completion in dataset]
|
|
||||||
completion_token_ids = tokenizer(completions).input_ids
|
|
||||||
tokenized_dataset = []
|
|
||||||
for i in range(len(dataset)):
|
|
||||||
output_len = len(completion_token_ids[i])
|
|
||||||
if fixed_output_len is not None:
|
|
||||||
output_len = fixed_output_len
|
|
||||||
tokenized_dataset.append((prompts[i], prompt_token_ids[i], output_len))
|
|
||||||
|
|
||||||
# Filter out too long sequences.
|
# Filter out sequences that are too long or too short
|
||||||
filtered_dataset: List[Tuple[str, int, int]] = []
|
filtered_dataset: List[Tuple[str, int, int]] = []
|
||||||
for prompt, prompt_token_ids, output_len in tokenized_dataset:
|
for i in range(len(dataset)):
|
||||||
|
if len(filtered_dataset) == num_requests:
|
||||||
|
break
|
||||||
|
|
||||||
|
# Tokenize the prompts and completions.
|
||||||
|
prompt = dataset[i][0]
|
||||||
|
prompt_token_ids = tokenizer(prompt).input_ids
|
||||||
|
completion = dataset[i][1]
|
||||||
|
completion_token_ids = tokenizer(completion).input_ids
|
||||||
prompt_len = len(prompt_token_ids)
|
prompt_len = len(prompt_token_ids)
|
||||||
|
output_len = len(completion_token_ids
|
||||||
|
) if fixed_output_len is None else fixed_output_len
|
||||||
if prompt_len < 4 or output_len < 4:
|
if prompt_len < 4 or output_len < 4:
|
||||||
# Prune too short sequences.
|
# Prune too short sequences.
|
||||||
continue
|
continue
|
||||||
@@ -53,9 +58,7 @@ def sample_requests(
|
|||||||
continue
|
continue
|
||||||
filtered_dataset.append((prompt, prompt_len, output_len))
|
filtered_dataset.append((prompt, prompt_len, output_len))
|
||||||
|
|
||||||
# Sample the requests.
|
return filtered_dataset
|
||||||
sampled_requests = random.sample(filtered_dataset, num_requests)
|
|
||||||
return sampled_requests
|
|
||||||
|
|
||||||
|
|
||||||
def run_vllm(
|
def run_vllm(
|
||||||
@@ -72,47 +75,56 @@ def run_vllm(
|
|||||||
max_model_len: Optional[int],
|
max_model_len: Optional[int],
|
||||||
enforce_eager: bool,
|
enforce_eager: bool,
|
||||||
kv_cache_dtype: str,
|
kv_cache_dtype: str,
|
||||||
|
quantization_param_path: Optional[str],
|
||||||
device: str,
|
device: str,
|
||||||
enable_prefix_caching: bool,
|
enable_prefix_caching: bool,
|
||||||
|
enable_chunked_prefill: bool,
|
||||||
|
max_num_batched_tokens: int,
|
||||||
|
distributed_executor_backend: Optional[str],
|
||||||
gpu_memory_utilization: float = 0.9,
|
gpu_memory_utilization: float = 0.9,
|
||||||
download_dir: Optional[str] = None,
|
download_dir: Optional[str] = None,
|
||||||
|
load_format: str = EngineArgs.load_format,
|
||||||
) -> float:
|
) -> float:
|
||||||
from vllm import LLM, SamplingParams
|
from vllm import LLM, SamplingParams
|
||||||
llm = LLM(model=model,
|
llm = LLM(
|
||||||
tokenizer=tokenizer,
|
model=model,
|
||||||
quantization=quantization,
|
tokenizer=tokenizer,
|
||||||
tensor_parallel_size=tensor_parallel_size,
|
quantization=quantization,
|
||||||
seed=seed,
|
tensor_parallel_size=tensor_parallel_size,
|
||||||
trust_remote_code=trust_remote_code,
|
seed=seed,
|
||||||
dtype=dtype,
|
trust_remote_code=trust_remote_code,
|
||||||
max_model_len=max_model_len,
|
dtype=dtype,
|
||||||
gpu_memory_utilization=gpu_memory_utilization,
|
max_model_len=max_model_len,
|
||||||
enforce_eager=enforce_eager,
|
gpu_memory_utilization=gpu_memory_utilization,
|
||||||
kv_cache_dtype=kv_cache_dtype,
|
enforce_eager=enforce_eager,
|
||||||
device=device,
|
kv_cache_dtype=kv_cache_dtype,
|
||||||
enable_prefix_caching=enable_prefix_caching,
|
quantization_param_path=quantization_param_path,
|
||||||
download_dir=download_dir)
|
device=device,
|
||||||
|
enable_prefix_caching=enable_prefix_caching,
|
||||||
|
download_dir=download_dir,
|
||||||
|
enable_chunked_prefill=enable_chunked_prefill,
|
||||||
|
max_num_batched_tokens=max_num_batched_tokens,
|
||||||
|
distributed_executor_backend=distributed_executor_backend,
|
||||||
|
load_format=load_format,
|
||||||
|
)
|
||||||
|
|
||||||
# Add the requests to the engine.
|
# Add the requests to the engine.
|
||||||
|
prompts: List[str] = []
|
||||||
|
sampling_params: List[SamplingParams] = []
|
||||||
for prompt, _, output_len in requests:
|
for prompt, _, output_len in requests:
|
||||||
sampling_params = SamplingParams(
|
prompts.append(prompt)
|
||||||
n=n,
|
sampling_params.append(
|
||||||
temperature=0.0 if use_beam_search else 1.0,
|
SamplingParams(
|
||||||
top_p=1.0,
|
n=n,
|
||||||
use_beam_search=use_beam_search,
|
temperature=0.0 if use_beam_search else 1.0,
|
||||||
ignore_eos=True,
|
top_p=1.0,
|
||||||
max_tokens=output_len,
|
use_beam_search=use_beam_search,
|
||||||
)
|
ignore_eos=True,
|
||||||
# FIXME(woosuk): Do not use internal method.
|
max_tokens=output_len,
|
||||||
llm._add_request(
|
))
|
||||||
prompt=prompt,
|
|
||||||
prompt_token_ids=None,
|
|
||||||
sampling_params=sampling_params,
|
|
||||||
)
|
|
||||||
|
|
||||||
start = time.perf_counter()
|
start = time.perf_counter()
|
||||||
# FIXME(woosuk): Do not use internal method.
|
llm.generate(prompts, sampling_params, use_tqdm=True)
|
||||||
llm._run_engine(use_tqdm=True)
|
|
||||||
end = time.perf_counter()
|
end = time.perf_counter()
|
||||||
return end - start
|
return end - start
|
||||||
|
|
||||||
@@ -212,14 +224,15 @@ def main(args: argparse.Namespace):
|
|||||||
args.output_len)
|
args.output_len)
|
||||||
|
|
||||||
if args.backend == "vllm":
|
if args.backend == "vllm":
|
||||||
elapsed_time = run_vllm(requests, args.model, args.tokenizer,
|
elapsed_time = run_vllm(
|
||||||
args.quantization, args.tensor_parallel_size,
|
requests, args.model, args.tokenizer, args.quantization,
|
||||||
args.seed, args.n, args.use_beam_search,
|
args.tensor_parallel_size, args.seed, args.n, args.use_beam_search,
|
||||||
args.trust_remote_code, args.dtype,
|
args.trust_remote_code, args.dtype, args.max_model_len,
|
||||||
args.max_model_len, args.enforce_eager,
|
args.enforce_eager, args.kv_cache_dtype,
|
||||||
args.kv_cache_dtype, args.device,
|
args.quantization_param_path, args.device,
|
||||||
args.enable_prefix_caching,
|
args.enable_prefix_caching, args.enable_chunked_prefill,
|
||||||
args.gpu_memory_utilization, args.download_dir)
|
args.max_num_batched_tokens, args.distributed_executor_backend,
|
||||||
|
args.gpu_memory_utilization, args.download_dir, args.load_format)
|
||||||
elif args.backend == "hf":
|
elif args.backend == "hf":
|
||||||
assert args.tensor_parallel_size == 1
|
assert args.tensor_parallel_size == 1
|
||||||
elapsed_time = run_hf(requests, args.model, tokenizer, args.n,
|
elapsed_time = run_hf(requests, args.model, tokenizer, args.n,
|
||||||
@@ -235,9 +248,21 @@ def main(args: argparse.Namespace):
|
|||||||
print(f"Throughput: {len(requests) / elapsed_time:.2f} requests/s, "
|
print(f"Throughput: {len(requests) / elapsed_time:.2f} requests/s, "
|
||||||
f"{total_num_tokens / elapsed_time:.2f} tokens/s")
|
f"{total_num_tokens / elapsed_time:.2f} tokens/s")
|
||||||
|
|
||||||
|
# Output JSON results if specified
|
||||||
|
if args.output_json:
|
||||||
|
results = {
|
||||||
|
"elapsed_time": elapsed_time,
|
||||||
|
"num_requests": len(requests),
|
||||||
|
"total_num_tokens": total_num_tokens,
|
||||||
|
"requests_per_second": len(requests) / elapsed_time,
|
||||||
|
"tokens_per_second": total_num_tokens / elapsed_time,
|
||||||
|
}
|
||||||
|
with open(args.output_json, "w") as f:
|
||||||
|
json.dump(results, f, indent=4)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
parser = argparse.ArgumentParser(description="Benchmark the throughput.")
|
parser = FlexibleArgumentParser(description="Benchmark the throughput.")
|
||||||
parser.add_argument("--backend",
|
parser.add_argument("--backend",
|
||||||
type=str,
|
type=str,
|
||||||
choices=["vllm", "hf", "mii"],
|
choices=["vllm", "hf", "mii"],
|
||||||
@@ -259,7 +284,7 @@ if __name__ == "__main__":
|
|||||||
parser.add_argument("--tokenizer", type=str, default=None)
|
parser.add_argument("--tokenizer", type=str, default=None)
|
||||||
parser.add_argument('--quantization',
|
parser.add_argument('--quantization',
|
||||||
'-q',
|
'-q',
|
||||||
choices=['awq', 'gptq', 'squeezellm', None],
|
choices=[*QUANTIZATION_METHODS, None],
|
||||||
default=None)
|
default=None)
|
||||||
parser.add_argument("--tensor-parallel-size", "-tp", type=int, default=1)
|
parser.add_argument("--tensor-parallel-size", "-tp", type=int, default=1)
|
||||||
parser.add_argument("--n",
|
parser.add_argument("--n",
|
||||||
@@ -304,27 +329,82 @@ if __name__ == "__main__":
|
|||||||
action="store_true",
|
action="store_true",
|
||||||
help="enforce eager execution")
|
help="enforce eager execution")
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--kv-cache-dtype",
|
'--kv-cache-dtype',
|
||||||
type=str,
|
type=str,
|
||||||
choices=["auto", "fp8_e5m2"],
|
choices=['auto', 'fp8', 'fp8_e5m2', 'fp8_e4m3'],
|
||||||
default="auto",
|
default="auto",
|
||||||
help=
|
help='Data type for kv cache storage. If "auto", will use model '
|
||||||
'Data type for kv cache storage. If "auto", will use model data type.')
|
'data type. CUDA 11.8+ supports fp8 (=fp8_e4m3) and fp8_e5m2. '
|
||||||
|
'ROCm (AMD GPU) supports fp8 (=fp8_e4m3)')
|
||||||
|
parser.add_argument(
|
||||||
|
'--quantization-param-path',
|
||||||
|
type=str,
|
||||||
|
default=None,
|
||||||
|
help='Path to the JSON file containing the KV cache scaling factors. '
|
||||||
|
'This should generally be supplied, when KV cache dtype is FP8. '
|
||||||
|
'Otherwise, KV cache scaling factors default to 1.0, which may cause '
|
||||||
|
'accuracy issues. FP8_E5M2 (without scaling) is only supported on '
|
||||||
|
'cuda version greater than 11.8. On ROCm (AMD GPU), FP8_E4M3 is '
|
||||||
|
'instead supported for common inference criteria.')
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--device",
|
"--device",
|
||||||
type=str,
|
type=str,
|
||||||
default="cuda",
|
default="auto",
|
||||||
choices=["cuda"],
|
choices=["auto", "cuda", "cpu", "openvino", "tpu", "xpu"],
|
||||||
help='device type for vLLM execution, supporting CUDA only currently.')
|
help='device type for vLLM execution, supporting CUDA, OpenVINO and '
|
||||||
|
'CPU.')
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--enable-prefix-caching",
|
"--enable-prefix-caching",
|
||||||
action='store_true',
|
action='store_true',
|
||||||
help="enable automatic prefix caching for vLLM backend.")
|
help="enable automatic prefix caching for vLLM backend.")
|
||||||
|
parser.add_argument("--enable-chunked-prefill",
|
||||||
|
action='store_true',
|
||||||
|
help="enable chunked prefill for vLLM backend.")
|
||||||
|
parser.add_argument('--max-num-batched-tokens',
|
||||||
|
type=int,
|
||||||
|
default=None,
|
||||||
|
help='maximum number of batched tokens per '
|
||||||
|
'iteration')
|
||||||
parser.add_argument('--download-dir',
|
parser.add_argument('--download-dir',
|
||||||
type=str,
|
type=str,
|
||||||
default=None,
|
default=None,
|
||||||
help='directory to download and load the weights, '
|
help='directory to download and load the weights, '
|
||||||
'default to the default cache dir of huggingface')
|
'default to the default cache dir of huggingface')
|
||||||
|
parser.add_argument(
|
||||||
|
'--output-json',
|
||||||
|
type=str,
|
||||||
|
default=None,
|
||||||
|
help='Path to save the throughput results in JSON format.')
|
||||||
|
parser.add_argument(
|
||||||
|
'--distributed-executor-backend',
|
||||||
|
choices=['ray', 'mp'],
|
||||||
|
default=None,
|
||||||
|
help='Backend to use for distributed serving. When more than 1 GPU '
|
||||||
|
'is used, will be automatically set to "ray" if installed '
|
||||||
|
'or "mp" (multiprocessing) otherwise.')
|
||||||
|
parser.add_argument(
|
||||||
|
'--load-format',
|
||||||
|
type=str,
|
||||||
|
default=EngineArgs.load_format,
|
||||||
|
choices=[
|
||||||
|
'auto', 'pt', 'safetensors', 'npcache', 'dummy', 'tensorizer',
|
||||||
|
'bitsandbytes'
|
||||||
|
],
|
||||||
|
help='The format of the model weights to load.\n\n'
|
||||||
|
'* "auto" will try to load the weights in the safetensors format '
|
||||||
|
'and fall back to the pytorch bin format if safetensors format '
|
||||||
|
'is not available.\n'
|
||||||
|
'* "pt" will load the weights in the pytorch bin format.\n'
|
||||||
|
'* "safetensors" will load the weights in the safetensors format.\n'
|
||||||
|
'* "npcache" will load the weights in pytorch format and store '
|
||||||
|
'a numpy cache to speed up the loading.\n'
|
||||||
|
'* "dummy" will initialize the weights with random values, '
|
||||||
|
'which is mainly for profiling.\n'
|
||||||
|
'* "tensorizer" will load the weights using tensorizer from '
|
||||||
|
'CoreWeave. See the Tensorize vLLM Model script in the Examples'
|
||||||
|
'section for more information.\n'
|
||||||
|
'* "bitsandbytes" will load the weights using bitsandbytes '
|
||||||
|
'quantization.\n')
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
if args.tokenizer is None:
|
if args.tokenizer is None:
|
||||||
args.tokenizer = args.model
|
args.tokenizer = args.model
|
||||||
|
|||||||
360
benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
Normal file
360
benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
Normal file
@@ -0,0 +1,360 @@
|
|||||||
|
import argparse
|
||||||
|
import copy
|
||||||
|
import itertools
|
||||||
|
import pickle as pkl
|
||||||
|
import time
|
||||||
|
from typing import Callable, Iterable, List, Tuple
|
||||||
|
|
||||||
|
import torch
|
||||||
|
import torch.utils.benchmark as TBenchmark
|
||||||
|
from torch.utils.benchmark import Measurement as TMeasurement
|
||||||
|
from weight_shapes import WEIGHT_SHAPES
|
||||||
|
|
||||||
|
from vllm import _custom_ops as ops
|
||||||
|
from vllm.utils import FlexibleArgumentParser
|
||||||
|
|
||||||
|
DEFAULT_MODELS = list(WEIGHT_SHAPES.keys())
|
||||||
|
DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512]
|
||||||
|
DEFAULT_TP_SIZES = [1]
|
||||||
|
|
||||||
|
# helpers
|
||||||
|
|
||||||
|
|
||||||
|
def to_fp8(tensor: torch.Tensor) -> torch.Tensor:
|
||||||
|
finfo = torch.finfo(torch.float8_e4m3fn)
|
||||||
|
return torch.round(tensor.clamp(
|
||||||
|
min=finfo.min, max=finfo.max)).to(dtype=torch.float8_e4m3fn)
|
||||||
|
|
||||||
|
|
||||||
|
def to_int8(tensor: torch.Tensor) -> torch.Tensor:
|
||||||
|
return torch.round(tensor.clamp(min=-128, max=127)).to(dtype=torch.int8)
|
||||||
|
|
||||||
|
|
||||||
|
def make_rand_tensors(dtype: torch.dtype, m: int, n: int,
|
||||||
|
k: int) -> Tuple[torch.Tensor, torch.Tensor]:
|
||||||
|
|
||||||
|
a = torch.randn((m, k), device='cuda') * 5
|
||||||
|
b = torch.randn((n, k), device='cuda').t() * 5
|
||||||
|
|
||||||
|
if dtype == torch.int8:
|
||||||
|
return to_int8(a), to_int8(b)
|
||||||
|
if dtype == torch.float8_e4m3fn:
|
||||||
|
return to_fp8(a), to_fp8(b)
|
||||||
|
|
||||||
|
raise ValueError("unsupported dtype")
|
||||||
|
|
||||||
|
|
||||||
|
# impl
|
||||||
|
|
||||||
|
|
||||||
|
def pytorch_mm_impl(a: torch.Tensor, b: torch.Tensor, scale_a: torch.Tensor,
|
||||||
|
scale_b: torch.Tensor,
|
||||||
|
out_dtype: torch.dtype) -> torch.Tensor:
|
||||||
|
return torch.mm(a, b)
|
||||||
|
|
||||||
|
|
||||||
|
def pytorch_fp8_impl(a: torch.Tensor, b: torch.Tensor, scale_a: torch.Tensor,
|
||||||
|
scale_b: torch.Tensor,
|
||||||
|
out_dtype: torch.dtype) -> torch.Tensor:
|
||||||
|
return torch._scaled_mm(a,
|
||||||
|
b,
|
||||||
|
scale_a=scale_a,
|
||||||
|
scale_b=scale_b,
|
||||||
|
out_dtype=out_dtype)
|
||||||
|
|
||||||
|
|
||||||
|
def pytorch_fp8_impl_fast_accum(a: torch.Tensor, b: torch.Tensor,
|
||||||
|
scale_a: torch.Tensor, scale_b: torch.Tensor,
|
||||||
|
out_dtype: torch.dtype) -> torch.Tensor:
|
||||||
|
return torch._scaled_mm(a,
|
||||||
|
b,
|
||||||
|
scale_a=scale_a,
|
||||||
|
scale_b=scale_b,
|
||||||
|
out_dtype=out_dtype,
|
||||||
|
use_fast_accum=True)
|
||||||
|
|
||||||
|
|
||||||
|
def cutlass_impl(a: torch.Tensor, b: torch.Tensor, scale_a: torch.Tensor,
|
||||||
|
scale_b: torch.Tensor,
|
||||||
|
out_dtype: torch.dtype) -> torch.Tensor:
|
||||||
|
return ops.cutlass_scaled_mm(a, b, scale_a, scale_b, out_dtype=out_dtype)
|
||||||
|
|
||||||
|
|
||||||
|
# bench
|
||||||
|
def bench_fn(a: torch.Tensor, b: torch.Tensor, scale_a: torch.Tensor,
|
||||||
|
scale_b: torch.Tensor, out_dtype: torch.dtype, label: str,
|
||||||
|
sub_label: str, fn: Callable, description: str) -> TMeasurement:
|
||||||
|
|
||||||
|
min_run_time = 1
|
||||||
|
|
||||||
|
globals = {
|
||||||
|
"a": a,
|
||||||
|
"b": b,
|
||||||
|
"scale_a": scale_a,
|
||||||
|
"scale_b": scale_b,
|
||||||
|
"out_dtype": out_dtype,
|
||||||
|
"fn": fn,
|
||||||
|
}
|
||||||
|
return TBenchmark.Timer(
|
||||||
|
stmt="fn(a, b, scale_a, scale_b, out_dtype)",
|
||||||
|
globals=globals,
|
||||||
|
label=label,
|
||||||
|
sub_label=sub_label,
|
||||||
|
description=description,
|
||||||
|
).blocked_autorange(min_run_time=min_run_time)
|
||||||
|
|
||||||
|
|
||||||
|
def bench_int8(dtype: torch.dtype, m: int, k: int, n: int, label: str,
|
||||||
|
sub_label: str) -> Iterable[TMeasurement]:
|
||||||
|
assert dtype == torch.int8
|
||||||
|
a, b = make_rand_tensors(torch.int8, m, n, k)
|
||||||
|
scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32)
|
||||||
|
scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32)
|
||||||
|
|
||||||
|
timers = []
|
||||||
|
# pytorch impl - bfloat16
|
||||||
|
timers.append(
|
||||||
|
bench_fn(a.to(dtype=torch.bfloat16, device="cuda"),
|
||||||
|
b.to(dtype=torch.bfloat16, device="cuda"), scale_a, scale_b,
|
||||||
|
torch.bfloat16, label, sub_label, pytorch_mm_impl,
|
||||||
|
"pytorch_bf16_bf16_bf16_matmul-no-scales"))
|
||||||
|
|
||||||
|
# pytorch impl - float16
|
||||||
|
timers.append(
|
||||||
|
bench_fn(a.to(dtype=torch.float16, device="cuda"),
|
||||||
|
b.to(dtype=torch.float16, device="cuda"), scale_a, scale_b,
|
||||||
|
torch.float16, label, sub_label, pytorch_mm_impl,
|
||||||
|
"pytorch_fp16_fp16_fp16_matmul-no-scales"))
|
||||||
|
|
||||||
|
# cutlass impl
|
||||||
|
timers.append(
|
||||||
|
bench_fn(a, b, scale_a, scale_b, torch.bfloat16, label, sub_label,
|
||||||
|
cutlass_impl, "cutlass_i8_i8_bf16_scaled_mm"))
|
||||||
|
|
||||||
|
return timers
|
||||||
|
|
||||||
|
|
||||||
|
def bench_fp8(dtype: torch.dtype, m: int, k: int, n: int, label: str,
|
||||||
|
sub_label: str) -> Iterable[TMeasurement]:
|
||||||
|
assert dtype == torch.float8_e4m3fn
|
||||||
|
a, b = make_rand_tensors(torch.float8_e4m3fn, m, n, k)
|
||||||
|
scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32)
|
||||||
|
scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32)
|
||||||
|
|
||||||
|
timers = []
|
||||||
|
|
||||||
|
# pytorch impl w. bf16
|
||||||
|
timers.append(
|
||||||
|
bench_fn(a.to(dtype=torch.bfloat16, device="cuda"),
|
||||||
|
b.to(dtype=torch.bfloat16, device="cuda"), scale_a, scale_b,
|
||||||
|
torch.bfloat16, label, sub_label, pytorch_mm_impl,
|
||||||
|
"pytorch_bf16_bf16_bf16_matmul-no-scales"))
|
||||||
|
|
||||||
|
# pytorch impl: bf16 output, without fp8 fast accum
|
||||||
|
timers.append(
|
||||||
|
bench_fn(a, b, scale_a, scale_b, torch.bfloat16, label, sub_label,
|
||||||
|
pytorch_fp8_impl, "pytorch_fp8_fp8_bf16_scaled_mm"))
|
||||||
|
|
||||||
|
# pytorch impl: bf16 output, with fp8 fast accum
|
||||||
|
timers.append(
|
||||||
|
bench_fn(a, b, scale_a, scale_b, torch.bfloat16, label, sub_label,
|
||||||
|
pytorch_fp8_impl_fast_accum,
|
||||||
|
"pytorch_fp8_fp8_bf16_scaled_mm_fast_accum"))
|
||||||
|
|
||||||
|
# pytorch impl: fp16 output, without fp8 fast accum
|
||||||
|
timers.append(
|
||||||
|
bench_fn(a, b, scale_a, scale_b, torch.float16, label, sub_label,
|
||||||
|
pytorch_fp8_impl, "pytorch_fp8_fp8_fp16_scaled_mm"))
|
||||||
|
|
||||||
|
# pytorch impl: fp16 output, with fp8 fast accum
|
||||||
|
timers.append(
|
||||||
|
bench_fn(a, b, scale_a, scale_b, torch.float16, label, sub_label,
|
||||||
|
pytorch_fp8_impl_fast_accum,
|
||||||
|
"pytorch_fp8_fp8_fp16_scaled_mm_fast_accum"))
|
||||||
|
|
||||||
|
# cutlass impl: bf16 output
|
||||||
|
timers.append(
|
||||||
|
bench_fn(a, b, scale_a, scale_b, torch.bfloat16, label, sub_label,
|
||||||
|
cutlass_impl, "cutlass_fp8_fp8_bf16_scaled_mm"))
|
||||||
|
# cutlass impl: fp16 output
|
||||||
|
timers.append(
|
||||||
|
bench_fn(a, b, scale_a, scale_b, torch.float16, label, sub_label,
|
||||||
|
cutlass_impl, "cutlass_fp8_fp8_fp16_scaled_mm"))
|
||||||
|
return timers
|
||||||
|
|
||||||
|
|
||||||
|
def bench(dtype: torch.dtype, m: int, k: int, n: int, label: str,
|
||||||
|
sub_label: str) -> Iterable[TMeasurement]:
|
||||||
|
if dtype == torch.int8:
|
||||||
|
return bench_int8(dtype, m, k, n, label, sub_label)
|
||||||
|
if dtype == torch.float8_e4m3fn:
|
||||||
|
return bench_fp8(dtype, m, k, n, label, sub_label)
|
||||||
|
raise ValueError("unsupported type")
|
||||||
|
|
||||||
|
|
||||||
|
# runner
|
||||||
|
def print_timers(timers: Iterable[TMeasurement]):
|
||||||
|
compare = TBenchmark.Compare(timers)
|
||||||
|
compare.print()
|
||||||
|
|
||||||
|
|
||||||
|
def run(dtype: torch.dtype,
|
||||||
|
MKNs: Iterable[Tuple[int, int, int]]) -> Iterable[TMeasurement]:
|
||||||
|
|
||||||
|
results = []
|
||||||
|
for m, k, n in MKNs:
|
||||||
|
timers = bench(dtype, m, k, n, f"scaled-{dtype}-gemm",
|
||||||
|
f"MKN=({m}x{k}x{n})")
|
||||||
|
print_timers(timers)
|
||||||
|
results.extend(timers)
|
||||||
|
|
||||||
|
return results
|
||||||
|
|
||||||
|
|
||||||
|
# output makers
|
||||||
|
def make_output(data: Iterable[TMeasurement],
|
||||||
|
MKNs: Iterable[Tuple[int, int, int]],
|
||||||
|
base_description: str,
|
||||||
|
timestamp=None):
|
||||||
|
|
||||||
|
print(f"== All Results {base_description} ====")
|
||||||
|
print_timers(data)
|
||||||
|
|
||||||
|
# pickle all the results
|
||||||
|
timestamp = int(time.time()) if timestamp is None else timestamp
|
||||||
|
with open(f"{base_description}-{timestamp}.pkl", "wb") as f:
|
||||||
|
pkl.dump(data, f)
|
||||||
|
|
||||||
|
|
||||||
|
# argparse runners
|
||||||
|
|
||||||
|
|
||||||
|
def run_square_bench(args):
|
||||||
|
dim_sizes = list(
|
||||||
|
range(args.dim_start, args.dim_end + 1, args.dim_increment))
|
||||||
|
MKNs = list(zip(dim_sizes, dim_sizes, dim_sizes))
|
||||||
|
data = run(args.dtype, MKNs)
|
||||||
|
|
||||||
|
make_output(data, MKNs, f"square_bench-{args.dtype}")
|
||||||
|
|
||||||
|
|
||||||
|
def run_range_bench(args):
|
||||||
|
dim_sizes = list(range(args.dim_start, args.dim_end, args.dim_increment))
|
||||||
|
n = len(dim_sizes)
|
||||||
|
Ms = [args.m_constant] * n if args.m_constant is not None else dim_sizes
|
||||||
|
Ks = [args.k_constant] * n if args.k_constant is not None else dim_sizes
|
||||||
|
Ns = [args.n_constant] * n if args.n_constant is not None else dim_sizes
|
||||||
|
MKNs = list(zip(Ms, Ks, Ns))
|
||||||
|
data = run(args.dtype, MKNs)
|
||||||
|
|
||||||
|
make_output(data, MKNs, f"range_bench-{args.dtype}")
|
||||||
|
|
||||||
|
|
||||||
|
def run_model_bench(args):
|
||||||
|
|
||||||
|
print("Benchmarking models:")
|
||||||
|
for i, model in enumerate(args.models):
|
||||||
|
print(f"[{i}] {model}")
|
||||||
|
|
||||||
|
def model_shapes(model_name: str, tp_size: int) -> List[Tuple[int, int]]:
|
||||||
|
KNs = []
|
||||||
|
for KN, tp_split_dim in copy.deepcopy(WEIGHT_SHAPES[model_name]):
|
||||||
|
KN[tp_split_dim] = KN[tp_split_dim] // tp_size
|
||||||
|
KNs.append(KN)
|
||||||
|
return KNs
|
||||||
|
|
||||||
|
model_bench_data = []
|
||||||
|
models_tps = list(itertools.product(args.models, args.tp_sizes))
|
||||||
|
for model, tp_size in models_tps:
|
||||||
|
Ms = args.batch_sizes
|
||||||
|
KNs = model_shapes(model, tp_size)
|
||||||
|
MKNs = []
|
||||||
|
for m in Ms:
|
||||||
|
for k, n in KNs:
|
||||||
|
MKNs.append((m, k, n))
|
||||||
|
|
||||||
|
data = run(args.dtype, MKNs)
|
||||||
|
model_bench_data.append(data)
|
||||||
|
|
||||||
|
# Print all results
|
||||||
|
for data, model_tp in zip(model_bench_data, models_tps):
|
||||||
|
model, tp_size = model_tp
|
||||||
|
print(f"== Results {args.dtype} {model}-TP{tp_size} ====")
|
||||||
|
print_timers(data)
|
||||||
|
|
||||||
|
timestamp = int(time.time())
|
||||||
|
|
||||||
|
all_data = []
|
||||||
|
for d in model_bench_data:
|
||||||
|
all_data.extend(d)
|
||||||
|
# pickle all data
|
||||||
|
with open(f"model_bench-{args.dtype}-{timestamp}.pkl", "wb") as f:
|
||||||
|
pkl.dump(all_data, f)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
|
||||||
|
def to_torch_dtype(dt):
|
||||||
|
if dt == "int8":
|
||||||
|
return torch.int8
|
||||||
|
if dt == "fp8":
|
||||||
|
return torch.float8_e4m3fn
|
||||||
|
raise ValueError("unsupported dtype")
|
||||||
|
|
||||||
|
parser = FlexibleArgumentParser(
|
||||||
|
description="""
|
||||||
|
Benchmark Cutlass GEMM.
|
||||||
|
|
||||||
|
To run square GEMMs:
|
||||||
|
python3 ./benchmarks/cutlass_benchmarks/w8a8_benchmarks.py --dtype fp8 square_bench --dim-start 128 --dim-end 512 --dim-increment 64
|
||||||
|
|
||||||
|
To run constant N and K and sweep M:
|
||||||
|
python3 ./benchmarks/cutlass_benchmarks/w8a8_benchmarks.py --dtype fp8 range_bench --dim-start 128 --dim-end 512 --dim-increment 64 --n-constant 16384 --k-constant 16384
|
||||||
|
|
||||||
|
To run dimensions from a model:
|
||||||
|
python3 ./benchmarks/cutlass_benchmarks/w8a8_benchmarks.py --dtype fp8 model_bench --models meta-llama/Llama-2-7b-hf --batch-sizes 16 --tp-sizes 1
|
||||||
|
|
||||||
|
Output:
|
||||||
|
- a .pkl file, that is a list of raw torch.benchmark.utils.Measurements for the pytorch and cutlass implementations for the various GEMMs.
|
||||||
|
""", # noqa: E501
|
||||||
|
formatter_class=argparse.RawTextHelpFormatter)
|
||||||
|
|
||||||
|
parser.add_argument("--dtype",
|
||||||
|
type=to_torch_dtype,
|
||||||
|
required=True,
|
||||||
|
help="Available options are ['int8', 'fp8']")
|
||||||
|
subparsers = parser.add_subparsers(dest="cmd")
|
||||||
|
|
||||||
|
square_parser = subparsers.add_parser("square_bench")
|
||||||
|
square_parser.add_argument("--dim-start", type=int, required=True)
|
||||||
|
square_parser.add_argument("--dim-end", type=int, required=True)
|
||||||
|
square_parser.add_argument("--dim-increment", type=int, required=True)
|
||||||
|
square_parser.set_defaults(func=run_square_bench)
|
||||||
|
|
||||||
|
range_parser = subparsers.add_parser("range_bench")
|
||||||
|
range_parser.add_argument("--dim-start", type=int, required=True)
|
||||||
|
range_parser.add_argument("--dim-end", type=int, required=True)
|
||||||
|
range_parser.add_argument("--dim-increment", type=int, required=True)
|
||||||
|
range_parser.add_argument("--m-constant", type=int, default=None)
|
||||||
|
range_parser.add_argument("--n-constant", type=int, default=None)
|
||||||
|
range_parser.add_argument("--k-constant", type=int, default=None)
|
||||||
|
range_parser.set_defaults(func=run_range_bench)
|
||||||
|
|
||||||
|
model_parser = subparsers.add_parser("model_bench")
|
||||||
|
model_parser.add_argument("--models",
|
||||||
|
nargs="+",
|
||||||
|
type=str,
|
||||||
|
default=DEFAULT_MODELS,
|
||||||
|
choices=WEIGHT_SHAPES.keys())
|
||||||
|
model_parser.add_argument("--tp-sizes",
|
||||||
|
nargs="+",
|
||||||
|
type=int,
|
||||||
|
default=DEFAULT_TP_SIZES)
|
||||||
|
model_parser.add_argument("--batch-sizes",
|
||||||
|
nargs="+",
|
||||||
|
type=int,
|
||||||
|
default=DEFAULT_BATCH_SIZES)
|
||||||
|
model_parser.set_defaults(func=run_model_bench)
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
args.func(args)
|
||||||
43
benchmarks/cutlass_benchmarks/weight_shapes.py
Normal file
43
benchmarks/cutlass_benchmarks/weight_shapes.py
Normal file
@@ -0,0 +1,43 @@
|
|||||||
|
# Weight Shapes are in the format
|
||||||
|
# ([K, N], TP_SPLIT_DIM)
|
||||||
|
# Example:
|
||||||
|
# A shape of ([14336, 4096], 0) indicates the following GEMM shape,
|
||||||
|
# - TP1 : K = 14336, N = 4096
|
||||||
|
# - TP2 : K = 7168, N = 4096
|
||||||
|
# A shape of ([4096, 6144], 1) indicates the following GEMM shape,
|
||||||
|
# - TP1 : K = 4096, N = 6144
|
||||||
|
# - TP4 : K = 4096, N = 1536
|
||||||
|
|
||||||
|
# TP1 shapes
|
||||||
|
WEIGHT_SHAPES = {
|
||||||
|
"mistralai/Mistral-7B-v0.1": [
|
||||||
|
([4096, 6144], 1),
|
||||||
|
([4096, 4096], 0),
|
||||||
|
([4096, 28672], 1),
|
||||||
|
([14336, 4096], 0),
|
||||||
|
],
|
||||||
|
"meta-llama/Llama-2-7b-hf": [
|
||||||
|
([4096, 12288], 1),
|
||||||
|
([4096, 4096], 0),
|
||||||
|
([4096, 22016], 1),
|
||||||
|
([11008, 4096], 0),
|
||||||
|
],
|
||||||
|
"meta-llama/Llama-3-8b": [
|
||||||
|
([4096, 6144], 1),
|
||||||
|
([4096, 4096], 0),
|
||||||
|
([4096, 28672], 1),
|
||||||
|
([14336, 4096], 0),
|
||||||
|
],
|
||||||
|
"meta-llama/Llama-2-13b-hf": [
|
||||||
|
([5120, 15360], 1),
|
||||||
|
([5120, 5120], 0),
|
||||||
|
([5120, 27648], 1),
|
||||||
|
([13824, 5120], 0),
|
||||||
|
],
|
||||||
|
"meta-llama/Llama-2-70b-hf": [
|
||||||
|
([8192, 10240], 1),
|
||||||
|
([8192, 8192], 0),
|
||||||
|
([8192, 57344], 1),
|
||||||
|
([28672, 8192], 0),
|
||||||
|
],
|
||||||
|
}
|
||||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user