Compare commits
990 Commits
v0.11.1rc5
...
v0.12.0
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
4fd9d6a85c | ||
|
|
a1d627e40f | ||
|
|
2f055ec1c1 | ||
|
|
6a6108511f | ||
|
|
9057fc2f1b | ||
|
|
a05b580540 | ||
|
|
b6ae5aeca6 | ||
|
|
5c7c09af8f | ||
|
|
7f718169d1 | ||
|
|
339e84ce86 | ||
|
|
34a8559be7 | ||
|
|
85fb2e3120 | ||
|
|
d8c6210eea | ||
|
|
8bbcf8b6e7 | ||
|
|
70fb77b4dc | ||
|
|
48d15a32aa | ||
|
|
3b221cb661 | ||
|
|
0037b5746a | ||
|
|
f5b0846ba0 | ||
|
|
13ea39bc09 | ||
|
|
4b612664fd | ||
|
|
653591d5e7 | ||
|
|
e2fbfc955e | ||
|
|
a690fb5bd6 | ||
|
|
81fe3f82af | ||
|
|
53bf71b0f0 | ||
|
|
f441d36cee | ||
|
|
22274b2184 | ||
|
|
fc95521ba5 | ||
|
|
d0cd728907 | ||
|
|
fa8804ad9c | ||
|
|
4b40924998 | ||
|
|
c0dfc89485 | ||
|
|
44822d7ff2 | ||
|
|
342c4f1472 | ||
|
|
1336a1ea24 | ||
|
|
eaf81485ed | ||
|
|
38caf7fa1a | ||
|
|
cabc77cc86 | ||
|
|
ec7035c9d4 | ||
|
|
fc6acc88ca | ||
|
|
d0985c5feb | ||
|
|
092bb73b8a | ||
|
|
5d43f7372e | ||
|
|
37593deb02 | ||
|
|
f5516039c5 | ||
|
|
36db0a35e4 | ||
|
|
5cfa967efa | ||
|
|
b95db244ee | ||
|
|
ad9d656bfa | ||
|
|
f37e8938d2 | ||
|
|
f0a28bf661 | ||
|
|
86e178f7c4 | ||
|
|
014ece97c7 | ||
|
|
62de4f4257 | ||
|
|
83805a6078 | ||
|
|
1ab8fc8197 | ||
|
|
f72a817bdf | ||
|
|
ec38a7368d | ||
|
|
21c2627934 | ||
|
|
39d28108f4 | ||
|
|
cd719de5cb | ||
|
|
8c363ed666 | ||
|
|
64bc09ba27 | ||
|
|
47539cfd3e | ||
|
|
2afcec4dec | ||
|
|
9381b5cde0 | ||
|
|
66b5840287 | ||
|
|
82c795d6f2 | ||
|
|
e1464c3a08 | ||
|
|
a491b0911b | ||
|
|
b9d0504a36 | ||
|
|
1656ad3704 | ||
|
|
fa59fe417f | ||
|
|
fe3398fab2 | ||
|
|
ad7f714d62 | ||
|
|
f4341f45d3 | ||
|
|
34a984274e | ||
|
|
f223ed4181 | ||
|
|
04a797cd0e | ||
|
|
6afc0ffaf6 | ||
|
|
39e63dec7c | ||
|
|
4a80ad0a25 | ||
|
|
4b17ce6815 | ||
|
|
e23f665d83 | ||
|
|
ca1b1e7296 | ||
|
|
762a4a6ca9 | ||
|
|
b2c50eda50 | ||
|
|
1dcafb3dea | ||
|
|
ea3370b428 | ||
|
|
c625d7b1c6 | ||
|
|
6173682b6e | ||
|
|
9726e64530 | ||
|
|
3fd1fb0b60 | ||
|
|
a51f4186f2 | ||
|
|
7675ba30de | ||
|
|
7c1ed45848 | ||
|
|
1986de1375 | ||
|
|
3461e7efd8 | ||
|
|
fecae12cd7 | ||
|
|
8d9338fae4 | ||
|
|
d40c854009 | ||
|
|
4332955602 | ||
|
|
f946a8d743 | ||
|
|
6f9d81d03b | ||
|
|
fae6943068 | ||
|
|
3bcbb30cbf | ||
|
|
9e6bcda3ac | ||
|
|
9eec282cb5 | ||
|
|
0808eb813b | ||
|
|
460d8bbf2d | ||
|
|
e2f56c309d | ||
|
|
f8151b66fa | ||
|
|
1168768a2d | ||
|
|
8e7a891602 | ||
|
|
953d9c820b | ||
|
|
33b06a6f24 | ||
|
|
5c2b5cb422 | ||
|
|
3cb32e5d6e | ||
|
|
ccbdf51bd5 | ||
|
|
5f5521bd5d | ||
|
|
b2c1d294fa | ||
|
|
cc0f2a0e19 | ||
|
|
480598958e | ||
|
|
b34e8775a3 | ||
|
|
f4b76056ee | ||
|
|
37b15e97e8 | ||
|
|
c7ba1f6bc7 | ||
|
|
18523b87f6 | ||
|
|
745a3bae1a | ||
|
|
35657bcd7a | ||
|
|
be493e0b3c | ||
|
|
ae0ce1be27 | ||
|
|
a5345bf49d | ||
|
|
e5a621b724 | ||
|
|
38658ec6f3 | ||
|
|
a24ea5414b | ||
|
|
ea228b4491 | ||
|
|
d45269b378 | ||
|
|
ee9841daa9 | ||
|
|
0840abdd24 | ||
|
|
e1f262337b | ||
|
|
fc1d8be3dc | ||
|
|
cd007a53b4 | ||
|
|
66d3d5422c | ||
|
|
bab438ff3e | ||
|
|
882851dc81 | ||
|
|
2f5f9acd55 | ||
|
|
cf348c8d27 | ||
|
|
a5abd1d384 | ||
|
|
e6d4f3c254 | ||
|
|
51906c8c55 | ||
|
|
0838b52e2e | ||
|
|
00d3310d2d | ||
|
|
da3222f371 | ||
|
|
43c5792592 | ||
|
|
3ecabd06ee | ||
|
|
c069086b9c | ||
|
|
11ea5ec1ff | ||
|
|
ecb1952378 | ||
|
|
da8e1a1bf9 | ||
|
|
ee80aee1ca | ||
|
|
0aeb698b77 | ||
|
|
9bb33c8919 | ||
|
|
a67dec7cba | ||
|
|
77740191de | ||
|
|
df01eda4dc | ||
|
|
ba1fcd84a7 | ||
|
|
56539cddac | ||
|
|
430dd4d9eb | ||
|
|
c4c0354eec | ||
|
|
e603129505 | ||
|
|
0b0aa874e8 | ||
|
|
70d5953f82 | ||
|
|
3650a74ed8 | ||
|
|
bb706d6048 | ||
|
|
e30859dff3 | ||
|
|
452a7c9f7c | ||
|
|
d9d342d214 | ||
|
|
53d7f1f601 | ||
|
|
c5ee430328 | ||
|
|
8d6a89dffd | ||
|
|
56531b79cc | ||
|
|
12866af748 | ||
|
|
d8819c88eb | ||
|
|
de75b0bb70 | ||
|
|
7df0289782 | ||
|
|
0abc79482a | ||
|
|
4e57c6587f | ||
|
|
e7d776273d | ||
|
|
c32a18cbe7 | ||
|
|
b07555d26f | ||
|
|
0353d2e162 | ||
|
|
a1f2676879 | ||
|
|
48ddb02b79 | ||
|
|
e502098643 | ||
|
|
dbc3d9991a | ||
|
|
794029f012 | ||
|
|
0231ce836a | ||
|
|
516c3f7847 | ||
|
|
51fc9e017a | ||
|
|
bf0c75cd4f | ||
|
|
c2c661af9b | ||
|
|
798e87db5c | ||
|
|
de6889946b | ||
|
|
7a80b01889 | ||
|
|
e1dd706cd1 | ||
|
|
a685b47c57 | ||
|
|
32c40b95e0 | ||
|
|
db2906108a | ||
|
|
67fc16cd8c | ||
|
|
6330f9477d | ||
|
|
ef1f7030f0 | ||
|
|
12c007e288 | ||
|
|
f242cfcdd5 | ||
|
|
888152bf87 | ||
|
|
fe3a4f5b34 | ||
|
|
98caeadd54 | ||
|
|
64deead719 | ||
|
|
7992324f23 | ||
|
|
40a6f53f6c | ||
|
|
ce58fdc1c3 | ||
|
|
a21256c463 | ||
|
|
316c8492bf | ||
|
|
2d9ee28cab | ||
|
|
81db702ed2 | ||
|
|
92effb07a4 | ||
|
|
87185c88d5 | ||
|
|
9cf4edae6e | ||
|
|
7012d8b45e | ||
|
|
22b42b5402 | ||
|
|
cb7214d8ea | ||
|
|
77e10c9cab | ||
|
|
6f1355a1b7 | ||
|
|
a4ad43ad5a | ||
|
|
a178a0b40b | ||
|
|
b8328b49fb | ||
|
|
5f9679a43b | ||
|
|
699bca76c0 | ||
|
|
c17610e2ba | ||
|
|
71df2a57ef | ||
|
|
4dd42db566 | ||
|
|
84371daf75 | ||
|
|
f32c7d6f54 | ||
|
|
3cfa63ad99 | ||
|
|
4d6afcaddc | ||
|
|
97588c4d12 | ||
|
|
839c6b7b72 | ||
|
|
8f066146c3 | ||
|
|
cec418b5df | ||
|
|
cc313cb73d | ||
|
|
26a465584a | ||
|
|
e924bbb4f4 | ||
|
|
656516c315 | ||
|
|
e48b2e6848 | ||
|
|
7a228b5305 | ||
|
|
f716a15372 | ||
|
|
2601f18a82 | ||
|
|
4de87866a8 | ||
|
|
eca7a8fb59 | ||
|
|
8005e606bf | ||
|
|
68dfe28eae | ||
|
|
ed40d85929 | ||
|
|
0ff70821c9 | ||
|
|
5253f4276f | ||
|
|
30854783ad | ||
|
|
1073ba68b0 | ||
|
|
c309bb5245 | ||
|
|
3e1ad40655 | ||
|
|
62d54ba46d | ||
|
|
b004c00418 | ||
|
|
7f12c82fa6 | ||
|
|
6fb0215eee | ||
|
|
55c21c8836 | ||
|
|
3999442f1c | ||
|
|
71362ffab4 | ||
|
|
20ee418adc | ||
|
|
389aa1b2eb | ||
|
|
3ed767ec06 | ||
|
|
5f96c00c55 | ||
|
|
4587063267 | ||
|
|
472fdee974 | ||
|
|
df78aeef08 | ||
|
|
7df331c66b | ||
|
|
eb5352a770 | ||
|
|
d1cf8214e5 | ||
|
|
730bd35378 | ||
|
|
f55c76c2b3 | ||
|
|
d84d8f4429 | ||
|
|
ae66818379 | ||
|
|
d44a63c6d6 | ||
|
|
066209a045 | ||
|
|
5f7209a793 | ||
|
|
2d4978a57e | ||
|
|
6965a392a4 | ||
|
|
5a4802588e | ||
|
|
8e22da1d7f | ||
|
|
a4fdf2405c | ||
|
|
e6309acdba | ||
|
|
988ee66b0d | ||
|
|
ea38474ac5 | ||
|
|
742e9ff6b3 | ||
|
|
e9056056fb | ||
|
|
1489902b53 | ||
|
|
933f67ecd8 | ||
|
|
fd65015a14 | ||
|
|
77e1c035d0 | ||
|
|
6f403501a0 | ||
|
|
052950e5b3 | ||
|
|
1ef9c9e294 | ||
|
|
5c8f2adf50 | ||
|
|
ed8e6843cc | ||
|
|
d045e22dfe | ||
|
|
1d34eb11e0 | ||
|
|
9a3101b2ba | ||
|
|
d5dbdbfcb2 | ||
|
|
30d6466238 | ||
|
|
e9af6ba62a | ||
|
|
c6fa3895e9 | ||
|
|
3137991f55 | ||
|
|
57430fc95c | ||
|
|
c68c7b403d | ||
|
|
53a1ba6ec5 | ||
|
|
1840c5cb18 | ||
|
|
1bed891f72 | ||
|
|
ceca060501 | ||
|
|
75648b16dd | ||
|
|
460d02a417 | ||
|
|
b4c8fbaae2 | ||
|
|
e99e467384 | ||
|
|
a42ab317ac | ||
|
|
b7f1f490a6 | ||
|
|
30b44a1598 | ||
|
|
1f400c58b8 | ||
|
|
711241c13c | ||
|
|
d7219bcda3 | ||
|
|
4050bae417 | ||
|
|
f1805db1a6 | ||
|
|
434f3d3eb8 | ||
|
|
2092ce8c39 | ||
|
|
fc9f821d20 | ||
|
|
9452863088 | ||
|
|
2b1b3dfa4b | ||
|
|
cca2d2cdbe | ||
|
|
aab0102a26 | ||
|
|
b34129bf8e | ||
|
|
4d7231e774 | ||
|
|
8ac3a41487 | ||
|
|
7d6da483b0 | ||
|
|
e4c3182c68 | ||
|
|
b4734b9550 | ||
|
|
30b9c67743 | ||
|
|
11857a00b0 | ||
|
|
8c25f9cfb6 | ||
|
|
56e96b37e4 | ||
|
|
698024ecce | ||
|
|
0730414999 | ||
|
|
a982f5b5ea | ||
|
|
0e741c12e3 | ||
|
|
56669c1f29 | ||
|
|
3f5f36da3f | ||
|
|
e1eefa4c40 | ||
|
|
ed6ae1e36a | ||
|
|
9875be6431 | ||
|
|
df44df0143 | ||
|
|
87cbbdff63 | ||
|
|
986ab5db63 | ||
|
|
dd39f91edb | ||
|
|
c7a29d2c8d | ||
|
|
8237ab8a2b | ||
|
|
3fd74189db | ||
|
|
5e5a7eb16f | ||
|
|
3d84ef9054 | ||
|
|
4d01b64284 | ||
|
|
114b0e2500 | ||
|
|
647464719b | ||
|
|
e5bfcb6a88 | ||
|
|
22924383e1 | ||
|
|
56f45eddaf | ||
|
|
82b05b15e6 | ||
|
|
a2e9ebe9e2 | ||
|
|
93c8672ceb | ||
|
|
371b1d4c61 | ||
|
|
c9e093116c | ||
|
|
c0c2dd1e0b | ||
|
|
06c20c9904 | ||
|
|
6eb745d9bd | ||
|
|
66483a9d00 | ||
|
|
edfe867208 | ||
|
|
dc45efc8ef | ||
|
|
fb8851f254 | ||
|
|
a903d59ffa | ||
|
|
322cb02872 | ||
|
|
2c52c7fd9a | ||
|
|
1e1c06789e | ||
|
|
7218f83992 | ||
|
|
20e4497be2 | ||
|
|
1c7bcc55b8 | ||
|
|
a9705a290a | ||
|
|
64192d5624 | ||
|
|
fe25772aa9 | ||
|
|
0cca9b4d13 | ||
|
|
a8c536829c | ||
|
|
fcbcba6c70 | ||
|
|
3168285fca | ||
|
|
3fb0d90999 | ||
|
|
05c2dee7e9 | ||
|
|
1d642872a2 | ||
|
|
9ccef8e333 | ||
|
|
537cc635c7 | ||
|
|
5031cd5d55 | ||
|
|
3aaa94ac99 | ||
|
|
8e38e99829 | ||
|
|
0075bfffd4 | ||
|
|
cb0a7b4bea | ||
|
|
8f4f77a727 | ||
|
|
22e44ad589 | ||
|
|
88f5b19f0b | ||
|
|
613abb50d5 | ||
|
|
cdeec2e606 | ||
|
|
1607e664f0 | ||
|
|
68d7231991 | ||
|
|
2fd893b4ce | ||
|
|
02f5903b84 | ||
|
|
ac10fd3c69 | ||
|
|
9d2d561257 | ||
|
|
fe69f331f8 | ||
|
|
3319a493fc | ||
|
|
61728cd1df | ||
|
|
0c80efd94f | ||
|
|
a8b70304d6 | ||
|
|
d44e9df7d4 | ||
|
|
48fc8b1e59 | ||
|
|
1ffe934c8a | ||
|
|
2c8b9182b5 | ||
|
|
4f5299f717 | ||
|
|
09540cd918 | ||
|
|
da2f6800e0 | ||
|
|
ba558c029a | ||
|
|
97cfa99d59 | ||
|
|
bbc6c2f1e5 | ||
|
|
8151609583 | ||
|
|
fdf93486d6 | ||
|
|
d69062c67a | ||
|
|
ae4821a108 | ||
|
|
7ed27f3cb5 | ||
|
|
a4511e38db | ||
|
|
71d0ae1c54 | ||
|
|
3d4e7d34be | ||
|
|
6a25ea5f0e | ||
|
|
73ff872db0 | ||
|
|
468a8d72ba | ||
|
|
4c23690f43 | ||
|
|
814843e021 | ||
|
|
20852c8f4c | ||
|
|
40b6b38f2c | ||
|
|
da94c7c0eb | ||
|
|
1395461f5f | ||
|
|
9912b8ccb8 | ||
|
|
49ef847aa8 | ||
|
|
67745d189f | ||
|
|
2a2d5d2780 | ||
|
|
c3e2978620 | ||
|
|
e4bb2684bc | ||
|
|
c64c0b78de | ||
|
|
0af3d4f0df | ||
|
|
da8dadf68b | ||
|
|
f226a3f0c1 | ||
|
|
c2612371ad | ||
|
|
49a986ecd4 | ||
|
|
f6aa122698 | ||
|
|
184b12fdc6 | ||
|
|
b9489f51e1 | ||
|
|
285eaa4285 | ||
|
|
439368496d | ||
|
|
896e41ae04 | ||
|
|
5bb1da5190 | ||
|
|
5bdd155277 | ||
|
|
0168f69e50 | ||
|
|
083cf326dc | ||
|
|
bf9e1e8767 | ||
|
|
3ddcf46011 | ||
|
|
d0a73620cc | ||
|
|
88ab591f0b | ||
|
|
b6e04390d3 | ||
|
|
552cac95b5 | ||
|
|
61485844fc | ||
|
|
f77bce001a | ||
|
|
a289cc1dde | ||
|
|
95ae50b7d1 | ||
|
|
7765e5ba75 | ||
|
|
d8874c61a5 | ||
|
|
f8b19c0ffd | ||
|
|
e42bd8c2e3 | ||
|
|
7f064491f8 | ||
|
|
64e39d667c | ||
|
|
1b82fb0ad3 | ||
|
|
d4acf518d0 | ||
|
|
ab01cd14e5 | ||
|
|
577bb34fff | ||
|
|
3380ed5e11 | ||
|
|
6f37419244 | ||
|
|
60e089f0b9 | ||
|
|
d64429bb36 | ||
|
|
561253b37f | ||
|
|
80b6080ddc | ||
|
|
03ee48111d | ||
|
|
5a87076d6e | ||
|
|
ac1daf3233 | ||
|
|
63fed55506 | ||
|
|
8d259fad6c | ||
|
|
3bc1175798 | ||
|
|
af02c40970 | ||
|
|
b316ac6589 | ||
|
|
a55b64635c | ||
|
|
d231876ce3 | ||
|
|
f849ee739c | ||
|
|
be263f7645 | ||
|
|
2bb4435cb7 | ||
|
|
07cadab27a | ||
|
|
637f292196 | ||
|
|
e439c784fa | ||
|
|
085a525332 | ||
|
|
89d3679221 | ||
|
|
cb15ee28db | ||
|
|
f36292dbee | ||
|
|
173b356abf | ||
|
|
638e4196d1 | ||
|
|
1ec978c209 | ||
|
|
74b5267d3a | ||
|
|
dd6ac1c2bb | ||
|
|
98b4d389ed | ||
|
|
6965ef436f | ||
|
|
c9e665852a | ||
|
|
363aaeef0f | ||
|
|
ac86bff8cb | ||
|
|
edfe498189 | ||
|
|
f05d474c8a | ||
|
|
9fc81ec765 | ||
|
|
186352b270 | ||
|
|
58e61e56b7 | ||
|
|
75f01b9d3c | ||
|
|
ba041d980b | ||
|
|
e0c910bb89 | ||
|
|
bf3ffb61e6 | ||
|
|
e5c78956c0 | ||
|
|
2e0ad629b0 | ||
|
|
5a84b76b86 | ||
|
|
0de4f217ab | ||
|
|
f08eab2acc | ||
|
|
8977ffb5e6 | ||
|
|
fd4555089a | ||
|
|
cec275efce | ||
|
|
e2741f6cbc | ||
|
|
67187554dd | ||
|
|
a425dc256e | ||
|
|
964d65deed | ||
|
|
9261eb3dc1 | ||
|
|
cdd7025961 | ||
|
|
085424808e | ||
|
|
a17e36f223 | ||
|
|
8cc40f8992 | ||
|
|
6f1e7f7226 | ||
|
|
d54a18a47e | ||
|
|
5f3cd7f7f2 | ||
|
|
c934caee88 | ||
|
|
3f8a874065 | ||
|
|
511a6b611d | ||
|
|
96b23b8e3b | ||
|
|
433c0f8675 | ||
|
|
8d3748d3c7 | ||
|
|
db56a59970 | ||
|
|
9324e10275 | ||
|
|
4516d44b7f | ||
|
|
41b92f7d38 | ||
|
|
360bd8762f | ||
|
|
ecf8230d4d | ||
|
|
8cfbe89b93 | ||
|
|
fd75d3e8c0 | ||
|
|
c9a3a02149 | ||
|
|
bc3e43069a | ||
|
|
c36bcfe6b3 | ||
|
|
529cea343d | ||
|
|
93103575ce | ||
|
|
15ae8e0784 | ||
|
|
0b25498990 | ||
|
|
0aecd9138f | ||
|
|
da14ae0fad | ||
|
|
01bea115c4 | ||
|
|
b39a5026eb | ||
|
|
622e6106a9 | ||
|
|
2aa75c752b | ||
|
|
4d5943bda6 | ||
|
|
f2b8e1c551 | ||
|
|
6e25b1cddf | ||
|
|
e64011f29a | ||
|
|
1b622deba7 | ||
|
|
faed7bf07e | ||
|
|
262d263f6c | ||
|
|
968060c15a | ||
|
|
5d6ce2b960 | ||
|
|
f9f3b596f3 | ||
|
|
119c4927b3 | ||
|
|
fe1cd7704d | ||
|
|
fdfd5075aa | ||
|
|
327c0a9a23 | ||
|
|
06c4873d95 | ||
|
|
d3387750f1 | ||
|
|
b230286fbc | ||
|
|
3035d1a166 | ||
|
|
07a606aa7e | ||
|
|
a7791eac9d | ||
|
|
8da2f28f53 | ||
|
|
86d15bfd8d | ||
|
|
c9fe6abe7c | ||
|
|
c47b6c85ac | ||
|
|
c428e8d80b | ||
|
|
5e973209aa | ||
|
|
e63fd44560 | ||
|
|
11ac9ddd03 | ||
|
|
5c9ad138d5 | ||
|
|
fa183e9271 | ||
|
|
4ab34f6ef1 | ||
|
|
c33b87e777 | ||
|
|
4504e8029b | ||
|
|
ca00b1bfc6 | ||
|
|
d44fbbab0e | ||
|
|
7e082bc14e | ||
|
|
dbbe0c756a | ||
|
|
7dca0c90cb | ||
|
|
1a0b157a2e | ||
|
|
7c38ed0f1c | ||
|
|
a1d3866dda | ||
|
|
97d1c99302 | ||
|
|
3226283461 | ||
|
|
8832fff972 | ||
|
|
a543e678b4 | ||
|
|
2dacd57394 | ||
|
|
d75ad04818 | ||
|
|
52eadcec9e | ||
|
|
51c599f0ec | ||
|
|
69d0e90313 | ||
|
|
4ca5cd5740 | ||
|
|
10f01d5a3a | ||
|
|
3eb0c2673e | ||
|
|
d8140b9833 | ||
|
|
74a9a9faad | ||
|
|
478ee511de | ||
|
|
58ce8d12b7 | ||
|
|
94a9ebcf31 | ||
|
|
a39dd7bb06 | ||
|
|
64d57c3be7 | ||
|
|
a1e7fa362a | ||
|
|
bac904565f | ||
|
|
304419576a | ||
|
|
a742134cc5 | ||
|
|
728a9eb70e | ||
|
|
bc5bd45c7d | ||
|
|
f76e85c299 | ||
|
|
54aecd9ed5 | ||
|
|
10138c92a5 | ||
|
|
a9d18b5107 | ||
|
|
edb59a9470 | ||
|
|
c5f10cc139 | ||
|
|
d143152308 | ||
|
|
a4730c1b4f | ||
|
|
d3ade61e42 | ||
|
|
1761dea1a8 | ||
|
|
c748355e0d | ||
|
|
91864b79b3 | ||
|
|
ac0bb2c307 | ||
|
|
f31419ed8b | ||
|
|
b9ce9a3013 | ||
|
|
4ccffe561f | ||
|
|
cbb799e314 | ||
|
|
9f0247cfa4 | ||
|
|
7f829be7d3 | ||
|
|
e1710393c4 | ||
|
|
3f770f4427 | ||
|
|
48c879369f | ||
|
|
1788aa1efb | ||
|
|
d23539549a | ||
|
|
412e153df5 | ||
|
|
e5f599d4d1 | ||
|
|
28534b92b9 | ||
|
|
d4902ba56d | ||
|
|
df4d3a44a8 | ||
|
|
9d1c474704 | ||
|
|
8c32c6e4b4 | ||
|
|
de120bc94f | ||
|
|
4228be7959 | ||
|
|
76e4dcf225 | ||
|
|
d5edcb8678 | ||
|
|
6c3c0f8235 | ||
|
|
684f254585 | ||
|
|
e553424919 | ||
|
|
5a1271d83a | ||
|
|
05576df85c | ||
|
|
68c09efc37 | ||
|
|
a7ef3eb0cd | ||
|
|
f9a4087182 | ||
|
|
287bbbeb06 | ||
|
|
3143eb23fc | ||
|
|
b886068056 | ||
|
|
a90ad7d838 | ||
|
|
533b018f72 | ||
|
|
a1448b4b69 | ||
|
|
fa1970201d | ||
|
|
3380543b20 | ||
|
|
afffd3cc8a | ||
|
|
7dbe6d81d6 | ||
|
|
b30dfa03c5 | ||
|
|
2e78150d24 | ||
|
|
d381eb967f | ||
|
|
9973e6e04a | ||
|
|
c7991269dd | ||
|
|
f0359fffa4 | ||
|
|
798c7bebca | ||
|
|
4fd4b743a2 | ||
|
|
cc079763c5 | ||
|
|
a7adbc6c6b | ||
|
|
e605e8e323 | ||
|
|
bca74e32b7 | ||
|
|
8d706cca90 | ||
|
|
57201a6a4c | ||
|
|
f2d9ad0620 | ||
|
|
de540c0354 | ||
|
|
39029d5192 | ||
|
|
35d801f13f | ||
|
|
0bf29fadf5 | ||
|
|
a5a790eea6 | ||
|
|
b30372cbd0 | ||
|
|
d17ecc6b19 | ||
|
|
021143561f | ||
|
|
30700b1cd7 | ||
|
|
4b94ed8f92 | ||
|
|
6dec9f6109 | ||
|
|
bf6a3d0ff5 | ||
|
|
40d33264c6 | ||
|
|
9c84ca8293 | ||
|
|
6d54336ae5 | ||
|
|
34553b9d27 | ||
|
|
b039bfda8f | ||
|
|
d0e186c16f | ||
|
|
f080a83511 | ||
|
|
40e2eeeb92 | ||
|
|
b06b9470ca | ||
|
|
4673e465ff | ||
|
|
912744d066 | ||
|
|
15be507c86 | ||
|
|
6f7de33bed | ||
|
|
a98cc35c34 | ||
|
|
e8697faf03 | ||
|
|
03fa4d3fb3 | ||
|
|
6b2b9fd934 | ||
|
|
c5f685b3ae | ||
|
|
c4768dcf47 | ||
|
|
a65a934ebe | ||
|
|
4a8d6bd168 | ||
|
|
636efd10a5 | ||
|
|
289eb6c537 | ||
|
|
19d91ece4b | ||
|
|
7ae5a5fb11 | ||
|
|
de2b78305f | ||
|
|
e5e9067e61 | ||
|
|
3a7d580343 | ||
|
|
05f8d69077 | ||
|
|
404d7a9d14 | ||
|
|
171133f929 | ||
|
|
32787d0644 | ||
|
|
975676d174 | ||
|
|
77d702a22b | ||
|
|
2108a571d7 | ||
|
|
47604137a2 | ||
|
|
26990d25dc | ||
|
|
d9ab1ad9d1 | ||
|
|
608bb14462 | ||
|
|
4a36681f85 | ||
|
|
d15afc1fd0 | ||
|
|
934a9c3b79 | ||
|
|
70af44fd10 | ||
|
|
781f5ebf52 | ||
|
|
0852527647 | ||
|
|
61d25dc44b | ||
|
|
d0c7792004 | ||
|
|
b158df2813 | ||
|
|
1aaecda078 | ||
|
|
811df41ee9 | ||
|
|
67a2da890e | ||
|
|
da786e339e | ||
|
|
18903216f5 | ||
|
|
d0ceb38ae8 | ||
|
|
155ad56d7b | ||
|
|
5fb4137c99 | ||
|
|
68a72a5cc1 | ||
|
|
0f872b7977 | ||
|
|
4b1ff13221 | ||
|
|
e0d6b4a867 | ||
|
|
72b1c2ae2c | ||
|
|
e0919f331d | ||
|
|
8e19d470af | ||
|
|
1958bda9b4 | ||
|
|
7bdb42b2f2 | ||
|
|
315068eb4a | ||
|
|
ccd98b59c1 | ||
|
|
21b82f4ea2 | ||
|
|
a736e5ff77 | ||
|
|
9da9208b20 | ||
|
|
11fd69dd54 | ||
|
|
c0a4b95d64 | ||
|
|
a47d94f18c | ||
|
|
e70fbc599b | ||
|
|
4bf56c79cc | ||
|
|
59b453eaa2 | ||
|
|
827e4237bc | ||
|
|
ca6f755d24 | ||
|
|
ca90f50304 | ||
|
|
da855b42d2 | ||
|
|
449de9001a | ||
|
|
d4aa65c998 | ||
|
|
7a8375f8a0 | ||
|
|
5e0c1fe69c | ||
|
|
4507a6dae4 | ||
|
|
d1dd5f53e4 | ||
|
|
e52e4da971 | ||
|
|
2176778cd3 | ||
|
|
0370679ce9 | ||
|
|
8816e375d3 | ||
|
|
f32229293e | ||
|
|
c757a15f0f | ||
|
|
59a50afa08 | ||
|
|
981cadb35c | ||
|
|
c3ee80a01a | ||
|
|
3755c14532 | ||
|
|
201dc98acc | ||
|
|
a404e2c0f1 | ||
|
|
e31946f86e | ||
|
|
bde5039325 | ||
|
|
d72299d47b | ||
|
|
80679f108f | ||
|
|
43ecd0a900 | ||
|
|
07d614511f | ||
|
|
f948ab6945 | ||
|
|
d71af5f502 | ||
|
|
90189c71a9 | ||
|
|
d79d9f0780 | ||
|
|
b6a248bdd7 | ||
|
|
1767658559 | ||
|
|
efe73e9b57 | ||
|
|
0b8e871e5e | ||
|
|
5ee93a5956 | ||
|
|
e15601789b | ||
|
|
65ac8d8dc4 | ||
|
|
ffb08379d8 | ||
|
|
e04492449e | ||
|
|
518ec6b722 | ||
|
|
802748bddb | ||
|
|
faedbb4d4f | ||
|
|
40db194446 | ||
|
|
c765f0b443 | ||
|
|
002b07c4b2 | ||
|
|
752ddeacaa | ||
|
|
c18f88c6ca | ||
|
|
6fd0df8132 | ||
|
|
3f5a4b6473 | ||
|
|
6cae1e5332 | ||
|
|
80c9275348 | ||
|
|
e50c454672 | ||
|
|
5d16d0fa62 | ||
|
|
0606bea2b6 | ||
|
|
6e97eccf5d | ||
|
|
6ab183813c | ||
|
|
6b7a81185d | ||
|
|
b57789b62b | ||
|
|
377061d481 | ||
|
|
86dca07d9b | ||
|
|
16b37f3119 | ||
|
|
0976711f3b | ||
|
|
e261d37c9a | ||
|
|
b7cbc25416 | ||
|
|
d43ad5a757 | ||
|
|
0ff05e3770 | ||
|
|
428bc7bf1c | ||
|
|
878fd5a16f | ||
|
|
18b39828d9 | ||
|
|
4ea62b77f5 | ||
|
|
d4e547bb7e | ||
|
|
2d977a7a9e | ||
|
|
1fb4217a05 | ||
|
|
611c86ea3c | ||
|
|
dc937175d4 | ||
|
|
2f1cc8cef1 | ||
|
|
938a81692e | ||
|
|
c9f66da8fd | ||
|
|
05cae69f0f | ||
|
|
5fd8f02ea9 | ||
|
|
97e3dda84b | ||
|
|
5a0a6dfd55 | ||
|
|
938772af03 | ||
|
|
e4ee658672 | ||
|
|
77f8001f53 | ||
|
|
300a265978 | ||
|
|
03c4c4aa9d | ||
|
|
2ec401bc39 | ||
|
|
4022a9d279 | ||
|
|
53f6e81dfd | ||
|
|
43a6acfb7d | ||
|
|
58279c60b5 | ||
|
|
2f84ae1f27 | ||
|
|
f32cbc9a0c | ||
|
|
7e4be74104 | ||
|
|
380ba6816d | ||
|
|
14a125a06d | ||
|
|
c02fccdbd2 | ||
|
|
6ddae74054 | ||
|
|
b13a447546 | ||
|
|
7956b0c0bc | ||
|
|
3758757377 | ||
|
|
ccd3e55e51 | ||
|
|
01baefe674 | ||
|
|
786030721e | ||
|
|
145c00a4d3 | ||
|
|
55011aef24 | ||
|
|
a4398fbb5e | ||
|
|
2c19d96777 | ||
|
|
4bc400f47e | ||
|
|
cac4c10ef0 | ||
|
|
f7d2946e99 | ||
|
|
294c805f1d | ||
|
|
40b69e33e7 | ||
|
|
32257297dd | ||
|
|
ba464e6ae2 | ||
|
|
7f4bdadb92 | ||
|
|
cec7c28833 | ||
|
|
18961c5ea6 | ||
|
|
470ad118b6 | ||
|
|
1bf43ae35d | ||
|
|
0ce743f4e1 | ||
|
|
6c317a656e | ||
|
|
00b31a36a2 | ||
|
|
73444b7b56 | ||
|
|
853a8eb53b | ||
|
|
758ea2e980 | ||
|
|
685c99ee77 | ||
|
|
1e88fb751b | ||
|
|
c2ed069b32 | ||
|
|
af6e19f50f | ||
|
|
99d69af9ec | ||
|
|
d811b442d3 | ||
|
|
30a14b034f | ||
|
|
799ce45cc1 | ||
|
|
2c0c7c39bd | ||
|
|
e675118849 | ||
|
|
e2347dbf58 | ||
|
|
879a06579e | ||
|
|
29de3cdee4 | ||
|
|
7e2729b57e | ||
|
|
3a5de7d2d6 | ||
|
|
bc4486d609 | ||
|
|
0cdbe7b744 | ||
|
|
df334868ca | ||
|
|
0e0a638c3b | ||
|
|
f29aeb5a25 | ||
|
|
5e8862e9e0 | ||
|
|
9e5bd3076e | ||
|
|
fc16f1c477 | ||
|
|
bc306fe5e9 | ||
|
|
103a468bbf | ||
|
|
70bfbd7b16 | ||
|
|
d6517be3cd | ||
|
|
7e06c40e63 | ||
|
|
675704ac01 | ||
|
|
0384aa7150 | ||
|
|
3857eb8725 | ||
|
|
933cdea440 | ||
|
|
3933f18a5e | ||
|
|
e5ef4dfc11 | ||
|
|
36960501d3 | ||
|
|
b2e65cb4a7 | ||
|
|
2bf0bcc1fc | ||
|
|
697f507a8e | ||
|
|
d5d2a0fe74 | ||
|
|
c9791f1813 | ||
|
|
e7acb20076 | ||
|
|
4b68c4a55b | ||
|
|
a8141fa649 | ||
|
|
4917002523 | ||
|
|
a2981c4272 | ||
|
|
4574d48bab | ||
|
|
ab98f6556f |
@@ -1,12 +0,0 @@
|
||||
# For vllm script, with -t option (tensor parallel size).
|
||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Qwen2-1.5B-Instruct-W8A16-Channelwise -b "auto" -l 1000 -f 5 -t 1
|
||||
model_name: "nm-testing/Qwen2-1.5B-Instruct-W8A16-Channelwise"
|
||||
tasks:
|
||||
- name: "gsm8k"
|
||||
metrics:
|
||||
- name: "exact_match,strict-match"
|
||||
value: 0.595
|
||||
- name: "exact_match,flexible-extract"
|
||||
value: 0.582
|
||||
limit: 1000
|
||||
num_fewshot: 5
|
||||
@@ -7,7 +7,7 @@ vLLM also maintains a continuous performance benchmark under [perf.vllm.ai](http
|
||||
|
||||
## Performance benchmark quick overview
|
||||
|
||||
**Benchmarking Coverage**: latency, throughput and fix-qps serving on B200, A100, H100 and Intel® Xeon® Processors, with different models.
|
||||
**Benchmarking Coverage**: latency, throughput and fix-qps serving on B200, A100, H100, Intel® Xeon® Processors and Intel® Gaudi® 3 Accelerators with different models.
|
||||
|
||||
**Benchmarking Duration**: about 1hr.
|
||||
|
||||
@@ -34,6 +34,7 @@ Runtime environment variables:
|
||||
|
||||
See [performance-benchmarks-descriptions.md](performance-benchmarks-descriptions.md) for detailed descriptions, and use `tests/latency-tests.json`, `tests/throughput-tests.json`, `tests/serving-tests.json` to configure the test cases.
|
||||
> NOTE: For Intel® Xeon® Processors, use `tests/latency-tests-cpu.json`, `tests/throughput-tests-cpu.json`, `tests/serving-tests-cpu.json` instead.
|
||||
For Intel® Gaudi® 3 Accelerators, use `tests/latency-tests-hpu.json`, `tests/throughput-tests-hpu.json`, `tests/serving-tests-hpu.json` instead.
|
||||
>
|
||||
### Latency test
|
||||
|
||||
@@ -107,6 +108,65 @@ The number of this test is less stable compared to the delay and latency benchma
|
||||
|
||||
WARNING: The benchmarking script will save json results by itself, so please do not configure `--save-results` or other results-saving-related parameters in `serving-tests.json`.
|
||||
|
||||
#### Default Parameters Field
|
||||
|
||||
We can specify default parameters in a JSON field with key `defaults`. Parameters defined in the field are applied globally to all serving tests, and can be overridden in test case fields. Here is an example:
|
||||
|
||||
<details>
|
||||
<summary> An Example of default parameters field </summary>
|
||||
|
||||
```json
|
||||
{
|
||||
"defaults": {
|
||||
"qps_list": [
|
||||
"inf"
|
||||
],
|
||||
"server_environment_variables": {
|
||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1
|
||||
},
|
||||
"server_parameters": {
|
||||
"tensor_parallel_size": 1,
|
||||
"dtype": "bfloat16",
|
||||
"block_size": 128,
|
||||
"disable_log_stats": "",
|
||||
"load_format": "dummy"
|
||||
},
|
||||
"client_parameters": {
|
||||
"backend": "vllm",
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 128,
|
||||
"random-output-len": 128,
|
||||
"num_prompts": 200,
|
||||
"ignore-eos": ""
|
||||
}
|
||||
},
|
||||
"tests": [
|
||||
{
|
||||
"test_name": "serving_llama3B_tp2_random_128_128",
|
||||
"server_parameters": {
|
||||
"model": "meta-llama/Llama-3.2-3B-Instruct",
|
||||
"tensor_parallel_size": 2,
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "meta-llama/Llama-3.2-3B-Instruct",
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_qwen3_tp4_random_128_128",
|
||||
"server_parameters": {
|
||||
"model": "Qwen/Qwen3-14B",
|
||||
"tensor_parallel_size": 4,
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "Qwen/Qwen3-14B",
|
||||
}
|
||||
},
|
||||
]
|
||||
}
|
||||
```
|
||||
|
||||
</details>
|
||||
|
||||
### Visualizing the results
|
||||
|
||||
The `convert-results-json-to-markdown.py` helps you put the benchmarking results inside a markdown table, by formatting [descriptions.md](performance-benchmarks-descriptions.md) with real benchmarking results.
|
||||
|
||||
@@ -5,7 +5,7 @@
|
||||
- Input length: 32 tokens.
|
||||
- Output length: 128 tokens.
|
||||
- Batch size: fixed (8).
|
||||
- GPU Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
|
||||
- GPU/HPU Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
|
||||
- CPU Models: llama-3.1 8B.
|
||||
- Evaluation metrics: end-to-end latency (mean, median, p99).
|
||||
|
||||
@@ -16,7 +16,7 @@
|
||||
- Input length: randomly sample 200 prompts from ShareGPT dataset (with fixed random seed).
|
||||
- Output length: the corresponding output length of these 200 prompts.
|
||||
- Batch size: dynamically determined by vllm to achieve maximum throughput.
|
||||
- GPU Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
|
||||
- GPU/HPU Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
|
||||
- CPU Models: llama-3.1 8B.
|
||||
- Evaluation metrics: throughput.
|
||||
|
||||
@@ -28,7 +28,7 @@
|
||||
- Output length: the corresponding output length of these 200 prompts.
|
||||
- Batch size: dynamically determined by vllm and the arrival pattern of the requests.
|
||||
- **Average QPS (query per second)**: 1, 4, 16 and inf. QPS = inf means all requests come at once. For other QPS values, the arrival time of each query is determined using a random Poisson process (with fixed random seed).
|
||||
- GPU Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
|
||||
- GPU/HPU Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
|
||||
- We also added a speculative decoding test for llama-3 70B on GPU, under QPS 2
|
||||
- CPU Models: llama-3.1 8B.
|
||||
- Evaluation metrics: throughput, TTFT (time to the first token, with mean, median and p99), ITL (inter-token latency, with mean, median and p99).
|
||||
|
||||
@@ -15,6 +15,8 @@ check_gpus() {
|
||||
declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
|
||||
elif command -v amd-smi; then
|
||||
declare -g gpu_count=$(amd-smi list | grep 'GPU' | wc -l)
|
||||
elif command -v hl-smi; then
|
||||
declare -g gpu_count=$(hl-smi --list | grep -i "Module ID" | wc -l)
|
||||
fi
|
||||
|
||||
if [[ $gpu_count -gt 0 ]]; then
|
||||
@@ -23,10 +25,16 @@ check_gpus() {
|
||||
echo "Need at least 1 GPU to run benchmarking."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
declare -g arch_suffix=''
|
||||
|
||||
if command -v nvidia-smi; then
|
||||
declare -g gpu_type=$(nvidia-smi --query-gpu=name --format=csv,noheader | awk '{print $2}')
|
||||
elif command -v amd-smi; then
|
||||
declare -g gpu_type=$(amd-smi static -g 0 -a | grep 'MARKET_NAME' | awk '{print $2}')
|
||||
elif command -v hl-smi; then
|
||||
declare -g gpu_type=$(hl-smi -q | grep "Product Name" | head -n 1 | awk -F ':' '{print $2}' | sed 's/^ *//')
|
||||
arch_suffix='-hpu'
|
||||
fi
|
||||
echo "GPU type is $gpu_type"
|
||||
}
|
||||
@@ -102,7 +110,8 @@ json2envs() {
|
||||
wait_for_server() {
|
||||
# wait for vllm server to start
|
||||
# return 1 if vllm server crashes
|
||||
timeout 1200 bash -c '
|
||||
local timeout_val="1200"
|
||||
timeout "$timeout_val" bash -c '
|
||||
until curl -X POST localhost:8000/v1/completions; do
|
||||
sleep 1
|
||||
done' && return 0 || return 1
|
||||
@@ -138,6 +147,10 @@ kill_gpu_processes() {
|
||||
while [ "$(amd-smi metric -g 0 | grep 'USED_VRAM' | awk '{print $2}')" -ge 1000 ]; do
|
||||
sleep 1
|
||||
done
|
||||
elif command -v hl-smi; then
|
||||
while [ "$(hl-smi -q | grep "Used" | head -n 1 | awk '{print $3}')" -ge 1000 ]; do
|
||||
sleep 1
|
||||
done
|
||||
fi
|
||||
|
||||
# remove vllm config file
|
||||
@@ -304,12 +317,44 @@ run_throughput_tests() {
|
||||
run_serving_tests() {
|
||||
# run serving tests using `vllm bench serve` command
|
||||
# $1: a json file specifying serving test cases
|
||||
#
|
||||
# Supported JSON formats:
|
||||
# 1) Plain format: top-level array
|
||||
# [ { "test_name": "...", "server_parameters": {...}, ... }, ... ]
|
||||
#
|
||||
# 2) Default parameters field + plain format tests
|
||||
# {
|
||||
# "defaults": { ... },
|
||||
# "tests": [ { "test_name": "...", "server_parameters": {...}, ... }, ... ]
|
||||
# }
|
||||
|
||||
local serving_test_file
|
||||
serving_test_file=$1
|
||||
|
||||
# Iterate over serving tests
|
||||
jq -c '.[]' "$serving_test_file" | while read -r params; do
|
||||
jq -c '
|
||||
if type == "array" then
|
||||
# Plain format: test cases array
|
||||
.[]
|
||||
elif (type == "object" and has("tests")) then
|
||||
# merge the default parameters into each test cases
|
||||
. as $root
|
||||
| ($root.defaults // {}) as $d
|
||||
| ($root.tests // [])[]
|
||||
# default qps / max_concurrency from defaults if missing
|
||||
| .qps_list = (.qps_list // $d.qps_list)
|
||||
| .max_concurrency_list = (.max_concurrency_list // $d.max_concurrency_list)
|
||||
# merge envs / params: test overrides defaults
|
||||
| .server_environment_variables =
|
||||
(($d.server_environment_variables // {}) + (.server_environment_variables // {}))
|
||||
| .server_parameters =
|
||||
(($d.server_parameters // {}) + (.server_parameters // {}))
|
||||
| .client_parameters =
|
||||
(($d.client_parameters // {}) + (.client_parameters // {}))
|
||||
else
|
||||
error("Unsupported serving test file format: must be array or object with .tests")
|
||||
end
|
||||
' "$serving_test_file" | while read -r params; do
|
||||
# get the test name, and append the GPU type back to it.
|
||||
test_name=$(echo "$params" | jq -r '.test_name')
|
||||
if [[ ! "$test_name" =~ ^serving_ ]]; then
|
||||
@@ -323,20 +368,25 @@ run_serving_tests() {
|
||||
continue
|
||||
fi
|
||||
|
||||
# get client and server arguments
|
||||
# get client and server arguments (after merged the default parameters)
|
||||
server_params=$(echo "$params" | jq -r '.server_parameters')
|
||||
server_envs=$(echo "$params" | jq -r '.server_environment_variables')
|
||||
client_params=$(echo "$params" | jq -r '.client_parameters')
|
||||
|
||||
server_args=$(json2args "$server_params")
|
||||
server_envs=$(json2envs "$server_envs")
|
||||
client_args=$(json2args "$client_params")
|
||||
|
||||
# qps_list
|
||||
qps_list=$(echo "$params" | jq -r '.qps_list')
|
||||
qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
|
||||
echo "Running over qps list $qps_list"
|
||||
|
||||
# max_concurrency_list (fallback to num_prompts if missing)
|
||||
max_concurrency_list=$(echo "$params" | jq -r '.max_concurrency_list')
|
||||
if [[ -z "$max_concurrency_list" || "$max_concurrency_list" == "null" ]]; then
|
||||
num_prompts=$(echo "$client_params" | jq -r '.num_prompts')
|
||||
max_concurrency_list="[$num_prompts]"
|
||||
num_prompts=$(echo "$client_params" | jq -r '.num_prompts')
|
||||
max_concurrency_list="[$num_prompts]"
|
||||
fi
|
||||
max_concurrency_list=$(echo "$max_concurrency_list" | jq -r '.[] | @sh')
|
||||
echo "Running over max concurrency list $max_concurrency_list"
|
||||
@@ -451,6 +501,7 @@ main() {
|
||||
ARCH='-cpu'
|
||||
else
|
||||
check_gpus
|
||||
ARCH="$arch_suffix"
|
||||
fi
|
||||
check_hf_token
|
||||
|
||||
|
||||
@@ -0,0 +1,55 @@
|
||||
[
|
||||
{
|
||||
"test_name": "latency_llama8B_tp1",
|
||||
"environment_variables": {
|
||||
"PT_HPU_LAZY_MODE": 1,
|
||||
"VLLM_CONTIGUOUS_PA": 1,
|
||||
"VLLM_DEFRAG": 1
|
||||
},
|
||||
"parameters": {
|
||||
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
||||
"tensor_parallel_size": 1,
|
||||
"load_format": "dummy",
|
||||
"num-iters-warmup": 5,
|
||||
"num-iters": 15,
|
||||
"max-model-len": 256,
|
||||
"async-scheduling": ""
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "latency_llama70B_tp4",
|
||||
"environment_variables": {
|
||||
"PT_HPU_LAZY_MODE": 1,
|
||||
"PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
|
||||
"VLLM_CONTIGUOUS_PA": 1,
|
||||
"VLLM_DEFRAG": 1
|
||||
},
|
||||
"parameters": {
|
||||
"model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
|
||||
"tensor_parallel_size": 4,
|
||||
"load_format": "dummy",
|
||||
"num-iters-warmup": 5,
|
||||
"num-iters": 15,
|
||||
"max-model-len": 256,
|
||||
"async-scheduling": ""
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "latency_mixtral8x7B_tp2",
|
||||
"environment_variables": {
|
||||
"PT_HPU_LAZY_MODE": 1,
|
||||
"PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
|
||||
"VLLM_CONTIGUOUS_PA": 1,
|
||||
"VLLM_DEFRAG": 1
|
||||
},
|
||||
"parameters": {
|
||||
"model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
|
||||
"tensor_parallel_size": 2,
|
||||
"load_format": "dummy",
|
||||
"num-iters-warmup": 5,
|
||||
"num-iters": 15,
|
||||
"max-model-len": 256,
|
||||
"async-scheduling": ""
|
||||
}
|
||||
}
|
||||
]
|
||||
@@ -1,610 +0,0 @@
|
||||
[
|
||||
{
|
||||
"test_name": "serving_llama8B_bf16_tp1_sharegpt",
|
||||
"qps_list": ["inf"],
|
||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
||||
"server_environment_variables": {
|
||||
"VLLM_RPC_TIMEOUT": 100000,
|
||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||
"VLLM_CPU_SGL_KERNEL": 1,
|
||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||
},
|
||||
"server_parameters": {
|
||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||
"tensor_parallel_size": 1,
|
||||
"dtype": "bfloat16",
|
||||
"distributed_executor_backend": "mp",
|
||||
"block_size": 128,
|
||||
"trust_remote_code": "",
|
||||
"disable_log_stats": "",
|
||||
"enforce_eager": "",
|
||||
"max_num_batched_tokens": 2048,
|
||||
"max_num_seqs": 256,
|
||||
"load_format": "dummy"
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||
"backend": "vllm",
|
||||
"dataset_name": "sharegpt",
|
||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||
"num_prompts": 200
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_bf16_tp2_sharegpt",
|
||||
"qps_list": ["inf"],
|
||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
||||
"server_environment_variables": {
|
||||
"VLLM_RPC_TIMEOUT": 100000,
|
||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||
"VLLM_CPU_SGL_KERNEL": 1,
|
||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||
},
|
||||
"server_parameters": {
|
||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||
"tensor_parallel_size": 2,
|
||||
"dtype": "bfloat16",
|
||||
"distributed_executor_backend": "mp",
|
||||
"block_size": 128,
|
||||
"trust_remote_code": "",
|
||||
"disable_log_stats": "",
|
||||
"enforce_eager": "",
|
||||
"max_num_batched_tokens": 2048,
|
||||
"max_num_seqs": 256,
|
||||
"load_format": "dummy"
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||
"backend": "vllm",
|
||||
"dataset_name": "sharegpt",
|
||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||
"num_prompts": 200
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_bf16_tp4_sharegpt",
|
||||
"qps_list": ["inf"],
|
||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
||||
"server_environment_variables": {
|
||||
"VLLM_RPC_TIMEOUT": 100000,
|
||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||
"VLLM_CPU_SGL_KERNEL": 1,
|
||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||
},
|
||||
"server_parameters": {
|
||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||
"tensor_parallel_size": 4,
|
||||
"dtype": "bfloat16",
|
||||
"distributed_executor_backend": "mp",
|
||||
"block_size": 128,
|
||||
"trust_remote_code": "",
|
||||
"disable_log_stats": "",
|
||||
"enforce_eager": "",
|
||||
"max_num_batched_tokens": 2048,
|
||||
"max_num_seqs": 256,
|
||||
"load_format": "dummy"
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||
"backend": "vllm",
|
||||
"dataset_name": "sharegpt",
|
||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||
"num_prompts": 200
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_bf16_tp1_random_128_128",
|
||||
"qps_list": ["inf"],
|
||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
|
||||
"server_environment_variables": {
|
||||
"VLLM_RPC_TIMEOUT": 100000,
|
||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||
"VLLM_CPU_SGL_KERNEL": 1,
|
||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||
},
|
||||
"server_parameters": {
|
||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||
"tensor_parallel_size": 1,
|
||||
"dtype": "bfloat16",
|
||||
"distributed_executor_backend": "mp",
|
||||
"block_size": 128,
|
||||
"trust_remote_code": "",
|
||||
"enable_chunked_prefill": "",
|
||||
"disable_log_stats": "",
|
||||
"enforce_eager": "",
|
||||
"max_num_batched_tokens": 2048,
|
||||
"max_num_seqs": 256,
|
||||
"load_format": "dummy"
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||
"backend": "vllm",
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 128,
|
||||
"random-output-len": 128,
|
||||
"ignore-eos": "",
|
||||
"num_prompts": 1000
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_bf16_tp2_random_128_128",
|
||||
"qps_list": ["inf"],
|
||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
|
||||
"server_environment_variables": {
|
||||
"VLLM_RPC_TIMEOUT": 100000,
|
||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||
"VLLM_CPU_SGL_KERNEL": 1,
|
||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||
},
|
||||
"server_parameters": {
|
||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||
"tensor_parallel_size": 2,
|
||||
"dtype": "bfloat16",
|
||||
"distributed_executor_backend": "mp",
|
||||
"block_size": 128,
|
||||
"trust_remote_code": "",
|
||||
"enable_chunked_prefill": "",
|
||||
"disable_log_stats": "",
|
||||
"enforce_eager": "",
|
||||
"max_num_batched_tokens": 2048,
|
||||
"max_num_seqs": 256,
|
||||
"load_format": "dummy"
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||
"backend": "vllm",
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 128,
|
||||
"random-output-len": 128,
|
||||
"ignore-eos": "",
|
||||
"num_prompts": 1000
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_bf16_tp4_random_128_128",
|
||||
"qps_list": ["inf"],
|
||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
|
||||
"server_environment_variables": {
|
||||
"VLLM_RPC_TIMEOUT": 100000,
|
||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||
"VLLM_CPU_SGL_KERNEL": 1,
|
||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||
},
|
||||
"server_parameters": {
|
||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||
"tensor_parallel_size": 4,
|
||||
"dtype": "bfloat16",
|
||||
"distributed_executor_backend": "mp",
|
||||
"block_size": 128,
|
||||
"trust_remote_code": "",
|
||||
"enable_chunked_prefill": "",
|
||||
"disable_log_stats": "",
|
||||
"enforce_eager": "",
|
||||
"max_num_batched_tokens": 2048,
|
||||
"max_num_seqs": 256,
|
||||
"load_format": "dummy"
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||
"backend": "vllm",
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 128,
|
||||
"random-output-len": 128,
|
||||
"num_prompts": 1000
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_int8_tp1_sharegpt",
|
||||
"qps_list": ["inf"],
|
||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
||||
"server_environment_variables": {
|
||||
"VLLM_RPC_TIMEOUT": 100000,
|
||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||
"VLLM_CPU_SGL_KERNEL": 1,
|
||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||
},
|
||||
"server_parameters": {
|
||||
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
|
||||
"tensor_parallel_size": 1,
|
||||
"dtype": "bfloat16",
|
||||
"distributed_executor_backend": "mp",
|
||||
"block_size": 128,
|
||||
"trust_remote_code": "",
|
||||
"disable_log_stats": "",
|
||||
"enforce_eager": "",
|
||||
"max_num_batched_tokens": 2048,
|
||||
"max_num_seqs": 256,
|
||||
"load_format": "dummy"
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
|
||||
"backend": "vllm",
|
||||
"dataset_name": "sharegpt",
|
||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||
"num_prompts": 200
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_int8_tp2_sharegpt",
|
||||
"qps_list": ["inf"],
|
||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
||||
"server_environment_variables": {
|
||||
"VLLM_RPC_TIMEOUT": 100000,
|
||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||
"VLLM_CPU_SGL_KERNEL": 1,
|
||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||
},
|
||||
"server_parameters": {
|
||||
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
|
||||
"tensor_parallel_size": 2,
|
||||
"dtype": "bfloat16",
|
||||
"distributed_executor_backend": "mp",
|
||||
"block_size": 128,
|
||||
"trust_remote_code": "",
|
||||
"disable_log_stats": "",
|
||||
"enforce_eager": "",
|
||||
"max_num_batched_tokens": 2048,
|
||||
"max_num_seqs": 256,
|
||||
"load_format": "dummy"
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
|
||||
"backend": "vllm",
|
||||
"dataset_name": "sharegpt",
|
||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||
"num_prompts": 200
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_int8_tp4_sharegpt",
|
||||
"qps_list": ["inf"],
|
||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
||||
"server_environment_variables": {
|
||||
"VLLM_RPC_TIMEOUT": 100000,
|
||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||
"VLLM_CPU_SGL_KERNEL": 1,
|
||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||
},
|
||||
"server_parameters": {
|
||||
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
|
||||
"tensor_parallel_size": 4,
|
||||
"dtype": "bfloat16",
|
||||
"distributed_executor_backend": "mp",
|
||||
"block_size": 128,
|
||||
"trust_remote_code": "",
|
||||
"disable_log_stats": "",
|
||||
"enforce_eager": "",
|
||||
"max_num_batched_tokens": 2048,
|
||||
"max_num_seqs": 256,
|
||||
"load_format": "dummy"
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
|
||||
"backend": "vllm",
|
||||
"dataset_name": "sharegpt",
|
||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||
"num_prompts": 200
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_int8_tp1_random_128_128",
|
||||
"qps_list": ["inf"],
|
||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
|
||||
"server_environment_variables": {
|
||||
"VLLM_RPC_TIMEOUT": 100000,
|
||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||
"VLLM_CPU_SGL_KERNEL": 1,
|
||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||
},
|
||||
"server_parameters": {
|
||||
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
|
||||
"tensor_parallel_size": 1,
|
||||
"dtype": "bfloat16",
|
||||
"distributed_executor_backend": "mp",
|
||||
"block_size": 128,
|
||||
"trust_remote_code": "",
|
||||
"enable_chunked_prefill": "",
|
||||
"disable_log_stats": "",
|
||||
"enforce_eager": "",
|
||||
"max_num_batched_tokens": 2048,
|
||||
"max_num_seqs": 256,
|
||||
"load_format": "dummy"
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
|
||||
"backend": "vllm",
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 128,
|
||||
"random-output-len": 128,
|
||||
"ignore-eos": "",
|
||||
"num_prompts": 1000
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_int8_tp2_random_128_128",
|
||||
"qps_list": ["inf"],
|
||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
|
||||
"server_environment_variables": {
|
||||
"VLLM_RPC_TIMEOUT": 100000,
|
||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||
"VLLM_CPU_SGL_KERNEL": 1,
|
||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||
},
|
||||
"server_parameters": {
|
||||
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
|
||||
"tensor_parallel_size": 2,
|
||||
"dtype": "bfloat16",
|
||||
"distributed_executor_backend": "mp",
|
||||
"block_size": 128,
|
||||
"trust_remote_code": "",
|
||||
"enable_chunked_prefill": "",
|
||||
"disable_log_stats": "",
|
||||
"enforce_eager": "",
|
||||
"max_num_batched_tokens": 2048,
|
||||
"max_num_seqs": 256,
|
||||
"load_format": "dummy"
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
|
||||
"backend": "vllm",
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 128,
|
||||
"random-output-len": 128,
|
||||
"ignore-eos": "",
|
||||
"num_prompts": 1000
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_int8_tp4_random_128_128",
|
||||
"qps_list": ["inf"],
|
||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
|
||||
"server_environment_variables": {
|
||||
"VLLM_RPC_TIMEOUT": 100000,
|
||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||
"VLLM_CPU_SGL_KERNEL": 1,
|
||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||
},
|
||||
"server_parameters": {
|
||||
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
|
||||
"tensor_parallel_size": 4,
|
||||
"dtype": "bfloat16",
|
||||
"distributed_executor_backend": "mp",
|
||||
"block_size": 128,
|
||||
"trust_remote_code": "",
|
||||
"enable_chunked_prefill": "",
|
||||
"disable_log_stats": "",
|
||||
"enforce_eager": "",
|
||||
"max_num_batched_tokens": 2048,
|
||||
"max_num_seqs": 256,
|
||||
"load_format": "dummy"
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
|
||||
"backend": "vllm",
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 128,
|
||||
"random-output-len": 128,
|
||||
"ignore-eos": "",
|
||||
"num_prompts": 1000
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_int4_tp1_sharegpt",
|
||||
"qps_list": ["inf"],
|
||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
||||
"server_environment_variables": {
|
||||
"VLLM_RPC_TIMEOUT": 100000,
|
||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||
"VLLM_CPU_SGL_KERNEL": 1,
|
||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||
},
|
||||
"server_parameters": {
|
||||
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
||||
"quantization": "awq",
|
||||
"tensor_parallel_size": 1,
|
||||
"dtype": "bfloat16",
|
||||
"distributed_executor_backend": "mp",
|
||||
"block_size": 128,
|
||||
"trust_remote_code": "",
|
||||
"disable_log_stats": "",
|
||||
"enforce_eager": "",
|
||||
"max_num_batched_tokens": 2048,
|
||||
"max_num_seqs": 256,
|
||||
"load_format": "dummy"
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
||||
"backend": "vllm",
|
||||
"dataset_name": "sharegpt",
|
||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||
"num_prompts": 200
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_int4_tp2_sharegpt",
|
||||
"qps_list": ["inf"],
|
||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
||||
"server_environment_variables": {
|
||||
"VLLM_RPC_TIMEOUT": 100000,
|
||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||
"VLLM_CPU_SGL_KERNEL": 1,
|
||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||
},
|
||||
"server_parameters": {
|
||||
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
||||
"quantization": "awq",
|
||||
"tensor_parallel_size": 2,
|
||||
"dtype": "bfloat16",
|
||||
"distributed_executor_backend": "mp",
|
||||
"block_size": 128,
|
||||
"trust_remote_code": "",
|
||||
"disable_log_stats": "",
|
||||
"enforce_eager": "",
|
||||
"max_num_batched_tokens": 2048,
|
||||
"max_num_seqs": 256,
|
||||
"load_format": "dummy"
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
||||
"backend": "vllm",
|
||||
"dataset_name": "sharegpt",
|
||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||
"num_prompts": 200
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_int4_tp4_sharegpt",
|
||||
"qps_list": ["inf"],
|
||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
||||
"server_environment_variables": {
|
||||
"VLLM_RPC_TIMEOUT": 100000,
|
||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||
"VLLM_CPU_SGL_KERNEL": 1,
|
||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||
},
|
||||
"server_parameters": {
|
||||
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
||||
"quantization": "awq",
|
||||
"tensor_parallel_size": 4,
|
||||
"dtype": "bfloat16",
|
||||
"distributed_executor_backend": "mp",
|
||||
"block_size": 128,
|
||||
"trust_remote_code": "",
|
||||
"disable_log_stats": "",
|
||||
"enforce_eager": "",
|
||||
"max_num_batched_tokens": 2048,
|
||||
"max_num_seqs": 256,
|
||||
"load_format": "dummy"
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
||||
"backend": "vllm",
|
||||
"dataset_name": "sharegpt",
|
||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||
"num_prompts": 200
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_int4_tp1_random_128_128",
|
||||
"qps_list": ["inf"],
|
||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
|
||||
"server_environment_variables": {
|
||||
"VLLM_RPC_TIMEOUT": 100000,
|
||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||
"VLLM_CPU_SGL_KERNEL": 1,
|
||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||
},
|
||||
"server_parameters": {
|
||||
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
||||
"quantization": "awq",
|
||||
"tensor_parallel_size": 1,
|
||||
"dtype": "bfloat16",
|
||||
"distributed_executor_backend": "mp",
|
||||
"block_size": 128,
|
||||
"trust_remote_code": "",
|
||||
"enable_chunked_prefill": "",
|
||||
"disable_log_stats": "",
|
||||
"enforce_eager": "",
|
||||
"max_num_batched_tokens": 2048,
|
||||
"max_num_seqs": 256,
|
||||
"load_format": "dummy"
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
||||
"backend": "vllm",
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 128,
|
||||
"random-output-len": 128,
|
||||
"ignore-eos": "",
|
||||
"num_prompts": 1000
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_int4_tp2_random_128_128",
|
||||
"qps_list": ["inf"],
|
||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
|
||||
"server_environment_variables": {
|
||||
"VLLM_RPC_TIMEOUT": 100000,
|
||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||
"VLLM_CPU_SGL_KERNEL": 1,
|
||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||
},
|
||||
"server_parameters": {
|
||||
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
||||
"quantization": "awq",
|
||||
"tensor_parallel_size": 2,
|
||||
"dtype": "bfloat16",
|
||||
"distributed_executor_backend": "mp",
|
||||
"block_size": 128,
|
||||
"trust_remote_code": "",
|
||||
"enable_chunked_prefill": "",
|
||||
"disable_log_stats": "",
|
||||
"enforce_eager": "",
|
||||
"max_num_batched_tokens": 2048,
|
||||
"max_num_seqs": 256,
|
||||
"load_format": "dummy"
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
||||
"backend": "vllm",
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 128,
|
||||
"random-output-len": 128,
|
||||
"ignore-eos": "",
|
||||
"num_prompts": 1000
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_int4_tp4_random_128_128",
|
||||
"qps_list": ["inf"],
|
||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
|
||||
"server_environment_variables": {
|
||||
"VLLM_RPC_TIMEOUT": 100000,
|
||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||
"VLLM_CPU_SGL_KERNEL": 1,
|
||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||
},
|
||||
"server_parameters": {
|
||||
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
||||
"quantization": "awq",
|
||||
"tensor_parallel_size": 4,
|
||||
"dtype": "bfloat16",
|
||||
"distributed_executor_backend": "mp",
|
||||
"block_size": 128,
|
||||
"trust_remote_code": "",
|
||||
"enable_chunked_prefill": "",
|
||||
"disable_log_stats": "",
|
||||
"enforce_eager": "",
|
||||
"max_num_batched_tokens": 2048,
|
||||
"max_num_seqs": 256,
|
||||
"load_format": "dummy"
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
||||
"backend": "vllm",
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 128,
|
||||
"random-output-len": 128,
|
||||
"ignore-eos": "",
|
||||
"num_prompts": 1000
|
||||
}
|
||||
}
|
||||
]
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,276 +1,246 @@
|
||||
[
|
||||
{
|
||||
"test_name": "serving_llama8B_tp1_sharegpt",
|
||||
"qps_list": [1, 4, 16, "inf"],
|
||||
"max_concurrency_list": [32],
|
||||
"server_environment_variables": {
|
||||
"VLLM_RPC_TIMEOUT": 100000,
|
||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||
"VLLM_CPU_SGL_KERNEL": 1,
|
||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||
},
|
||||
"server_parameters": {
|
||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||
"tensor_parallel_size": 1,
|
||||
"dtype": "bfloat16",
|
||||
"distributed_executor_backend": "mp",
|
||||
"block_size": 128,
|
||||
"trust_remote_code": "",
|
||||
"disable_log_stats": "",
|
||||
"enforce_eager": "",
|
||||
"max_num_batched_tokens": 2048,
|
||||
"max_num_seqs": 256,
|
||||
"load_format": "dummy"
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||
"backend": "vllm",
|
||||
"dataset_name": "sharegpt",
|
||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||
"num_prompts": 32
|
||||
}
|
||||
{
|
||||
"defaults": {
|
||||
"qps_list": [
|
||||
"inf"
|
||||
],
|
||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
||||
"server_environment_variables": {
|
||||
"VLLM_RPC_TIMEOUT": 100000,
|
||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||
"VLLM_CPU_SGL_KERNEL": 1,
|
||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_tp2_sharegpt",
|
||||
"qps_list": [1, 4, 16, "inf"],
|
||||
"max_concurrency_list": [32],
|
||||
"server_environment_variables": {
|
||||
"VLLM_RPC_TIMEOUT": 100000,
|
||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||
"VLLM_CPU_SGL_KERNEL": 1,
|
||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||
},
|
||||
"server_parameters": {
|
||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||
"tensor_parallel_size": 2,
|
||||
"dtype": "bfloat16",
|
||||
"distributed_executor_backend": "mp",
|
||||
"block_size": 128,
|
||||
"trust_remote_code": "",
|
||||
"disable_log_stats": "",
|
||||
"enforce_eager": "",
|
||||
"max_num_batched_tokens": 2048,
|
||||
"max_num_seqs": 256,
|
||||
"load_format": "dummy"
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||
"backend": "vllm",
|
||||
"dataset_name": "sharegpt",
|
||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||
"num_prompts": 32
|
||||
}
|
||||
"server_parameters": {
|
||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||
"tensor_parallel_size": 1,
|
||||
"dtype": "bfloat16",
|
||||
"distributed_executor_backend": "mp",
|
||||
"block_size": 128,
|
||||
"trust_remote_code": "",
|
||||
"disable_log_stats": "",
|
||||
"enforce_eager": "",
|
||||
"max_num_batched_tokens": 2048,
|
||||
"max_num_seqs": 256,
|
||||
"load_format": "dummy"
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_tp1_random_128_128",
|
||||
"qps_list": [1, 4, 16, "inf"],
|
||||
"max_concurrency_list": [32],
|
||||
"server_environment_variables": {
|
||||
"VLLM_RPC_TIMEOUT": 100000,
|
||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||
"VLLM_CPU_SGL_KERNEL": 1,
|
||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||
},
|
||||
"server_parameters": {
|
||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||
"tensor_parallel_size": 1,
|
||||
"dtype": "bfloat16",
|
||||
"distributed_executor_backend": "mp",
|
||||
"block_size": 128,
|
||||
"trust_remote_code": "",
|
||||
"enable_chunked_prefill": "",
|
||||
"disable_log_stats": "",
|
||||
"enforce_eager": "",
|
||||
"max_num_batched_tokens": 2048,
|
||||
"max_num_seqs": 256,
|
||||
"load_format": "dummy"
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||
"backend": "vllm",
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 128,
|
||||
"random-output-len": 128,
|
||||
"ignore-eos": "",
|
||||
"num_prompts": 32
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_tp2_random_128_128",
|
||||
"qps_list": [1, 4, 16, "inf"],
|
||||
"max_concurrency_list": [32],
|
||||
"server_environment_variables": {
|
||||
"VLLM_RPC_TIMEOUT": 100000,
|
||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||
"VLLM_CPU_SGL_KERNEL": 1,
|
||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||
},
|
||||
"server_parameters": {
|
||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||
"tensor_parallel_size": 2,
|
||||
"dtype": "bfloat16",
|
||||
"distributed_executor_backend": "mp",
|
||||
"block_size": 128,
|
||||
"trust_remote_code": "",
|
||||
"enable_chunked_prefill": "",
|
||||
"disable_log_stats": "",
|
||||
"enforce_eager": "",
|
||||
"max_num_batched_tokens": 2048,
|
||||
"max_num_seqs": 256,
|
||||
"load_format": "dummy"
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||
"backend": "vllm",
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 128,
|
||||
"random-output-len": 128,
|
||||
"ignore-eos": "",
|
||||
"num_prompts": 32
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_tp1_random_128_2048",
|
||||
"qps_list": [1, 4, 16, "inf"],
|
||||
"max_concurrency_list": [32],
|
||||
"server_environment_variables": {
|
||||
"VLLM_RPC_TIMEOUT": 100000,
|
||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||
"VLLM_CPU_SGL_KERNEL": 1,
|
||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||
},
|
||||
"server_parameters": {
|
||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||
"tensor_parallel_size": 1,
|
||||
"dtype": "bfloat16",
|
||||
"distributed_executor_backend": "mp",
|
||||
"block_size": 128,
|
||||
"trust_remote_code": "",
|
||||
"enable_chunked_prefill": "",
|
||||
"disable_log_stats": "",
|
||||
"enforce_eager": "",
|
||||
"max_num_batched_tokens": 2048,
|
||||
"max_num_seqs": 256,
|
||||
"load_format": "dummy"
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||
"backend": "vllm",
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 128,
|
||||
"random-output-len": 2048,
|
||||
"ignore-eos": "",
|
||||
"num_prompts": 32
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_tp2_random_128_2048",
|
||||
"qps_list": [1, 4, 16, "inf"],
|
||||
"max_concurrency_list": [32],
|
||||
"server_environment_variables": {
|
||||
"VLLM_RPC_TIMEOUT": 100000,
|
||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||
"VLLM_CPU_SGL_KERNEL": 1,
|
||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||
},
|
||||
"server_parameters": {
|
||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||
"tensor_parallel_size": 2,
|
||||
"dtype": "bfloat16",
|
||||
"distributed_executor_backend": "mp",
|
||||
"block_size": 128,
|
||||
"trust_remote_code": "",
|
||||
"enable_chunked_prefill": "",
|
||||
"disable_log_stats": "",
|
||||
"enforce_eager": "",
|
||||
"max_num_batched_tokens": 2048,
|
||||
"max_num_seqs": 256,
|
||||
"load_format": "dummy"
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||
"backend": "vllm",
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 128,
|
||||
"random-output-len": 2048,
|
||||
"ignore-eos": "",
|
||||
"num_prompts": 32
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_tp1_random_2048_128",
|
||||
"qps_list": [1, 4, 16, "inf"],
|
||||
"max_concurrency_list": [32],
|
||||
"server_environment_variables": {
|
||||
"VLLM_RPC_TIMEOUT": 100000,
|
||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||
"VLLM_CPU_SGL_KERNEL": 1,
|
||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||
},
|
||||
"server_parameters": {
|
||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||
"tensor_parallel_size": 1,
|
||||
"dtype": "bfloat16",
|
||||
"distributed_executor_backend": "mp",
|
||||
"block_size": 128,
|
||||
"trust_remote_code": "",
|
||||
"enable_chunked_prefill": "",
|
||||
"disable_log_stats": "",
|
||||
"enforce_eager": "",
|
||||
"max_num_batched_tokens": 2048,
|
||||
"max_num_seqs": 256,
|
||||
"load_format": "dummy"
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||
"backend": "vllm",
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 2048,
|
||||
"random-output-len": 128,
|
||||
"ignore-eos": "",
|
||||
"num_prompts": 32
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_tp2_random_2048_128",
|
||||
"qps_list": [1, 4, 16, "inf"],
|
||||
"max_concurrency_list": [32],
|
||||
"server_environment_variables": {
|
||||
"VLLM_RPC_TIMEOUT": 100000,
|
||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||
"VLLM_CPU_SGL_KERNEL": 1,
|
||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||
},
|
||||
"server_parameters": {
|
||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||
"tensor_parallel_size": 2,
|
||||
"dtype": "bfloat16",
|
||||
"distributed_executor_backend": "mp",
|
||||
"block_size": 128,
|
||||
"trust_remote_code": "",
|
||||
"enable_chunked_prefill": "",
|
||||
"disable_log_stats": "",
|
||||
"enforce_eager": "",
|
||||
"max_num_batched_tokens": 2048,
|
||||
"max_num_seqs": 256,
|
||||
"load_format": "dummy"
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||
"backend": "vllm",
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 2048,
|
||||
"random-output-len": 128,
|
||||
"ignore-eos": "",
|
||||
"num_prompts": 32
|
||||
}
|
||||
"client_parameters": {
|
||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||
"backend": "vllm",
|
||||
"ignore-eos": "",
|
||||
"num_prompts": 200
|
||||
}
|
||||
]
|
||||
},
|
||||
"tests": [
|
||||
{
|
||||
"test_name": "serving_llama8B_tp1_sharegpt",
|
||||
"server_parameters": {
|
||||
"tensor_parallel_size": 1
|
||||
},
|
||||
"client_parameters": {
|
||||
"dataset_name": "sharegpt",
|
||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json"
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_tp2_sharegpt",
|
||||
"server_parameters": {
|
||||
"tensor_parallel_size": 2
|
||||
},
|
||||
"client_parameters": {
|
||||
"dataset_name": "sharegpt",
|
||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json"
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_tp1_random_128_128",
|
||||
"server_parameters": {
|
||||
"tensor_parallel_size": 1
|
||||
},
|
||||
"client_parameters": {
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 128,
|
||||
"random-output-len": 128
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_tp2_random_128_128",
|
||||
"server_parameters": {
|
||||
"tensor_parallel_size": 2
|
||||
},
|
||||
"client_parameters": {
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 128,
|
||||
"random-output-len": 128
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_tp4_random_128_128",
|
||||
"server_parameters": {
|
||||
"tensor_parallel_size": 4
|
||||
},
|
||||
"client_parameters": {
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 128,
|
||||
"random-output-len": 128
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_tp1_random_128_2048",
|
||||
"server_parameters": {
|
||||
"tensor_parallel_size": 1
|
||||
},
|
||||
"client_parameters": {
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 128,
|
||||
"random-output-len": 2048
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_tp2_random_128_2048",
|
||||
"server_parameters": {
|
||||
"tensor_parallel_size": 2
|
||||
},
|
||||
"client_parameters": {
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 128,
|
||||
"random-output-len": 2048
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_tp4_random_128_2048",
|
||||
"server_parameters": {
|
||||
"tensor_parallel_size": 4
|
||||
},
|
||||
"client_parameters": {
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 128,
|
||||
"random-output-len": 2048
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_tp1_random_2048_128",
|
||||
"server_parameters": {
|
||||
"tensor_parallel_size": 1
|
||||
},
|
||||
"client_parameters": {
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 2048,
|
||||
"random-output-len": 128
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_tp2_random_2048_128",
|
||||
"server_parameters": {
|
||||
"tensor_parallel_size": 2
|
||||
},
|
||||
"client_parameters": {
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 2048,
|
||||
"random-output-len": 128
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_tp4_random_2048_128",
|
||||
"server_parameters": {
|
||||
"tensor_parallel_size": 4
|
||||
},
|
||||
"client_parameters": {
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 2048,
|
||||
"random-output-len": 128
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama3B_tp1_random_128_128",
|
||||
"server_parameters": {
|
||||
"model": "meta-llama/Llama-3.2-3B-Instruct",
|
||||
"tensor_parallel_size": 1
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "meta-llama/Llama-3.2-3B-Instruct",
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 128,
|
||||
"random-output-len": 128
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_granite2B_tp1_random_128_128",
|
||||
"server_parameters": {
|
||||
"model": "ibm-granite/granite-3.2-2b-instruct",
|
||||
"tensor_parallel_size": 1
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "ibm-granite/granite-3.2-2b-instruct",
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 128,
|
||||
"random-output-len": 128
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_qwen1.7B_tp1_random_128_128",
|
||||
"server_parameters": {
|
||||
"model": "Qwen/Qwen3-1.7B",
|
||||
"tensor_parallel_size": 1
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "Qwen/Qwen3-1.7B",
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 128,
|
||||
"random-output-len": 128
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_qwen4B_tp1_random_128_128",
|
||||
"server_parameters": {
|
||||
"model": "Qwen/Qwen3-4B",
|
||||
"tensor_parallel_size": 1
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "Qwen/Qwen3-4B",
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 128,
|
||||
"random-output-len": 128
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_qwen8B_tp1_random_128_128",
|
||||
"server_parameters": {
|
||||
"model": "Qwen/Qwen3-8B",
|
||||
"tensor_parallel_size": 1
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "Qwen/Qwen3-8B",
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 128,
|
||||
"random-output-len": 128
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_glm9B_tp1_random_128_128",
|
||||
"server_parameters": {
|
||||
"model": "zai-org/glm-4-9b-hf",
|
||||
"tensor_parallel_size": 1
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "zai-org/glm-4-9b-hf",
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 128,
|
||||
"random-output-len": 128
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_gemma7B_tp1_random_128_128",
|
||||
"server_parameters": {
|
||||
"model": "google/gemma-7b",
|
||||
"tensor_parallel_size": 1
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "google/gemma-7b",
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 128,
|
||||
"random-output-len": 128
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
@@ -0,0 +1,82 @@
|
||||
[
|
||||
{
|
||||
"test_name": "serving_llama8B_tp1_sharegpt",
|
||||
"qps_list": [1, 4, 16, "inf"],
|
||||
"server_environment_variables": {
|
||||
"PT_HPU_LAZY_MODE": 1,
|
||||
"VLLM_CONTIGUOUS_PA": 1,
|
||||
"VLLM_DEFRAG": 1
|
||||
},
|
||||
"server_parameters": {
|
||||
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
||||
"tensor_parallel_size": 1,
|
||||
"swap_space": 16,
|
||||
"disable_log_stats": "",
|
||||
"load_format": "dummy",
|
||||
"max-model-len": 2048,
|
||||
"max-num-seqs": 256,
|
||||
"async-scheduling": ""
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
||||
"backend": "vllm",
|
||||
"dataset_name": "sharegpt",
|
||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||
"num_prompts": 200
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama70B_tp4_sharegpt",
|
||||
"qps_list": [1, 4, 16, "inf"],
|
||||
"server_environment_variables": {
|
||||
"PT_HPU_LAZY_MODE": 1,
|
||||
"PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
|
||||
"VLLM_CONTIGUOUS_PA": 1,
|
||||
"VLLM_DEFRAG": 1
|
||||
},
|
||||
"server_parameters": {
|
||||
"model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
|
||||
"tensor_parallel_size": 4,
|
||||
"swap_space": 16,
|
||||
"disable_log_stats": "",
|
||||
"load_format": "dummy",
|
||||
"max-model-len": 2048,
|
||||
"max-num-seqs": 256,
|
||||
"async-scheduling": ""
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
|
||||
"backend": "vllm",
|
||||
"dataset_name": "sharegpt",
|
||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||
"num_prompts": 200
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_mixtral8x7B_tp2_sharegpt",
|
||||
"qps_list": [1, 4, 16, "inf"],
|
||||
"server_environment_variables": {
|
||||
"PT_HPU_LAZY_MODE": 1,
|
||||
"PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
|
||||
"VLLM_CONTIGUOUS_PA": 1,
|
||||
"VLLM_DEFRAG": 1
|
||||
},
|
||||
"server_parameters": {
|
||||
"model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
|
||||
"tensor_parallel_size": 2,
|
||||
"swap_space": 16,
|
||||
"disable_log_stats": "",
|
||||
"load_format": "dummy",
|
||||
"max-model-len": 2048,
|
||||
"max-num-seqs": 256,
|
||||
"async-scheduling": ""
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
|
||||
"backend": "vllm",
|
||||
"dataset_name": "sharegpt",
|
||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||
"num_prompts": 200
|
||||
}
|
||||
}
|
||||
]
|
||||
@@ -0,0 +1,61 @@
|
||||
[
|
||||
{
|
||||
"test_name": "throughput_llama8B_tp1",
|
||||
"environment_variables": {
|
||||
"PT_HPU_LAZY_MODE": 1,
|
||||
"VLLM_CONTIGUOUS_PA": 1,
|
||||
"VLLM_DEFRAG": 1
|
||||
},
|
||||
"parameters": {
|
||||
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
||||
"tensor_parallel_size": 1,
|
||||
"load_format": "dummy",
|
||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||
"num_prompts": 1000,
|
||||
"backend": "vllm",
|
||||
"max-model-len": 2048,
|
||||
"max-num-seqs": 512,
|
||||
"async-scheduling": ""
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "throughput_llama70B_tp4",
|
||||
"environment_variables": {
|
||||
"PT_HPU_LAZY_MODE": 1,
|
||||
"PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
|
||||
"VLLM_CONTIGUOUS_PA": 1,
|
||||
"VLLM_DEFRAG": 1
|
||||
},
|
||||
"parameters": {
|
||||
"model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
|
||||
"tensor_parallel_size": 4,
|
||||
"load_format": "dummy",
|
||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||
"num_prompts": 1000,
|
||||
"backend": "vllm",
|
||||
"max-model-len": 2048,
|
||||
"max-num-seqs": 512,
|
||||
"async-scheduling": ""
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "throughput_mixtral8x7B_tp2",
|
||||
"environment_variables": {
|
||||
"PT_HPU_LAZY_MODE": 1,
|
||||
"PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
|
||||
"VLLM_CONTIGUOUS_PA": 1,
|
||||
"VLLM_DEFRAG": 1
|
||||
},
|
||||
"parameters": {
|
||||
"model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
|
||||
"tensor_parallel_size": 2,
|
||||
"load_format": "dummy",
|
||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||
"num_prompts": 1000,
|
||||
"backend": "vllm",
|
||||
"max-model-len": 2048,
|
||||
"max-num-seqs": 512,
|
||||
"async-scheduling": ""
|
||||
}
|
||||
}
|
||||
]
|
||||
@@ -8,7 +8,7 @@ steps:
|
||||
commands:
|
||||
# #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here:
|
||||
# https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg VLLM_MAIN_CUDA_VERSION=12.9 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
||||
- "mkdir artifacts"
|
||||
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
||||
- "bash .buildkite/scripts/upload-wheels.sh"
|
||||
@@ -30,19 +30,6 @@ steps:
|
||||
DOCKER_BUILDKIT: "1"
|
||||
|
||||
# x86 + CUDA builds
|
||||
- label: "Build wheel - CUDA 12.8"
|
||||
depends_on: ~
|
||||
id: build-wheel-cuda-12-8
|
||||
agents:
|
||||
queue: cpu_queue_postmerge
|
||||
commands:
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
||||
- "mkdir artifacts"
|
||||
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
||||
- "bash .buildkite/scripts/upload-wheels.sh"
|
||||
env:
|
||||
DOCKER_BUILDKIT: "1"
|
||||
|
||||
- label: "Build wheel - CUDA 12.9"
|
||||
depends_on: ~
|
||||
id: build-wheel-cuda-12-9
|
||||
@@ -109,31 +96,12 @@ steps:
|
||||
- label: "Annotate release workflow"
|
||||
depends_on:
|
||||
- create-multi-arch-manifest
|
||||
- build-wheel-cuda-12-8
|
||||
id: annotate-release-workflow
|
||||
agents:
|
||||
queue: cpu_queue_postmerge
|
||||
commands:
|
||||
- "bash .buildkite/scripts/annotate-release.sh"
|
||||
|
||||
- label: "Build and publish TPU release image"
|
||||
depends_on: ~
|
||||
if: build.env("NIGHTLY") == "1"
|
||||
agents:
|
||||
queue: tpu_queue_postmerge
|
||||
commands:
|
||||
- "yes | docker system prune -a"
|
||||
- "git fetch --all"
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --tag vllm/vllm-tpu:nightly --tag vllm/vllm-tpu:$BUILDKITE_COMMIT --progress plain -f docker/Dockerfile.tpu ."
|
||||
- "docker push vllm/vllm-tpu:nightly"
|
||||
- "docker push vllm/vllm-tpu:$BUILDKITE_COMMIT"
|
||||
plugins:
|
||||
- docker-login#v3.0.0:
|
||||
username: vllmbot
|
||||
password-env: DOCKERHUB_TOKEN
|
||||
env:
|
||||
DOCKER_BUILDKIT: "1"
|
||||
|
||||
- input: "Provide Release version here"
|
||||
id: input-release-version
|
||||
fields:
|
||||
@@ -150,7 +118,7 @@ steps:
|
||||
queue: cpu_queue_postmerge
|
||||
commands:
|
||||
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_CPU_AVX512BF16=true --build-arg VLLM_CPU_AVX512VNNI=true --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ."
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_CPU_AVX512BF16=true --build-arg VLLM_CPU_AVX512VNNI=true --build-arg VLLM_CPU_AMXBF16=true --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ."
|
||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest"
|
||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version)"
|
||||
env:
|
||||
|
||||
@@ -2,22 +2,29 @@
|
||||
|
||||
set -ex
|
||||
|
||||
# Get release version and strip leading 'v' if present
|
||||
RELEASE_VERSION=$(buildkite-agent meta-data get release-version | sed 's/^v//')
|
||||
|
||||
if [ -z "$RELEASE_VERSION" ]; then
|
||||
echo "Error: RELEASE_VERSION is empty. 'release-version' metadata might not be set or is invalid."
|
||||
exit 1
|
||||
# Get release version, default to 1.0.0.dev for nightly/per-commit builds
|
||||
RELEASE_VERSION=$(buildkite-agent meta-data get release-version 2>/dev/null | sed 's/^v//')
|
||||
if [ -z "${RELEASE_VERSION}" ]; then
|
||||
RELEASE_VERSION="1.0.0.dev"
|
||||
fi
|
||||
|
||||
buildkite-agent annotate --style 'info' --context 'release-workflow' << EOF
|
||||
To download the wheel:
|
||||
To download the wheel (by commit):
|
||||
\`\`\`
|
||||
aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux1_x86_64.whl .
|
||||
aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux2014_aarch64.whl .
|
||||
|
||||
aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}+cu129-cp38-abi3-manylinux1_x86_64.whl .
|
||||
aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}+cu129-cp38-abi3-manylinux1_x86_64.whl .
|
||||
\`\`\`
|
||||
|
||||
To download the wheel (by version):
|
||||
\`\`\`
|
||||
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux1_x86_64.whl .
|
||||
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux2014_aarch64.whl .
|
||||
|
||||
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu126/vllm-${RELEASE_VERSION}+cu126-cp38-abi3-manylinux1_x86_64.whl .
|
||||
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu129/vllm-${RELEASE_VERSION}+cu129-cp38-abi3-manylinux1_x86_64.whl .
|
||||
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu130/vllm-${RELEASE_VERSION}+cu130-cp38-abi3-manylinux1_x86_64.whl .
|
||||
\`\`\`
|
||||
|
||||
To download and upload the image:
|
||||
@@ -38,9 +45,10 @@ docker tag vllm/vllm-openai:aarch64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
|
||||
docker push vllm/vllm-openai:latest-aarch64
|
||||
docker push vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
|
||||
|
||||
docker manifest create vllm/vllm-openai:latest vllm/vllm-openai:latest-x86_64 vllm/vllm-openai:latest-aarch64 --amend
|
||||
docker manifest create vllm/vllm-openai:v${RELEASE_VERSION} vllm/vllm-openai:v${RELEASE_VERSION}-x86_64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64 --amend
|
||||
docker manifest rm vllm/vllm-openai:latest
|
||||
docker manifest create vllm/vllm-openai:latest vllm/vllm-openai:latest-x86_64 vllm/vllm-openai:latest-aarch64
|
||||
docker manifest create vllm/vllm-openai:v${RELEASE_VERSION} vllm/vllm-openai:v${RELEASE_VERSION}-x86_64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
|
||||
docker manifest push vllm/vllm-openai:latest
|
||||
docker manifest push vllm/vllm-openai:v${RELEASE_VERSION}
|
||||
\`\`\`
|
||||
EOF
|
||||
EOF
|
||||
|
||||
369
.buildkite/scripts/generate-nightly-index.py
Normal file
369
.buildkite/scripts/generate-nightly-index.py
Normal file
@@ -0,0 +1,369 @@
|
||||
#!/usr/bin/env python3
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
# do not complain about line length (for docstring)
|
||||
# ruff: noqa: E501
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import re
|
||||
import sys
|
||||
from dataclasses import asdict, dataclass
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
from urllib.parse import quote
|
||||
|
||||
if not sys.version_info >= (3, 12):
|
||||
raise RuntimeError("This script requires Python 3.12 or higher.")
|
||||
|
||||
INDEX_HTML_TEMPLATE = """<!DOCTYPE html>
|
||||
<html>
|
||||
<meta name="pypi:repository-version" content="1.0">
|
||||
<body>
|
||||
{items}
|
||||
</body>
|
||||
</html>
|
||||
"""
|
||||
|
||||
|
||||
@dataclass
|
||||
class WheelFileInfo:
|
||||
package_name: str
|
||||
version: str
|
||||
build_tag: str | None
|
||||
python_tag: str
|
||||
abi_tag: str
|
||||
platform_tag: str
|
||||
variant: str | None
|
||||
filename: str
|
||||
|
||||
|
||||
def parse_from_filename(file: str) -> WheelFileInfo:
|
||||
"""
|
||||
Parse wheel file name to extract metadata.
|
||||
|
||||
The format of wheel names:
|
||||
{package_name}-{version}(-{build_tag})?-{python_tag}-{abi_tag}-{platform_tag}.whl
|
||||
All versions could contain a variant like '+cu129' or '.cpu' or `.rocm` (or not).
|
||||
Example:
|
||||
vllm-0.11.0-cp38-abi3-manylinux1_x86_64.whl
|
||||
vllm-0.10.2rc2+cu129-cp38-abi3-manylinux2014_aarch64.whl
|
||||
vllm-0.11.1rc8.dev14+gaa384b3c0-cp38-abi3-manylinux2014_aarch64.whl
|
||||
vllm-0.11.1rc8.dev14+gaa384b3c0.cu130-cp38-abi3-manylinux1_x86_64.whl
|
||||
"""
|
||||
wheel_file_re = re.compile(
|
||||
r"^(?P<package_name>.+)-(?P<version>[^-]+?)(-(?P<build_tag>[^-]+))?-(?P<python_tag>[^-]+)-(?P<abi_tag>[^-]+)-(?P<platform_tag>[^-]+)\.whl$"
|
||||
)
|
||||
match = wheel_file_re.match(file)
|
||||
if not match:
|
||||
raise ValueError(f"Invalid wheel file name: {file}")
|
||||
|
||||
package_name = match.group("package_name")
|
||||
version = match.group("version")
|
||||
build_tag = match.group("build_tag")
|
||||
python_tag = match.group("python_tag")
|
||||
abi_tag = match.group("abi_tag")
|
||||
platform_tag = match.group("platform_tag")
|
||||
|
||||
# extract variant from version
|
||||
variant = None
|
||||
if "dev" in version:
|
||||
ver_after_dev = version.split("dev")[-1]
|
||||
if "." in ver_after_dev:
|
||||
variant = ver_after_dev.split(".")[-1]
|
||||
version = version.removesuffix("." + variant)
|
||||
else:
|
||||
if "+" in version:
|
||||
version, variant = version.split("+")
|
||||
|
||||
return WheelFileInfo(
|
||||
package_name=package_name,
|
||||
version=version,
|
||||
build_tag=build_tag,
|
||||
python_tag=python_tag,
|
||||
abi_tag=abi_tag,
|
||||
platform_tag=platform_tag,
|
||||
variant=variant,
|
||||
filename=file,
|
||||
)
|
||||
|
||||
|
||||
def generate_project_list(subdir_names: list[str]) -> str:
|
||||
"""
|
||||
Generate project list HTML content linking to each project & variant sub-directory.
|
||||
"""
|
||||
href_tags = []
|
||||
for name in sorted(subdir_names):
|
||||
name = name.strip("/").strip(".")
|
||||
href_tags.append(f' <a href="{name}/">{name}/</a><br/>')
|
||||
return INDEX_HTML_TEMPLATE.format(items="\n".join(href_tags))
|
||||
|
||||
|
||||
def generate_package_index_and_metadata(
|
||||
wheel_files: list[WheelFileInfo], wheel_base_dir: Path, index_base_dir: Path
|
||||
) -> tuple[str, str]:
|
||||
"""
|
||||
Generate package index HTML content for a specific package, linking to actual wheel files.
|
||||
"""
|
||||
href_tags = []
|
||||
metadata = []
|
||||
for file in sorted(wheel_files, key=lambda x: x.filename):
|
||||
relative_path = (
|
||||
wheel_base_dir.relative_to(index_base_dir, walk_up=True) / file.filename
|
||||
)
|
||||
# handle with '+' in URL, and avoid double-encoding '/' and already-encoded '%2B'
|
||||
# NOTE: this is AWS S3 specific behavior!
|
||||
file_path_quoted = quote(relative_path.as_posix(), safe=":%/")
|
||||
href_tags.append(f' <a href="{file_path_quoted}">{file.filename}</a><br/>')
|
||||
file_meta = asdict(file)
|
||||
file_meta["path"] = file_path_quoted
|
||||
metadata.append(file_meta)
|
||||
index_str = INDEX_HTML_TEMPLATE.format(items="\n".join(href_tags))
|
||||
metadata_str = json.dumps(metadata, indent=2)
|
||||
return index_str, metadata_str
|
||||
|
||||
|
||||
def generate_index_and_metadata(
|
||||
whl_files: list[str],
|
||||
wheel_base_dir: Path,
|
||||
index_base_dir: Path,
|
||||
default_variant: str | None = None,
|
||||
alias_to_default: str | None = None,
|
||||
):
|
||||
"""
|
||||
Generate index for all wheel files.
|
||||
|
||||
Args:
|
||||
whl_files (list[str]): List of wheel files (must be directly under `wheel_base_dir`).
|
||||
wheel_base_dir (Path): Base directory for wheel files.
|
||||
index_base_dir (Path): Base directory to store index files.
|
||||
default_variant (str | None): The default variant name, if any.
|
||||
alias_to_default (str | None): Alias variant name for the default variant, if any.
|
||||
|
||||
First, parse all wheel files to extract metadata.
|
||||
We need to collect all wheel files for each variant, and generate an index for it (in a sub-directory).
|
||||
The index for the default variant (if any) is generated in the root index directory.
|
||||
|
||||
If `default_variant` is provided, all wheels must have variant suffixes, and the default variant index
|
||||
is purely a copy of the corresponding variant index, with only the links adjusted.
|
||||
Otherwise, all wheels without variant suffixes are treated as the default variant.
|
||||
|
||||
If `alias_to_default` is provided, an additional alias sub-directory is created, it has the same content
|
||||
as the default variant index, but the links are adjusted accordingly.
|
||||
|
||||
Index directory structure:
|
||||
index_base_dir/ (hosted at wheels.vllm.ai/{nightly,$commit,$version}/)
|
||||
index.html # project list, linking to "vllm/" and other packages, and all variant sub-directories
|
||||
vllm/
|
||||
index.html # package index, pointing to actual files in wheel_base_dir (relative path)
|
||||
metadata.json # machine-readable metadata for all wheels in this package
|
||||
cpu/ # cpu variant sub-directory
|
||||
index.html
|
||||
vllm/
|
||||
index.html
|
||||
metadata.json
|
||||
cu129/ # cu129 is actually the alias to default variant
|
||||
index.html
|
||||
vllm/
|
||||
index.html
|
||||
metadata.json
|
||||
cu130/ # cu130 variant sub-directory
|
||||
index.html
|
||||
vllm/
|
||||
index.html
|
||||
metadata.json
|
||||
...
|
||||
|
||||
metadata.json stores a dump of all wheel files' metadata in a machine-readable format:
|
||||
[
|
||||
{
|
||||
"package_name": "vllm",
|
||||
"version": "0.10.2rc2",
|
||||
"build_tag": null,
|
||||
"python_tag": "cp38",
|
||||
"abi_tag": "abi3",
|
||||
"platform_tag": "manylinux2014_aarch64",
|
||||
"variant": "cu129",
|
||||
"filename": "vllm-0.10.2rc2+cu129-cp38-abi3-manylinux2014_aarch64.whl",
|
||||
"path": "../vllm-0.10.2rc2%2Bcu129-cp38-abi3-manylinux2014_aarch64.whl" # to be concatenated with the directory URL and URL-encoded
|
||||
},
|
||||
...
|
||||
]
|
||||
"""
|
||||
|
||||
parsed_files = [parse_from_filename(f) for f in whl_files]
|
||||
|
||||
if not parsed_files:
|
||||
print("No wheel files found, skipping index generation.")
|
||||
return
|
||||
|
||||
# Group by variant
|
||||
variant_to_files: dict[str, list[WheelFileInfo]] = {}
|
||||
for file in parsed_files:
|
||||
variant = file.variant or "default"
|
||||
if variant not in variant_to_files:
|
||||
variant_to_files[variant] = []
|
||||
variant_to_files[variant].append(file)
|
||||
|
||||
print(f"Found variants: {list(variant_to_files.keys())}")
|
||||
|
||||
# sanity check for default variant
|
||||
if default_variant:
|
||||
if "default" in variant_to_files:
|
||||
raise ValueError(
|
||||
"All wheel files must have variant suffixes when `default_variant` is specified."
|
||||
)
|
||||
if default_variant not in variant_to_files:
|
||||
raise ValueError(
|
||||
f"Default variant '{default_variant}' not found among wheel files."
|
||||
)
|
||||
|
||||
if alias_to_default:
|
||||
if "default" not in variant_to_files:
|
||||
# e.g. only some wheels are uploaded to S3 currently
|
||||
print(
|
||||
"[WARN] Alias to default variant specified, but no default variant found."
|
||||
)
|
||||
elif alias_to_default in variant_to_files:
|
||||
raise ValueError(
|
||||
f"Alias variant name '{alias_to_default}' already exists among wheel files."
|
||||
)
|
||||
else:
|
||||
variant_to_files[alias_to_default] = variant_to_files["default"].copy()
|
||||
print(f"Alias variant '{alias_to_default}' created for default variant.")
|
||||
|
||||
# Generate index for each variant
|
||||
subdir_names = set()
|
||||
for variant, files in variant_to_files.items():
|
||||
if variant == "default":
|
||||
variant_dir = index_base_dir
|
||||
else:
|
||||
variant_dir = index_base_dir / variant
|
||||
subdir_names.add(variant)
|
||||
|
||||
variant_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# gather all package names in this variant
|
||||
packages = set(f.package_name for f in files)
|
||||
if variant == "default":
|
||||
# these packages should also appear in the "project list"
|
||||
# generate after all variants are processed
|
||||
subdir_names = subdir_names.union(packages)
|
||||
else:
|
||||
# generate project list for this variant directly
|
||||
project_list_str = generate_project_list(sorted(packages))
|
||||
with open(variant_dir / "index.html", "w") as f:
|
||||
f.write(project_list_str)
|
||||
|
||||
for package in packages:
|
||||
# filter files belonging to this package only
|
||||
package_files = [f for f in files if f.package_name == package]
|
||||
package_dir = variant_dir / package
|
||||
package_dir.mkdir(parents=True, exist_ok=True)
|
||||
index_str, metadata_str = generate_package_index_and_metadata(
|
||||
package_files, wheel_base_dir, package_dir
|
||||
)
|
||||
with open(package_dir / "index.html", "w") as f:
|
||||
f.write(index_str)
|
||||
with open(package_dir / "metadata.json", "w") as f:
|
||||
f.write(metadata_str)
|
||||
|
||||
# Generate top-level project list index
|
||||
project_list_str = generate_project_list(sorted(subdir_names))
|
||||
with open(index_base_dir / "index.html", "w") as f:
|
||||
f.write(project_list_str)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
"""
|
||||
Arguments:
|
||||
--version <version> : version string for the current build (e.g., commit hash)
|
||||
--current-objects <path_to_json> : path to JSON file containing current S3 objects listing in this version directory
|
||||
--output-dir <output_directory> : directory to store generated index files
|
||||
--alias-to-default <alias_variant_name> : (optional) alias variant name for the default variant
|
||||
"""
|
||||
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Process nightly build wheel files to generate indices."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--version",
|
||||
type=str,
|
||||
required=True,
|
||||
help="Version string for the current build (e.g., commit hash)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--current-objects",
|
||||
type=str,
|
||||
required=True,
|
||||
help="Path to JSON file containing current S3 objects listing in this version directory",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output-dir",
|
||||
type=str,
|
||||
required=True,
|
||||
help="Directory to store generated index files",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--alias-to-default",
|
||||
type=str,
|
||||
default=None,
|
||||
help="Alias variant name for the default variant",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
version = args.version
|
||||
if "/" in version or "\\" in version:
|
||||
raise ValueError("Version string must not contain slashes.")
|
||||
current_objects_path = Path(args.current_objects)
|
||||
output_dir = Path(args.output_dir)
|
||||
if not output_dir.exists():
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Read current objects JSON
|
||||
with open(current_objects_path) as f:
|
||||
current_objects: dict[str, list[dict[str, Any]]] = json.load(f)
|
||||
|
||||
# current_objects looks like from list_objects_v2 S3 API:
|
||||
"""
|
||||
"Contents": [
|
||||
{
|
||||
"Key": "e2f56c309d2a28899c68975a7e104502d56deb8f/vllm-0.11.2.dev363+ge2f56c309-cp38-abi3-manylinux1_x86_64.whl",
|
||||
"LastModified": "2025-11-28T14:00:32+00:00",
|
||||
"ETag": "\"37a38339c7cdb61ca737021b968075df-52\"",
|
||||
"ChecksumAlgorithm": [
|
||||
"CRC64NVME"
|
||||
],
|
||||
"ChecksumType": "FULL_OBJECT",
|
||||
"Size": 435649349,
|
||||
"StorageClass": "STANDARD"
|
||||
},
|
||||
...
|
||||
]
|
||||
"""
|
||||
|
||||
# Extract wheel file keys
|
||||
wheel_files = []
|
||||
for item in current_objects.get("Contents", []):
|
||||
key: str = item["Key"]
|
||||
if key.endswith(".whl"):
|
||||
wheel_files.append(key.split("/")[-1]) # only the filename is used
|
||||
|
||||
print(f"Found {len(wheel_files)} wheel files for version {version}: {wheel_files}")
|
||||
|
||||
# Generate index and metadata, assuming wheels and indices are stored as:
|
||||
# s3://vllm-wheels/{version}/<wheel files>
|
||||
# s3://vllm-wheels/<anything>/<index files>
|
||||
wheel_base_dir = Path(output_dir).parent / version
|
||||
index_base_dir = Path(output_dir)
|
||||
|
||||
generate_index_and_metadata(
|
||||
whl_files=wheel_files,
|
||||
wheel_base_dir=wheel_base_dir,
|
||||
index_base_dir=index_base_dir,
|
||||
default_variant=None,
|
||||
alias_to_default=args.alias_to_default,
|
||||
)
|
||||
print(f"Successfully generated index and metadata in {output_dir}")
|
||||
@@ -59,7 +59,7 @@ while true; do
|
||||
fi
|
||||
done
|
||||
|
||||
echo "--- Pulling container"
|
||||
echo "--- Pulling container"
|
||||
image_name="rocm/vllm-ci:${BUILDKITE_COMMIT}"
|
||||
container_name="rocm_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
|
||||
docker pull "${image_name}"
|
||||
@@ -78,17 +78,13 @@ HF_MOUNT="/root/.cache/huggingface"
|
||||
commands=$@
|
||||
echo "Commands:$commands"
|
||||
|
||||
if [[ $commands == *"pytest -v -s basic_correctness/test_basic_correctness.py"* ]]; then
|
||||
commands=${commands//"pytest -v -s basic_correctness/test_basic_correctness.py"/"VLLM_USE_TRITON_FLASH_ATTN=0 pytest -v -s basic_correctness/test_basic_correctness.py"}
|
||||
fi
|
||||
commands=${commands//"pytest -v -s basic_correctness/test_basic_correctness.py"/"pytest -v -s basic_correctness/test_basic_correctness.py"}
|
||||
|
||||
if [[ $commands == *"pytest -v -s models/test_registry.py"* ]]; then
|
||||
commands=${commands//"pytest -v -s models/test_registry.py"/"pytest -v -s models/test_registry.py -k 'not BambaForCausalLM and not GritLM and not Mamba2ForCausalLM and not Zamba2ForCausalLM'"}
|
||||
fi
|
||||
|
||||
if [[ $commands == *"pytest -v -s compile/test_basic_correctness.py"* ]]; then
|
||||
commands=${commands//"pytest -v -s compile/test_basic_correctness.py"/"VLLM_USE_TRITON_FLASH_ATTN=0 pytest -v -s compile/test_basic_correctness.py"}
|
||||
fi
|
||||
commands=${commands//"pytest -v -s compile/test_basic_correctness.py"/"pytest -v -s compile/test_basic_correctness.py"}
|
||||
|
||||
if [[ $commands == *"pytest -v -s lora"* ]]; then
|
||||
commands=${commands//"pytest -v -s lora"/"VLLM_ROCM_CUSTOM_PAGED_ATTN=0 pytest -v -s lora"}
|
||||
@@ -173,19 +169,28 @@ fi
|
||||
PARALLEL_JOB_COUNT=8
|
||||
MYPYTHONPATH=".."
|
||||
|
||||
# check if the command contains shard flag, we will run all shards in parallel because the host have 8 GPUs.
|
||||
# Test that we're launching on the machine that has
|
||||
# proper access to GPUs
|
||||
render_gid=$(getent group render | cut -d: -f3)
|
||||
if [[ -z "$render_gid" ]]; then
|
||||
echo "Error: 'render' group not found. This is required for GPU access." >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# check if the command contains shard flag, we will run all shards in parallel because the host have 8 GPUs.
|
||||
if [[ $commands == *"--shard-id="* ]]; then
|
||||
# assign job count as the number of shards used
|
||||
commands=${commands//"--num-shards= "/"--num-shards=${PARALLEL_JOB_COUNT} "}
|
||||
# assign job count as the number of shards used
|
||||
commands=$(echo "$commands" | sed -E "s/--num-shards[[:blank:]]*=[[:blank:]]*[0-9]*/--num-shards=${PARALLEL_JOB_COUNT} /g" | sed 's/ \\ / /g')
|
||||
for GPU in $(seq 0 $(($PARALLEL_JOB_COUNT-1))); do
|
||||
# assign shard-id for each shard
|
||||
commands_gpu=${commands//"--shard-id= "/"--shard-id=${GPU} "}
|
||||
commands_gpu=$(echo "$commands" | sed -E "s/--shard-id[[:blank:]]*=[[:blank:]]*[0-9]*/--shard-id=${GPU} /g" | sed 's/ \\ / /g')
|
||||
echo "Shard ${GPU} commands:$commands_gpu"
|
||||
echo "Render devices: $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES"
|
||||
docker run \
|
||||
--device /dev/kfd $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES \
|
||||
--network=host \
|
||||
--shm-size=16gb \
|
||||
--group-add "$render_gid" \
|
||||
--rm \
|
||||
-e HIP_VISIBLE_DEVICES="${GPU}" \
|
||||
-e HF_TOKEN \
|
||||
@@ -217,8 +222,8 @@ else
|
||||
--device /dev/kfd $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES \
|
||||
--network=host \
|
||||
--shm-size=16gb \
|
||||
--group-add "$render_gid" \
|
||||
--rm \
|
||||
-e HIP_VISIBLE_DEVICES=0 \
|
||||
-e HF_TOKEN \
|
||||
-e AWS_ACCESS_KEY_ID \
|
||||
-e AWS_SECRET_ACCESS_KEY \
|
||||
|
||||
62
.buildkite/scripts/hardware_ci/run-cpu-test-arm.sh
Executable file
62
.buildkite/scripts/hardware_ci/run-cpu-test-arm.sh
Executable file
@@ -0,0 +1,62 @@
|
||||
#!/bin/bash
|
||||
|
||||
# This script build the CPU docker image and run the offline inference inside the container.
|
||||
# It serves a sanity check for compilation and basic model usage.
|
||||
set -ex
|
||||
|
||||
# allow to bind to different cores
|
||||
CORE_RANGE=${CORE_RANGE:-0-16}
|
||||
OMP_CORE_RANGE=${OMP_CORE_RANGE:-0-16}
|
||||
|
||||
export CMAKE_BUILD_PARALLEL_LEVEL=16
|
||||
|
||||
# Setup cleanup
|
||||
remove_docker_container() {
|
||||
set -e;
|
||||
docker rm -f cpu-test || true;
|
||||
}
|
||||
trap remove_docker_container EXIT
|
||||
remove_docker_container
|
||||
|
||||
# Try building the docker image
|
||||
docker build --tag cpu-test --target vllm-test -f docker/Dockerfile.cpu .
|
||||
|
||||
# Run the image
|
||||
docker run -itd --cpuset-cpus="$CORE_RANGE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=16 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test cpu-test
|
||||
|
||||
function cpu_tests() {
|
||||
set -e
|
||||
|
||||
docker exec cpu-test bash -c "
|
||||
set -e
|
||||
pip list"
|
||||
|
||||
# offline inference
|
||||
docker exec cpu-test bash -c "
|
||||
set -e
|
||||
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
|
||||
|
||||
# Run kernel tests
|
||||
docker exec cpu-test bash -c "
|
||||
set -e
|
||||
pytest -x -v -s tests/kernels/test_onednn.py
|
||||
pytest -x -v -s tests/kernels/attention/test_cpu_attn.py"
|
||||
|
||||
# basic online serving
|
||||
docker exec cpu-test bash -c '
|
||||
set -e
|
||||
VLLM_CPU_OMP_THREADS_BIND=$E2E_OMP_THREADS vllm serve Qwen/Qwen3-0.6B --max-model-len 2048 &
|
||||
server_pid=$!
|
||||
timeout 600 bash -c "until curl localhost:8000/v1/models; do sleep 1; done" || exit 1
|
||||
vllm bench serve \
|
||||
--backend vllm \
|
||||
--dataset-name random \
|
||||
--model Qwen/Qwen3-0.6B \
|
||||
--num-prompts 20 \
|
||||
--endpoint /v1/completions
|
||||
kill -s SIGTERM $server_pid &'
|
||||
}
|
||||
|
||||
# All of CPU tests are expected to be finished less than 40 mins.
|
||||
export -f cpu_tests
|
||||
timeout 2h bash -c cpu_tests
|
||||
@@ -25,20 +25,22 @@ function cpu_tests() {
|
||||
|
||||
# offline inference
|
||||
podman exec -it "$container_id" bash -c "
|
||||
export TORCH_COMPILE_DISABLE=1
|
||||
set -xve
|
||||
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m" >> $HOME/test_basic.log
|
||||
|
||||
# Run basic model test
|
||||
podman exec -it "$container_id" bash -c "
|
||||
export TORCH_COMPILE_DISABLE=1
|
||||
set -evx
|
||||
pip install pytest pytest-asyncio einops peft Pillow soundfile transformers_stream_generator matplotlib
|
||||
pip install sentence-transformers datamodel_code_generator
|
||||
pip install sentence-transformers datamodel_code_generator tblib
|
||||
|
||||
# Note: disable Bart until supports V1
|
||||
# pytest -v -s tests/models/language/generation/test_bart.py -m cpu_model
|
||||
pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-openai-community/gpt2]
|
||||
pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-facebook/opt-125m]
|
||||
pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-google/gemma-1.1-2b-it]
|
||||
pytest -v -s tests/models/language/generation/test_common.py::test_models[False-False-5-32-openai-community/gpt2]
|
||||
pytest -v -s tests/models/language/generation/test_common.py::test_models[False-False-5-32-facebook/opt-125m]
|
||||
pytest -v -s tests/models/language/generation/test_common.py::test_models[False-False-5-32-google/gemma-1.1-2b-it]
|
||||
pytest -v -s tests/models/language/pooling/test_classification.py::test_models[float-jason9693/Qwen2.5-1.5B-apeach]
|
||||
# TODO: Below test case tests/models/language/pooling/test_embedding.py::test_models[True-ssmits/Qwen2-7B-Instruct-embed-base] fails on ppc64le. Disabling it for time being.
|
||||
# pytest -v -s tests/models/language/pooling/test_embedding.py -m cpu_model" >> $HOME/test_rest.log
|
||||
|
||||
@@ -21,8 +21,8 @@ trap remove_docker_container EXIT
|
||||
remove_docker_container
|
||||
|
||||
# Try building the docker image
|
||||
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --tag cpu-test-"$NUMA_NODE" --target vllm-test -f docker/Dockerfile.cpu .
|
||||
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" --tag cpu-test-"$NUMA_NODE"-avx2 --target vllm-test -f docker/Dockerfile.cpu .
|
||||
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --progress plain --tag cpu-test-"$NUMA_NODE" --target vllm-test -f docker/Dockerfile.cpu .
|
||||
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --progress plain --build-arg VLLM_CPU_DISABLE_AVX512="true" --tag cpu-test-"$NUMA_NODE"-avx2 --target vllm-test -f docker/Dockerfile.cpu .
|
||||
|
||||
# Run the image, setting --shm-size=4g for tensor parallel.
|
||||
docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=16 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"
|
||||
@@ -49,6 +49,7 @@ function cpu_tests() {
|
||||
# Run kernel tests
|
||||
docker exec cpu-test-"$NUMA_NODE" bash -c "
|
||||
set -e
|
||||
pytest -x -v -s tests/kernels/attention/test_cpu_attn.py
|
||||
pytest -x -v -s tests/kernels/test_onednn.py"
|
||||
|
||||
# Run basic model test
|
||||
@@ -72,12 +73,11 @@ function cpu_tests() {
|
||||
pytest -x -s -v \
|
||||
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_logprobs"
|
||||
|
||||
# Note: disable it until supports V1
|
||||
# Run AWQ test
|
||||
# docker exec cpu-test-"$NUMA_NODE" bash -c "
|
||||
# set -e
|
||||
# VLLM_USE_V1=0 pytest -x -s -v \
|
||||
# tests/quantization/test_ipex_quant.py"
|
||||
# Run AWQ/GPTQ test
|
||||
docker exec cpu-test-"$NUMA_NODE" bash -c "
|
||||
set -e
|
||||
pytest -x -s -v \
|
||||
tests/quantization/test_cpu_wna16.py"
|
||||
|
||||
# Run multi-lora tests
|
||||
docker exec cpu-test-"$NUMA_NODE" bash -c "
|
||||
@@ -116,4 +116,4 @@ function cpu_tests() {
|
||||
|
||||
# All of CPU tests are expected to be finished less than 40 mins.
|
||||
export -f cpu_tests
|
||||
timeout 2h bash -c "cpu_tests $CORE_RANGE $NUMA_NODE"
|
||||
timeout 2.5h bash -c "cpu_tests $CORE_RANGE $NUMA_NODE"
|
||||
|
||||
@@ -35,7 +35,7 @@ docker run \
|
||||
echo $ZE_AFFINITY_MASK
|
||||
pip install tblib==3.1.0
|
||||
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
|
||||
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 -O3 -O.cudagraph_mode=NONE
|
||||
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 -O3 -cc.cudagraph_mode=NONE
|
||||
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray
|
||||
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp
|
||||
VLLM_ATTENTION_BACKEND=TRITON_ATTN python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
|
||||
@@ -46,6 +46,6 @@ docker run \
|
||||
pytest -v -s v1/worker --ignore=v1/worker/test_gpu_model_runner.py
|
||||
pytest -v -s v1/structured_output
|
||||
pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_tree_attention.py --ignore=v1/spec_decode/test_speculators_eagle3.py
|
||||
pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_nixl_connector.py --ignore=v1/kv_connector/unit/test_shared_storage_connector.py
|
||||
pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_nixl_connector.py --ignore=v1/kv_connector/unit/test_shared_storage_connector.py --ignore=v1/kv_connector/unit/test_lmcache_integration.py
|
||||
pytest -v -s v1/test_serial_utils.py
|
||||
'
|
||||
|
||||
@@ -0,0 +1,72 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euxo pipefail
|
||||
|
||||
# args: [THRESHOLD] [NUM_QUESTIONS] [START_PORT]
|
||||
THRESHOLD=${1:-0.25}
|
||||
NUM_Q=${2:-1319}
|
||||
PORT=${3:-8010}
|
||||
OUT_DIR=${OUT_DIR:-/tmp/vllm-scheduled}
|
||||
mkdir -p "${OUT_DIR}"
|
||||
|
||||
wait_for_server() {
|
||||
local port=$1
|
||||
timeout 600 bash -c '
|
||||
until curl -sf "http://127.0.0.1:'"$port"'/health" > /dev/null; do
|
||||
sleep 1
|
||||
done'
|
||||
}
|
||||
|
||||
MODEL="deepseek-ai/DeepSeek-V2-lite"
|
||||
|
||||
# Set BACKENDS based on platform
|
||||
if command -v rocm-smi &> /dev/null || [[ -d /opt/rocm ]] || [[ -n "${ROCM_PATH:-}" ]]; then
|
||||
# ROCm platform
|
||||
BACKENDS=("allgather_reducescatter")
|
||||
# Disable MOE padding for ROCm since it is causing eplb to fail
|
||||
export VLLM_ROCM_MOE_PADDING=0
|
||||
else
|
||||
# Non-ROCm platform (CUDA/other)
|
||||
BACKENDS=("deepep_high_throughput" "deepep_low_latency")
|
||||
fi
|
||||
|
||||
cleanup() {
|
||||
if [[ -n "${SERVER_PID:-}" ]] && kill -0 "${SERVER_PID}" 2>/dev/null; then
|
||||
kill "${SERVER_PID}" 2>/dev/null || true
|
||||
for _ in {1..20}; do
|
||||
kill -0 "${SERVER_PID}" 2>/dev/null || break
|
||||
sleep 0.5
|
||||
done
|
||||
kill -9 "${SERVER_PID}" 2>/dev/null || true
|
||||
fi
|
||||
}
|
||||
trap cleanup EXIT
|
||||
|
||||
for BACK in "${BACKENDS[@]}"; do
|
||||
VLLM_DEEP_GEMM_WARMUP=skip \
|
||||
VLLM_ALL2ALL_BACKEND=$BACK \
|
||||
vllm serve "$MODEL" \
|
||||
--enforce-eager \
|
||||
--tensor-parallel-size 2 \
|
||||
--data-parallel-size 2 \
|
||||
--enable-expert-parallel \
|
||||
--enable-eplb \
|
||||
--trust-remote-code \
|
||||
--max-model-len 2048 \
|
||||
--port $PORT &
|
||||
SERVER_PID=$!
|
||||
wait_for_server $PORT
|
||||
|
||||
TAG=$(echo "$MODEL" | tr '/: \\n' '_____')
|
||||
OUT="${OUT_DIR}/${TAG}_${BACK}.json"
|
||||
python3 tests/evals/gsm8k/gsm8k_eval.py --host http://127.0.0.1 --port $PORT --num-questions ${NUM_Q} --save-results ${OUT}
|
||||
python3 - <<PY
|
||||
import json; acc=json.load(open('${OUT}'))['accuracy']
|
||||
print(f"${MODEL} ${BACK}: accuracy {acc:.3f}")
|
||||
assert acc >= ${THRESHOLD}, f"${MODEL} ${BACK} accuracy {acc}"
|
||||
PY
|
||||
|
||||
cleanup
|
||||
SERVER_PID=
|
||||
sleep 1
|
||||
PORT=$((PORT+1))
|
||||
done
|
||||
@@ -0,0 +1,74 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euxo pipefail
|
||||
|
||||
# args: [THRESHOLD] [NUM_QUESTIONS] [START_PORT] [DATA_PARALLEL_SIZE] [TENSOR_PARALLEL_SIZE]
|
||||
THRESHOLD=${1:-0.8}
|
||||
NUM_Q=${2:-1319}
|
||||
PORT=${3:-8020}
|
||||
DATA_PARALLEL_SIZE=${4:-2}
|
||||
TENSOR_PARALLEL_SIZE=${5:-2}
|
||||
OUT_DIR=${OUT_DIR:-/tmp/vllm-scheduled}
|
||||
mkdir -p "${OUT_DIR}"
|
||||
|
||||
wait_for_server() {
|
||||
local port=$1
|
||||
timeout 600 bash -c '
|
||||
until curl -sf "http://127.0.0.1:'"$port"'/health" > /dev/null; do
|
||||
sleep 1
|
||||
done'
|
||||
}
|
||||
|
||||
MODEL="QWen/Qwen3-30B-A3B-FP8"
|
||||
# Set BACKENDS based on platform
|
||||
if command -v rocm-smi &> /dev/null || [[ -d /opt/rocm ]] || [[ -n "${ROCM_PATH:-}" ]]; then
|
||||
# ROCm platform
|
||||
BACKENDS=("allgather_reducescatter")
|
||||
# Disable MOE padding for ROCm since it is causing eplb to fail
|
||||
export VLLM_ROCM_MOE_PADDING=0
|
||||
else
|
||||
# Non-ROCm platform (CUDA/other)
|
||||
BACKENDS=("deepep_high_throughput" "deepep_low_latency")
|
||||
fi
|
||||
|
||||
cleanup() {
|
||||
if [[ -n "${SERVER_PID:-}" ]] && kill -0 "${SERVER_PID}" 2>/dev/null; then
|
||||
kill "${SERVER_PID}" 2>/dev/null || true
|
||||
for _ in {1..20}; do
|
||||
kill -0 "${SERVER_PID}" 2>/dev/null || break
|
||||
sleep 0.5
|
||||
done
|
||||
kill -9 "${SERVER_PID}" 2>/dev/null || true
|
||||
fi
|
||||
}
|
||||
trap cleanup EXIT
|
||||
|
||||
for BACK in "${BACKENDS[@]}"; do
|
||||
VLLM_DEEP_GEMM_WARMUP=skip \
|
||||
VLLM_ALL2ALL_BACKEND=$BACK \
|
||||
vllm serve "$MODEL" \
|
||||
--enforce-eager \
|
||||
--enable-eplb \
|
||||
--eplb-config '{"window_size":10, "step_interval":100, "num_redundant_experts":0, "log_balancedness":true}' \
|
||||
--tensor-parallel-size ${TENSOR_PARALLEL_SIZE} \
|
||||
--data-parallel-size ${DATA_PARALLEL_SIZE} \
|
||||
--enable-expert-parallel \
|
||||
--trust-remote-code \
|
||||
--max-model-len 2048 \
|
||||
--port $PORT &
|
||||
SERVER_PID=$!
|
||||
wait_for_server $PORT
|
||||
|
||||
TAG=$(echo "$MODEL" | tr '/: \\n' '_____')
|
||||
OUT="${OUT_DIR}/${TAG}_${BACK}.json"
|
||||
python3 tests/evals/gsm8k/gsm8k_eval.py --host http://127.0.0.1 --port $PORT --num-questions ${NUM_Q} --save-results ${OUT}
|
||||
python3 - <<PY
|
||||
import json; acc=json.load(open('${OUT}'))['accuracy']
|
||||
print(f"${MODEL} ${BACK}: accuracy {acc:.3f}")
|
||||
assert acc >= ${THRESHOLD}, f"${MODEL} ${BACK} accuracy {acc}"
|
||||
PY
|
||||
|
||||
cleanup
|
||||
SERVER_PID=
|
||||
sleep 1
|
||||
PORT=$((PORT+1))
|
||||
done
|
||||
@@ -2,6 +2,28 @@
|
||||
|
||||
set -ex
|
||||
|
||||
# ======== part 0: setup ========
|
||||
|
||||
BUCKET="vllm-wheels"
|
||||
INDICES_OUTPUT_DIR="indices"
|
||||
DEFAULT_VARIANT_ALIAS="cu129" # align with vLLM_MAIN_CUDA_VERSION in vllm/envs.py
|
||||
PYTHON=${PYTHON_PROG:=python3} # try to read from env var, otherwise use python3
|
||||
SUBPATH=$BUILDKITE_COMMIT
|
||||
S3_COMMIT_PREFIX="s3://$BUCKET/$SUBPATH/"
|
||||
|
||||
# detect if python3.10+ is available
|
||||
has_new_python=$($PYTHON -c "print(1 if __import__('sys').version_info >= (3,12) else 0)")
|
||||
if [[ "$has_new_python" -eq 0 ]]; then
|
||||
# use new python from docker
|
||||
docker pull python:3-slim
|
||||
PYTHON="docker run --rm -v $(pwd):/app -w /app python:3-slim python3"
|
||||
fi
|
||||
|
||||
echo "Using python interpreter: $PYTHON"
|
||||
echo "Python version: $($PYTHON --version)"
|
||||
|
||||
# ========= part 1: collect, rename & upload the wheel ==========
|
||||
|
||||
# Assume wheels are in artifacts/dist/*.whl
|
||||
wheel_files=(artifacts/dist/*.whl)
|
||||
|
||||
@@ -10,74 +32,69 @@ if [[ ${#wheel_files[@]} -ne 1 ]]; then
|
||||
echo "Error: Expected exactly one wheel file in artifacts/dist/, but found ${#wheel_files[@]}"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Get the single wheel file
|
||||
wheel="${wheel_files[0]}"
|
||||
|
||||
# Detect architecture and rename 'linux' to appropriate manylinux version
|
||||
arch=$(uname -m)
|
||||
if [[ $arch == "x86_64" ]]; then
|
||||
manylinux_version="manylinux1"
|
||||
elif [[ $arch == "aarch64" ]]; then
|
||||
manylinux_version="manylinux2014"
|
||||
else
|
||||
echo "Warning: Unknown architecture $arch, using manylinux1 as default"
|
||||
manylinux_version="manylinux1"
|
||||
fi
|
||||
# current build image uses ubuntu 20.04, which corresponds to manylinux_2_31
|
||||
# refer to https://github.com/mayeut/pep600_compliance?tab=readme-ov-file#acceptable-distros-to-build-wheels
|
||||
manylinux_version="manylinux_2_31"
|
||||
|
||||
# Rename 'linux' to the appropriate manylinux version in the wheel filename
|
||||
if [[ "$wheel" != *"linux"* ]]; then
|
||||
echo "Error: Wheel filename does not contain 'linux': $wheel"
|
||||
exit 1
|
||||
fi
|
||||
new_wheel="${wheel/linux/$manylinux_version}"
|
||||
mv -- "$wheel" "$new_wheel"
|
||||
wheel="$new_wheel"
|
||||
echo "Renamed wheel to: $wheel"
|
||||
|
||||
# Extract the version from the wheel
|
||||
version=$(unzip -p "$wheel" '**/METADATA' | grep '^Version: ' | cut -d' ' -f2)
|
||||
echo "Version: $version"
|
||||
echo "Version in wheel: $version"
|
||||
pure_version="${version%%+*}"
|
||||
echo "Pure version (without variant): $pure_version"
|
||||
|
||||
normal_wheel="$wheel" # Save the original wheel filename
|
||||
# copy wheel to its own bucket
|
||||
aws s3 cp "$wheel" "$S3_COMMIT_PREFIX"
|
||||
|
||||
# If the version contains "dev", rename it to v1.0.0.dev for consistency
|
||||
if [[ $version == *dev* ]]; then
|
||||
suffix="${version##*.}"
|
||||
if [[ $suffix == cu* ]]; then
|
||||
new_version="1.0.0.dev+${suffix}"
|
||||
else
|
||||
new_version="1.0.0.dev"
|
||||
fi
|
||||
new_wheel="${wheel/$version/$new_version}"
|
||||
# use cp to keep both files in the artifacts directory
|
||||
cp -- "$wheel" "$new_wheel"
|
||||
wheel="$new_wheel"
|
||||
version="$new_version"
|
||||
fi
|
||||
# ========= part 2: generate and upload indices ==========
|
||||
# generate indices for all existing wheels in the commit directory
|
||||
# this script might be run multiple times if there are multiple variants being built
|
||||
# so we need to guarantee there is little chance for "TOCTOU" issues
|
||||
# i.e., one process is generating indices while another is uploading a new wheel
|
||||
# so we need to ensure no time-consuming operations happen below
|
||||
|
||||
# Upload the wheel to S3
|
||||
python3 .buildkite/generate_index.py --wheel "$normal_wheel"
|
||||
# list all wheels in the commit directory
|
||||
echo "Existing wheels on S3:"
|
||||
aws s3 ls "$S3_COMMIT_PREFIX"
|
||||
obj_json="objects.json"
|
||||
aws s3api list-objects-v2 --bucket "$BUCKET" --prefix "$SUBPATH/" --delimiter / --output json > "$obj_json"
|
||||
mkdir -p "$INDICES_OUTPUT_DIR"
|
||||
|
||||
# generate index for this commit
|
||||
aws s3 cp "$wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
|
||||
aws s3 cp "$normal_wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
|
||||
|
||||
if [[ $normal_wheel == *"cu129"* ]]; then
|
||||
# only upload index.html for cu129 wheels (default wheels) as it
|
||||
# is available on both x86 and arm64
|
||||
aws s3 cp index.html "s3://vllm-wheels/$BUILDKITE_COMMIT/vllm/index.html"
|
||||
aws s3 cp "s3://vllm-wheels/nightly/index.html" "s3://vllm-wheels/$BUILDKITE_COMMIT/index.html"
|
||||
# call script to generate indicies for all existing wheels
|
||||
# this indices have relative paths that could work as long as it is next to the wheel directory in s3
|
||||
# i.e., the wheels are always in s3://vllm-wheels/<commit>/
|
||||
# and indices can be placed in /<commit>/, or /nightly/, or /<version>/
|
||||
if [[ ! -z "$DEFAULT_VARIANT_ALIAS" ]]; then
|
||||
alias_arg="--alias-to-default $DEFAULT_VARIANT_ALIAS"
|
||||
else
|
||||
echo "Skipping index files for non-cu129 wheels"
|
||||
alias_arg=""
|
||||
fi
|
||||
|
||||
# generate index for nightly
|
||||
aws s3 cp "$wheel" "s3://vllm-wheels/nightly/"
|
||||
aws s3 cp "$normal_wheel" "s3://vllm-wheels/nightly/"
|
||||
$PYTHON .buildkite/scripts/generate-nightly-index.py --version "$SUBPATH" --current-objects "$obj_json" --output-dir "$INDICES_OUTPUT_DIR" $alias_arg
|
||||
|
||||
if [[ $normal_wheel == *"cu129"* ]]; then
|
||||
# only upload index.html for cu129 wheels (default wheels) as it
|
||||
# is available on both x86 and arm64
|
||||
aws s3 cp index.html "s3://vllm-wheels/nightly/vllm/index.html"
|
||||
else
|
||||
echo "Skipping index files for non-cu129 wheels"
|
||||
# copy indices to /<commit>/ unconditionally
|
||||
echo "Uploading indices to $S3_COMMIT_PREFIX"
|
||||
aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "$S3_COMMIT_PREFIX"
|
||||
|
||||
# copy to /nightly/ only if it is on the main branch and not a PR
|
||||
if [[ "$BUILDKITE_BRANCH" == "main" && "$BUILDKITE_PULL_REQUEST" == "false" ]]; then
|
||||
echo "Uploading indices to overwrite /nightly/"
|
||||
aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "s3://$BUCKET/nightly/"
|
||||
fi
|
||||
|
||||
aws s3 cp "$wheel" "s3://vllm-wheels/$version/"
|
||||
aws s3 cp index.html "s3://vllm-wheels/$version/vllm/index.html"
|
||||
# copy to /<pure_version>/ only if it does not have "dev" in the version
|
||||
if [[ "$version" != *"dev"* ]]; then
|
||||
echo "Uploading indices to overwrite /$pure_version/"
|
||||
aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "s3://$BUCKET/$pure_version/"
|
||||
fi
|
||||
|
||||
@@ -39,20 +39,20 @@ steps:
|
||||
# if this test fails, it means the nightly torch version is not compatible with some
|
||||
# of the dependencies. Please check the error message and add the package to whitelist
|
||||
# in /vllm/tools/pre_commit/generate_nightly_torch_test.py
|
||||
mirror_hardwares: [amdexperimental]
|
||||
mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
|
||||
agent_pool: mi325_1
|
||||
# grade: Blocking
|
||||
grade: Blocking
|
||||
soft_fail: true
|
||||
source_file_dependencies:
|
||||
- requirements/nightly_torch_test.txt
|
||||
commands:
|
||||
- bash standalone_tests/pytorch_nightly_dependency.sh
|
||||
|
||||
- label: Async Engine, Inputs, Utils, Worker Test # 36min
|
||||
timeout_in_minutes: 50
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
- label: Async Engine, Inputs, Utils, Worker Test # 10min
|
||||
timeout_in_minutes: 15
|
||||
mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
|
||||
agent_pool: mi325_1
|
||||
# grade: Blocking
|
||||
grade: Blocking
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/multimodal
|
||||
@@ -61,25 +61,29 @@ steps:
|
||||
- pytest -v -s -m 'not cpu_test' multimodal
|
||||
- pytest -v -s utils_
|
||||
|
||||
- label: Async Engine, Inputs, Utils, Worker Test (CPU) # 4 mins
|
||||
timeout_in_minutes: 10
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
- label: Async Engine, Inputs, Utils, Worker, Config Test (CPU) # 15min
|
||||
timeout_in_minutes: 20
|
||||
mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
|
||||
agent_pool: mi325_1
|
||||
# grade: Blocking
|
||||
grade: Blocking
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/test_inputs.py
|
||||
- tests/test_outputs.py
|
||||
- tests/multimodal
|
||||
- tests/standalone_tests/lazy_imports.py
|
||||
- tests/tokenizers_
|
||||
- tests/transformers_utils
|
||||
- tests/config
|
||||
no_gpu: true
|
||||
commands:
|
||||
- python3 standalone_tests/lazy_imports.py
|
||||
- pytest -v -s test_inputs.py
|
||||
- pytest -v -s test_outputs.py
|
||||
- pytest -v -s -m 'cpu_test' multimodal
|
||||
- pytest -v -s tokenizers_
|
||||
- pytest -v -s transformers_utils
|
||||
- pytest -v -s config
|
||||
|
||||
- label: Python-only Installation Test # 10min
|
||||
timeout_in_minutes: 20
|
||||
@@ -111,9 +115,9 @@ steps:
|
||||
- pytest -v -s basic_correctness/test_cpu_offload.py
|
||||
|
||||
- label: Entrypoints Unit Tests # 5min
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
|
||||
agent_pool: mi325_1
|
||||
# grade: Blocking
|
||||
grade: Blocking
|
||||
timeout_in_minutes: 10
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
fast_check: true
|
||||
@@ -187,7 +191,7 @@ steps:
|
||||
- tests/distributed/test_utils
|
||||
- tests/distributed/test_pynccl
|
||||
- tests/distributed/test_events
|
||||
- tests/compile/test_basic_correctness
|
||||
- tests/compile/fullgraph/test_basic_correctness.py
|
||||
- examples/offline_inference/rlhf.py
|
||||
- examples/offline_inference/rlhf_colocate.py
|
||||
- tests/examples/offline_inference/data_parallel.py
|
||||
@@ -210,12 +214,13 @@ steps:
|
||||
# test with internal dp
|
||||
- python3 ../examples/offline_inference/data_parallel.py --enforce-eager
|
||||
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
|
||||
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
|
||||
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
|
||||
- TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_internal_lb_dp.py
|
||||
- TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_hybrid_lb_dp.py
|
||||
- pytest -v -s v1/engine/test_engine_core_client.py::test_kv_cache_events_dp
|
||||
- pytest -v -s distributed/test_utils.py
|
||||
- pytest -v -s compile/test_basic_correctness.py
|
||||
- pytest -v -s compile/fullgraph/test_basic_correctness.py
|
||||
- pytest -v -s distributed/test_pynccl.py
|
||||
- pytest -v -s distributed/test_events.py
|
||||
- pytest -v -s distributed/test_symm_mem_allreduce.py
|
||||
@@ -226,10 +231,31 @@ steps:
|
||||
- VLLM_ALLOW_INSECURE_SERIALIZATION=1 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
|
||||
- popd
|
||||
|
||||
- label: EPLB Algorithm Test # 5min
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
agent_pool: mi325_1
|
||||
- label: Distributed Tests (8 GPUs) # 4min
|
||||
timeout_in_minutes: 10
|
||||
mirror_hardwares: [amdexperimental]
|
||||
agent_pool: mi325_8
|
||||
# grade: Blocking
|
||||
gpu: h100
|
||||
num_gpus: 8
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
source_file_dependencies:
|
||||
- examples/offline_inference/torchrun_dp_example.py
|
||||
- vllm/config/parallel.py
|
||||
- vllm/distributed/
|
||||
- vllm/v1/engine/llm_engine.py
|
||||
- vllm/v1/executor/uniproc_executor.py
|
||||
- vllm/v1/worker/gpu_worker.py
|
||||
commands:
|
||||
# https://github.com/NVIDIA/nccl/issues/1838
|
||||
#- export NCCL_CUMEM_HOST_ENABLE=0
|
||||
# test with torchrun tp=2 and dp=4 with ep
|
||||
- torchrun --nproc-per-node=8 ../examples/offline_inference/torchrun_dp_example.py --tp-size=2 --pp-size=1 --dp-size=4 --enable-ep
|
||||
|
||||
- label: EPLB Algorithm Test # 5min
|
||||
mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
|
||||
agent_pool: mi325_1
|
||||
grade: Blocking
|
||||
timeout_in_minutes: 15
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
source_file_dependencies:
|
||||
@@ -238,11 +264,11 @@ steps:
|
||||
commands:
|
||||
- pytest -v -s distributed/test_eplb_algo.py
|
||||
|
||||
- label: EPLB Execution Test # 5min
|
||||
- label: EPLB Execution Test # 10min
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
agent_pool: mi325_4
|
||||
# grade: Blocking
|
||||
timeout_in_minutes: 15
|
||||
timeout_in_minutes: 20
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
num_gpus: 4
|
||||
source_file_dependencies:
|
||||
@@ -250,6 +276,7 @@ steps:
|
||||
- tests/distributed/test_eplb_execute.py
|
||||
commands:
|
||||
- pytest -v -s distributed/test_eplb_execute.py
|
||||
- pytest -v -s distributed/test_eplb_spec_decode.py
|
||||
|
||||
- label: Metrics, Tracing Test # 12min
|
||||
timeout_in_minutes: 20
|
||||
@@ -273,7 +300,7 @@ steps:
|
||||
|
||||
- label: Regression Test # 7min
|
||||
timeout_in_minutes: 20
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
|
||||
agent_pool: mi325_1
|
||||
grade: Blocking
|
||||
source_file_dependencies:
|
||||
@@ -284,23 +311,20 @@ steps:
|
||||
- pytest -v -s test_regression.py
|
||||
working_dir: "/vllm-workspace/tests" # optional
|
||||
|
||||
- label: Engine Test # 25min
|
||||
timeout_in_minutes: 40
|
||||
- label: Engine Test # 9min
|
||||
timeout_in_minutes: 15
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
agent_pool: mi325_1
|
||||
#grade: Blocking
|
||||
# grade: Blocking
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/engine
|
||||
- tests/tokenization
|
||||
- tests/test_sequence
|
||||
- tests/test_config
|
||||
- tests/test_logger
|
||||
- tests/test_vllm_port
|
||||
commands:
|
||||
- pytest -v -s engine test_sequence.py test_config.py test_logger.py test_vllm_port.py
|
||||
# OOM in the CI unless we run this separately
|
||||
- pytest -v -s tokenization
|
||||
|
||||
- label: V1 Test e2e + engine # 30min
|
||||
timeout_in_minutes: 45
|
||||
@@ -318,9 +342,9 @@ steps:
|
||||
|
||||
- label: V1 Test entrypoints # 35min
|
||||
timeout_in_minutes: 50
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
|
||||
agent_pool: mi325_1
|
||||
# grade: Blocking
|
||||
grade: Blocking
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/v1
|
||||
@@ -337,6 +361,7 @@ steps:
|
||||
- tests/v1
|
||||
commands:
|
||||
# split the test to avoid interference
|
||||
- uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
|
||||
- pytest -v -s -m 'not cpu_test' v1/core
|
||||
- pytest -v -s v1/executor
|
||||
- pytest -v -s v1/kv_offload
|
||||
@@ -348,14 +373,52 @@ steps:
|
||||
- pytest -v -s -m 'not cpu_test' v1/metrics
|
||||
- pytest -v -s v1/test_oracle.py
|
||||
- pytest -v -s v1/test_request.py
|
||||
- pytest -v -s v1/test_outputs.py
|
||||
# Integration test for streaming correctness (requires special branch).
|
||||
- pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api
|
||||
- pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
|
||||
|
||||
- label: V1 Test others (CPU) # 5 mins
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
# TODO: Add the "V1 Test attetion (MI300)" test group
|
||||
|
||||
- label: V1 Test attention (H100) # 10min
|
||||
mirror_hardwares: [amdexperimental]
|
||||
agent_pool: mi325_1
|
||||
# grade: Blocking
|
||||
timeout_in_minutes: 30
|
||||
gpu: h100
|
||||
source_file_dependencies:
|
||||
- vllm/v1/attention
|
||||
- tests/v1/attention
|
||||
commands:
|
||||
- pytest -v -s v1/attention
|
||||
|
||||
- label: Batch Invariance Tests (H100) # 10min
|
||||
mirror_hardwares: [amdexperimental]
|
||||
agent_pool: mi325_1
|
||||
timeout_in_minutes: 25
|
||||
gpu: h100
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/v1/determinism/
|
||||
commands:
|
||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||
- pip install pytest-timeout pytest-forked
|
||||
- pytest -v -s v1/determinism/test_batch_invariance.py
|
||||
- pytest -v -s v1/determinism/test_rms_norm_batch_invariant.py
|
||||
|
||||
- label: V1 Test attention (B200) # 10min
|
||||
timeout_in_minutes: 30
|
||||
gpu: b200
|
||||
source_file_dependencies:
|
||||
- vllm/v1/attention
|
||||
- tests/v1/attention
|
||||
commands:
|
||||
- VLLM_DISABLE_FLASHINFER_PREFILL=1 pytest -v -s v1/attention # TODO: FI prefill is bugged and causes incorrectness, fix this
|
||||
|
||||
- label: V1 Test others (CPU) # 5 mins
|
||||
mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
|
||||
agent_pool: mi325_1
|
||||
grade: Blocking
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/v1
|
||||
@@ -441,13 +504,13 @@ steps:
|
||||
--ignore=lora/test_llm_with_multi_loras.py \
|
||||
--ignore=lora/test_olmoe_tp.py \
|
||||
--ignore=lora/test_deepseekv2_tp.py \
|
||||
--ignore=lora/test_gptoss.py \
|
||||
--ignore=lora/test_gptoss_tp.py \
|
||||
--ignore=lora/test_qwen3moe_tp.py
|
||||
parallelism: 4
|
||||
|
||||
- label: PyTorch Compilation Unit Tests # 15min
|
||||
timeout_in_minutes: 30
|
||||
mirror_hardwares: [amdexperimental]
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
agent_pool: mi325_1
|
||||
# grade: Blocking
|
||||
torch_nightly: true
|
||||
@@ -455,33 +518,15 @@ steps:
|
||||
- vllm/
|
||||
- tests/compile
|
||||
commands:
|
||||
- pytest -v -s compile/test_pass_manager.py
|
||||
- pytest -v -s compile/test_fusion.py
|
||||
- pytest -v -s compile/test_fusion_attn.py
|
||||
- pytest -v -s compile/test_functionalization.py
|
||||
- pytest -v -s compile/test_silu_mul_quant_fusion.py
|
||||
# - pytest -v -s compile/test_sequence_parallelism.py
|
||||
# - pytest -v -s compile/test_async_tp.py
|
||||
- pytest -v -s compile/test_fusion_all_reduce.py
|
||||
- pytest -v -s compile/test_decorator.py
|
||||
- pytest -v -s compile/test_noop_elimination.py
|
||||
- pytest -v -s compile/test_aot_compile.py
|
||||
# Run unit tests defined directly under compile/,
|
||||
# not including subdirectories, which are usually heavier
|
||||
# tests covered elsewhere.
|
||||
# Use `find` to launch multiple instances of pytest so that
|
||||
# they do not suffer from https://github.com/vllm-project/vllm/issues/28965
|
||||
- "find compile/ -maxdepth 1 -name 'test_*.py' -exec pytest -s -v {} \\\\;"
|
||||
|
||||
- label: PyTorch Fullgraph Smoke Test # 15min
|
||||
timeout_in_minutes: 30
|
||||
mirror_hardwares: [amdexperimental]
|
||||
agent_pool: mi325_1
|
||||
# grade: Blocking
|
||||
torch_nightly: true
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/compile
|
||||
commands:
|
||||
- pytest -v -s compile/test_basic_correctness.py
|
||||
- pytest -v -s compile/piecewise/
|
||||
|
||||
- label: PyTorch Fullgraph Test # 22min
|
||||
timeout_in_minutes: 35
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
agent_pool: mi325_1
|
||||
# grade: Blocking
|
||||
@@ -490,8 +535,39 @@ steps:
|
||||
- vllm/
|
||||
- tests/compile
|
||||
commands:
|
||||
- pytest -v -s compile/test_full_graph.py
|
||||
- pytest -v -s compile/test_fusions_e2e.py
|
||||
# Run smoke tests under fullgraph directory, except test_full_graph.py
|
||||
# as it is a heavy test that is covered in other steps.
|
||||
# Use `find` to launch multiple instances of pytest so that
|
||||
# they do not suffer from https://github.com/vllm-project/vllm/issues/28965
|
||||
- "find compile/fullgraph/ -name 'test_*.py' -not -name 'test_full_graph.py' -exec pytest -s -v {} \\\\;"
|
||||
|
||||
- label: PyTorch Fullgraph Test # 27min
|
||||
timeout_in_minutes: 40
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
agent_pool: mi325_1
|
||||
# grade: Blocking
|
||||
torch_nightly: true
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/compile
|
||||
commands:
|
||||
- pytest -v -s compile/fullgraph/test_full_graph.py -k 'not test_fp8_kv_scale_compile'
|
||||
# Limit to no custom ops to reduce running time
|
||||
# Wrap with quotes to escape yaml and avoid starting -k string with a -
|
||||
- "pytest -v -s compile/distributed/test_fusions_e2e.py -k 'TRITON and not +quant_fp8 and not Llama-4'"
|
||||
|
||||
- label: Cudagraph test
|
||||
timeout_in_minutes: 20
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
agent_pool: mi325_1
|
||||
source_file_dependencies:
|
||||
- tests/v1/cudagraph
|
||||
- vllm/v1/cudagraph_dispatcher.py
|
||||
- vllm/config/compilation.py
|
||||
- vllm/compilation
|
||||
commands:
|
||||
- pytest -v -s v1/cudagraph/test_cudagraph_dispatch.py
|
||||
- pytest -v -s v1/cudagraph/test_cudagraph_mode.py
|
||||
|
||||
- label: Kernels Core Operation Test # 48min
|
||||
timeout_in_minutes: 75
|
||||
@@ -507,7 +583,7 @@ steps:
|
||||
|
||||
- label: Kernels Attention Test %N # 23min
|
||||
timeout_in_minutes: 35
|
||||
mirror_hardwares: [amdexperimental]
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
agent_pool: mi325_8
|
||||
# grade: Blocking
|
||||
source_file_dependencies:
|
||||
@@ -534,7 +610,7 @@ steps:
|
||||
|
||||
- label: Kernels MoE Test %N # 40min
|
||||
timeout_in_minutes: 60
|
||||
mirror_hardwares: [amdexperimental]
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
agent_pool: mi325_8
|
||||
# grade: Blocking
|
||||
source_file_dependencies:
|
||||
@@ -543,6 +619,8 @@ steps:
|
||||
- tests/kernels/moe
|
||||
- vllm/model_executor/layers/fused_moe/
|
||||
- vllm/distributed/device_communicators/
|
||||
- vllm/envs.py
|
||||
- vllm/config
|
||||
commands:
|
||||
- pytest -v -s kernels/moe --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
|
||||
parallelism: 2
|
||||
@@ -559,12 +637,35 @@ steps:
|
||||
commands:
|
||||
- pytest -v -s kernels/mamba
|
||||
|
||||
- label: Kernels DeepGEMM Test (H100) # Nvidia-centric
|
||||
# Not replicating for CUTLAS & CuTe
|
||||
timeout_in_minutes: 45
|
||||
gpu: h100
|
||||
num_gpus: 1
|
||||
source_file_dependencies:
|
||||
- tools/install_deepgemm.sh
|
||||
- vllm/utils/deep_gemm.py
|
||||
- vllm/model_executor/layers/fused_moe
|
||||
- vllm/model_executor/layers/quantization
|
||||
- tests/kernels/quantization/test_block_fp8.py
|
||||
- tests/kernels/moe/test_deepgemm.py
|
||||
- tests/kernels/moe/test_batched_deepgemm.py
|
||||
- tests/kernels/attention/test_deepgemm_attention.py
|
||||
commands:
|
||||
- pytest -v -s kernels/quantization/test_block_fp8.py -k deep_gemm
|
||||
- pytest -v -s kernels/moe/test_deepgemm.py
|
||||
- pytest -v -s kernels/moe/test_batched_deepgemm.py
|
||||
- pytest -v -s kernels/attention/test_deepgemm_attention.py
|
||||
|
||||
- label: Model Executor Test # 23min
|
||||
timeout_in_minutes: 35
|
||||
torch_nightly: true
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
agent_pool: mi325_1
|
||||
# grade: Blocking
|
||||
source_file_dependencies:
|
||||
- vllm/engine/arg_utils.py
|
||||
- vllm/config/model.py
|
||||
- vllm/model_executor
|
||||
- tests/model_executor
|
||||
- tests/entrypoints/openai/test_tensorizer_entrypoint.py
|
||||
@@ -616,9 +717,9 @@ steps:
|
||||
- uv pip install --system torchao==0.13.0
|
||||
- VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py
|
||||
|
||||
- label: LM Eval Small Models # 53min
|
||||
timeout_in_minutes: 75
|
||||
mirror_hardwares: [amdexperimental]
|
||||
- label: LM Eval Small Models # 15min
|
||||
timeout_in_minutes: 20
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
agent_pool: mi325_1
|
||||
# grade: Blocking
|
||||
source_file_dependencies:
|
||||
@@ -627,16 +728,17 @@ steps:
|
||||
commands:
|
||||
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt --tp-size=1
|
||||
|
||||
- label: OpenAI API correctness # 22min
|
||||
timeout_in_minutes: 30
|
||||
mirror_hardwares: [amdexperimental]
|
||||
- label: OpenAI API correctness # 10min
|
||||
timeout_in_minutes: 15
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
agent_pool: mi325_1
|
||||
# grade: Blocking
|
||||
source_file_dependencies:
|
||||
- csrc/
|
||||
- vllm/entrypoints/openai/
|
||||
- vllm/model_executor/models/whisper.py
|
||||
commands: # LMEval+Transcription WER check
|
||||
commands: # LMEval
|
||||
# Transcription WER check is skipped because encoder-decoder models are not supported on ROCm, see https://github.com/vllm-project/vllm/issues/27442
|
||||
- pytest -s entrypoints/openai/correctness/
|
||||
|
||||
- label: OpenAI-Compatible Tool Use # 23 min
|
||||
@@ -686,6 +788,7 @@ steps:
|
||||
torch_nightly: true
|
||||
source_file_dependencies:
|
||||
- vllm/model_executor/models/
|
||||
- vllm/transformers_utils/
|
||||
- tests/models/test_initialization.py
|
||||
commands:
|
||||
# Only when vLLM model source is modified - test initialization of a large
|
||||
@@ -858,10 +961,11 @@ steps:
|
||||
- pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing
|
||||
- cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model # Otherwise, mp_method="spawn" doesn't work
|
||||
|
||||
- label: Multi-Modal Accuracy Eval (Small Models) # 50min
|
||||
mirror_hardwares: [amdexperimental]
|
||||
agent_pool: mi325_1
|
||||
- label: Multi-Modal Accuracy Eval (Small Models) # 10min
|
||||
timeout_in_minutes: 70
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
agent_pool: mi325_1
|
||||
# grade: Blocking
|
||||
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
|
||||
source_file_dependencies:
|
||||
- vllm/multimodal/
|
||||
@@ -932,16 +1036,17 @@ steps:
|
||||
- label: Transformers Nightly Models Test
|
||||
mirror_hardwares: [amdexperimental]
|
||||
agent_pool: mi325_1
|
||||
# grade: Blocking
|
||||
working_dir: "/vllm-workspace/"
|
||||
optional: true
|
||||
commands:
|
||||
- pip install --upgrade git+https://github.com/huggingface/transformers
|
||||
- pytest -v -s tests/models/test_initialization.py
|
||||
- pytest -v -s tests/models/test_initialization.py -k 'not (Gemma3 or ModernBert or Qwen2_5_VL or Qwen2_5vl or Qwen2VL or TransformersMultiModalEmbeddingModel or TransformersMultiModalForSequenceClassification or Ultravox or Phi4Multimodal or LlavaNextVideo or MiniCPMO or Lfm2Moe or PaliGemma or RobertaForSequenceClassification or Ovis2_5 or Fuyu or DeepseekOCR or KimiVL)'
|
||||
- pytest -v -s tests/models/test_transformers.py
|
||||
- pytest -v -s tests/models/multimodal/processing/
|
||||
- pytest -v -s tests/models/multimodal/test_mapping.py
|
||||
# - pytest -v -s tests/models/multimodal/processing/
|
||||
- pytest -v -s tests/models/multimodal/test_mapping.py -k 'not (Gemma3 or Qwen2VL or Qwen2_5_VL)'
|
||||
- python3 examples/offline_inference/basic/chat.py
|
||||
- python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
|
||||
# - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
|
||||
# Whisper needs spawn method to avoid deadlock
|
||||
- VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper
|
||||
|
||||
@@ -959,11 +1064,16 @@ steps:
|
||||
- vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py
|
||||
- vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
|
||||
- vllm/v1/attention/backends/flashinfer.py
|
||||
- vllm/v1/attention/backends/mla/cutlass_mla.py
|
||||
- vllm/v1/attention/backends/mla/flashinfer_mla.py
|
||||
- vllm/platforms/cuda.py
|
||||
- vllm/attention/selector.py
|
||||
commands:
|
||||
- nvidia-smi
|
||||
- python3 examples/offline_inference/basic/chat.py
|
||||
# Attention
|
||||
# num_heads2 broken by https://github.com/flashinfer-ai/flashinfer/issues/1353
|
||||
- pytest -v -s tests/kernels/attention/test_attention_selector.py
|
||||
- pytest -v -s tests/kernels/attention/test_flashinfer.py -k 'not num_heads2'
|
||||
- pytest -v -s tests/kernels/attention/test_flashinfer_trtllm_attention.py
|
||||
- pytest -v -s tests/kernels/attention/test_cutlass_mla_decode.py
|
||||
@@ -980,8 +1090,9 @@ steps:
|
||||
- pytest -v -s tests/kernels/moe/test_nvfp4_moe.py
|
||||
- pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py
|
||||
- pytest -v -s tests/kernels/moe/test_flashinfer.py
|
||||
- pytest -v -s tests/kernels/moe/test_cutedsl_moe.py
|
||||
|
||||
- label: Blackwell Fusion Tests # 30 min
|
||||
- label: Blackwell Fusion and Compile Tests # 30 min
|
||||
timeout_in_minutes: 40
|
||||
working_dir: "/vllm-workspace/"
|
||||
gpu: b200
|
||||
@@ -989,23 +1100,58 @@ steps:
|
||||
- csrc/quantization/fp4/
|
||||
- vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
|
||||
- vllm/v1/attention/backends/flashinfer.py
|
||||
- vllm/v1/worker/
|
||||
- vllm/v1/cudagraph_dispatcher.py
|
||||
- vllm/compilation/
|
||||
# can affect pattern matching
|
||||
- vllm/model_executor/layers/layernorm.py
|
||||
- vllm/model_executor/layers/activation.py
|
||||
- vllm/model_executor/layers/quantization/input_quant_fp8.py
|
||||
- vllm/model_executor/layers/fused_moe/layer.py
|
||||
- tests/compile/test_fusion_attn.py
|
||||
- tests/compile/test_silu_mul_quant_fusion.py
|
||||
- tests/compile/distributed/test_fusion_all_reduce.py
|
||||
- tests/compile/distributed/test_fusions_e2e.py
|
||||
- tests/compile/fullgraph/test_full_graph.py
|
||||
commands:
|
||||
- nvidia-smi
|
||||
- pytest -v -s tests/compile/test_fusion_attn.py
|
||||
- pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
|
||||
# this runner has 2 GPUs available even though num_gpus=2 is not set
|
||||
- pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
|
||||
# Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time
|
||||
# Wrap with quotes to escape yaml
|
||||
- "pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and not +quant_fp8 and not +rms_norm'"
|
||||
# test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40)
|
||||
- pytest -v -s tests/compile/fullgraph/test_full_graph.py::test_fp8_kv_scale_compile
|
||||
|
||||
- label: Blackwell Fusion E2E Tests # 30 min
|
||||
timeout_in_minutes: 40
|
||||
working_dir: "/vllm-workspace/"
|
||||
gpu: b200
|
||||
optional: true
|
||||
num_gpus: 2
|
||||
source_file_dependencies:
|
||||
- csrc/quantization/fp4/
|
||||
- vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
|
||||
- vllm/v1/attention/backends/flashinfer.py
|
||||
- vllm/compilation/
|
||||
# can affect pattern matching
|
||||
- vllm/model_executor/layers/layernorm.py
|
||||
- vllm/model_executor/layers/activation.py
|
||||
- vllm/model_executor/layers/quantization/input_quant_fp8.py
|
||||
- tests/compile/distributed/test_fusions_e2e.py
|
||||
- tests/compile/fullgraph/test_full_graph.py
|
||||
commands:
|
||||
- nvidia-smi
|
||||
- pytest -v -s tests/compile/test_fusion_attn.py
|
||||
- pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
|
||||
# this runner has 2 GPUs available even though num_gpus=2 is not set
|
||||
- pytest -v -s tests/compile/test_fusion_all_reduce.py
|
||||
- pytest -v -s tests/compile/test_fusions_e2e.py
|
||||
# Run all e2e fusion tests
|
||||
- pytest -v -s tests/compile/distributed/test_fusions_e2e.py
|
||||
|
||||
- label: Blackwell GPT-OSS Eval
|
||||
- label: ROCm GPT-OSS Eval
|
||||
timeout_in_minutes: 60
|
||||
working_dir: "/vllm-workspace/"
|
||||
gpu: b200
|
||||
agent_pool: mi325_1
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
optional: true # run on nightlies
|
||||
source_file_dependencies:
|
||||
- tests/evals/gpt_oss
|
||||
@@ -1014,7 +1160,7 @@ steps:
|
||||
- vllm/v1/attention/backends/flashinfer.py
|
||||
commands:
|
||||
- uv pip install --system 'gpt-oss[eval]==0.0.5'
|
||||
- pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58
|
||||
- VLLM_ROCM_USE_AITER_MHA=0 VLLM_ROCM_USE_AITER=1 VLLM_USE_AITER_UNIFIED_ATTENTION=1 pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58
|
||||
|
||||
- label: Blackwell Quantized MoE Test
|
||||
timeout_in_minutes: 60
|
||||
@@ -1104,7 +1250,7 @@ steps:
|
||||
- vllm/worker/worker_base.py
|
||||
- vllm/v1/engine/
|
||||
- vllm/v1/worker/
|
||||
- tests/compile/test_basic_correctness.py
|
||||
- tests/compile/fullgraph/test_basic_correctness.py
|
||||
- tests/compile/test_wrapper.py
|
||||
- tests/distributed/
|
||||
- tests/entrypoints/llm/test_collective_rpc.py
|
||||
@@ -1114,10 +1260,11 @@ steps:
|
||||
- tests/v1/worker/test_worker_memory_snapshot.py
|
||||
commands:
|
||||
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
|
||||
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
|
||||
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
|
||||
- DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py
|
||||
- pytest -v -s entrypoints/llm/test_collective_rpc.py
|
||||
- pytest -v -s ./compile/test_basic_correctness.py
|
||||
- pytest -v -s ./compile/fullgraph/test_basic_correctness.py
|
||||
- pytest -v -s ./compile/test_wrapper.py
|
||||
- VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
|
||||
- VLLM_TEST_SAME_HOST=1 VLLM_TEST_WITH_DEFAULT_DEVICE_SET=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
|
||||
@@ -1149,7 +1296,7 @@ steps:
|
||||
|
||||
- label: Plugin Tests (2 GPUs) # 40min
|
||||
timeout_in_minutes: 60
|
||||
mirror_hardwares: [amdexperimental]
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
agent_pool: mi325_2
|
||||
# grade: Blocking
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
@@ -1218,9 +1365,14 @@ steps:
|
||||
- pytest -v -s -x lora/test_llm_with_multi_loras.py
|
||||
- pytest -v -s -x lora/test_olmoe_tp.py
|
||||
|
||||
# Disabled for now because MXFP4 backend on non-cuda platform
|
||||
# doesn't support LoRA yet
|
||||
#- pytest -v -s -x lora/test_gptoss_tp.py
|
||||
|
||||
|
||||
- label: Weight Loading Multiple GPU Test # 33min
|
||||
timeout_in_minutes: 45
|
||||
mirror_hardwares: [amdexperimental]
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
agent_pool: mi325_2
|
||||
# grade: Blocking
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
@@ -1230,7 +1382,7 @@ steps:
|
||||
- vllm/
|
||||
- tests/weight_loading
|
||||
commands:
|
||||
- bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models.txt
|
||||
- bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-amd.txt
|
||||
|
||||
- label: Weight Loading Multiple GPU Test - Large Models # optional
|
||||
mirror_hardwares: [amdexperimental]
|
||||
@@ -1238,17 +1390,17 @@ steps:
|
||||
# grade: Blocking
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
num_gpus: 2
|
||||
gpu: a100
|
||||
optional: true
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/weight_loading
|
||||
commands:
|
||||
- bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large.txt
|
||||
- bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large-amd.txt
|
||||
|
||||
- label: NixlConnector PD accuracy tests (Distributed) # 30min
|
||||
mirror_hardwares: [amdexperimental]
|
||||
agent_pool: mi325_4
|
||||
# grade: Blocking
|
||||
timeout_in_minutes: 30
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
num_gpus: 4
|
||||
@@ -1263,6 +1415,9 @@ steps:
|
||||
##### A100 test #####
|
||||
|
||||
- label: Distributed Tests (A100) # optional
|
||||
mirror_hardwares: [amdexperimental]
|
||||
agent_pool: mi325_4
|
||||
# grade: Blocking
|
||||
gpu: a100
|
||||
optional: true
|
||||
num_gpus: 4
|
||||
@@ -1277,6 +1432,9 @@ steps:
|
||||
- pytest -v -s -x lora/test_mixtral.py
|
||||
|
||||
- label: LM Eval Large Models # optional
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
agent_pool: mi325_4
|
||||
# grade: Blocking
|
||||
gpu: a100
|
||||
optional: true
|
||||
num_gpus: 4
|
||||
@@ -1288,19 +1446,41 @@ steps:
|
||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4
|
||||
|
||||
##### H100 test #####
|
||||
- label: LM Eval Large Models (H100) # optional
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
agent_pool: mi325_4
|
||||
# grade: Blocking
|
||||
gpu: h100
|
||||
optional: true
|
||||
num_gpus: 4
|
||||
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
|
||||
source_file_dependencies:
|
||||
- csrc/
|
||||
- vllm/model_executor/layers/quantization
|
||||
commands:
|
||||
- export VLLM_USE_DEEP_GEMM=0 # We found Triton is faster than DeepGEMM for H100
|
||||
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt --tp-size=4
|
||||
|
||||
##### H200 test #####
|
||||
- label: Distributed Tests (H200) # optional
|
||||
mirror_hardwares: [amdexperimental]
|
||||
agent_pool: mi325_2
|
||||
# grade: Blocking
|
||||
gpu: h200
|
||||
optional: true
|
||||
working_dir: "/vllm-workspace/"
|
||||
num_gpus: 2
|
||||
commands:
|
||||
- pytest -v -s tests/compile/test_async_tp.py
|
||||
- pytest -v -s tests/compile/test_sequence_parallelism.py
|
||||
- pytest -v -s tests/compile/test_fusion_all_reduce.py
|
||||
- pytest -v -s tests/compile/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm
|
||||
- pytest -v -s tests/compile/distributed/test_async_tp.py
|
||||
- pytest -v -s tests/compile/distributed/test_sequence_parallelism.py
|
||||
- pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
|
||||
#- pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm
|
||||
- "pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'"
|
||||
- pytest -v -s tests/distributed/test_sequence_parallel.py
|
||||
- pytest -v -s tests/distributed/test_context_parallel.py
|
||||
- CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048
|
||||
- pytest -v -s tests/v1/distributed/test_dbo.py
|
||||
|
||||
##### B200 test #####
|
||||
- label: Distributed Tests (B200) # optional
|
||||
@@ -1311,6 +1491,7 @@ steps:
|
||||
commands:
|
||||
- pytest -v -s tests/distributed/test_context_parallel.py
|
||||
- pytest -v -s tests/distributed/test_nccl_symm_mem_allreduce.py
|
||||
- pytest -v -s tests/v1/distributed/test_dbo.py
|
||||
|
||||
##### RL Integration Tests #####
|
||||
- label: Prime-RL Integration Test # 15min
|
||||
@@ -1326,3 +1507,36 @@ steps:
|
||||
- .buildkite/scripts/run-prime-rl-test.sh
|
||||
commands:
|
||||
- bash .buildkite/scripts/run-prime-rl-test.sh
|
||||
|
||||
- label: DeepSeek V2-Lite Accuracy
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
agent_pool: mi325_4
|
||||
# grade: Blocking
|
||||
timeout_in_minutes: 60
|
||||
gpu: h100
|
||||
optional: true
|
||||
num_gpus: 4
|
||||
working_dir: "/vllm-workspace"
|
||||
commands:
|
||||
- bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh 0.25 200 8010
|
||||
|
||||
- label: Qwen3-30B-A3B-FP8-block Accuracy (H100)
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
agent_pool: mi325_4
|
||||
# grade: Blocking
|
||||
timeout_in_minutes: 60
|
||||
gpu: h100
|
||||
optional: true
|
||||
num_gpus: 4
|
||||
working_dir: "/vllm-workspace"
|
||||
commands:
|
||||
- bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020
|
||||
|
||||
- label: Qwen3-30B-A3B-FP8-block Accuracy (B200)
|
||||
timeout_in_minutes: 60
|
||||
gpu: b200
|
||||
optional: true
|
||||
num_gpus: 2
|
||||
working_dir: "/vllm-workspace"
|
||||
commands:
|
||||
- bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1
|
||||
@@ -25,6 +25,7 @@
|
||||
# and $$BUILDKITE_PARALLEL_JOB_COUNT environment variables.
|
||||
# working_dir(str): specify the place where the command should execute, default to /vllm-workspace/tests
|
||||
# source_file_dependencies(list): the list of prefixes to opt-in the test for, if empty, the test will always run.
|
||||
# autorun_on_main (bool): default to false, if true, the test will run automatically when commit is pushed to main branch.
|
||||
|
||||
# When adding a test
|
||||
# - If the test belongs to an existing group, add it there
|
||||
@@ -56,22 +57,26 @@ steps:
|
||||
- pytest -v -s -m 'not cpu_test' multimodal
|
||||
- pytest -v -s utils_
|
||||
|
||||
- label: Async Engine, Inputs, Utils, Worker Test (CPU) # 4 mins
|
||||
timeout_in_minutes: 10
|
||||
- label: Async Engine, Inputs, Utils, Worker, Config Test (CPU) # 15min
|
||||
timeout_in_minutes: 20
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/test_inputs.py
|
||||
- tests/test_outputs.py
|
||||
- tests/multimodal
|
||||
- tests/standalone_tests/lazy_imports.py
|
||||
- tests/tokenizers_
|
||||
- tests/transformers_utils
|
||||
- tests/config
|
||||
no_gpu: true
|
||||
commands:
|
||||
- python3 standalone_tests/lazy_imports.py
|
||||
- pytest -v -s test_inputs.py
|
||||
- pytest -v -s test_outputs.py
|
||||
- pytest -v -s -m 'cpu_test' multimodal
|
||||
- pytest -v -s tokenizers_
|
||||
- pytest -v -s transformers_utils
|
||||
- pytest -v -s config
|
||||
|
||||
- label: Python-only Installation Test # 10min
|
||||
timeout_in_minutes: 20
|
||||
@@ -164,7 +169,7 @@ steps:
|
||||
- tests/distributed/test_utils
|
||||
- tests/distributed/test_pynccl
|
||||
- tests/distributed/test_events
|
||||
- tests/compile/test_basic_correctness
|
||||
- tests/compile/fullgraph/test_basic_correctness.py
|
||||
- examples/offline_inference/rlhf.py
|
||||
- examples/offline_inference/rlhf_colocate.py
|
||||
- tests/examples/offline_inference/data_parallel.py
|
||||
@@ -189,12 +194,13 @@ steps:
|
||||
# test with internal dp
|
||||
- python3 ../examples/offline_inference/data_parallel.py --enforce-eager
|
||||
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
|
||||
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
|
||||
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
|
||||
- TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_internal_lb_dp.py
|
||||
- TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_hybrid_lb_dp.py
|
||||
- pytest -v -s v1/engine/test_engine_core_client.py::test_kv_cache_events_dp
|
||||
- pytest -v -s distributed/test_utils.py
|
||||
- pytest -v -s compile/test_basic_correctness.py
|
||||
- pytest -v -s compile/fullgraph/test_basic_correctness.py
|
||||
- pytest -v -s distributed/test_pynccl.py
|
||||
- pytest -v -s distributed/test_events.py
|
||||
- pytest -v -s distributed/test_symm_mem_allreduce.py
|
||||
@@ -232,8 +238,8 @@ steps:
|
||||
commands:
|
||||
- pytest -v -s distributed/test_eplb_algo.py
|
||||
|
||||
- label: EPLB Execution Test # 5min
|
||||
timeout_in_minutes: 15
|
||||
- label: EPLB Execution Test # 10min
|
||||
timeout_in_minutes: 20
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
num_gpus: 4
|
||||
source_file_dependencies:
|
||||
@@ -241,6 +247,7 @@ steps:
|
||||
- tests/distributed/test_eplb_execute.py
|
||||
commands:
|
||||
- pytest -v -s distributed/test_eplb_execute.py
|
||||
- pytest -v -s distributed/test_eplb_spec_decode.py
|
||||
|
||||
- label: Metrics, Tracing Test # 12min
|
||||
timeout_in_minutes: 20
|
||||
@@ -271,21 +278,18 @@ steps:
|
||||
- pytest -v -s test_regression.py
|
||||
working_dir: "/vllm-workspace/tests" # optional
|
||||
|
||||
- label: Engine Test # 25min
|
||||
timeout_in_minutes: 40
|
||||
- label: Engine Test # 9min
|
||||
timeout_in_minutes: 15
|
||||
mirror_hardwares: [amdexperimental]
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/engine
|
||||
- tests/tokenization
|
||||
- tests/test_sequence
|
||||
- tests/test_config
|
||||
- tests/test_logger
|
||||
- tests/test_vllm_port
|
||||
commands:
|
||||
- pytest -v -s engine test_sequence.py test_config.py test_logger.py test_vllm_port.py
|
||||
# OOM in the CI unless we run this separately
|
||||
- pytest -v -s tokenization
|
||||
|
||||
- label: V1 Test e2e + engine # 30min
|
||||
timeout_in_minutes: 45
|
||||
@@ -315,6 +319,7 @@ steps:
|
||||
- vllm/
|
||||
- tests/v1
|
||||
commands:
|
||||
- uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
|
||||
# split the test to avoid interference
|
||||
- pytest -v -s -m 'not cpu_test' v1/core
|
||||
- pytest -v -s v1/executor
|
||||
@@ -327,6 +332,7 @@ steps:
|
||||
- pytest -v -s -m 'not cpu_test' v1/metrics
|
||||
- pytest -v -s v1/test_oracle.py
|
||||
- pytest -v -s v1/test_request.py
|
||||
- pytest -v -s v1/test_outputs.py
|
||||
# Integration test for streaming correctness (requires special branch).
|
||||
- pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api
|
||||
- pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
|
||||
@@ -340,6 +346,27 @@ steps:
|
||||
commands:
|
||||
- pytest -v -s v1/attention
|
||||
|
||||
- label: Batch Invariance Tests (H100) # 10min
|
||||
timeout_in_minutes: 25
|
||||
gpu: h100
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/v1/determinism/
|
||||
commands:
|
||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||
- pip install pytest-timeout pytest-forked
|
||||
- pytest -v -s v1/determinism/test_batch_invariance.py
|
||||
- pytest -v -s v1/determinism/test_rms_norm_batch_invariant.py
|
||||
|
||||
- label: V1 Test attention (B200) # 10min
|
||||
timeout_in_minutes: 30
|
||||
gpu: b200
|
||||
source_file_dependencies:
|
||||
- vllm/v1/attention
|
||||
- tests/v1/attention
|
||||
commands:
|
||||
- VLLM_DISABLE_FLASHINFER_PREFILL=1 pytest -v -s v1/attention # TODO: FI prefill is bugged and causes incorrectness, fix this
|
||||
|
||||
- label: V1 Test others (CPU) # 5 mins
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
@@ -417,7 +444,7 @@ steps:
|
||||
--ignore=lora/test_llm_with_multi_loras.py \
|
||||
--ignore=lora/test_olmoe_tp.py \
|
||||
--ignore=lora/test_deepseekv2_tp.py \
|
||||
--ignore=lora/test_gptoss.py \
|
||||
--ignore=lora/test_gptoss_tp.py \
|
||||
--ignore=lora/test_qwen3moe_tp.py
|
||||
|
||||
parallelism: 4
|
||||
@@ -430,15 +457,12 @@ steps:
|
||||
- vllm/
|
||||
- tests/compile
|
||||
commands:
|
||||
- pytest -v -s compile/test_pass_manager.py
|
||||
- pytest -v -s compile/test_fusion.py
|
||||
- pytest -v -s compile/test_fusion_attn.py
|
||||
- pytest -v -s compile/test_functionalization.py
|
||||
- pytest -v -s compile/test_silu_mul_quant_fusion.py
|
||||
- pytest -v -s compile/test_fusion_all_reduce.py
|
||||
- pytest -v -s compile/test_decorator.py
|
||||
- pytest -v -s compile/test_noop_elimination.py
|
||||
- pytest -v -s compile/test_aot_compile.py
|
||||
# Run unit tests defined directly under compile/,
|
||||
# not including subdirectories, which are usually heavier
|
||||
# tests covered elsewhere.
|
||||
# Use `find` to launch multiple instances of pytest so that
|
||||
# they do not suffer from https://github.com/vllm-project/vllm/issues/28965
|
||||
- "find compile/ -maxdepth 1 -name 'test_*.py' -exec pytest -s -v {} \\\\;"
|
||||
|
||||
- label: PyTorch Fullgraph Smoke Test # 15min
|
||||
timeout_in_minutes: 30
|
||||
@@ -448,19 +472,25 @@ steps:
|
||||
- vllm/
|
||||
- tests/compile
|
||||
commands:
|
||||
- pytest -v -s compile/test_basic_correctness.py
|
||||
- pytest -v -s compile/piecewise/
|
||||
# Run smoke tests under fullgraph directory, except test_full_graph.py
|
||||
# as it is a heavy test that is covered in other steps.
|
||||
# Use `find` to launch multiple instances of pytest so that
|
||||
# they do not suffer from https://github.com/vllm-project/vllm/issues/28965
|
||||
- "find compile/fullgraph/ -name 'test_*.py' -not -name 'test_full_graph.py' -exec pytest -s -v {} \\\\;"
|
||||
|
||||
- label: PyTorch Fullgraph Test # 22min
|
||||
timeout_in_minutes: 35
|
||||
- label: PyTorch Fullgraph Test # 27min
|
||||
timeout_in_minutes: 40
|
||||
mirror_hardwares: [amdexperimental]
|
||||
torch_nightly: true
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/compile
|
||||
commands:
|
||||
- pytest -v -s compile/test_full_graph.py
|
||||
- pytest -v -s compile/test_fusions_e2e.py
|
||||
# fp8 kv scales not supported on sm89, tested on Blackwell instead
|
||||
- pytest -v -s compile/fullgraph/test_full_graph.py -k 'not test_fp8_kv_scale_compile'
|
||||
# Limit to no custom ops to reduce running time
|
||||
# Wrap with quotes to escape yaml and avoid starting -k string with a -
|
||||
- "pytest -v -s compile/distributed/test_fusions_e2e.py -k 'TRITON and not +quant_fp8 and not Llama-4'"
|
||||
|
||||
- label: Cudagraph test
|
||||
timeout_in_minutes: 20
|
||||
@@ -532,10 +562,32 @@ steps:
|
||||
commands:
|
||||
- pytest -v -s kernels/mamba
|
||||
|
||||
- label: Kernels DeepGEMM Test (H100)
|
||||
timeout_in_minutes: 45
|
||||
gpu: h100
|
||||
num_gpus: 1
|
||||
source_file_dependencies:
|
||||
- tools/install_deepgemm.sh
|
||||
- vllm/utils/deep_gemm.py
|
||||
- vllm/model_executor/layers/fused_moe
|
||||
- vllm/model_executor/layers/quantization
|
||||
- tests/kernels/quantization/test_block_fp8.py
|
||||
- tests/kernels/moe/test_deepgemm.py
|
||||
- tests/kernels/moe/test_batched_deepgemm.py
|
||||
- tests/kernels/attention/test_deepgemm_attention.py
|
||||
commands:
|
||||
- pytest -v -s kernels/quantization/test_block_fp8.py -k deep_gemm
|
||||
- pytest -v -s kernels/moe/test_deepgemm.py
|
||||
- pytest -v -s kernels/moe/test_batched_deepgemm.py
|
||||
- pytest -v -s kernels/attention/test_deepgemm_attention.py
|
||||
|
||||
- label: Model Executor Test # 23min
|
||||
timeout_in_minutes: 35
|
||||
torch_nightly: true
|
||||
mirror_hardwares: [amdexperimental]
|
||||
source_file_dependencies:
|
||||
- vllm/engine/arg_utils.py
|
||||
- vllm/config/model.py
|
||||
- vllm/model_executor
|
||||
- tests/model_executor
|
||||
- tests/entrypoints/openai/test_tensorizer_entrypoint.py
|
||||
@@ -579,6 +631,7 @@ steps:
|
||||
# we can only upgrade after this is resolved
|
||||
# TODO(jerryzh168): resolve the above comment
|
||||
- uv pip install --system torchao==0.13.0 --index-url https://download.pytorch.org/whl/cu129
|
||||
- uv pip install --system conch-triton-kernels
|
||||
- VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py
|
||||
|
||||
- label: LM Eval Small Models # 53min
|
||||
@@ -587,6 +640,7 @@ steps:
|
||||
source_file_dependencies:
|
||||
- csrc/
|
||||
- vllm/model_executor/layers/quantization
|
||||
autorun_on_main: true
|
||||
commands:
|
||||
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt --tp-size=1
|
||||
|
||||
@@ -638,6 +692,7 @@ steps:
|
||||
torch_nightly: true
|
||||
source_file_dependencies:
|
||||
- vllm/model_executor/models/
|
||||
- vllm/transformers_utils/
|
||||
- tests/models/test_initialization.py
|
||||
commands:
|
||||
# Only when vLLM model source is modified - test initialization of a large
|
||||
@@ -764,14 +819,24 @@ steps:
|
||||
commands:
|
||||
- pytest -v -s models/language/pooling_mteb_test
|
||||
|
||||
- label: Multi-Modal Processor Test # 44min
|
||||
- label: Multi-Modal Processor Test (CPU)
|
||||
timeout_in_minutes: 60
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/models/multimodal
|
||||
no_gpu: true
|
||||
commands:
|
||||
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
|
||||
- pytest -v -s models/multimodal/processing --ignore models/multimodal/processing/test_tensor_schema.py
|
||||
|
||||
- label: Multi-Modal Processor Test
|
||||
timeout_in_minutes: 60
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/models/multimodal
|
||||
commands:
|
||||
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
|
||||
- pytest -v -s models/multimodal/processing
|
||||
- pytest -v -s models/multimodal/processing/test_tensor_schema.py
|
||||
|
||||
- label: Multi-Modal Models Test (Standard) # 60min
|
||||
timeout_in_minutes: 80
|
||||
@@ -848,6 +913,7 @@ steps:
|
||||
- label: Transformers Nightly Models Test
|
||||
working_dir: "/vllm-workspace/"
|
||||
optional: true
|
||||
soft_fail: true
|
||||
commands:
|
||||
- pip install --upgrade git+https://github.com/huggingface/transformers
|
||||
- pytest -v -s tests/models/test_initialization.py
|
||||
@@ -873,11 +939,16 @@ steps:
|
||||
- vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py
|
||||
- vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
|
||||
- vllm/v1/attention/backends/flashinfer.py
|
||||
- vllm/v1/attention/backends/mla/cutlass_mla.py
|
||||
- vllm/v1/attention/backends/mla/flashinfer_mla.py
|
||||
- vllm/platforms/cuda.py
|
||||
- vllm/attention/selector.py
|
||||
commands:
|
||||
- nvidia-smi
|
||||
- python3 examples/offline_inference/basic/chat.py
|
||||
# Attention
|
||||
# num_heads2 broken by https://github.com/flashinfer-ai/flashinfer/issues/1353
|
||||
- pytest -v -s tests/kernels/attention/test_attention_selector.py
|
||||
- pytest -v -s tests/kernels/attention/test_flashinfer.py -k 'not num_heads2'
|
||||
- pytest -v -s tests/kernels/attention/test_flashinfer_trtllm_attention.py
|
||||
- pytest -v -s tests/kernels/attention/test_cutlass_mla_decode.py
|
||||
@@ -894,8 +965,9 @@ steps:
|
||||
- pytest -v -s tests/kernels/moe/test_nvfp4_moe.py
|
||||
- pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py
|
||||
- pytest -v -s tests/kernels/moe/test_flashinfer.py
|
||||
- pytest -v -s tests/kernels/moe/test_cutedsl_moe.py
|
||||
|
||||
- label: Blackwell Fusion Tests # 30 min
|
||||
- label: Blackwell Fusion and Compile Tests # 30 min
|
||||
timeout_in_minutes: 40
|
||||
working_dir: "/vllm-workspace/"
|
||||
gpu: b200
|
||||
@@ -903,18 +975,50 @@ steps:
|
||||
- csrc/quantization/fp4/
|
||||
- vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
|
||||
- vllm/v1/attention/backends/flashinfer.py
|
||||
- vllm/v1/worker/
|
||||
- vllm/v1/cudagraph_dispatcher.py
|
||||
- vllm/compilation/
|
||||
# can affect pattern matching
|
||||
- vllm/model_executor/layers/layernorm.py
|
||||
- vllm/model_executor/layers/activation.py
|
||||
- vllm/model_executor/layers/quantization/input_quant_fp8.py
|
||||
- tests/compile/test_fusion_attn.py
|
||||
- tests/compile/test_silu_mul_quant_fusion.py
|
||||
- tests/compile/distributed/test_fusion_all_reduce.py
|
||||
- tests/compile/distributed/test_fusions_e2e.py
|
||||
- tests/compile/fullgraph/test_full_graph.py
|
||||
commands:
|
||||
- nvidia-smi
|
||||
- pytest -v -s tests/compile/test_fusion_attn.py
|
||||
- pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
|
||||
# this runner has 2 GPUs available even though num_gpus=2 is not set
|
||||
- pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
|
||||
# Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time
|
||||
# Wrap with quotes to escape yaml
|
||||
- "pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and not +quant_fp8 and not +rms_norm'"
|
||||
# test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40)
|
||||
- pytest -v -s tests/compile/fullgraph/test_full_graph.py::test_fp8_kv_scale_compile
|
||||
|
||||
- label: Blackwell Fusion E2E Tests # 30 min
|
||||
timeout_in_minutes: 40
|
||||
working_dir: "/vllm-workspace/"
|
||||
gpu: b200
|
||||
optional: true
|
||||
num_gpus: 2
|
||||
source_file_dependencies:
|
||||
- csrc/quantization/fp4/
|
||||
- vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
|
||||
- vllm/v1/attention/backends/flashinfer.py
|
||||
- vllm/compilation/
|
||||
# can affect pattern matching
|
||||
- vllm/model_executor/layers/layernorm.py
|
||||
- vllm/model_executor/layers/activation.py
|
||||
- vllm/model_executor/layers/quantization/input_quant_fp8.py
|
||||
- tests/compile/distributed/test_fusions_e2e.py
|
||||
commands:
|
||||
- nvidia-smi
|
||||
- pytest -v -s tests/compile/test_fusion_attn.py
|
||||
- pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
|
||||
# this runner has 2 GPUs available even though num_gpus=2 is not set
|
||||
- pytest -v -s tests/compile/test_fusion_all_reduce.py
|
||||
- pytest -v -s tests/compile/test_fusions_e2e.py
|
||||
# Run all e2e fusion tests
|
||||
- pytest -v -s tests/compile/distributed/test_fusions_e2e.py
|
||||
|
||||
- label: Blackwell GPT-OSS Eval
|
||||
timeout_in_minutes: 60
|
||||
@@ -1012,7 +1116,7 @@ steps:
|
||||
- vllm/worker/worker_base.py
|
||||
- vllm/v1/engine/
|
||||
- vllm/v1/worker/
|
||||
- tests/compile/test_basic_correctness.py
|
||||
- tests/compile/fullgraph/test_basic_correctness.py
|
||||
- tests/compile/test_wrapper.py
|
||||
- tests/distributed/
|
||||
- tests/entrypoints/llm/test_collective_rpc.py
|
||||
@@ -1024,10 +1128,11 @@ steps:
|
||||
# https://github.com/NVIDIA/nccl/issues/1838
|
||||
- export NCCL_CUMEM_HOST_ENABLE=0
|
||||
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
|
||||
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
|
||||
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
|
||||
- DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py
|
||||
- pytest -v -s entrypoints/llm/test_collective_rpc.py
|
||||
- pytest -v -s ./compile/test_basic_correctness.py
|
||||
- pytest -v -s ./compile/fullgraph/test_basic_correctness.py
|
||||
- pytest -v -s ./compile/test_wrapper.py
|
||||
- VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
|
||||
- VLLM_TEST_SAME_HOST=1 VLLM_TEST_WITH_DEFAULT_DEVICE_SET=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
|
||||
@@ -1119,6 +1224,7 @@ steps:
|
||||
- pytest -v -s -x lora/test_llama_tp.py
|
||||
- pytest -v -s -x lora/test_llm_with_multi_loras.py
|
||||
- pytest -v -s -x lora/test_olmoe_tp.py
|
||||
- pytest -v -s -x lora/test_gptoss_tp.py
|
||||
|
||||
|
||||
- label: Weight Loading Multiple GPU Test # 33min
|
||||
@@ -1206,12 +1312,14 @@ steps:
|
||||
working_dir: "/vllm-workspace/"
|
||||
num_gpus: 2
|
||||
commands:
|
||||
- pytest -v -s tests/compile/test_async_tp.py
|
||||
- pytest -v -s tests/compile/test_sequence_parallelism.py
|
||||
- pytest -v -s tests/compile/test_fusion_all_reduce.py
|
||||
- pytest -v -s tests/compile/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm
|
||||
- VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_async_tp.py
|
||||
- pytest -v -s tests/compile/distributed/test_sequence_parallelism.py
|
||||
- pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
|
||||
- "VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'"
|
||||
- VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/distributed/test_sequence_parallel.py
|
||||
- pytest -v -s tests/distributed/test_context_parallel.py
|
||||
- CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048
|
||||
- pytest -v -s tests/v1/distributed/test_dbo.py
|
||||
|
||||
##### B200 test #####
|
||||
- label: Distributed Tests (B200) # optional
|
||||
@@ -1222,6 +1330,7 @@ steps:
|
||||
commands:
|
||||
- pytest -v -s tests/distributed/test_context_parallel.py
|
||||
- pytest -v -s tests/distributed/test_nccl_symm_mem_allreduce.py
|
||||
- pytest -v -s tests/v1/distributed/test_dbo.py
|
||||
|
||||
##### RL Integration Tests #####
|
||||
- label: Prime-RL Integration Test # 15min
|
||||
@@ -1234,3 +1343,30 @@ steps:
|
||||
- .buildkite/scripts/run-prime-rl-test.sh
|
||||
commands:
|
||||
- bash .buildkite/scripts/run-prime-rl-test.sh
|
||||
|
||||
- label: DeepSeek V2-Lite Accuracy
|
||||
timeout_in_minutes: 60
|
||||
gpu: h100
|
||||
optional: true
|
||||
num_gpus: 4
|
||||
working_dir: "/vllm-workspace"
|
||||
commands:
|
||||
- bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh 0.25 200 8010
|
||||
|
||||
- label: Qwen3-30B-A3B-FP8-block Accuracy (H100)
|
||||
timeout_in_minutes: 60
|
||||
gpu: h100
|
||||
optional: true
|
||||
num_gpus: 4
|
||||
working_dir: "/vllm-workspace"
|
||||
commands:
|
||||
- bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020
|
||||
|
||||
- label: Qwen3-30B-A3B-FP8-block Accuracy (B200)
|
||||
timeout_in_minutes: 60
|
||||
gpu: b200
|
||||
optional: true
|
||||
num_gpus: 2
|
||||
working_dir: "/vllm-workspace"
|
||||
commands:
|
||||
- bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1
|
||||
61
.github/CODEOWNERS
vendored
61
.github/CODEOWNERS
vendored
@@ -3,13 +3,14 @@
|
||||
|
||||
# This lists cover the "core" components of vLLM that require careful review
|
||||
/vllm/attention @LucasWilkinson
|
||||
/vllm/attention/backends/abstract.py @WoosukKwon @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
|
||||
/vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @22quinn
|
||||
/vllm/attention/backends/abstract.py @WoosukKwon @zhuohan123 @youkaichao @alexm-redhat @njhill
|
||||
/vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-redhat @njhill @22quinn
|
||||
/vllm/model_executor/layers/fused_moe @mgoin @pavanimajety
|
||||
/vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth @yewentao256 @pavanimajety
|
||||
/vllm/model_executor/layers/mamba @tdoublep
|
||||
/vllm/model_executor/model_loader @22quinn
|
||||
/vllm/multimodal @DarkLight1337 @ywang96 @NickLucche
|
||||
/vllm/model_executor/layers/batch_invariant.py @yewentao256
|
||||
/vllm/multimodal @DarkLight1337 @ywang96 @NickLucche @tjtanaa
|
||||
/vllm/vllm_flash_attn @LucasWilkinson
|
||||
/vllm/lora @jeejeelee
|
||||
/vllm/reasoning @aarnphm @chaunceyjiang
|
||||
@@ -20,27 +21,30 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
|
||||
|
||||
# Any change to the VllmConfig changes can have a large user-facing impact,
|
||||
# so spam a lot of people
|
||||
/vllm/config @simon-mo @WoosukKwon @youkaichao @robertgshaw2-redhat @mgoin @tlrmchlsmth @houseroad @hmellor @yewentao256 @ProExpertProg
|
||||
/vllm/config/cache.py @simon-mo @WoosukKwon @youkaichao @robertgshaw2-redhat @mgoin @tlrmchlsmth @houseroad @hmellor @yewentao256 @ProExpertProg @heheda12345
|
||||
/vllm/config @WoosukKwon @youkaichao @robertgshaw2-redhat @mgoin @tlrmchlsmth @houseroad @hmellor @yewentao256 @ProExpertProg
|
||||
/vllm/config/cache.py @WoosukKwon @youkaichao @robertgshaw2-redhat @mgoin @tlrmchlsmth @houseroad @hmellor @yewentao256 @ProExpertProg @heheda12345
|
||||
|
||||
# vLLM V1
|
||||
/vllm/v1/attention @LucasWilkinson
|
||||
/vllm/v1/attention/backends/mla @pavanimajety
|
||||
/vllm/v1/attention/backends/flashinfer.py @mgoin @pavanimajety
|
||||
/vllm/v1/attention/backends/triton_attn.py @tdoublep
|
||||
/vllm/v1/core @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat @heheda12345 @ApostaC
|
||||
/vllm/v1/core @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @alexm-redhat @heheda12345 @ApostaC
|
||||
/vllm/v1/sample @22quinn @houseroad @njhill
|
||||
/vllm/v1/spec_decode @benchislett @luccafong
|
||||
/vllm/v1/structured_output @mgoin @russellb @aarnphm @benchislett
|
||||
/vllm/v1/kv_cache_interface.py @heheda12345
|
||||
/vllm/v1/offloading @ApostaC
|
||||
|
||||
# Model runner V2
|
||||
/vllm/v1/worker/gpu @WoosukKwon
|
||||
|
||||
# Test ownership
|
||||
/.buildkite/lm-eval-harness @mgoin @simon-mo
|
||||
/.buildkite/lm-eval-harness @mgoin
|
||||
/tests/distributed/test_multi_node_assignment.py @youkaichao
|
||||
/tests/distributed/test_pipeline_parallel.py @youkaichao
|
||||
/tests/distributed/test_same_node.py @youkaichao
|
||||
/tests/entrypoints @DarkLight1337 @robertgshaw2-redhat @simon-mo @aarnphm @NickLucche
|
||||
/tests/entrypoints @DarkLight1337 @robertgshaw2-redhat @aarnphm @NickLucche
|
||||
/tests/evals @mgoin
|
||||
/tests/kernels @mgoin @tlrmchlsmth @WoosukKwon @yewentao256
|
||||
/tests/models @DarkLight1337 @ywang96
|
||||
@@ -49,18 +53,29 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
|
||||
/tests/test_inputs.py @DarkLight1337 @ywang96
|
||||
/tests/v1/entrypoints/llm/test_struct_output_generate.py @mgoin @russellb @aarnphm
|
||||
/tests/v1/structured_output @mgoin @russellb @aarnphm
|
||||
/tests/v1/core @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat @heheda12345 @ApostaC
|
||||
/tests/v1/core @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @alexm-redhat @heheda12345 @ApostaC
|
||||
/tests/weight_loading @mgoin @youkaichao @yewentao256
|
||||
/tests/lora @jeejeelee
|
||||
/tests/models/language/generation/test_hybrid.py @tdoublep
|
||||
/tests/v1/kv_connector/nixl_integration @NickLucche
|
||||
/tests/v1/kv_connector @ApostaC
|
||||
/tests/v1/offloading @ApostaC
|
||||
/tests/v1/determinism @yewentao256
|
||||
|
||||
# Transformers backend
|
||||
# Transformers modeling backend
|
||||
/vllm/model_executor/models/transformers @hmellor
|
||||
/tests/models/test_transformers.py @hmellor
|
||||
|
||||
# Observability
|
||||
/vllm/config/observability.py @markmc
|
||||
/vllm/v1/metrics @markmc
|
||||
/tests/v1/metrics @markmc
|
||||
/vllm/tracing.py @markmc
|
||||
/tests/v1/tracing/test_tracing.py @markmc
|
||||
/vllm/config/kv_events.py @markmc
|
||||
/vllm/distributed/kv_events.py @markmc
|
||||
/tests/distributed/test_events.py @markmc
|
||||
|
||||
# Docs
|
||||
/docs/mkdocs @hmellor
|
||||
/docs/**/*.yml @hmellor
|
||||
@@ -105,11 +120,21 @@ mkdocs.yaml @hmellor
|
||||
/vllm/attention/ops/triton_unified_attention.py @tdoublep
|
||||
|
||||
# ROCm related: specify owner with write access to notify AMD folks for careful code review
|
||||
/docker/Dockerfile.rocm* @gshtras
|
||||
/vllm/v1/attention/backends/rocm*.py @gshtras
|
||||
/vllm/v1/attention/backends/mla/rocm*.py @gshtras
|
||||
/vllm/attention/ops/rocm*.py @gshtras
|
||||
/vllm/model_executor/layers/fused_moe/rocm*.py @gshtras
|
||||
/vllm/**/*rocm* @tjtanaa
|
||||
/docker/Dockerfile.rocm* @gshtras @tjtanaa
|
||||
/vllm/v1/attention/backends/rocm*.py @gshtras @tjtanaa
|
||||
/vllm/v1/attention/backends/mla/rocm*.py @gshtras @tjtanaa
|
||||
/vllm/attention/ops/rocm*.py @gshtras @tjtanaa
|
||||
/vllm/model_executor/layers/fused_moe/rocm*.py @gshtras @tjtanaa
|
||||
/csrc/rocm @gshtras @tjtanaa
|
||||
/requirements/*rocm* @tjtanaa
|
||||
/tests/**/*rocm* @tjtanaa
|
||||
/docs/**/*rocm* @tjtanaa
|
||||
/vllm/**/*quark* @tjtanaa
|
||||
/tests/**/*quark* @tjtanaa
|
||||
/docs/**/*quark* @tjtanaa
|
||||
/vllm/**/*aiter* @tjtanaa
|
||||
/tests/**/*aiter* @tjtanaa
|
||||
|
||||
# TPU
|
||||
/vllm/v1/worker/tpu* @NickLucche
|
||||
@@ -124,6 +149,12 @@ mkdocs.yaml @hmellor
|
||||
/examples/*/pooling/ @noooop
|
||||
/tests/models/*/pooling* @noooop
|
||||
/tests/entrypoints/pooling @noooop
|
||||
/vllm/entrypoints/pooling @aarnphm @chaunceyjiang @noooop
|
||||
/vllm/config/pooler.py @noooop
|
||||
/vllm/pooling_params.py @noooop
|
||||
/vllm/model_executor/layers/pooler.py @noooop
|
||||
|
||||
# Security guide and policies
|
||||
/docs/usage/security.md @russellb
|
||||
/SECURITY.md @russellb
|
||||
/docs/contributing/vulnerability_management.md @russellb
|
||||
|
||||
17
.github/mergify.yml
vendored
17
.github/mergify.yml
vendored
@@ -151,6 +151,23 @@ pull_request_rules:
|
||||
add:
|
||||
- gpt-oss
|
||||
|
||||
- name: label-nvidia
|
||||
description: Automatically apply nvidia label
|
||||
conditions:
|
||||
- label != stale
|
||||
- or:
|
||||
- files~=cuda
|
||||
- files~=cutlass
|
||||
- files~=flashinfer
|
||||
- files~=trtllm
|
||||
- title~=(?i)NVIDIA
|
||||
- title~=(?i)CUDA
|
||||
- title~=(?i)CUTLASS
|
||||
actions:
|
||||
label:
|
||||
add:
|
||||
- nvidia
|
||||
|
||||
- name: label-rocm
|
||||
description: Automatically apply rocm label
|
||||
conditions:
|
||||
|
||||
2
.github/workflows/cleanup_pr_body.yml
vendored
2
.github/workflows/cleanup_pr_body.yml
vendored
@@ -13,7 +13,7 @@ jobs:
|
||||
|
||||
steps:
|
||||
- name: Checkout repository
|
||||
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||
uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
|
||||
|
||||
- name: Set up Python
|
||||
uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6.0.0
|
||||
|
||||
25
.github/workflows/issue_autolabel.yml
vendored
25
.github/workflows/issue_autolabel.yml
vendored
@@ -105,6 +105,31 @@ jobs:
|
||||
}
|
||||
],
|
||||
},
|
||||
cpu: {
|
||||
// Keyword search - matches whole words only (with word boundaries)
|
||||
keywords: [
|
||||
{
|
||||
term: "CPU Backend",
|
||||
searchIn: "title"
|
||||
},
|
||||
{
|
||||
term: "x86",
|
||||
searchIn: "title"
|
||||
},
|
||||
{
|
||||
term: "ARM",
|
||||
searchIn: "title"
|
||||
},
|
||||
{
|
||||
term: "Apple Silicon",
|
||||
searchIn: "title"
|
||||
},
|
||||
{
|
||||
term: "IBM Z",
|
||||
searchIn: "title"
|
||||
},
|
||||
],
|
||||
},
|
||||
// Add more label configurations here as needed
|
||||
// example: {
|
||||
// keywords: [...],
|
||||
|
||||
80
.github/workflows/macos-smoke-test.yml
vendored
Normal file
80
.github/workflows/macos-smoke-test.yml
vendored
Normal file
@@ -0,0 +1,80 @@
|
||||
name: macOS Apple Silicon Smoke Test
|
||||
|
||||
on:
|
||||
push:
|
||||
branches:
|
||||
- main
|
||||
workflow_dispatch: # Manual trigger
|
||||
|
||||
jobs:
|
||||
macos-m1-smoke-test:
|
||||
runs-on: macos-latest
|
||||
timeout-minutes: 30
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v6
|
||||
|
||||
- uses: astral-sh/setup-uv@v7
|
||||
with:
|
||||
enable-cache: true
|
||||
cache-dependency-glob: |
|
||||
requirements/**/*.txt
|
||||
pyproject.toml
|
||||
python-version: '3.12'
|
||||
|
||||
- name: Create virtual environment
|
||||
run: |
|
||||
uv venv
|
||||
echo "$GITHUB_WORKSPACE/.venv/bin" >> "$GITHUB_PATH"
|
||||
|
||||
- name: Install dependencies and build vLLM
|
||||
run: |
|
||||
uv pip install -r requirements/cpu.txt --index-strategy unsafe-best-match
|
||||
uv pip install -e .
|
||||
env:
|
||||
CMAKE_BUILD_PARALLEL_LEVEL: 4
|
||||
|
||||
- name: Verify installation
|
||||
run: |
|
||||
python -c "import vllm; print(f'vLLM version: {vllm.__version__}')"
|
||||
|
||||
- name: Smoke test vllm serve
|
||||
run: |
|
||||
# Start server in background
|
||||
vllm serve Qwen/Qwen3-0.6B \
|
||||
--max-model-len=2K \
|
||||
--load-format=dummy \
|
||||
--hf-overrides '{"num_hidden_layers": 2}' \
|
||||
--enforce-eager \
|
||||
--port 8000 &
|
||||
|
||||
SERVER_PID=$!
|
||||
|
||||
# Wait for server to start
|
||||
for i in {1..30}; do
|
||||
if curl -s http://localhost:8000/health > /dev/null; then
|
||||
echo "Server started successfully"
|
||||
break
|
||||
fi
|
||||
if [ "$i" -eq 30 ]; then
|
||||
echo "Server failed to start"
|
||||
kill "$SERVER_PID"
|
||||
exit 1
|
||||
fi
|
||||
sleep 2
|
||||
done
|
||||
|
||||
# Test health endpoint
|
||||
curl -f http://localhost:8000/health
|
||||
|
||||
# Test completion
|
||||
curl -f http://localhost:8000/v1/completions \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{
|
||||
"model": "Qwen/Qwen3-0.6B",
|
||||
"prompt": "Hello",
|
||||
"max_tokens": 5
|
||||
}'
|
||||
|
||||
# Cleanup
|
||||
kill "$SERVER_PID"
|
||||
2
.github/workflows/pre-commit.yml
vendored
2
.github/workflows/pre-commit.yml
vendored
@@ -16,7 +16,7 @@ jobs:
|
||||
pre-commit:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||
- uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
|
||||
- uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6.0.0
|
||||
with:
|
||||
python-version: "3.12"
|
||||
|
||||
6
.gitignore
vendored
6
.gitignore
vendored
@@ -4,6 +4,9 @@
|
||||
# vllm-flash-attn built from source
|
||||
vllm/vllm_flash_attn/*
|
||||
|
||||
# OpenAI triton kernels copied from source
|
||||
vllm/third_party/triton_kernels/*
|
||||
|
||||
# triton jit
|
||||
.triton
|
||||
|
||||
@@ -221,3 +224,6 @@ csrc/moe/marlin_moe_wna16/kernel_*
|
||||
|
||||
# Ignore ep_kernels_workspace folder
|
||||
ep_kernels_workspace/
|
||||
|
||||
# Allow tracked library source folders under submodules (e.g., benchmarks/lib)
|
||||
!vllm/benchmarks/lib/
|
||||
|
||||
@@ -3,10 +3,9 @@ MD007:
|
||||
MD013: false
|
||||
MD024:
|
||||
siblings_only: true
|
||||
MD031:
|
||||
list_items: false
|
||||
MD033: false
|
||||
MD045: false
|
||||
MD046: false
|
||||
MD051: false
|
||||
MD052: false
|
||||
MD053: false
|
||||
MD059: false
|
||||
|
||||
@@ -38,7 +38,7 @@ repos:
|
||||
rev: 0.9.1
|
||||
hooks:
|
||||
- id: pip-compile
|
||||
args: [requirements/test.in, -o, requirements/test.txt, --index-strategy, unsafe-best-match, --torch-backend, cu129, --python-platform, x86_64-manylinux_2_28]
|
||||
args: [requirements/test.in, -o, requirements/test.txt, --index-strategy, unsafe-best-match, --torch-backend, cu129, --python-platform, x86_64-manylinux_2_28, --python-version, "3.12"]
|
||||
files: ^requirements/test\.(in|txt)$
|
||||
- repo: local
|
||||
hooks:
|
||||
|
||||
169
CMakeLists.txt
169
CMakeLists.txt
@@ -39,6 +39,13 @@ set(PYTHON_SUPPORTED_VERSIONS "3.10" "3.11" "3.12" "3.13")
|
||||
# Supported AMD GPU architectures.
|
||||
set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx1200;gfx1201;gfx1150;gfx1151")
|
||||
|
||||
# ROCm installation prefix. Default to /opt/rocm but allow override via
|
||||
# -DROCM_PATH=/your/rocm/path when invoking cmake.
|
||||
if(NOT DEFINED ROCM_PATH)
|
||||
set(ROCM_PATH "/opt/rocm" CACHE PATH "ROCm installation prefix")
|
||||
else()
|
||||
set(ROCM_PATH ${ROCM_PATH} CACHE PATH "ROCm installation prefix" FORCE)
|
||||
endif()
|
||||
#
|
||||
# Supported/expected torch versions for CUDA/ROCm.
|
||||
#
|
||||
@@ -129,7 +136,7 @@ elseif(HIP_FOUND)
|
||||
|
||||
# ROCm 5.X and 6.X
|
||||
if (ROCM_VERSION_DEV_MAJOR GREATER_EQUAL 5 AND
|
||||
NOT Torch_VERSION VERSION_EQUAL ${TORCH_SUPPORTED_VERSION_ROCM})
|
||||
Torch_VERSION VERSION_LESS ${TORCH_SUPPORTED_VERSION_ROCM})
|
||||
message(WARNING "Pytorch version >= ${TORCH_SUPPORTED_VERSION_ROCM} "
|
||||
"expected for ROCm build, saw ${Torch_VERSION} instead.")
|
||||
endif()
|
||||
@@ -237,11 +244,28 @@ set_gencode_flags_for_srcs(
|
||||
SRCS "${VLLM_CUMEM_EXT_SRC}"
|
||||
CUDA_ARCHS "${CUDA_ARCHS}")
|
||||
|
||||
if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
if(VLLM_GPU_LANG STREQUAL "CUDA" OR VLLM_GPU_LANG STREQUAL "HIP")
|
||||
message(STATUS "Enabling cumem allocator extension.")
|
||||
# link against cuda driver library
|
||||
list(APPEND CUMEM_LIBS CUDA::cuda_driver)
|
||||
define_gpu_extension_target(
|
||||
if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
# link against cuda driver library
|
||||
list(APPEND CUMEM_LIBS CUDA::cuda_driver)
|
||||
else()
|
||||
# link against rocm driver library. Prefer an absolute path to
|
||||
# libamdhip64.so inside ${ROCM_PATH}/lib if available, otherwise fall
|
||||
# back to linking by name "amdhip64".
|
||||
find_library(AMDHIP64_LIB
|
||||
NAMES amdhip64 libamdhip64.so
|
||||
PATHS ${ROCM_PATH}/lib
|
||||
NO_DEFAULT_PATH)
|
||||
if(AMDHIP64_LIB)
|
||||
message(STATUS "Found libamdhip64 at ${AMDHIP64_LIB}")
|
||||
list(APPEND CUMEM_LIBS ${AMDHIP64_LIB})
|
||||
else()
|
||||
message(WARNING "libamdhip64 not found in ${ROCM_PATH}/lib; falling back to linking 'amdhip64' by name")
|
||||
list(APPEND CUMEM_LIBS amdhip64)
|
||||
endif()
|
||||
endif()
|
||||
define_extension_target(
|
||||
cumem_allocator
|
||||
DESTINATION vllm
|
||||
LANGUAGE CXX
|
||||
@@ -265,6 +289,7 @@ set(VLLM_EXT_SRC
|
||||
"csrc/pos_encoding_kernels.cu"
|
||||
"csrc/activation_kernels.cu"
|
||||
"csrc/layernorm_kernels.cu"
|
||||
"csrc/fused_qknorm_rope_kernel.cu"
|
||||
"csrc/layernorm_quant_kernels.cu"
|
||||
"csrc/sampler.cu"
|
||||
"csrc/cuda_view.cu"
|
||||
@@ -282,7 +307,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
SET(CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL "Enable only the header library")
|
||||
|
||||
# Set CUTLASS_REVISION. Used for FetchContent. Also fixes some bogus messages when building.
|
||||
set(CUTLASS_REVISION "v4.2.1" CACHE STRING "CUTLASS revision to use")
|
||||
set(CUTLASS_REVISION "v4.2.1")
|
||||
|
||||
# Use the specified CUTLASS source directory for compilation if VLLM_CUTLASS_SRC_DIR is provided
|
||||
if (DEFINED ENV{VLLM_CUTLASS_SRC_DIR})
|
||||
@@ -329,8 +354,17 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
# Only build Marlin kernels if we are building for at least some compatible archs.
|
||||
# Keep building Marlin for 9.0 as there are some group sizes and shapes that
|
||||
# are not supported by Machete yet.
|
||||
# 9.0 for latest bf16 atomicAdd PTX
|
||||
cuda_archs_loose_intersection(MARLIN_ARCHS "8.0;8.7;9.0+PTX" "${CUDA_ARCHS}")
|
||||
|
||||
# marlin arches for fp16 output
|
||||
cuda_archs_loose_intersection(MARLIN_ARCHS "8.0+PTX" "${CUDA_ARCHS}")
|
||||
# marlin arches for bf16 output (we need 9.0 for bf16 atomicAdd PTX)
|
||||
cuda_archs_loose_intersection(MARLIN_BF16_ARCHS "8.0+PTX;9.0+PTX" "${CUDA_ARCHS}")
|
||||
# marlin arches for fp8 input
|
||||
# - sm80 doesn't support fp8 computation
|
||||
# - sm90 and sm100 don't support QMMA.16832.F32.E4M3.E4M3 SAAS instruction
|
||||
# so we only enable fp8 computation for SM89 (e.g. RTX 40x0) and 12.0 (e.g. RTX 50x0)
|
||||
cuda_archs_loose_intersection(MARLIN_FP8_ARCHS "8.9;12.0" "${CUDA_ARCHS}")
|
||||
|
||||
if (MARLIN_ARCHS)
|
||||
|
||||
#
|
||||
@@ -340,16 +374,18 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
set(MARLIN_GEN_SCRIPT
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/csrc/quantization/gptq_marlin/generate_kernels.py)
|
||||
file(MD5 ${MARLIN_GEN_SCRIPT} MARLIN_GEN_SCRIPT_HASH)
|
||||
list(JOIN CUDA_ARCHS "," CUDA_ARCHS_STR)
|
||||
set(MARLIN_GEN_SCRIPT_HASH_AND_ARCH "${MARLIN_GEN_SCRIPT_HASH}(ARCH:${CUDA_ARCHS_STR})")
|
||||
|
||||
message(STATUS "Marlin generation script hash: ${MARLIN_GEN_SCRIPT_HASH}")
|
||||
message(STATUS "Last run Marlin generate script hash: $CACHE{MARLIN_GEN_SCRIPT_HASH}")
|
||||
message(STATUS "Marlin generation script hash: ${MARLIN_GEN_SCRIPT_HASH_AND_ARCH}")
|
||||
message(STATUS "Last run Marlin generate script hash: $CACHE{MARLIN_GEN_SCRIPT_HASH_AND_ARCH}")
|
||||
|
||||
if (NOT DEFINED CACHE{MARLIN_GEN_SCRIPT_HASH}
|
||||
OR NOT $CACHE{MARLIN_GEN_SCRIPT_HASH} STREQUAL ${MARLIN_GEN_SCRIPT_HASH})
|
||||
if (NOT DEFINED CACHE{MARLIN_GEN_SCRIPT_HASH_AND_ARCH}
|
||||
OR NOT $CACHE{MARLIN_GEN_SCRIPT_HASH_AND_ARCH} STREQUAL ${MARLIN_GEN_SCRIPT_HASH_AND_ARCH})
|
||||
execute_process(
|
||||
COMMAND ${CMAKE_COMMAND} -E env
|
||||
PYTHONPATH=$PYTHONPATH
|
||||
${Python_EXECUTABLE} ${MARLIN_GEN_SCRIPT}
|
||||
${Python_EXECUTABLE} ${MARLIN_GEN_SCRIPT} ${CUDA_ARCHS_STR}
|
||||
RESULT_VARIABLE marlin_generation_result
|
||||
OUTPUT_VARIABLE marlin_generation_result
|
||||
OUTPUT_FILE ${CMAKE_CURRENT_BINARY_DIR}/marlin_generation.log
|
||||
@@ -362,15 +398,15 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
"\nCheck the log for details: "
|
||||
"${CMAKE_CURRENT_BINARY_DIR}/marlin_generation.log")
|
||||
else()
|
||||
set(MARLIN_GEN_SCRIPT_HASH ${MARLIN_GEN_SCRIPT_HASH}
|
||||
CACHE STRING "Last run Marlin generate script hash" FORCE)
|
||||
set(MARLIN_GEN_SCRIPT_HASH_AND_ARCH ${MARLIN_GEN_SCRIPT_HASH_AND_ARCH}
|
||||
CACHE STRING "Last run Marlin generate script hash and arch" FORCE)
|
||||
message(STATUS "Marlin generation completed successfully.")
|
||||
endif()
|
||||
else()
|
||||
message(STATUS "Marlin generation script has not changed, skipping generation.")
|
||||
endif()
|
||||
|
||||
file(GLOB MARLIN_TEMPLATE_KERNEL_SRC "csrc/quantization/gptq_marlin/kernel_*.cu")
|
||||
file(GLOB MARLIN_TEMPLATE_KERNEL_SRC "csrc/quantization/gptq_marlin/sm80_kernel_*_float16.cu")
|
||||
set_gencode_flags_for_srcs(
|
||||
SRCS "${MARLIN_TEMPLATE_KERNEL_SRC}"
|
||||
CUDA_ARCHS "${MARLIN_ARCHS}")
|
||||
@@ -378,12 +414,34 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
set_source_files_properties(${MARLIN_TEMPLATE_KERNEL_SRC}
|
||||
PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false")
|
||||
endif()
|
||||
|
||||
list(APPEND VLLM_EXT_SRC ${MARLIN_TEMPLATE_KERNEL_SRC})
|
||||
|
||||
file(GLOB MARLIN_TEMPLATE_BF16_KERNEL_SRC "csrc/quantization/gptq_marlin/sm80_kernel_*_bfloat16.cu")
|
||||
set_gencode_flags_for_srcs(
|
||||
SRCS "${MARLIN_TEMPLATE_BF16_KERNEL_SRC}"
|
||||
CUDA_ARCHS "${MARLIN_BF16_ARCHS}")
|
||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8)
|
||||
set_source_files_properties(${MARLIN_TEMPLATE_BF16_KERNEL_SRC}
|
||||
PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false")
|
||||
endif()
|
||||
list(APPEND VLLM_EXT_SRC ${MARLIN_TEMPLATE_BF16_KERNEL_SRC})
|
||||
|
||||
if (MARLIN_FP8_ARCHS)
|
||||
file(GLOB MARLIN_TEMPLATE_FP8_KERNEL_SRC "csrc/quantization/gptq_marlin/sm89_kernel_*.cu")
|
||||
set_gencode_flags_for_srcs(
|
||||
SRCS "${MARLIN_TEMPLATE_FP8_KERNEL_SRC}"
|
||||
CUDA_ARCHS "${MARLIN_FP8_ARCHS}")
|
||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8)
|
||||
set_source_files_properties(${MARLIN_TEMPLATE_FP8_KERNEL_SRC}
|
||||
PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false")
|
||||
endif()
|
||||
list(APPEND VLLM_EXT_SRC ${MARLIN_TEMPLATE_FP8_KERNEL_SRC})
|
||||
endif()
|
||||
|
||||
set(MARLIN_SRCS
|
||||
"csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu"
|
||||
"csrc/quantization/gptq_marlin/gptq_marlin.cu"
|
||||
"csrc/quantization/gptq_marlin/marlin_int4_fp8_preprocess.cu"
|
||||
"csrc/quantization/gptq_marlin/gptq_marlin_repack.cu"
|
||||
"csrc/quantization/gptq_marlin/awq_marlin_repack.cu")
|
||||
set_gencode_flags_for_srcs(
|
||||
@@ -487,9 +545,9 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
# The cutlass_scaled_mm kernels for Blackwell SM100 (c3x, i.e. CUTLASS 3.x)
|
||||
# require CUDA 12.8 or later
|
||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
|
||||
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f;11.0f;12.0f" "${CUDA_ARCHS}")
|
||||
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f;11.0f" "${CUDA_ARCHS}")
|
||||
else()
|
||||
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a;10.3a;12.0a;12.1a" "${CUDA_ARCHS}")
|
||||
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a;10.3a" "${CUDA_ARCHS}")
|
||||
endif()
|
||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
|
||||
set(SRCS
|
||||
@@ -579,12 +637,15 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
set(SRCS
|
||||
"csrc/quantization/fp4/nvfp4_quant_kernels.cu"
|
||||
"csrc/quantization/fp4/activation_nvfp4_quant_fusion_kernels.cu"
|
||||
"csrc/quantization/fp4/nvfp4_scaled_mm_sm120_kernels.cu")
|
||||
"csrc/quantization/fp4/nvfp4_experts_quant.cu"
|
||||
"csrc/quantization/fp4/nvfp4_scaled_mm_sm120_kernels.cu"
|
||||
"csrc/quantization/fp4/nvfp4_blockwise_moe_kernel.cu")
|
||||
set_gencode_flags_for_srcs(
|
||||
SRCS "${SRCS}"
|
||||
CUDA_ARCHS "${FP4_ARCHS}")
|
||||
list(APPEND VLLM_EXT_SRC "${SRCS}")
|
||||
list(APPEND VLLM_GPU_FLAGS "-DENABLE_NVFP4_SM120=1")
|
||||
list(APPEND VLLM_GPU_FLAGS "-DENABLE_CUTLASS_MOE_SM120=1")
|
||||
message(STATUS "Building NVFP4 for archs: ${FP4_ARCHS}")
|
||||
else()
|
||||
message(STATUS "Not building NVFP4 as no compatible archs were found.")
|
||||
@@ -594,9 +655,9 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
|
||||
# FP4 Archs and flags
|
||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
|
||||
cuda_archs_loose_intersection(FP4_ARCHS "10.0f;11.0f;12.0f" "${CUDA_ARCHS}")
|
||||
cuda_archs_loose_intersection(FP4_ARCHS "10.0f;11.0f" "${CUDA_ARCHS}")
|
||||
else()
|
||||
cuda_archs_loose_intersection(FP4_ARCHS "10.0a;10.1a;12.0a;12.1a" "${CUDA_ARCHS}")
|
||||
cuda_archs_loose_intersection(FP4_ARCHS "10.0a;10.1a;10.3a" "${CUDA_ARCHS}")
|
||||
endif()
|
||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND FP4_ARCHS)
|
||||
set(SRCS
|
||||
@@ -670,7 +731,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
|
||||
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f;11.0f" "${CUDA_ARCHS}")
|
||||
else()
|
||||
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a" "${CUDA_ARCHS}")
|
||||
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a;10.3a" "${CUDA_ARCHS}")
|
||||
endif()
|
||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
|
||||
set(SRCS "csrc/quantization/w8a8/cutlass/moe/grouped_mm_c3x_sm100.cu")
|
||||
@@ -716,9 +777,9 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
endif()
|
||||
|
||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
|
||||
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f;11.0f;12.0f" "${CUDA_ARCHS}")
|
||||
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f;11.0f" "${CUDA_ARCHS}")
|
||||
else()
|
||||
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a;10.3a;12.0a;12.1a" "${CUDA_ARCHS}")
|
||||
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a;10.3a" "${CUDA_ARCHS}")
|
||||
endif()
|
||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
|
||||
set(SRCS "csrc/quantization/w8a8/cutlass/moe/blockwise_scaled_group_mm_sm100.cu")
|
||||
@@ -836,7 +897,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
endif()
|
||||
|
||||
# Hadacore kernels
|
||||
cuda_archs_loose_intersection(HADACORE_ARCHS "8.0;8.9;9.0" "${CUDA_ARCHS}")
|
||||
cuda_archs_loose_intersection(HADACORE_ARCHS "8.0+PTX;9.0+PTX" "${CUDA_ARCHS}")
|
||||
if(HADACORE_ARCHS)
|
||||
set(SRCS "csrc/quantization/hadamard/hadacore/hadamard_transform_cuda.cu")
|
||||
set_gencode_flags_for_srcs(
|
||||
@@ -858,7 +919,7 @@ if (VLLM_GPU_LANG STREQUAL "HIP")
|
||||
endif()
|
||||
|
||||
message(STATUS "Enabling C extension.")
|
||||
define_gpu_extension_target(
|
||||
define_extension_target(
|
||||
_C
|
||||
DESTINATION vllm
|
||||
LANGUAGE ${VLLM_GPU_LANG}
|
||||
@@ -913,8 +974,15 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
CUDA_ARCHS "${CUDA_ARCHS}")
|
||||
|
||||
list(APPEND VLLM_MOE_EXT_SRC "${VLLM_MOE_WNA16_SRC}")
|
||||
# 9.0 for latest bf16 atomicAdd PTX
|
||||
cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0;8.7;9.0+PTX" "${CUDA_ARCHS}")
|
||||
# moe marlin arches
|
||||
# note that we always set `use_atomic_add=False` for moe marlin now,
|
||||
# so we don't need 9.0 for bf16 atomicAdd PTX
|
||||
cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0+PTX" "${CUDA_ARCHS}")
|
||||
# moe marlin arches for fp8 input
|
||||
# - sm80 doesn't support fp8 computation
|
||||
# - sm90 and sm100 don't support QMMA.16832.F32.E4M3.E4M3 SAAS instruction
|
||||
# so we only enable fp8 computation for SM89 (e.g. RTX 40x0) and 12.0 (e.g. RTX 50x0)
|
||||
cuda_archs_loose_intersection(MARLIN_MOE_FP8_ARCHS "8.9;12.0" "${CUDA_ARCHS}")
|
||||
if (MARLIN_MOE_ARCHS)
|
||||
|
||||
#
|
||||
@@ -924,16 +992,18 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
set(MOE_MARLIN_GEN_SCRIPT
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/csrc/moe/marlin_moe_wna16/generate_kernels.py)
|
||||
file(MD5 ${MOE_MARLIN_GEN_SCRIPT} MOE_MARLIN_GEN_SCRIPT_HASH)
|
||||
list(JOIN CUDA_ARCHS "," CUDA_ARCHS_STR)
|
||||
set(MOE_MARLIN_GEN_SCRIPT_HASH_AND_ARCH "${MOE_MARLIN_GEN_SCRIPT_HASH}(ARCH:${CUDA_ARCHS_STR})")
|
||||
|
||||
message(STATUS "Marlin MOE generation script hash: ${MOE_MARLIN_GEN_SCRIPT_HASH}")
|
||||
message(STATUS "Last run Marlin MOE generate script hash: $CACHE{MOE_MARLIN_GEN_SCRIPT_HASH}")
|
||||
message(STATUS "Marlin MOE generation script hash with arch: ${MOE_MARLIN_GEN_SCRIPT_HASH_AND_ARCH}")
|
||||
message(STATUS "Last run Marlin MOE generate script hash with arch: $CACHE{MOE_MARLIN_GEN_SCRIPT_HASH_AND_ARCH}")
|
||||
|
||||
if (NOT DEFINED CACHE{MOE_MARLIN_GEN_SCRIPT_HASH}
|
||||
OR NOT $CACHE{MOE_MARLIN_GEN_SCRIPT_HASH} STREQUAL ${MOE_MARLIN_GEN_SCRIPT_HASH})
|
||||
if (NOT DEFINED CACHE{MOE_MARLIN_GEN_SCRIPT_HASH_AND_ARCH}
|
||||
OR NOT $CACHE{MOE_MARLIN_GEN_SCRIPT_HASH_AND_ARCH} STREQUAL ${MOE_MARLIN_GEN_SCRIPT_HASH_AND_ARCH})
|
||||
execute_process(
|
||||
COMMAND ${CMAKE_COMMAND} -E env
|
||||
PYTHONPATH=$PYTHONPATH
|
||||
${Python_EXECUTABLE} ${MOE_MARLIN_GEN_SCRIPT}
|
||||
${Python_EXECUTABLE} ${MOE_MARLIN_GEN_SCRIPT} ${CUDA_ARCHS_STR}
|
||||
RESULT_VARIABLE moe_marlin_generation_result
|
||||
OUTPUT_VARIABLE moe_marlin_generation_output
|
||||
OUTPUT_FILE ${CMAKE_CURRENT_BINARY_DIR}/moe_marlin_generation.log
|
||||
@@ -946,7 +1016,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
"\nCheck the log for details: "
|
||||
"${CMAKE_CURRENT_BINARY_DIR}/moe_marlin_generation.log")
|
||||
else()
|
||||
set(MOE_MARLIN_GEN_SCRIPT_HASH ${MOE_MARLIN_GEN_SCRIPT_HASH}
|
||||
set(MOE_MARLIN_GEN_SCRIPT_HASH_AND_ARCH ${MOE_MARLIN_GEN_SCRIPT_HASH_AND_ARCH}
|
||||
CACHE STRING "Last run Marlin MOE generate script hash" FORCE)
|
||||
message(STATUS "Marlin MOE generation completed successfully.")
|
||||
endif()
|
||||
@@ -954,16 +1024,28 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
message(STATUS "Marlin MOE generation script has not changed, skipping generation.")
|
||||
endif()
|
||||
|
||||
file(GLOB MOE_WNAA16_MARLIN_SRC "csrc/moe/marlin_moe_wna16/*.cu")
|
||||
file(GLOB MARLIN_MOE_SRC "csrc/moe/marlin_moe_wna16/sm80_kernel_*.cu")
|
||||
list(APPEND MARLIN_MOE_SRC "csrc/moe/marlin_moe_wna16/ops.cu")
|
||||
set_gencode_flags_for_srcs(
|
||||
SRCS "${MOE_WNAA16_MARLIN_SRC}"
|
||||
SRCS "${MARLIN_MOE_SRC}"
|
||||
CUDA_ARCHS "${MARLIN_MOE_ARCHS}")
|
||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8)
|
||||
set_source_files_properties(${MOE_WNAA16_MARLIN_SRC}
|
||||
set_source_files_properties(${MARLIN_MOE_SRC}
|
||||
PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false")
|
||||
endif()
|
||||
list(APPEND VLLM_MOE_EXT_SRC ${MARLIN_MOE_SRC})
|
||||
|
||||
list(APPEND VLLM_MOE_EXT_SRC ${MOE_WNAA16_MARLIN_SRC})
|
||||
if (MARLIN_MOE_FP8_ARCHS)
|
||||
file(GLOB MARLIN_MOE_FP8_SRC "csrc/moe/marlin_moe_wna16/sm89_kernel_*.cu")
|
||||
set_gencode_flags_for_srcs(
|
||||
SRCS "${MARLIN_MOE_FP8_SRC}"
|
||||
CUDA_ARCHS "${MARLIN_MOE_FP8_ARCHS}")
|
||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8)
|
||||
set_source_files_properties(${MARLIN_MOE_FP8_SRC}
|
||||
PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false")
|
||||
endif()
|
||||
list(APPEND VLLM_MOE_EXT_SRC ${MARLIN_MOE_FP8_SRC})
|
||||
endif()
|
||||
|
||||
message(STATUS "Building Marlin MOE kernels for archs: ${MARLIN_MOE_ARCHS}")
|
||||
else()
|
||||
@@ -973,7 +1055,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
endif()
|
||||
|
||||
message(STATUS "Enabling moe extension.")
|
||||
define_gpu_extension_target(
|
||||
define_extension_target(
|
||||
_moe_C
|
||||
DESTINATION vllm
|
||||
LANGUAGE ${VLLM_GPU_LANG}
|
||||
@@ -994,7 +1076,7 @@ if(VLLM_GPU_LANG STREQUAL "HIP")
|
||||
"csrc/rocm/skinny_gemms.cu"
|
||||
"csrc/rocm/attention.cu")
|
||||
|
||||
define_gpu_extension_target(
|
||||
define_extension_target(
|
||||
_rocm_C
|
||||
DESTINATION vllm
|
||||
LANGUAGE ${VLLM_GPU_LANG}
|
||||
@@ -1005,6 +1087,11 @@ if(VLLM_GPU_LANG STREQUAL "HIP")
|
||||
WITH_SOABI)
|
||||
endif()
|
||||
|
||||
# For CUDA and HIP builds also build the triton_kernels external package.
|
||||
if(VLLM_GPU_LANG STREQUAL "CUDA" OR VLLM_GPU_LANG STREQUAL "HIP")
|
||||
include(cmake/external_projects/triton_kernels.cmake)
|
||||
endif()
|
||||
|
||||
# For CUDA we also build and ship some external projects.
|
||||
if (VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
include(cmake/external_projects/flashmla.cmake)
|
||||
|
||||
@@ -21,6 +21,9 @@ Join us at the [PyTorch Conference, October 22-23](https://events.linuxfoundatio
|
||||
|
||||
*Latest News* 🔥
|
||||
|
||||
- [2025/11] We hosted [vLLM Bangkok Meetup](https://luma.com/v0f647nv). We explored vLLM and LMCache inference and low-resource language adaptation with speakers from Embedded LLM, AMD, and Red Hat. Please find the meetup slides [here](https://drive.google.com/drive/folders/1H0DS57F8HQ5q3kSOSoRmucPJWL3E0A_X?usp=sharing).
|
||||
- [2025/11] We hosted [the first vLLM Europe Meetup in Zurich](https://luma.com/0gls27kb) focused on quantization, distributed inference, and reinforcement learning at scale with speakers from Mistral, IBM, and Red Hat. Please find the meetup slides [here](https://docs.google.com/presentation/d/1UC9PTLCHYXQpOmJDSFg6Sljra3iVXzc09DeEI7dnxMc/edit?usp=sharing) and recording [here](https://www.youtube.com/watch?v=6m6ZE6yVEDI)
|
||||
- [2025/11] We hosted [vLLM Beijing Meetup](https://mp.weixin.qq.com/s/xSrYXjNgr1HbCP4ExYNG1w) focusing on distributed inference and diverse accelerator support with vLLM! Please find the meetup slides [here](https://drive.google.com/drive/folders/1nQJ8ZkLSjKxvu36sSHaceVXtttbLvvu-?usp=drive_link).
|
||||
- [2025/10] We hosted [vLLM Shanghai Meetup](https://mp.weixin.qq.com/s/__xb4OyOsImz-9eAVrdlcg) focused on hands-on vLLM inference optimization! Please find the meetup slides [here](https://drive.google.com/drive/folders/1KqwjsFJLfEsC8wlDugnrR61zsWHt94Q6).
|
||||
- [2025/09] We hosted [vLLM Toronto Meetup](https://luma.com/e80e0ymm) focused on tackling inference at scale and speculative decoding with speakers from NVIDIA and Red Hat! Please find the meetup slides [here](https://docs.google.com/presentation/d/1IYJYmJcu9fLpID5N5RbW_vO0XLo0CGOR14IXOjB61V8/edit?usp=sharing).
|
||||
- [2025/08] We hosted [vLLM Shenzhen Meetup](https://mp.weixin.qq.com/s/k8ZBO1u2_2odgiKWH_GVTQ) focusing on the ecosystem around vLLM! Please find the meetup slides [here](https://drive.google.com/drive/folders/1Ua2SVKVSu-wp5vou_6ElraDt2bnKhiEA).
|
||||
@@ -83,7 +86,7 @@ vLLM is flexible and easy to use with:
|
||||
- Tensor, pipeline, data and expert parallelism support for distributed inference
|
||||
- Streaming outputs
|
||||
- OpenAI-compatible API server
|
||||
- Support for NVIDIA GPUs, AMD CPUs and GPUs, Intel CPUs and GPUs, PowerPC CPUs, and TPU. Additionally, support for diverse hardware plugins such as Intel Gaudi, IBM Spyre and Huawei Ascend.
|
||||
- Support for NVIDIA GPUs, AMD CPUs and GPUs, Intel CPUs and GPUs, PowerPC CPUs, Arm CPUs, and TPU. Additionally, support for diverse hardware plugins such as Intel Gaudi, IBM Spyre and Huawei Ascend.
|
||||
- Prefix caching support
|
||||
- Multi-LoRA support
|
||||
|
||||
|
||||
@@ -83,7 +83,7 @@ MIN_CACHE_HIT_PCT=0
|
||||
MAX_LATENCY_ALLOWED_MS=100000000000 # A very large number
|
||||
```
|
||||
|
||||
#### 2. Maximize Throughput with a Latency Requirement
|
||||
### 2. Maximize Throughput with a Latency Requirement
|
||||
|
||||
- **Goal**: Find the best server parameters when P99 end-to-end latency must be below 500ms.
|
||||
- **Configuration**:
|
||||
@@ -96,7 +96,7 @@ MIN_CACHE_HIT_PCT=0
|
||||
MAX_LATENCY_ALLOWED_MS=500
|
||||
```
|
||||
|
||||
#### 3. Maximize Throughput with Prefix Caching and Latency Requirements
|
||||
### 3. Maximize Throughput with Prefix Caching and Latency Requirements
|
||||
|
||||
- **Goal**: Find the best server parameters assuming a 60% prefix cache hit rate and a latency requirement of 500ms.
|
||||
- **Configuration**:
|
||||
|
||||
@@ -620,7 +620,7 @@ def get_tokenizer(
|
||||
kwargs["use_fast"] = False
|
||||
if tokenizer_mode == "mistral":
|
||||
try:
|
||||
from vllm.transformers_utils.tokenizer import MistralTokenizer
|
||||
from vllm.tokenizers import MistralTokenizer
|
||||
except ImportError as e:
|
||||
raise ImportError(
|
||||
"MistralTokenizer requires vllm package.\n"
|
||||
|
||||
380
benchmarks/benchmark_batch_invariance.py
Executable file
380
benchmarks/benchmark_batch_invariance.py
Executable file
@@ -0,0 +1,380 @@
|
||||
#!/usr/bin/env python3
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""
|
||||
Benchmark to measure the performance overhead of VLLM_BATCH_INVARIANT mode.
|
||||
|
||||
This benchmark runs the same workload twice:
|
||||
1. With VLLM_BATCH_INVARIANT=0 (baseline)
|
||||
2. With VLLM_BATCH_INVARIANT=1 (batch invariant mode)
|
||||
|
||||
And reports the timing and throughput metrics for comparison.
|
||||
|
||||
Environment variables:
|
||||
VLLM_BENCH_MODEL: Model to benchmark (default: "Qwen/Qwen3-1.7B")
|
||||
VLLM_BENCH_TP_SIZE: Tensor parallel size (default: 1, use 8 for deepseek)
|
||||
VLLM_BENCH_BATCH_SIZE: Max batch size (default: 128)
|
||||
VLLM_BENCH_NUM_TRIALS: Number of trials to run (default: 5)
|
||||
VLLM_BENCH_MIN_PROMPT: Min prompt length in words (default: 1024)
|
||||
VLLM_BENCH_MAX_PROMPT: Max prompt length in words (default: 2048)
|
||||
VLLM_BENCH_MAX_TOKENS: Max tokens to generate (default: 128)
|
||||
VLLM_BENCH_TEMPERATURE: Temperature for sampling (default: 0.0)
|
||||
VLLM_BENCH_GPU_MEMORY_UTILIZATION: GPU memory utilization (default: 0.4)
|
||||
VLLM_BENCH_MAX_MODEL_LEN: Max model length (default: 5120)
|
||||
VLLM_BENCH_BACKEND: Attention backend (default: FLASH_ATTN)
|
||||
|
||||
Example usage:
|
||||
# Benchmark qwen3 (default)
|
||||
python benchmarks/benchmark_batch_invariance.py
|
||||
|
||||
# Benchmark deepseek with 8 GPUs
|
||||
VLLM_BENCH_MODEL="deepseek-ai/DeepSeek-V3" VLLM_BENCH_TP_SIZE=8 \\
|
||||
python benchmarks/benchmark_batch_invariance.py
|
||||
|
||||
# Quick test with fewer trials
|
||||
VLLM_BENCH_NUM_TRIALS=2 VLLM_BENCH_BATCH_SIZE=32 \\
|
||||
python benchmarks/benchmark_batch_invariance.py
|
||||
"""
|
||||
|
||||
import contextlib
|
||||
import os
|
||||
import random
|
||||
import time
|
||||
|
||||
from vllm import LLM, SamplingParams
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
|
||||
def _random_prompt(min_words: int = 1024, max_words: int = 1024 * 2) -> str:
|
||||
"""Generate a random prompt for benchmarking."""
|
||||
prompt_templates = [
|
||||
"Question: What is the capital of France?\nAnswer: The capital of France is",
|
||||
"Q: How does photosynthesis work?\nA: Photosynthesis is the process by which",
|
||||
"User: Can you explain quantum mechanics?\nAssistant: Quantum mechanics is",
|
||||
"Once upon a time in a distant galaxy, there lived",
|
||||
"The old man walked slowly down the street, remembering",
|
||||
"In the year 2157, humanity finally discovered",
|
||||
"To implement a binary search tree in Python, first we need to",
|
||||
"The algorithm works by iterating through the array and",
|
||||
"Here's how to optimize database queries using indexing:",
|
||||
"The Renaissance was a period in European history that",
|
||||
"Climate change is caused by several factors including",
|
||||
"The human brain contains approximately 86 billion neurons which",
|
||||
"I've been thinking about getting a new laptop because",
|
||||
"Yesterday I went to the store and bought",
|
||||
"My favorite thing about summer is definitely",
|
||||
]
|
||||
|
||||
base_prompt = random.choice(prompt_templates)
|
||||
|
||||
if max_words < min_words:
|
||||
max_words = min_words
|
||||
target_words = random.randint(min_words, max_words)
|
||||
|
||||
if target_words > 50:
|
||||
padding_text = (
|
||||
" This is an interesting topic that deserves more explanation. "
|
||||
* (target_words // 50)
|
||||
)
|
||||
base_prompt = base_prompt + padding_text
|
||||
|
||||
return base_prompt
|
||||
|
||||
|
||||
def run_benchmark_with_batch_invariant(
|
||||
model: str,
|
||||
tp_size: int,
|
||||
max_batch_size: int,
|
||||
num_trials: int,
|
||||
min_prompt: int,
|
||||
max_prompt: int,
|
||||
max_tokens: int,
|
||||
temperature: float,
|
||||
gpu_mem_util: float,
|
||||
max_model_len: int,
|
||||
backend: str,
|
||||
batch_invariant: bool,
|
||||
seed: int = 12345,
|
||||
) -> dict:
|
||||
"""
|
||||
Run the benchmark with the specified configuration.
|
||||
|
||||
Returns a dict with timing and throughput metrics.
|
||||
"""
|
||||
random.seed(seed)
|
||||
|
||||
# Set environment variables
|
||||
os.environ["VLLM_ATTENTION_BACKEND"] = backend
|
||||
if batch_invariant:
|
||||
os.environ["VLLM_BATCH_INVARIANT"] = "1"
|
||||
else:
|
||||
os.environ["VLLM_BATCH_INVARIANT"] = "0"
|
||||
|
||||
print(f"\n{'=' * 80}")
|
||||
print(f"BENCHMARK: VLLM_BATCH_INVARIANT={int(batch_invariant)}")
|
||||
print(f" Model: {model}")
|
||||
print(f" TP Size: {tp_size}")
|
||||
print(f" Backend: {backend}")
|
||||
print(f" Max Batch Size: {max_batch_size}")
|
||||
print(f" Trials: {num_trials}")
|
||||
print(f" Max Tokens: {max_tokens}")
|
||||
print(f"{'=' * 80}\n")
|
||||
|
||||
sampling = SamplingParams(
|
||||
temperature=temperature,
|
||||
top_p=0.95,
|
||||
max_tokens=max_tokens,
|
||||
seed=20240919,
|
||||
)
|
||||
|
||||
needle_prompt = "There once was a "
|
||||
|
||||
llm = None
|
||||
try:
|
||||
# Create LLM engine
|
||||
start_init = time.perf_counter()
|
||||
llm = LLM(
|
||||
model=model,
|
||||
max_num_seqs=max_batch_size,
|
||||
gpu_memory_utilization=gpu_mem_util,
|
||||
max_model_len=max_model_len,
|
||||
dtype="bfloat16",
|
||||
tensor_parallel_size=tp_size,
|
||||
enable_prefix_caching=False,
|
||||
)
|
||||
init_time = time.perf_counter() - start_init
|
||||
print(f"Engine initialization time: {init_time:.2f}s\n")
|
||||
|
||||
# Generate baseline
|
||||
print("Generating baseline (warmup)...")
|
||||
baseline_out = llm.generate([needle_prompt], sampling)
|
||||
assert len(baseline_out) == 1
|
||||
baseline_text = baseline_out[0].outputs[0].text
|
||||
print(f"Baseline output: '{baseline_text[:50]}...'\n")
|
||||
|
||||
# Run trials and measure timing
|
||||
trial_times: list[float] = []
|
||||
total_tokens = 0
|
||||
total_prompts = 0
|
||||
|
||||
for trial in range(num_trials):
|
||||
# Create a batch
|
||||
prompts: list[str] = []
|
||||
batch_size = random.randint(max_batch_size // 2, max_batch_size)
|
||||
needle_pos = random.randint(0, batch_size - 1)
|
||||
for i in range(batch_size):
|
||||
if i == needle_pos:
|
||||
prompts.append(needle_prompt)
|
||||
else:
|
||||
prompts.append(_random_prompt(min_prompt, max_prompt))
|
||||
|
||||
# Measure time for this trial
|
||||
start_time = time.perf_counter()
|
||||
outputs = llm.generate(prompts, sampling)
|
||||
trial_time = time.perf_counter() - start_time
|
||||
|
||||
trial_times.append(trial_time)
|
||||
total_prompts += len(prompts)
|
||||
|
||||
# Count tokens
|
||||
for output in outputs:
|
||||
if output.outputs:
|
||||
total_tokens += len(output.outputs[0].token_ids)
|
||||
|
||||
print(
|
||||
f"Trial {trial + 1}/{num_trials}: "
|
||||
f"batch_size={batch_size}, "
|
||||
f"time={trial_time:.2f}s"
|
||||
)
|
||||
|
||||
# Verify needle output still matches
|
||||
needle_output = outputs[needle_pos]
|
||||
assert needle_output.prompt == needle_prompt
|
||||
|
||||
# Compute statistics
|
||||
avg_time = sum(trial_times) / len(trial_times)
|
||||
min_time = min(trial_times)
|
||||
max_time = max(trial_times)
|
||||
throughput = total_tokens / sum(trial_times)
|
||||
prompts_per_sec = total_prompts / sum(trial_times)
|
||||
|
||||
print(f"\n{'=' * 80}")
|
||||
print("RESULTS:")
|
||||
print(f" Average time per trial: {avg_time:.2f}s")
|
||||
print(f" Min time: {min_time:.2f}s")
|
||||
print(f" Max time: {max_time:.2f}s")
|
||||
print(f" Total tokens generated: {total_tokens}")
|
||||
print(f" Total prompts processed: {total_prompts}")
|
||||
print(f" Throughput: {throughput:.2f} tokens/s")
|
||||
print(f" Prompts/s: {prompts_per_sec:.2f}")
|
||||
print(f"{'=' * 80}\n")
|
||||
|
||||
return {
|
||||
"init_time": init_time,
|
||||
"avg_time": avg_time,
|
||||
"min_time": min_time,
|
||||
"max_time": max_time,
|
||||
"total_tokens": total_tokens,
|
||||
"total_prompts": total_prompts,
|
||||
"throughput": throughput,
|
||||
"prompts_per_sec": prompts_per_sec,
|
||||
"trial_times": trial_times,
|
||||
}
|
||||
|
||||
finally:
|
||||
# Cleanup
|
||||
if llm is not None:
|
||||
with contextlib.suppress(Exception):
|
||||
llm.shutdown()
|
||||
|
||||
|
||||
def main():
|
||||
# Check platform support
|
||||
if not (current_platform.is_cuda() and current_platform.has_device_capability(90)):
|
||||
print("ERROR: Requires CUDA and >= Hopper (SM90)")
|
||||
print(f"Current platform: {current_platform.device_type}")
|
||||
if current_platform.is_cuda():
|
||||
print(f"Device capability: {current_platform.get_device_capability()}")
|
||||
return 1
|
||||
|
||||
# Read configuration from environment
|
||||
model = os.getenv("VLLM_BENCH_MODEL", "Qwen/Qwen3-1.7B")
|
||||
tp_size = int(os.getenv("VLLM_BENCH_TP_SIZE", "1"))
|
||||
max_batch_size = int(os.getenv("VLLM_BENCH_BATCH_SIZE", "128"))
|
||||
num_trials = int(os.getenv("VLLM_BENCH_NUM_TRIALS", "5"))
|
||||
min_prompt = int(os.getenv("VLLM_BENCH_MIN_PROMPT", "1024"))
|
||||
max_prompt = int(os.getenv("VLLM_BENCH_MAX_PROMPT", "2048"))
|
||||
max_tokens = int(os.getenv("VLLM_BENCH_MAX_TOKENS", "128"))
|
||||
temperature = float(os.getenv("VLLM_BENCH_TEMPERATURE", "0.0"))
|
||||
gpu_mem_util = float(os.getenv("VLLM_BENCH_GPU_MEMORY_UTILIZATION", "0.4"))
|
||||
max_model_len = int(os.getenv("VLLM_BENCH_MAX_MODEL_LEN", "5120"))
|
||||
backend = os.getenv("VLLM_BENCH_BACKEND", "FLASH_ATTN")
|
||||
|
||||
print("\n" + "=" * 80)
|
||||
print("VLLM BATCH INVARIANCE BENCHMARK")
|
||||
print("=" * 80)
|
||||
print("\nConfiguration:")
|
||||
print(f" Model: {model}")
|
||||
print(f" Tensor Parallel Size: {tp_size}")
|
||||
print(f" Attention Backend: {backend}")
|
||||
print(f" Max Batch Size: {max_batch_size}")
|
||||
print(f" Number of Trials: {num_trials}")
|
||||
print(f" Prompt Length Range: {min_prompt}-{max_prompt} words")
|
||||
print(f" Max Tokens to Generate: {max_tokens}")
|
||||
print(f" Temperature: {temperature}")
|
||||
print(f" GPU Memory Utilization: {gpu_mem_util}")
|
||||
print(f" Max Model Length: {max_model_len}")
|
||||
print("=" * 80)
|
||||
|
||||
# Run benchmark WITHOUT batch invariance (baseline)
|
||||
print("\n" + "=" * 80)
|
||||
print("PHASE 1: Running WITHOUT batch invariance (baseline)")
|
||||
print("=" * 80)
|
||||
baseline_results = run_benchmark_with_batch_invariant(
|
||||
model=model,
|
||||
tp_size=tp_size,
|
||||
max_batch_size=max_batch_size,
|
||||
num_trials=num_trials,
|
||||
min_prompt=min_prompt,
|
||||
max_prompt=max_prompt,
|
||||
max_tokens=max_tokens,
|
||||
temperature=temperature,
|
||||
gpu_mem_util=gpu_mem_util,
|
||||
max_model_len=max_model_len,
|
||||
backend=backend,
|
||||
batch_invariant=False,
|
||||
)
|
||||
|
||||
# Run benchmark WITH batch invariance
|
||||
print("\n" + "=" * 80)
|
||||
print("PHASE 2: Running WITH batch invariance")
|
||||
print("=" * 80)
|
||||
batch_inv_results = run_benchmark_with_batch_invariant(
|
||||
model=model,
|
||||
tp_size=tp_size,
|
||||
max_batch_size=max_batch_size,
|
||||
num_trials=num_trials,
|
||||
min_prompt=min_prompt,
|
||||
max_prompt=max_prompt,
|
||||
max_tokens=max_tokens,
|
||||
temperature=temperature,
|
||||
gpu_mem_util=gpu_mem_util,
|
||||
max_model_len=max_model_len,
|
||||
backend=backend,
|
||||
batch_invariant=True,
|
||||
)
|
||||
|
||||
# Compare results
|
||||
print("\n" + "=" * 80)
|
||||
print("COMPARISON: Batch Invariance vs Baseline")
|
||||
print("=" * 80)
|
||||
|
||||
init_overhead_pct = (
|
||||
(batch_inv_results["init_time"] - baseline_results["init_time"])
|
||||
/ baseline_results["init_time"]
|
||||
* 100
|
||||
)
|
||||
time_overhead_pct = (
|
||||
(batch_inv_results["avg_time"] - baseline_results["avg_time"])
|
||||
/ baseline_results["avg_time"]
|
||||
* 100
|
||||
)
|
||||
throughput_change_pct = (
|
||||
(batch_inv_results["throughput"] - baseline_results["throughput"])
|
||||
/ baseline_results["throughput"]
|
||||
* 100
|
||||
)
|
||||
|
||||
print("\nInitialization Time:")
|
||||
print(f" Baseline: {baseline_results['init_time']:.2f}s")
|
||||
print(f" Batch Invariant: {batch_inv_results['init_time']:.2f}s")
|
||||
print(f" Overhead: {init_overhead_pct:+.2f}%")
|
||||
|
||||
print("\nAverage Trial Time:")
|
||||
print(f" Baseline: {baseline_results['avg_time']:.2f}s")
|
||||
print(f" Batch Invariant: {batch_inv_results['avg_time']:.2f}s")
|
||||
print(f" Overhead: {time_overhead_pct:+.2f}%")
|
||||
|
||||
print("\nThroughput (tokens/s):")
|
||||
print(f" Baseline: {baseline_results['throughput']:.2f}")
|
||||
print(f" Batch Invariant: {batch_inv_results['throughput']:.2f}")
|
||||
print(f" Change: {throughput_change_pct:+.2f}%")
|
||||
|
||||
print("\nPrompts/s:")
|
||||
print(f" Baseline: {baseline_results['prompts_per_sec']:.2f}")
|
||||
print(f" Batch Invariant: {batch_inv_results['prompts_per_sec']:.2f}")
|
||||
|
||||
print("\n" + "=" * 80)
|
||||
print("SUMMARY")
|
||||
print("=" * 80)
|
||||
if time_overhead_pct > 0:
|
||||
print(
|
||||
f"Batch invariance mode adds approximately {time_overhead_pct:.1f}% "
|
||||
"overhead"
|
||||
)
|
||||
else:
|
||||
print(
|
||||
f"Batch invariance mode is approximately {-time_overhead_pct:.1f}% "
|
||||
"faster (unexpected!)"
|
||||
)
|
||||
|
||||
if abs(throughput_change_pct) < 1.0:
|
||||
print("Throughput difference is negligible (< 1%)")
|
||||
elif throughput_change_pct < 0:
|
||||
print(
|
||||
f"Throughput decreased by {-throughput_change_pct:.1f}% "
|
||||
"with batch invariance"
|
||||
)
|
||||
else:
|
||||
print(
|
||||
f"Throughput increased by {throughput_change_pct:.1f}% "
|
||||
"with batch invariance (unexpected!)"
|
||||
)
|
||||
|
||||
print("=" * 80 + "\n")
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
exit(main())
|
||||
@@ -108,7 +108,10 @@ def benchmark_batched_propose(args):
|
||||
device_config=DeviceConfig(device=current_platform.device_type),
|
||||
parallel_config=ParallelConfig(),
|
||||
load_config=LoadConfig(),
|
||||
scheduler_config=SchedulerConfig(),
|
||||
scheduler_config=SchedulerConfig(
|
||||
max_model_len=model_config.max_model_len,
|
||||
is_encoder_decoder=model_config.is_encoder_decoder,
|
||||
),
|
||||
)
|
||||
|
||||
# monkey patch vllm.v1.worker.gpu_model_runner.get_pp_group
|
||||
|
||||
@@ -40,7 +40,7 @@ from vllm.engine.arg_utils import EngineArgs
|
||||
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
||||
|
||||
try:
|
||||
from vllm.transformers_utils.tokenizer import get_tokenizer
|
||||
from vllm.tokenizers import get_tokenizer
|
||||
except ImportError:
|
||||
from backend_request_func import get_tokenizer
|
||||
|
||||
@@ -69,7 +69,7 @@ def sample_tokens(tokenizer: PreTrainedTokenizerBase, length: int) -> list[int]:
|
||||
|
||||
# Remove the special tokens.
|
||||
return random.choices(
|
||||
[v for k, v in vocab.items() if k not in all_special_ids],
|
||||
[v for v in vocab.values() if v not in all_special_ids],
|
||||
k=length,
|
||||
)
|
||||
|
||||
|
||||
@@ -46,7 +46,7 @@ from tqdm.asyncio import tqdm
|
||||
from transformers import PreTrainedTokenizerBase
|
||||
|
||||
try:
|
||||
from vllm.transformers_utils.tokenizer import get_tokenizer
|
||||
from vllm.tokenizers import get_tokenizer
|
||||
except ImportError:
|
||||
from backend_request_func import get_tokenizer
|
||||
|
||||
|
||||
@@ -5,11 +5,12 @@ import argparse
|
||||
import asyncio
|
||||
import logging
|
||||
import os
|
||||
import time
|
||||
import uuid
|
||||
from urllib.parse import urlparse
|
||||
|
||||
import aiohttp
|
||||
from quart import Quart, Response, make_response, request
|
||||
from rate_limiter import RateLimiter
|
||||
from request_queue import RequestQueue
|
||||
|
||||
# Configure logging
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
@@ -24,26 +25,8 @@ def parse_args():
|
||||
parser.add_argument(
|
||||
"--timeout",
|
||||
type=float,
|
||||
default=300,
|
||||
help="Timeout for backend service requests in seconds (default: 300)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--max-concurrent",
|
||||
type=int,
|
||||
default=100,
|
||||
help="Maximum concurrent requests to backend services (default: 100)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--queue-size",
|
||||
type=int,
|
||||
default=500,
|
||||
help="Maximum number of requests in the queue (default: 500)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--rate-limit",
|
||||
type=int,
|
||||
default=40,
|
||||
help="Maximum requests per second (default: 40)",
|
||||
default=6 * 60 * 60,
|
||||
help="Timeout for backend service requests in seconds (default: 21600)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--port",
|
||||
@@ -54,14 +37,32 @@ def parse_args():
|
||||
parser.add_argument(
|
||||
"--prefill-url",
|
||||
type=str,
|
||||
default="http://localhost:8100/v1/completions",
|
||||
help="Prefill service endpoint URL",
|
||||
default="http://localhost:8100",
|
||||
help="Prefill service base URL (protocol + host[:port])",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--decode-url",
|
||||
type=str,
|
||||
default="http://localhost:8200/v1/completions",
|
||||
help="Decode service endpoint URL",
|
||||
default="http://localhost:8200",
|
||||
help="Decode service base URL (protocol + host[:port])",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--kv-host",
|
||||
type=str,
|
||||
default="localhost",
|
||||
help="Hostname or IP used by KV transfer (default: localhost)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--prefill-kv-port",
|
||||
type=int,
|
||||
default=14579,
|
||||
help="Prefill KV port (default: 14579)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--decode-kv-port",
|
||||
type=int,
|
||||
default=14580,
|
||||
help="Decode KV port (default: 14580)",
|
||||
)
|
||||
|
||||
return parser.parse_args()
|
||||
@@ -73,70 +74,129 @@ def main():
|
||||
|
||||
# Initialize configuration using command line parameters
|
||||
AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=args.timeout)
|
||||
MAX_CONCURRENT_REQUESTS = args.max_concurrent
|
||||
REQUEST_QUEUE_SIZE = args.queue_size
|
||||
RATE_LIMIT = args.rate_limit
|
||||
PREFILL_SERVICE_URL = args.prefill_url
|
||||
DECODE_SERVICE_URL = args.decode_url
|
||||
PORT = args.port
|
||||
|
||||
PREFILL_KV_ADDR = f"{args.kv_host}:{args.prefill_kv_port}"
|
||||
DECODE_KV_ADDR = f"{args.kv_host}:{args.decode_kv_port}"
|
||||
|
||||
logger.info(
|
||||
"Proxy resolved KV addresses -> prefill: %s, decode: %s",
|
||||
PREFILL_KV_ADDR,
|
||||
DECODE_KV_ADDR,
|
||||
)
|
||||
|
||||
app = Quart(__name__)
|
||||
|
||||
# Initialize the rate limiter and request queue
|
||||
rate_limiter = RateLimiter(RATE_LIMIT)
|
||||
request_queue = RequestQueue(MAX_CONCURRENT_REQUESTS, REQUEST_QUEUE_SIZE)
|
||||
|
||||
# Attach the configuration object to the application instance
|
||||
# Attach the configuration object to the application instance so helper
|
||||
# coroutines can read the resolved backend URLs and timeouts without using
|
||||
# globals.
|
||||
app.config.update(
|
||||
{
|
||||
"AIOHTTP_TIMEOUT": AIOHTTP_TIMEOUT,
|
||||
"rate_limiter": rate_limiter,
|
||||
"request_queue": request_queue,
|
||||
"PREFILL_SERVICE_URL": PREFILL_SERVICE_URL,
|
||||
"DECODE_SERVICE_URL": DECODE_SERVICE_URL,
|
||||
"PREFILL_KV_ADDR": PREFILL_KV_ADDR,
|
||||
"DECODE_KV_ADDR": DECODE_KV_ADDR,
|
||||
}
|
||||
)
|
||||
|
||||
# Start queue processing on app startup
|
||||
@app.before_serving
|
||||
async def startup():
|
||||
"""Start request processing task when app starts serving"""
|
||||
asyncio.create_task(request_queue.process())
|
||||
def _normalize_base_url(url: str) -> str:
|
||||
"""Remove any trailing slash so path joins behave predictably."""
|
||||
return url.rstrip("/")
|
||||
|
||||
async def forward_request(url, data):
|
||||
"""Forward request to backend service with rate limiting and error handling"""
|
||||
headers = {"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"}
|
||||
def _get_host_port(url: str) -> str:
|
||||
"""Return the hostname:port portion for logging and KV headers."""
|
||||
parsed = urlparse(url)
|
||||
host = parsed.hostname or "localhost"
|
||||
port = parsed.port
|
||||
if port is None:
|
||||
port = 80 if parsed.scheme == "http" else 443
|
||||
return f"{host}:{port}"
|
||||
|
||||
# Use rate limiter as context manager
|
||||
async with (
|
||||
rate_limiter,
|
||||
aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session,
|
||||
):
|
||||
try:
|
||||
async with session.post(
|
||||
url=url, json=data, headers=headers
|
||||
) as response:
|
||||
if response.status == 200:
|
||||
# Stream response chunks
|
||||
async for chunk_bytes in response.content.iter_chunked(1024):
|
||||
yield chunk_bytes
|
||||
else:
|
||||
# Handle backend service errors
|
||||
error_text = await response.text()
|
||||
logger.error(
|
||||
"Backend service error: %s - %s",
|
||||
response.status,
|
||||
error_text,
|
||||
)
|
||||
yield b'{"error": "Backend service error"}'
|
||||
except aiohttp.ClientError as e:
|
||||
# Handle connection errors
|
||||
logger.error("Connection error to %s: %s", url, str(e))
|
||||
yield b'{"error": "Service unavailable"}'
|
||||
except asyncio.TimeoutError:
|
||||
# Handle timeout errors
|
||||
logger.error("Timeout connecting to %s", url)
|
||||
yield b'{"error": "Service timeout"}'
|
||||
PREFILL_BASE = _normalize_base_url(PREFILL_SERVICE_URL)
|
||||
DECODE_BASE = _normalize_base_url(DECODE_SERVICE_URL)
|
||||
KV_TARGET = _get_host_port(DECODE_SERVICE_URL)
|
||||
|
||||
def _build_headers(request_id: str) -> dict[str, str]:
|
||||
"""Construct the headers expected by vLLM's P2P disagg connector."""
|
||||
headers: dict[str, str] = {"X-Request-Id": request_id, "X-KV-Target": KV_TARGET}
|
||||
api_key = os.environ.get("OPENAI_API_KEY")
|
||||
if api_key:
|
||||
headers["Authorization"] = f"Bearer {api_key}"
|
||||
return headers
|
||||
|
||||
async def _run_prefill(
|
||||
request_path: str,
|
||||
payload: dict,
|
||||
headers: dict[str, str],
|
||||
request_id: str,
|
||||
):
|
||||
url = f"{PREFILL_BASE}{request_path}"
|
||||
start_ts = time.perf_counter()
|
||||
logger.info("[prefill] start request_id=%s url=%s", request_id, url)
|
||||
try:
|
||||
async with (
|
||||
aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session,
|
||||
session.post(url=url, json=payload, headers=headers) as resp,
|
||||
):
|
||||
if resp.status != 200:
|
||||
error_text = await resp.text()
|
||||
raise RuntimeError(
|
||||
f"Prefill backend error {resp.status}: {error_text}"
|
||||
)
|
||||
await resp.read()
|
||||
logger.info(
|
||||
"[prefill] done request_id=%s status=%s elapsed=%.2fs",
|
||||
request_id,
|
||||
resp.status,
|
||||
time.perf_counter() - start_ts,
|
||||
)
|
||||
except asyncio.TimeoutError as exc:
|
||||
raise RuntimeError(f"Prefill service timeout at {url}") from exc
|
||||
except aiohttp.ClientError as exc:
|
||||
raise RuntimeError(f"Prefill service unavailable at {url}") from exc
|
||||
|
||||
async def _stream_decode(
|
||||
request_path: str,
|
||||
payload: dict,
|
||||
headers: dict[str, str],
|
||||
request_id: str,
|
||||
):
|
||||
url = f"{DECODE_BASE}{request_path}"
|
||||
# Stream tokens from the decode service once the prefill stage has
|
||||
# materialized KV caches on the target workers.
|
||||
logger.info("[decode] start request_id=%s url=%s", request_id, url)
|
||||
try:
|
||||
async with (
|
||||
aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session,
|
||||
session.post(url=url, json=payload, headers=headers) as resp,
|
||||
):
|
||||
if resp.status != 200:
|
||||
error_text = await resp.text()
|
||||
logger.error(
|
||||
"Decode backend error %s - %s", resp.status, error_text
|
||||
)
|
||||
err_msg = (
|
||||
'{"error": "Decode backend error ' + str(resp.status) + '"}'
|
||||
)
|
||||
yield err_msg.encode()
|
||||
return
|
||||
logger.info(
|
||||
"[decode] streaming response request_id=%s status=%s",
|
||||
request_id,
|
||||
resp.status,
|
||||
)
|
||||
async for chunk_bytes in resp.content.iter_chunked(1024):
|
||||
yield chunk_bytes
|
||||
logger.info("[decode] finished streaming request_id=%s", request_id)
|
||||
except asyncio.TimeoutError:
|
||||
logger.error("Decode service timeout at %s", url)
|
||||
yield b'{"error": "Decode service timeout"}'
|
||||
except aiohttp.ClientError as exc:
|
||||
logger.error("Decode service error at %s: %s", url, exc)
|
||||
yield b'{"error": "Decode service unavailable"}'
|
||||
|
||||
async def process_request():
|
||||
"""Process a single request through prefill and decode stages"""
|
||||
@@ -146,13 +206,27 @@ def main():
|
||||
# Create prefill request (max_tokens=1)
|
||||
prefill_request = original_request_data.copy()
|
||||
prefill_request["max_tokens"] = 1
|
||||
if "max_completion_tokens" in prefill_request:
|
||||
prefill_request["max_completion_tokens"] = 1
|
||||
|
||||
# Execute prefill stage
|
||||
async for _ in forward_request(PREFILL_SERVICE_URL, prefill_request):
|
||||
continue
|
||||
# The request id encodes both KV socket addresses so the backend can
|
||||
# shuttle tensors directly via NCCL once the prefill response
|
||||
# completes.
|
||||
request_id = (
|
||||
f"___prefill_addr_{PREFILL_KV_ADDR}___decode_addr_"
|
||||
f"{DECODE_KV_ADDR}_{uuid.uuid4().hex}"
|
||||
)
|
||||
|
||||
headers = _build_headers(request_id)
|
||||
await _run_prefill(request.path, prefill_request, headers, request_id)
|
||||
|
||||
# Execute decode stage and stream response
|
||||
generator = forward_request(DECODE_SERVICE_URL, original_request_data)
|
||||
# Pass the unmodified user request so the decode phase can continue
|
||||
# sampling with the already-populated KV cache.
|
||||
generator = _stream_decode(
|
||||
request.path, original_request_data, headers, request_id
|
||||
)
|
||||
response = await make_response(generator)
|
||||
response.timeout = None # Disable timeout for streaming response
|
||||
return response
|
||||
@@ -168,23 +242,10 @@ def main():
|
||||
@app.route("/v1/completions", methods=["POST"])
|
||||
async def handle_request():
|
||||
"""Handle incoming API requests with concurrency and rate limiting"""
|
||||
# Create task for request processing
|
||||
task = asyncio.create_task(process_request())
|
||||
|
||||
# Enqueue request or reject if queue is full
|
||||
if not await request_queue.enqueue(task):
|
||||
return Response(
|
||||
response=b'{"error": "Server busy, try again later"}',
|
||||
status=503,
|
||||
content_type="application/json",
|
||||
)
|
||||
|
||||
try:
|
||||
# Return the response from the processing task
|
||||
return await task
|
||||
return await process_request()
|
||||
except asyncio.CancelledError:
|
||||
# Handle task cancellation (timeout or queue full)
|
||||
logger.warning("Request cancelled due to timeout or queue full")
|
||||
logger.warning("Request cancelled")
|
||||
return Response(
|
||||
response=b'{"error": "Request cancelled"}',
|
||||
status=503,
|
||||
|
||||
@@ -1,10 +1,18 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import os
|
||||
|
||||
# Disable DeepGEMM for this benchmark to use CUTLASS
|
||||
os.environ["VLLM_USE_DEEP_GEMM"] = "0"
|
||||
|
||||
import torch
|
||||
|
||||
from vllm.model_executor.layers.quantization.utils.fp8_utils import (
|
||||
apply_w8a8_block_fp8_linear,
|
||||
W8A8BlockFp8LinearOp,
|
||||
)
|
||||
from vllm.model_executor.layers.quantization.utils.quant_utils import (
|
||||
GroupShape,
|
||||
)
|
||||
from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
|
||||
CUTLASS_BLOCK_FP8_SUPPORTED,
|
||||
@@ -39,13 +47,14 @@ def build_w8a8_block_fp8_runner(M, N, K, block_size, device, use_cutlass):
|
||||
fp8_info = torch.finfo(torch.float8_e4m3fn)
|
||||
fp8_max, fp8_min = fp8_info.max, fp8_info.min
|
||||
|
||||
# Create random FP8 tensors
|
||||
# Create random input tensor (bfloat16, will be quantized by W8A8BlockFp8LinearOp)
|
||||
A_ref = (torch.rand(M, K, dtype=torch.bfloat16, device=device) - 0.5) * 2 * fp8_max
|
||||
|
||||
# Create quantized weight tensor
|
||||
B_ref = (torch.rand(N, K, dtype=torch.bfloat16, device=device) - 0.5) * 2 * fp8_max
|
||||
B = B_ref.clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn)
|
||||
|
||||
# Create scales
|
||||
# Create weight scales
|
||||
block_n, block_k = block_size[0], block_size[1]
|
||||
n_tiles = (N + block_n - 1) // block_n
|
||||
k_tiles = (K + block_k - 1) // block_k
|
||||
@@ -55,19 +64,25 @@ def build_w8a8_block_fp8_runner(M, N, K, block_size, device, use_cutlass):
|
||||
* factor_for_scale
|
||||
)
|
||||
|
||||
# SM90 CUTLASS requires row-major format for scales
|
||||
if use_cutlass and current_platform.is_device_capability(90):
|
||||
Bs = Bs.T.contiguous()
|
||||
# Create W8A8BlockFp8LinearOp instance
|
||||
weight_group_shape = GroupShape(block_n, block_k)
|
||||
act_quant_group_shape = GroupShape(1, block_k) # Per-token, per-group quantization
|
||||
|
||||
linear_op = W8A8BlockFp8LinearOp(
|
||||
weight_group_shape=weight_group_shape,
|
||||
act_quant_group_shape=act_quant_group_shape,
|
||||
cutlass_block_fp8_supported=use_cutlass,
|
||||
use_aiter_and_is_supported=False,
|
||||
)
|
||||
|
||||
def run():
|
||||
if use_cutlass:
|
||||
return apply_w8a8_block_fp8_linear(
|
||||
A_ref, B, block_size, Bs, cutlass_block_fp8_supported=True
|
||||
)
|
||||
else:
|
||||
return apply_w8a8_block_fp8_linear(
|
||||
A_ref, B, block_size, Bs, cutlass_block_fp8_supported=False
|
||||
)
|
||||
return linear_op.apply(
|
||||
input=A_ref,
|
||||
weight=B,
|
||||
weight_scale=Bs,
|
||||
input_scale=None,
|
||||
bias=None,
|
||||
)
|
||||
|
||||
return run
|
||||
|
||||
|
||||
@@ -255,8 +255,8 @@ def bench_run(
|
||||
torch.cuda.synchronize()
|
||||
|
||||
# Timing
|
||||
start_event = torch.cuda.Event(enable_timing=True)
|
||||
end_event = torch.cuda.Event(enable_timing=True)
|
||||
start_event = torch.Event(enable_timing=True)
|
||||
end_event = torch.Event(enable_timing=True)
|
||||
|
||||
latencies = []
|
||||
for _ in range(num_iters):
|
||||
|
||||
1129
benchmarks/kernels/benchmark_fused_collective.py
Normal file
1129
benchmarks/kernels/benchmark_fused_collective.py
Normal file
File diff suppressed because it is too large
Load Diff
@@ -16,8 +16,8 @@ from vllm.model_executor.layers.fused_moe.fused_moe import (
|
||||
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
||||
|
||||
DEFAULT_MODELS = [
|
||||
"nm-testing/Mixtral-8x7B-Instruct-v0.1",
|
||||
"nm-testing/deepseekv2-lite",
|
||||
"mistralai/Mixtral-8x7B-Instruct-v0.1",
|
||||
"deepseek-ai/DeepSeek-V2-Lite",
|
||||
"ibm-granite/granite-3.0-1b-a400m",
|
||||
"ibm-granite/granite-3.0-3b-a800m",
|
||||
]
|
||||
|
||||
@@ -19,13 +19,24 @@ from torch.utils.benchmark import Measurement as TMeasurement
|
||||
from utils import ArgPool, Bench, CudaGraphBenchParams
|
||||
from weight_shapes import WEIGHT_SHAPES
|
||||
|
||||
from vllm.triton_utils import HAS_TRITON
|
||||
from vllm.lora.ops.triton_ops.utils import get_lora_op_configs
|
||||
from vllm.triton_utils import HAS_TRITON, triton
|
||||
|
||||
if HAS_TRITON:
|
||||
from vllm.lora.ops.triton_ops import LoRAKernelMeta, lora_expand, lora_shrink
|
||||
from vllm.lora.ops.triton_ops import ( ## added fused_moe_lora
|
||||
LoRAKernelMeta,
|
||||
fused_moe_lora_expand,
|
||||
fused_moe_lora_shrink,
|
||||
lora_expand,
|
||||
lora_shrink,
|
||||
)
|
||||
from vllm.lora.ops.triton_ops.fused_moe_lora_op import (
|
||||
_LORA_PTR_DICT, ## added _LORA_PTR_DICT for fused_moe_lora
|
||||
)
|
||||
from vllm.lora.ops.triton_ops.utils import _LORA_A_PTR_DICT, _LORA_B_PTR_DICT
|
||||
|
||||
from vllm import _custom_ops as ops
|
||||
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
||||
from vllm.utils.math_utils import round_up
|
||||
|
||||
DEFAULT_MODELS = list(WEIGHT_SHAPES.keys())
|
||||
DEFAULT_TP_SIZES = [1]
|
||||
@@ -59,6 +70,8 @@ DEFAULT_NUM_LORAS = [1, 2, 3, 4]
|
||||
DEFAULT_SORT_BY_LORA_IDS = [False, True]
|
||||
DEFAULT_SEQ_LENGTHS = [1]
|
||||
DEFAULT_EXPAND_FN_ADD_INPUTS = [True, False]
|
||||
DEFAULT_TOP_K_NUMS = [1] # Added for MoE LoRA top_k
|
||||
DEFAULT_NUM_EXPERTS = [8] # Added for MoE LoRA num_experts
|
||||
|
||||
|
||||
# Utilities
|
||||
@@ -191,6 +204,11 @@ class OpType(Enum):
|
||||
|
||||
LORA_SHRINK = auto()
|
||||
LORA_EXPAND = auto()
|
||||
## Adding support for fused moe lora
|
||||
FUSED_MOE_LORA_GATE_UP_SHRINK = auto() ## Gate/Up projection variant with shrink
|
||||
FUSED_MOE_LORA_GATE_UP_EXPAND = auto() ## Gate/Up projection variant with expand
|
||||
FUSED_MOE_LORA_DOWN_SHRINK = auto() ## Down projection variant with shrink
|
||||
FUSED_MOE_LORA_DOWN_EXPAND = auto() ## Down projection variant with expand
|
||||
|
||||
@staticmethod
|
||||
def from_str(s: str) -> "OpType":
|
||||
@@ -198,6 +216,15 @@ class OpType(Enum):
|
||||
return OpType.LORA_SHRINK
|
||||
if s.lower() == "lora_expand":
|
||||
return OpType.LORA_EXPAND
|
||||
# Adding support for fused moe lora, both in gate_up and down
|
||||
if s.lower() == "fused_moe_lora_gate_up_shrink": ## Gate/Up variant with shrink
|
||||
return OpType.FUSED_MOE_LORA_GATE_UP_SHRINK
|
||||
if s.lower() == "fused_moe_lora_gate_up_expand": ## Gate/Up variant with expand
|
||||
return OpType.FUSED_MOE_LORA_GATE_UP_EXPAND
|
||||
if s.lower() == "fused_moe_lora_down_shrink": ## Down variant with shrink
|
||||
return OpType.FUSED_MOE_LORA_DOWN_SHRINK
|
||||
if s.lower() == "fused_moe_lora_down_expand": ## Down variant with expand
|
||||
return OpType.FUSED_MOE_LORA_DOWN_EXPAND
|
||||
raise ValueError(f"Unrecognized str {s} to convert to OpType")
|
||||
|
||||
def is_shrink_fn(self) -> bool:
|
||||
@@ -206,19 +233,56 @@ class OpType(Enum):
|
||||
def is_expand_fn(self) -> bool:
|
||||
return self in [OpType.LORA_EXPAND]
|
||||
|
||||
def is_fused_moe_lora_fn(self) -> bool: ## adding for fused MoE LoRA
|
||||
return self in [
|
||||
OpType.FUSED_MOE_LORA_GATE_UP_SHRINK,
|
||||
OpType.FUSED_MOE_LORA_DOWN_SHRINK,
|
||||
OpType.FUSED_MOE_LORA_GATE_UP_EXPAND,
|
||||
OpType.FUSED_MOE_LORA_DOWN_EXPAND,
|
||||
]
|
||||
|
||||
def is_fused_moe_lora_gate_up_fn(
|
||||
self,
|
||||
) -> bool: ## adding for fused MoE LoRA Gate/Up
|
||||
return self in [
|
||||
OpType.FUSED_MOE_LORA_GATE_UP_SHRINK,
|
||||
OpType.FUSED_MOE_LORA_GATE_UP_EXPAND,
|
||||
]
|
||||
|
||||
def is_fused_moe_lora_down_fn(self) -> bool: ## adding for fused MoE LoRA Down
|
||||
return self in [
|
||||
OpType.FUSED_MOE_LORA_DOWN_SHRINK,
|
||||
OpType.FUSED_MOE_LORA_DOWN_EXPAND,
|
||||
]
|
||||
|
||||
def is_fused_moe_lora_shrink_fn(self) -> bool:
|
||||
return self in [
|
||||
OpType.FUSED_MOE_LORA_GATE_UP_SHRINK,
|
||||
OpType.FUSED_MOE_LORA_DOWN_SHRINK,
|
||||
]
|
||||
|
||||
def is_fused_moe_lora_expand_fn(self) -> bool:
|
||||
return self in [
|
||||
OpType.FUSED_MOE_LORA_GATE_UP_EXPAND,
|
||||
OpType.FUSED_MOE_LORA_DOWN_EXPAND,
|
||||
]
|
||||
|
||||
def num_slices(self) -> list[int]:
|
||||
if self.is_fused_moe_lora_gate_up_fn():
|
||||
return [2]
|
||||
elif self.is_fused_moe_lora_down_fn():
|
||||
return [1]
|
||||
return [1, 2, 3]
|
||||
|
||||
def mkn(
|
||||
self, batch_size: int, seq_length: int, hidden_size: int, lora_rank: int
|
||||
) -> tuple[int, int, int]:
|
||||
num_tokens = batch_size * seq_length
|
||||
if self.is_shrink_fn():
|
||||
if self.is_shrink_fn() or self.is_fused_moe_lora_fn():
|
||||
m = num_tokens
|
||||
k = hidden_size
|
||||
n = lora_rank
|
||||
else:
|
||||
assert self.is_expand_fn()
|
||||
elif self.is_expand_fn():
|
||||
m = num_tokens
|
||||
k = lora_rank
|
||||
n = hidden_size
|
||||
@@ -232,9 +296,36 @@ class OpType(Enum):
|
||||
"""
|
||||
if self.is_shrink_fn():
|
||||
return op_dtype, op_dtype, torch.float32
|
||||
else:
|
||||
assert self.is_expand_fn()
|
||||
elif self.is_expand_fn():
|
||||
return torch.float32, op_dtype, op_dtype
|
||||
else:
|
||||
assert self.is_fused_moe_lora_fn()
|
||||
return op_dtype, op_dtype, op_dtype
|
||||
|
||||
def matmul_shapes_fused_moe_lora(
|
||||
self,
|
||||
m: int,
|
||||
n: int,
|
||||
k: int,
|
||||
num_loras: int,
|
||||
num_slices: int,
|
||||
top_k_num: int,
|
||||
num_experts: int,
|
||||
) -> tuple[tuple[int], tuple[int], tuple[int], tuple[int]]:
|
||||
if self.is_fused_moe_lora_shrink_fn():
|
||||
input_shape = (
|
||||
(m * top_k_num, n)
|
||||
if self in [OpType.FUSED_MOE_LORA_DOWN_SHRINK]
|
||||
else (m, n)
|
||||
)
|
||||
output_shape = (num_slices, m, top_k_num, k)
|
||||
weight_shape = (num_loras, num_experts, k, n)
|
||||
else:
|
||||
assert self.is_fused_moe_lora_expand_fn()
|
||||
input_shape = (num_slices, m, top_k_num, k)
|
||||
output_shape = (m, top_k_num, n * num_slices)
|
||||
weight_shape = (num_loras, num_experts, n, k)
|
||||
return (input_shape, weight_shape, output_shape)
|
||||
|
||||
def matmul_shapes(
|
||||
self,
|
||||
@@ -244,6 +335,8 @@ class OpType(Enum):
|
||||
lora_rank: int,
|
||||
num_loras: int,
|
||||
num_slices: int,
|
||||
top_k_num: int | None = None,
|
||||
num_experts: int | None = None,
|
||||
) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...]]:
|
||||
"""
|
||||
Given num_slices, return the shapes of the A, B, and C matrices
|
||||
@@ -258,6 +351,16 @@ class OpType(Enum):
|
||||
if self in [OpType.LORA_EXPAND]:
|
||||
# LoRA expand kernels support num_slices inherently in the kernel
|
||||
return ((num_slices, m, k), b_shape, (m, n * num_slices))
|
||||
if self.is_fused_moe_lora_fn():
|
||||
return self.matmul_shapes_fused_moe_lora(
|
||||
m,
|
||||
k,
|
||||
n,
|
||||
num_loras,
|
||||
num_slices,
|
||||
top_k_num,
|
||||
num_experts,
|
||||
)
|
||||
raise ValueError(f"Unrecognized op_type {self}")
|
||||
|
||||
def bench_fn(self) -> Callable:
|
||||
@@ -265,6 +368,16 @@ class OpType(Enum):
|
||||
return lora_shrink
|
||||
if self == OpType.LORA_EXPAND:
|
||||
return lora_expand
|
||||
if self in [
|
||||
OpType.FUSED_MOE_LORA_GATE_UP_SHRINK,
|
||||
OpType.FUSED_MOE_LORA_DOWN_SHRINK,
|
||||
]:
|
||||
return fused_moe_lora_shrink
|
||||
if self in [
|
||||
OpType.FUSED_MOE_LORA_GATE_UP_EXPAND,
|
||||
OpType.FUSED_MOE_LORA_DOWN_EXPAND,
|
||||
]:
|
||||
return fused_moe_lora_expand
|
||||
|
||||
raise ValueError(f"Unrecognized optype {self}")
|
||||
|
||||
@@ -318,6 +431,8 @@ class BenchmarkContext:
|
||||
sort_by_lora_id: bool
|
||||
dtype: torch.dtype
|
||||
seq_length: int | None = None
|
||||
num_experts: int | None = None # num_experts for MoE based ops
|
||||
top_k_num: int | None = None # top_k for MoE based ops
|
||||
num_slices: int | None = None # num_slices for slice based ops
|
||||
|
||||
def with_seq_length(self, seq_length: int) -> "BenchmarkContext":
|
||||
@@ -373,6 +488,11 @@ class BenchmarkTensors:
|
||||
f"{dtype_to_str(self.output.dtype)}"
|
||||
)
|
||||
|
||||
def get_num_tokens(self, size: int, top_k_num: int, op_type: OpType):
|
||||
return (
|
||||
size * top_k_num if op_type in [OpType.FUSED_MOE_LORA_DOWN_SHRINK] else size
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def make(
|
||||
ctx: BenchmarkContext, op_type: OpType, device: str = "cuda"
|
||||
@@ -385,6 +505,8 @@ class BenchmarkTensors:
|
||||
ctx.lora_rank,
|
||||
ctx.num_loras,
|
||||
ctx.num_slices,
|
||||
ctx.top_k_num,
|
||||
ctx.num_experts,
|
||||
)
|
||||
a_type, b_type, c_type = op_type.matmul_dtypes(ctx.dtype)
|
||||
input_tensor, lora_weights, output_tensor = make_rand_tensors(
|
||||
@@ -432,17 +554,27 @@ class BenchmarkTensors:
|
||||
prompt_lora_indices_tensor,
|
||||
)
|
||||
|
||||
def sanity_check(self) -> None:
|
||||
def sanity_check(self, ctx: BenchmarkContext, op_type: OpType) -> None:
|
||||
"""
|
||||
Fails asserts when non-conformality is detected.
|
||||
"""
|
||||
num_tokens = self.input.shape[-2]
|
||||
num_tokens = (
|
||||
self.input.shape[1]
|
||||
if op_type.is_fused_moe_lora_expand_fn()
|
||||
else self.input.shape[-2]
|
||||
)
|
||||
# check metadata tensors
|
||||
assert torch.sum(self.seq_lens) == num_tokens
|
||||
## In down shrink case, each token is repeated top_k_num times
|
||||
assert num_tokens == self.get_num_tokens(
|
||||
torch.sum(self.seq_lens), ctx.top_k_num, op_type
|
||||
), f"Expected {num_tokens} tokens, but got {torch.sum(self.seq_lens)}"
|
||||
num_seqs = self.seq_lens.shape[0]
|
||||
# assert self.seq_start_loc.shape[0] == num_seqs
|
||||
## In down shrink case, each prompt corresponds to top_k_num sequences
|
||||
assert self.prompt_lora_mapping.shape[0] == num_seqs
|
||||
assert self.lora_kernel_meta.token_lora_mapping.shape[0] == num_tokens
|
||||
assert self.get_num_tokens(
|
||||
self.lora_kernel_meta.token_lora_mapping.shape[0], ctx.top_k_num, op_type
|
||||
)
|
||||
|
||||
def to_device(self, device: str):
|
||||
"""
|
||||
@@ -471,21 +603,111 @@ class BenchmarkTensors:
|
||||
to_device(field) if field_name != "no_lora_flag_cpu" else field,
|
||||
)
|
||||
|
||||
def metadata(self) -> tuple[int, int, int]:
|
||||
def metadata(self, ctx: BenchmarkContext, op_type: OpType) -> tuple[int, int, int]:
|
||||
"""
|
||||
Return num_seqs, num_tokens and max_seq_len
|
||||
"""
|
||||
num_seqs = self.seq_lens.shape[0]
|
||||
num_tokens = self.lora_kernel_meta.token_lora_mapping.shape[0]
|
||||
num_tokens = self.get_num_tokens(
|
||||
self.lora_kernel_meta.token_lora_mapping.shape[0], ctx.top_k_num, op_type
|
||||
)
|
||||
max_seq_len = torch.max(self.seq_lens).item()
|
||||
num_slices = len(self.lora_weights_lst)
|
||||
return num_seqs, num_tokens, max_seq_len, num_slices
|
||||
|
||||
def as_lora_shrink_kwargs(self) -> dict[str, Any]:
|
||||
self.sanity_check()
|
||||
def fused_moe_lora_data_prepare(
|
||||
self,
|
||||
block_size: int,
|
||||
token_lora_mapping: torch.Tensor,
|
||||
ctx: BenchmarkContext,
|
||||
):
|
||||
def moe_lora_align_block_size(
|
||||
topk_ids: torch.Tensor,
|
||||
token_lora_mapping: torch.Tensor,
|
||||
block_size: int,
|
||||
num_experts: int,
|
||||
max_loras: int,
|
||||
expert_map: torch.Tensor | None = None,
|
||||
pad_sorted_ids: bool = False,
|
||||
) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
|
||||
"""
|
||||
Aligns tokens and experts into block-sized chunks for LoRA-based
|
||||
mixture-of-experts (MoE) execution.
|
||||
"""
|
||||
max_num_tokens_padded = topk_ids.numel() + num_experts * (block_size - 1)
|
||||
if pad_sorted_ids:
|
||||
max_num_tokens_padded = round_up(max_num_tokens_padded, block_size)
|
||||
sorted_ids = torch.empty(
|
||||
(max_loras * max_num_tokens_padded,),
|
||||
dtype=torch.int32,
|
||||
device=topk_ids.device,
|
||||
)
|
||||
max_num_m_blocks = triton.cdiv(max_num_tokens_padded, block_size)
|
||||
# Expert ids must be set default to -1 to prevent a blank block
|
||||
expert_ids = torch.empty(
|
||||
(max_loras * max_num_m_blocks,),
|
||||
dtype=torch.int32,
|
||||
device=topk_ids.device,
|
||||
)
|
||||
num_tokens_post_pad = torch.empty(
|
||||
(max_loras), dtype=torch.int32, device=topk_ids.device
|
||||
)
|
||||
|
||||
ops.moe_lora_align_block_size(
|
||||
topk_ids,
|
||||
token_lora_mapping,
|
||||
num_experts,
|
||||
block_size,
|
||||
max_loras,
|
||||
max_num_tokens_padded,
|
||||
max_num_m_blocks,
|
||||
sorted_ids,
|
||||
expert_ids,
|
||||
num_tokens_post_pad,
|
||||
)
|
||||
if expert_map is not None:
|
||||
expert_ids = expert_map[expert_ids]
|
||||
|
||||
return sorted_ids, expert_ids, num_tokens_post_pad
|
||||
|
||||
num_tokens = ctx.batch_size
|
||||
curr_topk_ids = torch.randint(
|
||||
0,
|
||||
ctx.num_experts,
|
||||
(num_tokens, ctx.top_k_num),
|
||||
device="cuda",
|
||||
dtype=torch.int32,
|
||||
)
|
||||
topk_weights = torch.randint(
|
||||
0,
|
||||
ctx.num_experts,
|
||||
(num_tokens, ctx.top_k_num),
|
||||
device="cuda",
|
||||
dtype=torch.int32,
|
||||
)
|
||||
|
||||
(sorted_token_ids_lora, expert_ids_lora, num_tokens_post_padded_lora) = (
|
||||
moe_lora_align_block_size(
|
||||
topk_ids=curr_topk_ids,
|
||||
token_lora_mapping=token_lora_mapping,
|
||||
block_size=block_size,
|
||||
num_experts=ctx.num_experts,
|
||||
max_loras=ctx.num_loras,
|
||||
)
|
||||
)
|
||||
|
||||
sorted_token_ids = sorted_token_ids_lora.view(ctx.num_loras, -1)
|
||||
expert_ids = expert_ids_lora.view(ctx.num_loras, -1)
|
||||
num_tokens_post_padded = num_tokens_post_padded_lora
|
||||
return (topk_weights, sorted_token_ids, expert_ids, num_tokens_post_padded)
|
||||
|
||||
def as_lora_shrink_kwargs(
|
||||
self, ctx: BenchmarkContext, op_type: OpType
|
||||
) -> dict[str, Any]:
|
||||
self.sanity_check(ctx, op_type)
|
||||
self.to_device(self.input.device)
|
||||
|
||||
_, num_tokens, _, num_slices = self.metadata()
|
||||
_, num_tokens, _, num_slices = self.metadata(ctx, op_type)
|
||||
|
||||
# Sanity check matrix shapes.
|
||||
i_shape, lw_shape, o_shape = (
|
||||
@@ -520,11 +742,13 @@ class BenchmarkTensors:
|
||||
"no_lora_flag_cpu": self.lora_kernel_meta.no_lora_flag_cpu,
|
||||
}
|
||||
|
||||
def as_lora_expand_kwargs(self, add_inputs: bool) -> dict[str, Any]:
|
||||
self.sanity_check()
|
||||
def as_lora_expand_kwargs(
|
||||
self, ctx: BenchmarkContext, op_type: OpType, add_inputs: bool
|
||||
) -> dict[str, Any]:
|
||||
self.sanity_check(ctx, op_type)
|
||||
self.to_device(self.input.device)
|
||||
|
||||
_, num_tokens, _, num_slices = self.metadata()
|
||||
_, num_tokens, _, num_slices = self.metadata(ctx, op_type)
|
||||
|
||||
# Sanity check matrix shapes.
|
||||
i_shape, lw_shape, o_shape = (
|
||||
@@ -561,18 +785,173 @@ class BenchmarkTensors:
|
||||
"no_lora_flag_cpu": self.lora_kernel_meta.no_lora_flag_cpu,
|
||||
}
|
||||
|
||||
def bench_fn_kwargs(
|
||||
self, op_type: OpType, add_inputs: bool | None = None
|
||||
def as_fused_moe_lora_shrink_kwargs(
|
||||
self, ctx: BenchmarkContext, op_type: OpType
|
||||
) -> dict[str, Any]:
|
||||
if op_type.is_shrink_fn():
|
||||
self.sanity_check(ctx, op_type)
|
||||
self.to_device(self.input.device)
|
||||
|
||||
_, num_tokens, _, num_slices = self.metadata(ctx, op_type)
|
||||
|
||||
# Sanity check matrix shapes.
|
||||
i_shape, lw_shape, o_shape = (
|
||||
self.input.shape,
|
||||
self.lora_weights_lst[0].shape,
|
||||
self.output.shape,
|
||||
)
|
||||
# Expected input shape : [num_tokens, hidden_size] for gate_up
|
||||
# Expected input shape : [top_k_num * num_tokens, hidden_size] for down
|
||||
assert len(i_shape) == 2
|
||||
assert i_shape[0] == num_tokens
|
||||
hidden_size = i_shape[1]
|
||||
# Expected lora weight shape [max_lora, num_experts, lora_rank, hidden_size]
|
||||
assert len(lw_shape) == 4
|
||||
assert lw_shape[-1] == hidden_size
|
||||
lora_rank = lw_shape[-2]
|
||||
# Expected output shape : [num_slices, num_tokens, top_k_num, lora_rank]
|
||||
assert len(o_shape) == 4
|
||||
assert (
|
||||
o_shape
|
||||
== (num_slices, num_tokens // ctx.top_k_num, ctx.top_k_num, lora_rank)
|
||||
if op_type in [OpType.FUSED_MOE_LORA_DOWN_SHRINK]
|
||||
else o_shape == (num_slices, num_tokens, ctx.top_k_num, lora_rank)
|
||||
)
|
||||
kernel_config = get_lora_op_configs(
|
||||
op_type.name.lower(),
|
||||
max_loras=lw_shape[0],
|
||||
batch=num_tokens,
|
||||
hidden_size=hidden_size,
|
||||
rank=lora_rank,
|
||||
num_slices=num_slices,
|
||||
add_inputs=False,
|
||||
)
|
||||
|
||||
(topk_weights, sorted_token_ids, expert_ids, num_tokens_post_padded) = (
|
||||
self.fused_moe_lora_data_prepare(
|
||||
block_size=kernel_config["BLOCK_SIZE_M"],
|
||||
token_lora_mapping=self.lora_kernel_meta.token_lora_mapping,
|
||||
ctx=ctx,
|
||||
)
|
||||
)
|
||||
|
||||
return {
|
||||
"qcurr_hidden_states": self.input,
|
||||
"lora_a_stacked": self.lora_weights_lst,
|
||||
"a_intermediate_cache1": self.output,
|
||||
"topk_weights": topk_weights,
|
||||
"sorted_token_ids": sorted_token_ids,
|
||||
"expert_ids": expert_ids,
|
||||
"num_tokens_post_padded": num_tokens_post_padded,
|
||||
"top_k_num": ctx.top_k_num,
|
||||
"device": self.input.device,
|
||||
"N": lora_rank,
|
||||
"M": topk_weights.shape[0],
|
||||
"EM": sorted_token_ids.shape[1],
|
||||
"K": self.input.shape[1],
|
||||
"num_tokens": num_tokens,
|
||||
"num_experts": ctx.num_experts,
|
||||
"num_slices": num_slices,
|
||||
"shrink_block_size_m": kernel_config["BLOCK_SIZE_M"],
|
||||
"shrink_block_size_n": kernel_config["BLOCK_SIZE_N"],
|
||||
"shrink_block_size_k": kernel_config["BLOCK_SIZE_K"],
|
||||
"shrink_group_size_m": kernel_config["GROUP_SIZE_M"],
|
||||
"shrink_num_warps": kernel_config["NUM_WARPS"],
|
||||
"shrink_num_stages": kernel_config["NUM_STAGES"],
|
||||
"shrink_split_k": kernel_config.get("SPLIT_K", 1),
|
||||
"mul_routed_weight": op_type.is_fused_moe_lora_down_fn(),
|
||||
}
|
||||
|
||||
def as_fused_moe_lora_expand_kwargs(
|
||||
self, ctx: BenchmarkContext, op_type: OpType
|
||||
) -> dict[str, Any]:
|
||||
self.sanity_check(ctx, op_type)
|
||||
self.to_device(self.input.device)
|
||||
|
||||
_, num_tokens, _, num_slices = self.metadata(ctx, op_type)
|
||||
|
||||
# Sanity check matrix shapes.
|
||||
i_shape, lw_shape, o_shape = (
|
||||
self.input.shape,
|
||||
self.lora_weights_lst[0].shape,
|
||||
self.output.shape,
|
||||
)
|
||||
|
||||
# Expected input shape : [num_slices, num_tokens, top_k_num, lora_rank]
|
||||
assert len(i_shape) == 4
|
||||
assert i_shape[0] == num_slices
|
||||
assert i_shape[1] == num_tokens
|
||||
lora_rank = i_shape[-1]
|
||||
# Expected lora weight shape : [num_loras, num_experts, hidden_size, lora_rank]
|
||||
assert len(lw_shape) == 4
|
||||
assert lw_shape[-1] == lora_rank
|
||||
hidden_size = lw_shape[-2]
|
||||
# Expected output shape : [num_tokens, top_k_num, hidden_size * num_slices]
|
||||
assert len(o_shape) == 3
|
||||
assert o_shape == (num_tokens, ctx.top_k_num, hidden_size * num_slices)
|
||||
|
||||
kernel_config = get_lora_op_configs(
|
||||
op_type.name.lower(),
|
||||
max_loras=lw_shape[0],
|
||||
batch=num_tokens,
|
||||
hidden_size=hidden_size,
|
||||
rank=lora_rank,
|
||||
num_slices=num_slices,
|
||||
add_inputs=False,
|
||||
)
|
||||
|
||||
(topk_weights, sorted_token_ids, expert_ids, num_tokens_post_padded) = (
|
||||
self.fused_moe_lora_data_prepare(
|
||||
block_size=kernel_config["BLOCK_SIZE_M"],
|
||||
token_lora_mapping=self.lora_kernel_meta.token_lora_mapping,
|
||||
ctx=ctx,
|
||||
)
|
||||
)
|
||||
|
||||
return {
|
||||
"a_intermediate_cache1": self.input,
|
||||
"lora_b_stacked": self.lora_weights_lst,
|
||||
"output": self.output,
|
||||
"topk_weights": topk_weights,
|
||||
"sorted_token_ids": sorted_token_ids,
|
||||
"expert_ids": expert_ids,
|
||||
"num_tokens_post_padded": num_tokens_post_padded,
|
||||
"top_k_num": ctx.top_k_num,
|
||||
"device": self.input.device,
|
||||
"N": lora_rank,
|
||||
"M": topk_weights.shape[0],
|
||||
"EM": sorted_token_ids.shape[1],
|
||||
"K": self.input.shape[1],
|
||||
"num_tokens": num_tokens,
|
||||
"num_experts": ctx.num_experts,
|
||||
"num_slices": num_slices,
|
||||
"max_lora_rank": lora_rank,
|
||||
"w1_output_dim_size": lw_shape[2],
|
||||
"expand_block_size_m": kernel_config["BLOCK_SIZE_M"],
|
||||
"expand_block_size_n": kernel_config["BLOCK_SIZE_N"],
|
||||
"expand_block_size_k": kernel_config["BLOCK_SIZE_K"],
|
||||
"expand_group_size_m": kernel_config["GROUP_SIZE_M"],
|
||||
"expand_num_warps": kernel_config["NUM_WARPS"],
|
||||
"expand_num_stages": kernel_config["NUM_STAGES"],
|
||||
"expand_split_k": kernel_config.get("SPLIT_K", 1),
|
||||
"mul_routed_weight": op_type.is_fused_moe_lora_down_fn(),
|
||||
}
|
||||
|
||||
def bench_fn_kwargs(
|
||||
self, ctx: BenchmarkContext, op_type: OpType, add_inputs: bool | None = None
|
||||
) -> dict[str, Any]:
|
||||
if op_type.is_shrink_fn() or op_type.is_fused_moe_lora_fn():
|
||||
assert add_inputs is None
|
||||
else:
|
||||
assert add_inputs is not None
|
||||
|
||||
if op_type == OpType.LORA_SHRINK:
|
||||
return self.as_lora_shrink_kwargs()
|
||||
return self.as_lora_shrink_kwargs(ctx, op_type)
|
||||
if op_type == OpType.LORA_EXPAND:
|
||||
return self.as_lora_expand_kwargs(add_inputs)
|
||||
return self.as_lora_expand_kwargs(ctx, op_type, add_inputs)
|
||||
if op_type.is_fused_moe_lora_shrink_fn():
|
||||
return self.as_fused_moe_lora_shrink_kwargs(ctx, op_type)
|
||||
if op_type.is_fused_moe_lora_expand_fn():
|
||||
return self.as_fused_moe_lora_expand_kwargs(ctx, op_type)
|
||||
raise ValueError(f"Unrecognized optype {self}")
|
||||
|
||||
def test_correctness(
|
||||
@@ -617,7 +996,7 @@ def bench_optype(
|
||||
test_correctness: bool = False,
|
||||
) -> TMeasurement:
|
||||
assert arg_pool_size >= 1
|
||||
if op_type.is_shrink_fn():
|
||||
if op_type.is_shrink_fn() or op_type.is_fused_moe_lora_fn():
|
||||
assert expand_fn_add_inputs is None
|
||||
else:
|
||||
assert expand_fn_add_inputs is not None
|
||||
@@ -627,23 +1006,30 @@ def bench_optype(
|
||||
BenchmarkTensors.make(ctx, op_type) for _ in range(arg_pool_size)
|
||||
]
|
||||
for bt in bench_tensors:
|
||||
bt.sanity_check()
|
||||
bt.sanity_check(ctx, op_type)
|
||||
|
||||
# Test correctness of our implementation.
|
||||
if test_correctness:
|
||||
assert op_type in [OpType.LORA_SHRINK, OpType.LORA_EXPAND], (
|
||||
f"Correctness testing is not supported for {op_type.name}."
|
||||
)
|
||||
assert all(
|
||||
[bt.test_correctness(op_type, expand_fn_add_inputs) for bt in bench_tensors]
|
||||
[
|
||||
bt.test_correctness(ctx, op_type, expand_fn_add_inputs)
|
||||
for bt in bench_tensors
|
||||
]
|
||||
)
|
||||
|
||||
# BenchmarkTensors -> dict (kwargs)
|
||||
kwargs_list = [
|
||||
bt.bench_fn_kwargs(op_type, add_inputs=expand_fn_add_inputs)
|
||||
bt.bench_fn_kwargs(ctx, op_type, add_inputs=expand_fn_add_inputs)
|
||||
for bt in bench_tensors
|
||||
]
|
||||
|
||||
# Clear LoRA optimization hash-maps.
|
||||
_LORA_A_PTR_DICT.clear()
|
||||
_LORA_B_PTR_DICT.clear()
|
||||
_LORA_PTR_DICT.clear()
|
||||
# Run bench function so that _LORA_A_PTR_DICT and _LORA_B_PTR_DICT are set up
|
||||
for kwargs in kwargs_list:
|
||||
op_type.bench_fn()(**kwargs)
|
||||
@@ -793,7 +1179,9 @@ def run(args: argparse.Namespace, bench_ctxs: list[BenchmarkContext]):
|
||||
|
||||
# Benchmark bench_op
|
||||
expand_fn_add_inputs = (
|
||||
[None] if bench_op.is_shrink_fn() else args.expand_fn_add_inputs
|
||||
[None]
|
||||
if bench_op.is_shrink_fn() or bench_op.is_fused_moe_lora_fn()
|
||||
else args.expand_fn_add_inputs
|
||||
)
|
||||
for add_input_arg in expand_fn_add_inputs:
|
||||
seq_len_timers.append(
|
||||
@@ -831,12 +1219,22 @@ def as_benchmark_contexts(
|
||||
hidden_sizes: list[int], lora_ranks: list[int], args: argparse.Namespace
|
||||
) -> list[BenchmarkContext]:
|
||||
ctxs: list[BenchmarkContext] = []
|
||||
for batch_size, hidden_size, lora_rank, num_loras, sort_by_lora_id in product( # noqa
|
||||
for (
|
||||
batch_size,
|
||||
hidden_size,
|
||||
lora_rank,
|
||||
num_loras,
|
||||
sort_by_lora_id,
|
||||
top_k_num,
|
||||
num_experts,
|
||||
) in product( # noqa
|
||||
args.batch_sizes,
|
||||
list(hidden_sizes),
|
||||
lora_ranks,
|
||||
args.num_loras,
|
||||
args.sort_by_lora_id,
|
||||
args.top_k_nums,
|
||||
args.num_experts,
|
||||
):
|
||||
ctxs.append(
|
||||
BenchmarkContext(
|
||||
@@ -851,6 +1249,8 @@ def as_benchmark_contexts(
|
||||
seq_length=None,
|
||||
sort_by_lora_id=sort_by_lora_id,
|
||||
dtype=args.dtype,
|
||||
top_k_num=top_k_num,
|
||||
num_experts=num_experts,
|
||||
# To be filled based on the OpType to benchmark
|
||||
num_slices=None,
|
||||
)
|
||||
@@ -1012,6 +1412,22 @@ if __name__ == "__main__":
|
||||
),
|
||||
)
|
||||
|
||||
p.add_argument(
|
||||
"--top-k-nums",
|
||||
nargs="+",
|
||||
type=int,
|
||||
default=DEFAULT_TOP_K_NUMS,
|
||||
help="Top-K values for MoE LoRA operations",
|
||||
)
|
||||
|
||||
p.add_argument(
|
||||
"--num-experts",
|
||||
nargs="+",
|
||||
type=int,
|
||||
default=DEFAULT_NUM_EXPERTS,
|
||||
help="Number of experts for MoE LoRA operations",
|
||||
)
|
||||
|
||||
parser = FlexibleArgumentParser(
|
||||
description=f"""
|
||||
Benchmark LoRA kernels:
|
||||
|
||||
@@ -237,6 +237,7 @@ def marlin_create_bench_fn(bt: BenchmarkTensors) -> Callable:
|
||||
b_q_weight=w_q,
|
||||
b_bias=None,
|
||||
b_scales=w_s,
|
||||
a_scales=None,
|
||||
global_scale=None,
|
||||
b_zeros=w_zp,
|
||||
g_idx=g_idx,
|
||||
|
||||
@@ -263,7 +263,7 @@ def bench_run(
|
||||
|
||||
results.append(
|
||||
benchmark.Timer(
|
||||
stmt="output = gptq_marlin_gemm(a, None, marlin_q_w, marlin_s, marlin_s2, marlin_zp, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, quant_type, size_m, size_n, size_k, is_k_full, False, False, False)", # noqa: E501
|
||||
stmt="output = gptq_marlin_gemm(a, None, marlin_q_w, marlin_s, None, marlin_s2, marlin_zp, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, quant_type, size_m, size_n, size_k, is_k_full, False, False, False)", # noqa: E501
|
||||
globals=globals,
|
||||
label=label,
|
||||
sub_label=sub_label,
|
||||
@@ -273,7 +273,7 @@ def bench_run(
|
||||
|
||||
results.append(
|
||||
benchmark.Timer(
|
||||
stmt="output = gptq_marlin_gemm(a, None, marlin_q_w, marlin_s, marlin_s2, marlin_zp, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, quant_type, size_m, size_n, size_k, is_k_full, False, True, False)", # noqa: E501
|
||||
stmt="output = gptq_marlin_gemm(a, None, marlin_q_w, marlin_s, None, marlin_s2, marlin_zp, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, quant_type, size_m, size_n, size_k, is_k_full, False, True, False)", # noqa: E501
|
||||
globals=globals,
|
||||
label=label,
|
||||
sub_label=sub_label,
|
||||
|
||||
@@ -185,8 +185,8 @@ def benchmark_config(
|
||||
graph.replay()
|
||||
torch.cuda.synchronize()
|
||||
|
||||
start_event = torch.cuda.Event(enable_timing=True)
|
||||
end_event = torch.cuda.Event(enable_timing=True)
|
||||
start_event = torch.Event(enable_timing=True)
|
||||
end_event = torch.Event(enable_timing=True)
|
||||
|
||||
latencies: list[float] = []
|
||||
for i in range(num_iters):
|
||||
@@ -211,7 +211,7 @@ def get_rocm_tuning_space(use_fp16):
|
||||
num_warps_range = [1, 2, 4, 8]
|
||||
group_m_range = [1, 4, 8, 16, 32]
|
||||
num_stage_range = [2]
|
||||
waves_per_eu_range = [0]
|
||||
waves_per_eu_range = [0, 1, 2, 4]
|
||||
matrix_instr_nonkdim_range = [16, 32] if use_fp16 else []
|
||||
kpack_range = [1, 2] if use_fp16 else []
|
||||
|
||||
@@ -590,6 +590,7 @@ def main(args: argparse.Namespace):
|
||||
"DeepseekV3ForCausalLM",
|
||||
"DeepseekV32ForCausalLM",
|
||||
"Glm4MoeForCausalLM",
|
||||
"NemotronHForCausalLM",
|
||||
):
|
||||
E = config.n_routed_experts
|
||||
topk = config.num_experts_per_tok
|
||||
@@ -615,6 +616,11 @@ def main(args: argparse.Namespace):
|
||||
topk = config.moe_topk[0]
|
||||
intermediate_size = config.moe_intermediate_size[0]
|
||||
hidden_size = config.hidden_size
|
||||
elif config.architectures[0] in ["Qwen3OmniMoeForConditionalGeneration"]:
|
||||
E = config.thinker_config.text_config.num_experts
|
||||
topk = config.thinker_config.text_config.num_experts_per_tok
|
||||
intermediate_size = config.thinker_config.text_config.moe_intermediate_size
|
||||
hidden_size = config.thinker_config.text_config.hidden_size
|
||||
else:
|
||||
# Support for llama4
|
||||
config = config.get_text_config()
|
||||
|
||||
@@ -105,8 +105,8 @@ def benchmark_permute(
|
||||
graph.replay()
|
||||
torch.cuda.synchronize()
|
||||
|
||||
start_event = torch.cuda.Event(enable_timing=True)
|
||||
end_event = torch.cuda.Event(enable_timing=True)
|
||||
start_event = torch.Event(enable_timing=True)
|
||||
end_event = torch.Event(enable_timing=True)
|
||||
|
||||
latencies: list[float] = []
|
||||
for i in range(num_iters):
|
||||
@@ -241,8 +241,8 @@ def benchmark_unpermute(
|
||||
graph.replay()
|
||||
torch.cuda.synchronize()
|
||||
|
||||
start_event = torch.cuda.Event(enable_timing=True)
|
||||
end_event = torch.cuda.Event(enable_timing=True)
|
||||
start_event = torch.Event(enable_timing=True)
|
||||
end_event = torch.Event(enable_timing=True)
|
||||
|
||||
latencies: list[float] = []
|
||||
for i in range(num_iters):
|
||||
|
||||
@@ -6,7 +6,7 @@
|
||||
#
|
||||
# The CSV file (named with current date/time) contains these columns:
|
||||
# model_name, tp_size, num_tokens, num_heads, num_kv_heads, head_dim, max_position,
|
||||
# rope_theta, is_neox_style, rope_scaling, dtype, torch_mean, torch_median, torch_p99,
|
||||
# is_neox_style, rope_parameters, dtype, torch_mean, torch_median, torch_p99,
|
||||
# torch_min, torch_max, triton_mean, triton_median, triton_p99, triton_min, triton_max,
|
||||
# speedup
|
||||
#
|
||||
@@ -86,9 +86,8 @@ def benchmark_mrope(
|
||||
num_heads: int,
|
||||
num_kv_heads: int,
|
||||
max_position: int = 8192,
|
||||
rope_theta: float = 10000,
|
||||
is_neox_style: bool = True,
|
||||
rope_scaling: dict[str, Any] = None,
|
||||
rope_parameters: dict[str, Any] | None = None,
|
||||
dtype: torch.dtype = torch.bfloat16,
|
||||
seed: int = 0,
|
||||
warmup_iter: int = 10,
|
||||
@@ -102,9 +101,8 @@ def benchmark_mrope(
|
||||
head_size=head_dim,
|
||||
rotary_dim=head_dim,
|
||||
max_position=max_position,
|
||||
base=rope_theta,
|
||||
is_neox_style=is_neox_style,
|
||||
rope_scaling=rope_scaling,
|
||||
rope_parameters=rope_parameters,
|
||||
dtype=dtype,
|
||||
).to(device=device)
|
||||
|
||||
@@ -203,9 +201,8 @@ def benchmark_mrope(
|
||||
num_kv_heads,
|
||||
head_dim,
|
||||
max_position,
|
||||
rope_theta,
|
||||
is_neox_style,
|
||||
str(rope_scaling),
|
||||
str(rope_parameters),
|
||||
str(dtype).split(".")[-1],
|
||||
torch_stats["mean"],
|
||||
torch_stats["median"],
|
||||
@@ -255,9 +252,8 @@ if __name__ == "__main__":
|
||||
"num_kv_heads",
|
||||
"head_dim",
|
||||
"max_position",
|
||||
"rope_theta",
|
||||
"is_neox_style",
|
||||
"rope_scaling",
|
||||
"rope_parameters",
|
||||
"dtype",
|
||||
"torch_mean",
|
||||
"torch_median",
|
||||
@@ -303,7 +299,7 @@ if __name__ == "__main__":
|
||||
q_size = num_heads * head_dim
|
||||
kv_size = num_kv_heads * head_dim
|
||||
is_neox_style = True
|
||||
rope_theta = config.rope_theta
|
||||
rope_parameters = config.rope_parameters
|
||||
max_position = config.max_position_embeddings
|
||||
|
||||
for num_tokens in num_tokens_list:
|
||||
@@ -315,9 +311,8 @@ if __name__ == "__main__":
|
||||
num_heads=num_heads,
|
||||
num_kv_heads=num_kv_heads,
|
||||
max_position=max_position,
|
||||
rope_theta=rope_theta,
|
||||
is_neox_style=is_neox_style,
|
||||
rope_scaling=config.rope_scaling,
|
||||
rope_parameters=rope_parameters,
|
||||
dtype=getattr(torch, args.dtype),
|
||||
seed=args.seed,
|
||||
warmup_iter=args.warmup_iter,
|
||||
|
||||
@@ -30,8 +30,8 @@ def _time_cuda(
|
||||
fn()
|
||||
torch.cuda.synchronize()
|
||||
|
||||
start = torch.cuda.Event(enable_timing=True)
|
||||
end = torch.cuda.Event(enable_timing=True)
|
||||
start = torch.Event(enable_timing=True)
|
||||
end = torch.Event(enable_timing=True)
|
||||
|
||||
start.record()
|
||||
for _ in range(bench_iters):
|
||||
|
||||
@@ -1,97 +1,76 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from itertools import accumulate
|
||||
import itertools
|
||||
|
||||
import nvtx
|
||||
import torch
|
||||
|
||||
from vllm.model_executor.layers.rotary_embedding import RotaryEmbedding, get_rope
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.model_executor.layers.rotary_embedding import get_rope
|
||||
from vllm.triton_utils import triton
|
||||
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
||||
|
||||
batch_size_range = [2**i for i in range(0, 8, 2)]
|
||||
seq_len_range = [2**i for i in range(6, 10, 1)]
|
||||
num_heads_range = [32, 48]
|
||||
configs = list(itertools.product(batch_size_range, seq_len_range, num_heads_range))
|
||||
|
||||
def benchmark_rope_kernels_multi_lora(
|
||||
is_neox_style: bool,
|
||||
batch_size: int,
|
||||
seq_len: int,
|
||||
num_heads: int,
|
||||
head_size: int,
|
||||
rotary_dim: int | None,
|
||||
dtype: torch.dtype,
|
||||
seed: int,
|
||||
device: str,
|
||||
max_position: int = 8192,
|
||||
base: float = 10000,
|
||||
) -> None:
|
||||
current_platform.seed_everything(seed)
|
||||
torch.set_default_device(device)
|
||||
if rotary_dim is None:
|
||||
rotary_dim = head_size
|
||||
# silulating serving 4 LoRAs
|
||||
scaling_factors = [1, 2, 4, 8]
|
||||
# batched RoPE can take multiple scaling factors
|
||||
batched_rope = get_rope(
|
||||
head_size,
|
||||
rotary_dim,
|
||||
max_position,
|
||||
base,
|
||||
is_neox_style,
|
||||
{"rope_type": "linear", "factor": tuple(scaling_factors)},
|
||||
)
|
||||
# non-batched RoPE takes only one scaling factor, we create multiple
|
||||
# instances to simulate the same behavior
|
||||
non_batched_ropes: list[RotaryEmbedding] = []
|
||||
for scaling_factor in scaling_factors:
|
||||
non_batched_ropes.append(
|
||||
get_rope(
|
||||
head_size,
|
||||
rotary_dim,
|
||||
max_position,
|
||||
base,
|
||||
is_neox_style,
|
||||
{"rope_type": "linear", "factor": (scaling_factor,)},
|
||||
)
|
||||
)
|
||||
|
||||
positions = torch.randint(0, max_position, (batch_size, seq_len))
|
||||
query = torch.randn(batch_size, seq_len, num_heads * head_size, dtype=dtype)
|
||||
key = torch.randn_like(query)
|
||||
|
||||
# create query offsets for batched RoPE, we concat multiple kv cache
|
||||
# together and each query needs to find the right kv cache of its type
|
||||
offset_map = torch.tensor(
|
||||
list(
|
||||
accumulate(
|
||||
[0]
|
||||
+ [
|
||||
max_position * scaling_factor * 2
|
||||
for scaling_factor in scaling_factors[:-1]
|
||||
]
|
||||
)
|
||||
def get_benchmark(head_size, rotary_dim, is_neox_style, device):
|
||||
@triton.testing.perf_report(
|
||||
triton.testing.Benchmark(
|
||||
x_names=["batch_size", "seq_len", "num_heads"],
|
||||
x_vals=[list(_) for _ in configs],
|
||||
line_arg="provider",
|
||||
line_vals=["torch", "flashinfer", "vllm"],
|
||||
line_names=["PyTorch", "FlashInfer", "vLLM"],
|
||||
styles=[("blue", "-"), ("green", "-"), ("red", "-")],
|
||||
ylabel="us",
|
||||
plot_name=f"rope-perf{'-neox-style' if is_neox_style else ''}",
|
||||
args={},
|
||||
)
|
||||
)
|
||||
query_types = torch.randint(
|
||||
0, len(scaling_factors), (batch_size, seq_len), device=device
|
||||
)
|
||||
# map query types to offsets
|
||||
query_offsets = offset_map[query_types]
|
||||
# the kernel takes flattened offsets
|
||||
flatten_offsets = query_offsets.flatten()
|
||||
def benchmark(batch_size, seq_len, num_heads, provider):
|
||||
dtype = torch.bfloat16
|
||||
max_position = 8192
|
||||
base = 10000
|
||||
rope = get_rope(head_size, rotary_dim, max_position, base, is_neox_style)
|
||||
rope = rope.to(dtype=dtype, device=device)
|
||||
cos_sin_cache = rope.cos_sin_cache.to(dtype=torch.float, device=device)
|
||||
|
||||
# batched queries of the same type together for non-batched RoPE
|
||||
queries = [query[query_types == i] for i in range(len(scaling_factors))]
|
||||
keys = [key[query_types == i] for i in range(len(scaling_factors))]
|
||||
packed_qkr = zip(queries, keys, non_batched_ropes)
|
||||
# synchronize before start timing
|
||||
torch.cuda.synchronize()
|
||||
with nvtx.annotate("non-batched", color="yellow"):
|
||||
for q, k, r in packed_qkr:
|
||||
r.forward(positions, q, k)
|
||||
torch.cuda.synchronize()
|
||||
with nvtx.annotate("batched", color="green"):
|
||||
batched_rope.forward(positions, query, key, flatten_offsets)
|
||||
torch.cuda.synchronize()
|
||||
positions = torch.randint(0, max_position, (batch_size, seq_len), device=device)
|
||||
query = torch.randn(
|
||||
(batch_size, seq_len, num_heads * head_size), dtype=dtype, device=device
|
||||
)
|
||||
key = torch.randn_like(query)
|
||||
|
||||
quantiles = [0.5, 0.2, 0.8]
|
||||
|
||||
if provider == "torch":
|
||||
ms, min_ms, max_ms = triton.testing.do_bench(
|
||||
lambda: rope.forward_native(positions, query.clone(), key.clone()),
|
||||
quantiles=quantiles,
|
||||
)
|
||||
elif provider == "flashinfer":
|
||||
ms, min_ms, max_ms = triton.testing.do_bench(
|
||||
lambda: torch.ops.vllm.flashinfer_rotary_embedding(
|
||||
positions,
|
||||
query.clone(),
|
||||
key.clone(),
|
||||
head_size,
|
||||
cos_sin_cache,
|
||||
is_neox_style,
|
||||
),
|
||||
quantiles=quantiles,
|
||||
)
|
||||
else:
|
||||
ms, min_ms, max_ms = triton.testing.do_bench(
|
||||
lambda: rope.forward_cuda(positions, query.clone(), key.clone()),
|
||||
quantiles=quantiles,
|
||||
)
|
||||
|
||||
return 1000 * ms, 1000 * max_ms, 1000 * min_ms
|
||||
|
||||
return benchmark
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
@@ -116,17 +95,12 @@ if __name__ == "__main__":
|
||||
parser.add_argument(
|
||||
"--device", type=str, choices=["cuda:0", "cuda:1"], default="cuda:0"
|
||||
)
|
||||
parser.add_argument("--save-path", type=str, default="./configs/rope/")
|
||||
args = parser.parse_args()
|
||||
print(args)
|
||||
|
||||
benchmark_rope_kernels_multi_lora(
|
||||
is_neox_style=args.is_neox_style,
|
||||
batch_size=args.batch_size,
|
||||
seq_len=args.seq_len,
|
||||
num_heads=args.num_heads,
|
||||
head_size=args.head_size,
|
||||
rotary_dim=args.rotary_dim,
|
||||
dtype=getattr(torch, args.dtype),
|
||||
seed=args.seed,
|
||||
device=args.device,
|
||||
# Get the benchmark function
|
||||
benchmark = get_benchmark(
|
||||
args.head_size, args.rotary_dim, args.is_neox_style, args.device
|
||||
)
|
||||
# Run performance benchmark
|
||||
benchmark.run(print_data=True, save_path=args.save_path)
|
||||
|
||||
@@ -78,11 +78,11 @@ WEIGHT_SHAPES = {
|
||||
}
|
||||
|
||||
WEIGHT_SHAPES_MOE = {
|
||||
"nm-testing/Mixtral-8x7B-Instruct-v0.1": [
|
||||
"mistralai/Mixtral-8x7B-Instruct-v0.1": [
|
||||
[8, 2, 4096, 28672],
|
||||
[8, 2, 14336, 4096],
|
||||
],
|
||||
"nm-testing/deepseekv2-lite": [
|
||||
"deepseek-ai/DeepSeek-V2-Lite": [
|
||||
[64, 6, 2048, 1408],
|
||||
],
|
||||
"ibm-granite/granite-3.0-1b-a400m": [
|
||||
|
||||
@@ -253,8 +253,8 @@ def benchmark(
|
||||
)
|
||||
torch.cuda.synchronize()
|
||||
|
||||
start_event = torch.cuda.Event(enable_timing=True)
|
||||
end_event = torch.cuda.Event(enable_timing=True)
|
||||
start_event = torch.Event(enable_timing=True)
|
||||
end_event = torch.Event(enable_timing=True)
|
||||
|
||||
# Benchmark
|
||||
latencies: list[float] = []
|
||||
|
||||
@@ -127,8 +127,8 @@ def benchmark_decode(
|
||||
|
||||
def time_fn(fn, warmup=10, trials=20):
|
||||
torch.cuda.synchronize()
|
||||
start = torch.cuda.Event(enable_timing=True)
|
||||
end = torch.cuda.Event(enable_timing=True)
|
||||
start = torch.Event(enable_timing=True)
|
||||
end = torch.Event(enable_timing=True)
|
||||
times = []
|
||||
for i in range(warmup):
|
||||
fn()
|
||||
|
||||
@@ -139,8 +139,8 @@ def benchmark_prefill(
|
||||
|
||||
def time_fn(fn, warmup=10, trials=20):
|
||||
torch.cuda.synchronize()
|
||||
start = torch.cuda.Event(enable_timing=True)
|
||||
end = torch.cuda.Event(enable_timing=True)
|
||||
start = torch.Event(enable_timing=True)
|
||||
end = torch.Event(enable_timing=True)
|
||||
times = []
|
||||
for i in range(warmup):
|
||||
fn()
|
||||
|
||||
@@ -183,8 +183,8 @@ def benchmark_config(
|
||||
run()
|
||||
torch.cuda.synchronize()
|
||||
|
||||
start_event = torch.cuda.Event(enable_timing=True)
|
||||
end_event = torch.cuda.Event(enable_timing=True)
|
||||
start_event = torch.Event(enable_timing=True)
|
||||
end_event = torch.Event(enable_timing=True)
|
||||
|
||||
latencies: list[float] = []
|
||||
for i in range(num_iters):
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
|
||||
This directory includes benchmarks between DeepSeek's DeepGEMM block fp8 kernels against vLLM's existing triton and CUTLASS-based kernels.
|
||||
|
||||
Currently this just includes dense GEMMs and only works on Hopper GPUs.
|
||||
Currently, this just includes dense GEMMs and only works on Hopper GPUs.
|
||||
|
||||
## Setup
|
||||
|
||||
|
||||
@@ -55,6 +55,10 @@ output_num_chunks 166.0 99.01 11.80 79.00 90.00 98.00 108.75
|
||||
----------------------------------------------------------------------------------------------------
|
||||
```
|
||||
|
||||
If you run with `--warmup-step`, the summary will also include `warmup_runtime_sec`
|
||||
and `total_runtime_incl_warmup_sec` (while `runtime_sec` continues to reflect the
|
||||
benchmark-only runtime so the reported throughput stays comparable).
|
||||
|
||||
### JSON configuration file for synthetic conversations generation
|
||||
|
||||
The input flag `--input-file` is used to determine the input conversations for the benchmark.<br/>
|
||||
|
||||
@@ -11,6 +11,7 @@ from bench_utils import (
|
||||
Color,
|
||||
logger,
|
||||
)
|
||||
from tqdm import tqdm
|
||||
from transformers import AutoTokenizer # type: ignore
|
||||
|
||||
# Conversation ID is a string (e.g: "UzTK34D")
|
||||
@@ -417,6 +418,10 @@ def generate_conversations(
|
||||
data = file.read()
|
||||
tokens_in_file = tokenizer.encode(data, add_special_tokens=False)
|
||||
list_of_tokens.extend(tokens_in_file)
|
||||
logger.info(
|
||||
f"Loaded {len(tokens_in_file)} tokens from file {filename}, "
|
||||
f"total tokens so far: {len(list_of_tokens)}"
|
||||
)
|
||||
|
||||
conversations: ConversationsMap = {}
|
||||
conv_id = 0
|
||||
@@ -449,18 +454,25 @@ def generate_conversations(
|
||||
)
|
||||
base_offset += common_prefix_tokens
|
||||
|
||||
for conv_id in range(args.num_conversations):
|
||||
for conv_id in tqdm(
|
||||
range(args.num_conversations),
|
||||
total=args.num_conversations,
|
||||
desc="Generating conversations",
|
||||
unit="conv",
|
||||
):
|
||||
# Generate a single conversation
|
||||
messages: MessagesList = []
|
||||
|
||||
nturns = turn_count[conv_id]
|
||||
|
||||
# User prompt token count per turn (with lower limit)
|
||||
input_token_count: np.ndarray = args.input_num_tokens.sample(nturns)
|
||||
input_token_count: np.ndarray = args.input_num_tokens.sample(nturns).astype(int)
|
||||
input_token_count = np.maximum(input_token_count, base_prompt_token_count)
|
||||
|
||||
# Assistant answer token count per turn (with lower limit)
|
||||
output_token_count: np.ndarray = args.output_num_tokens.sample(nturns)
|
||||
output_token_count: np.ndarray = args.output_num_tokens.sample(nturns).astype(
|
||||
int
|
||||
)
|
||||
output_token_count = np.maximum(output_token_count, 1)
|
||||
|
||||
user_turn = True
|
||||
|
||||
@@ -55,6 +55,7 @@ class ClientArgs(NamedTuple):
|
||||
verify_output: bool
|
||||
conversation_sampling: ConversationSampling
|
||||
request_rate: float
|
||||
max_retries: int
|
||||
|
||||
|
||||
class RequestArgs(NamedTuple):
|
||||
@@ -63,6 +64,7 @@ class RequestArgs(NamedTuple):
|
||||
stream: bool
|
||||
limit_min_tokens: int # Use negative value for no limit
|
||||
limit_max_tokens: int # Use negative value for no limit
|
||||
timeout_sec: int
|
||||
|
||||
|
||||
class BenchmarkArgs(NamedTuple):
|
||||
@@ -214,6 +216,7 @@ async def send_request(
|
||||
stream: bool = True,
|
||||
min_tokens: int | None = None,
|
||||
max_tokens: int | None = None,
|
||||
timeout_sec: int = 120,
|
||||
) -> ServerResponse:
|
||||
payload = {
|
||||
"model": model,
|
||||
@@ -235,10 +238,16 @@ async def send_request(
|
||||
headers = {"Content-Type": "application/json"}
|
||||
|
||||
# Calculate the timeout for the request
|
||||
timeout_sec = 120
|
||||
if max_tokens is not None:
|
||||
# Assume TPOT of 200ms and use max_tokens to determine timeout
|
||||
timeout_sec = max(timeout_sec, int(max_tokens * 0.2))
|
||||
token_based_timeout = int(max_tokens * 0.2)
|
||||
if token_based_timeout > timeout_sec:
|
||||
timeout_sec = token_based_timeout
|
||||
logger.info(
|
||||
"Using timeout of %ds based on max_tokens %d",
|
||||
timeout_sec,
|
||||
max_tokens,
|
||||
)
|
||||
timeout = aiohttp.ClientTimeout(total=timeout_sec)
|
||||
|
||||
valid_response = True
|
||||
@@ -409,6 +418,7 @@ async def send_turn(
|
||||
req_args.stream,
|
||||
min_tokens,
|
||||
max_tokens,
|
||||
req_args.timeout_sec,
|
||||
)
|
||||
|
||||
if response.valid is False:
|
||||
@@ -518,6 +528,25 @@ async def poisson_sleep(request_rate: float, verbose: bool = False) -> None:
|
||||
await asyncio.sleep(interval)
|
||||
|
||||
|
||||
async def exponential_backoff_sleep(
|
||||
attempt_cnt: int,
|
||||
base_rate: float = 1.0,
|
||||
backoff_factor: float = 2.0,
|
||||
jitter_fraction: float = 0.10,
|
||||
verbose: bool = False,
|
||||
) -> None:
|
||||
# Sleep with exponential backoff and jitter after a failed request.
|
||||
backoff_delay = base_rate * (backoff_factor**attempt_cnt)
|
||||
jittered_delay = backoff_delay * (
|
||||
1 + np.random.uniform(-jitter_fraction, jitter_fraction)
|
||||
)
|
||||
|
||||
if verbose:
|
||||
logger.info(f"Backoff for {jittered_delay:.3f} seconds...")
|
||||
|
||||
await asyncio.sleep(jittered_delay)
|
||||
|
||||
|
||||
async def client_main(
|
||||
args: ClientArgs,
|
||||
req_args: RequestArgs,
|
||||
@@ -532,8 +561,11 @@ async def client_main(
|
||||
f"{Color.CYAN}Started client {client_id}: max_num_requests={args.max_num_requests}, max_active_conversations={args.max_active_conversations}{Color.RESET}" # noqa: E501
|
||||
)
|
||||
|
||||
random.seed(args.seed)
|
||||
np.random.seed(args.seed)
|
||||
# Set unique seed per client (each client runs in its own process)
|
||||
# Add 1 to ensure no client uses the same seed as the main process
|
||||
client_seed = args.seed + client_id + 1
|
||||
random.seed(client_seed)
|
||||
np.random.seed(client_seed)
|
||||
|
||||
# Active conversations
|
||||
active_convs: ConversationsMap = {}
|
||||
@@ -646,49 +678,62 @@ async def client_main(
|
||||
)
|
||||
time_of_last_turn[conv_id] = curr_time_sec
|
||||
|
||||
success = True
|
||||
try:
|
||||
result = await send_turn(
|
||||
session,
|
||||
client_id,
|
||||
conv_id,
|
||||
messages,
|
||||
current_turn,
|
||||
tokenizer,
|
||||
req_args,
|
||||
args.print_content,
|
||||
args.verify_output,
|
||||
)
|
||||
if result is not None:
|
||||
result_queue.put(result)
|
||||
else:
|
||||
# None means that the request failed,
|
||||
# and should not be added to the statistics.
|
||||
success = False
|
||||
num_failures += 1
|
||||
|
||||
logger.warning(
|
||||
f"{Color.YELLOW}Client {client_id} - Request rejected during conversation ID {conv_id} (turn: {current_turn}){Color.RESET}" # noqa: E501
|
||||
success = False
|
||||
for attempt_cnt in range(args.max_retries + 1):
|
||||
try:
|
||||
exception = False
|
||||
result = await send_turn(
|
||||
session,
|
||||
client_id,
|
||||
conv_id,
|
||||
messages,
|
||||
current_turn,
|
||||
tokenizer,
|
||||
req_args,
|
||||
args.print_content,
|
||||
args.verify_output,
|
||||
)
|
||||
if result is not None:
|
||||
result_queue.put(result)
|
||||
success = True
|
||||
break
|
||||
else:
|
||||
logger.warning(
|
||||
f"{Color.YELLOW}Client {client_id} - Request rejected during conversation ID {conv_id} (turn: {current_turn}){Color.RESET}" # noqa: E501
|
||||
)
|
||||
except asyncio.exceptions.TimeoutError:
|
||||
exception = True
|
||||
logger.error(
|
||||
"%sClient %d - Timeout during conversation ID %s (turn: %d). "
|
||||
"Base timeout is %ss (set with --request-timeout-sec), but the "
|
||||
"effective timeout may be longer based on max_tokens. If this "
|
||||
"is unexpected, consider increasing the timeout or checking "
|
||||
"model performance.%s",
|
||||
Color.RED,
|
||||
client_id,
|
||||
conv_id,
|
||||
current_turn,
|
||||
req_args.timeout_sec,
|
||||
Color.RESET,
|
||||
)
|
||||
except Exception:
|
||||
exception = True
|
||||
logger.exception(
|
||||
f"{Color.RED}Client {client_id} - Exception during conversation ID {conv_id} (turn: {current_turn}){Color.RESET}" # noqa: E501
|
||||
)
|
||||
|
||||
# Remove the conversation (should not be used again)
|
||||
active_convs.pop(conv_id)
|
||||
# Sleep before retry if not last attempt
|
||||
if not success and attempt_cnt < args.max_retries:
|
||||
await exponential_backoff_sleep(attempt_cnt, verbose=args.verbose)
|
||||
|
||||
except asyncio.exceptions.TimeoutError:
|
||||
if not success:
|
||||
num_failures += 1
|
||||
logger.exception(
|
||||
f"{Color.RED}Client {client_id} - Timeout during conversation ID {conv_id} (turn: {current_turn}){Color.RESET}" # noqa: E501
|
||||
)
|
||||
break # Exit gracefully instead of raising an error
|
||||
# Remove the conversation (should not be used again)
|
||||
active_convs.pop(conv_id)
|
||||
if exception:
|
||||
break # Exit gracefully instead of raising an error
|
||||
|
||||
except Exception:
|
||||
num_failures += 1
|
||||
logger.exception(
|
||||
f"{Color.RED}Client {client_id} - Exception during conversation ID {conv_id} (turn: {current_turn}){Color.RESET}" # noqa: E501
|
||||
)
|
||||
break # Exit gracefully instead of raising an error
|
||||
|
||||
if success:
|
||||
else:
|
||||
num_successes += 1
|
||||
|
||||
# Update the turns counter to include the LLM response
|
||||
@@ -803,6 +848,7 @@ def get_client_config(
|
||||
verify_output=args.verify_output,
|
||||
conversation_sampling=args.conversation_sampling,
|
||||
request_rate=args.request_rate,
|
||||
max_retries=args.max_retries,
|
||||
)
|
||||
|
||||
if args.limit_min_tokens > 0 or args.limit_max_tokens > 0:
|
||||
@@ -815,6 +861,9 @@ def get_client_config(
|
||||
"Invalid min/max tokens limits (min should not be larger than max)"
|
||||
)
|
||||
|
||||
if args.request_timeout_sec <= 0:
|
||||
raise ValueError("Request timeout must be a positive number")
|
||||
|
||||
# Arguments for API requests
|
||||
chat_url = f"{args.url}/v1/chat/completions"
|
||||
model_name = args.served_model_name if args.served_model_name else args.model
|
||||
@@ -825,6 +874,7 @@ def get_client_config(
|
||||
stream=not args.no_stream,
|
||||
limit_min_tokens=args.limit_min_tokens,
|
||||
limit_max_tokens=args.limit_max_tokens,
|
||||
timeout_sec=args.request_timeout_sec,
|
||||
)
|
||||
|
||||
return client_args, req_args
|
||||
@@ -968,7 +1018,7 @@ async def main_mp(
|
||||
f"(is alive: {client.is_alive()}){Color.RESET}"
|
||||
)
|
||||
|
||||
client.join(timeout=120)
|
||||
client.join(timeout=req_args.timeout_sec + 1)
|
||||
|
||||
if client.is_alive():
|
||||
logger.warning(
|
||||
@@ -1026,6 +1076,7 @@ def process_statistics(
|
||||
verbose: bool,
|
||||
gen_conv_args: GenConvArgs | None = None,
|
||||
excel_output: bool = False,
|
||||
warmup_runtime_sec: float | None = None,
|
||||
) -> None:
|
||||
if len(client_metrics) == 0:
|
||||
logger.info("No samples to process")
|
||||
@@ -1119,8 +1170,13 @@ def process_statistics(
|
||||
# Convert milliseconds to seconds
|
||||
runtime_sec = runtime_sec / 1000.0
|
||||
requests_per_sec = float(len(df)) / runtime_sec
|
||||
|
||||
params = {"runtime_sec": runtime_sec, "requests_per_sec": requests_per_sec}
|
||||
params = {
|
||||
"runtime_sec": runtime_sec,
|
||||
"requests_per_sec": requests_per_sec,
|
||||
}
|
||||
if warmup_runtime_sec is not None:
|
||||
params["warmup_runtime_sec"] = warmup_runtime_sec
|
||||
params["total_runtime_incl_warmup_sec"] = runtime_sec + warmup_runtime_sec
|
||||
|
||||
# Generate a summary of relevant metrics (and drop irrelevant data)
|
||||
df = df.drop(columns=exclude).describe(percentiles=percentiles).transpose()
|
||||
@@ -1334,6 +1390,16 @@ async def main() -> None:
|
||||
help="Expected request rate (Poisson process) per client in requests/sec."
|
||||
"Set to 0 for no delay between requests.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--max-retries",
|
||||
type=int,
|
||||
default=int(os.environ.get("MULTITURN_BENCH_MAX_RETRIES", "0")),
|
||||
help="Maximum number of retry attempts for timed-out requests. "
|
||||
"Default is 0 (no retries). "
|
||||
"Set to higher values to retry failed requests and maintain "
|
||||
"fair workload distribution. "
|
||||
"Can also be set via MULTITURN_BENCH_MAX_RETRIES environment variable.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--conversation-sampling",
|
||||
type=ConversationSampling,
|
||||
@@ -1351,6 +1417,13 @@ async def main() -> None:
|
||||
action="store_true",
|
||||
help="Verify the LLM output (compare to the answers in the input JSON file)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--request-timeout-sec",
|
||||
type=int,
|
||||
default=120,
|
||||
help="Timeout in seconds for each API request (default: 120). "
|
||||
"Automatically increased if max tokens imply longer decoding.",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--no-stream",
|
||||
@@ -1426,11 +1499,10 @@ async def main() -> None:
|
||||
f"Invalid --warmup-percentage={args.warmup_percentage}"
|
||||
) from None
|
||||
|
||||
# Set global seeds for main process
|
||||
random.seed(args.seed)
|
||||
np.random.seed(args.seed)
|
||||
|
||||
if not os.path.exists(args.model):
|
||||
raise OSError(f"Path does not exist: {args.model}")
|
||||
logger.info("Loading tokenizer")
|
||||
tokenizer = AutoTokenizer.from_pretrained(args.model)
|
||||
|
||||
@@ -1486,6 +1558,8 @@ async def main() -> None:
|
||||
url=args.url, num_clients=args.num_clients, early_stop=not args.no_early_stop
|
||||
)
|
||||
|
||||
warmup_runtime_sec: float | None = None
|
||||
|
||||
# Warm-up step
|
||||
if args.warmup_step:
|
||||
# Only send a single user prompt from every conversation.
|
||||
@@ -1500,26 +1574,56 @@ async def main() -> None:
|
||||
# all clients should finish their work before exiting
|
||||
warmup_bench_args = bench_args._replace(early_stop=False)
|
||||
|
||||
logger.info(f"{Color.PURPLE}Warmup start{Color.RESET}")
|
||||
logger.info("%sWarmup start%s", Color.PURPLE, Color.RESET)
|
||||
warmup_start_ns = time.perf_counter_ns()
|
||||
conversations, _ = await main_mp(
|
||||
warmup_client_args, req_args, warmup_bench_args, tokenizer, conversations
|
||||
)
|
||||
logger.info(f"{Color.PURPLE}Warmup done{Color.RESET}")
|
||||
warmup_runtime_sec = nanosec_to_sec(time.perf_counter_ns() - warmup_start_ns)
|
||||
logger.info(
|
||||
"%sWarmup runtime: %.3f sec (%.3f ms)%s",
|
||||
Color.PURPLE,
|
||||
warmup_runtime_sec,
|
||||
warmup_runtime_sec * 1000,
|
||||
Color.RESET,
|
||||
)
|
||||
logger.info("%sWarmup done%s", Color.PURPLE, Color.RESET)
|
||||
|
||||
# Run the benchmark
|
||||
start_time = time.perf_counter_ns()
|
||||
benchmark_start_ns = time.perf_counter_ns()
|
||||
client_convs, client_metrics = await main_mp(
|
||||
client_args, req_args, bench_args, tokenizer, conversations
|
||||
)
|
||||
total_runtime_ms = nanosec_to_millisec(time.perf_counter_ns() - start_time)
|
||||
benchmark_runtime_sec = nanosec_to_sec(time.perf_counter_ns() - benchmark_start_ns)
|
||||
|
||||
# Calculate requests per second
|
||||
total_runtime_sec = total_runtime_ms / 1000.0
|
||||
rps = len(client_metrics) / total_runtime_sec
|
||||
requests_per_sec = len(client_metrics) / benchmark_runtime_sec
|
||||
benchmark_runtime_ms = benchmark_runtime_sec * 1000.0
|
||||
logger.info(
|
||||
f"{Color.GREEN}All clients finished, total runtime: {total_runtime_sec:.3f} sec"
|
||||
f" ({total_runtime_ms:.3f} ms), requests per second: {rps:.3f}{Color.RESET}"
|
||||
"%sAll clients finished, benchmark runtime: %.3f sec (%.3f ms), "
|
||||
"requests per second: %.3f%s",
|
||||
Color.GREEN,
|
||||
benchmark_runtime_sec,
|
||||
benchmark_runtime_ms,
|
||||
requests_per_sec,
|
||||
Color.RESET,
|
||||
)
|
||||
if warmup_runtime_sec is not None:
|
||||
total_runtime_sec = benchmark_runtime_sec + warmup_runtime_sec
|
||||
logger.info(
|
||||
"%sWarmup runtime: %.3f sec (%.3f ms)%s",
|
||||
Color.GREEN,
|
||||
warmup_runtime_sec,
|
||||
warmup_runtime_sec * 1000,
|
||||
Color.RESET,
|
||||
)
|
||||
logger.info(
|
||||
"%sTotal runtime (including warmup): %.3f sec (%.3f ms)%s",
|
||||
Color.GREEN,
|
||||
total_runtime_sec,
|
||||
total_runtime_sec * 1000,
|
||||
Color.RESET,
|
||||
)
|
||||
|
||||
# Benchmark parameters
|
||||
params = {
|
||||
@@ -1544,6 +1648,7 @@ async def main() -> None:
|
||||
verbose=args.verbose,
|
||||
gen_conv_args=gen_conv_args,
|
||||
excel_output=args.excel_output,
|
||||
warmup_runtime_sec=warmup_runtime_sec,
|
||||
)
|
||||
|
||||
if args.output_file is not None:
|
||||
|
||||
@@ -2,4 +2,5 @@ numpy>=1.24
|
||||
pandas>=2.0.0
|
||||
aiohttp>=3.10
|
||||
transformers>=4.46
|
||||
xlsxwriter>=3.2.1
|
||||
xlsxwriter>=3.2.1
|
||||
tqdm>=4.66
|
||||
|
||||
@@ -15,6 +15,7 @@ endif()
|
||||
#
|
||||
set(ENABLE_AVX512BF16 $ENV{VLLM_CPU_AVX512BF16})
|
||||
set(ENABLE_AVX512VNNI $ENV{VLLM_CPU_AVX512VNNI})
|
||||
set(ENABLE_AMXBF16 $ENV{VLLM_CPU_AMXBF16})
|
||||
|
||||
include_directories("${CMAKE_SOURCE_DIR}/csrc")
|
||||
|
||||
@@ -140,6 +141,22 @@ if (AVX512_FOUND AND NOT AVX512_DISABLED)
|
||||
set(ENABLE_AVX512VNNI OFF)
|
||||
message(WARNING "Disable AVX512-VNNI ISA support, no avx512_vnni found in local CPU flags." " If cross-compilation is required, please set env VLLM_CPU_AVX512VNNI=1.")
|
||||
endif()
|
||||
|
||||
find_isa(${CPUINFO} "amx_bf16" AMXBF16_FOUND)
|
||||
if (AMXBF16_FOUND OR ENABLE_AMXBF16)
|
||||
if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU" AND
|
||||
CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 12.3)
|
||||
list(APPEND CXX_COMPILE_FLAGS "-mamx-bf16" "-mamx-tile")
|
||||
set(ENABLE_AMXBF16 ON)
|
||||
add_compile_definitions(-DCPU_CAPABILITY_AMXBF16)
|
||||
else()
|
||||
set(ENABLE_AMXBF16 OFF)
|
||||
message(WARNING "Disable AMX_BF16 ISA support, requires gcc/g++ >= 12.3")
|
||||
endif()
|
||||
else()
|
||||
set(ENABLE_AMXBF16 OFF)
|
||||
message(WARNING "Disable AMX_BF16 ISA support, no amx_bf16 found in local CPU flags." " If cross-compilation is required, please set env VLLM_CPU_AMXBF16=1.")
|
||||
endif()
|
||||
|
||||
elseif (AVX2_FOUND)
|
||||
list(APPEND CXX_COMPILE_FLAGS "-mavx2")
|
||||
@@ -193,7 +210,30 @@ endif()
|
||||
if ((AVX512_FOUND AND NOT AVX512_DISABLED) OR (ASIMD_FOUND AND NOT APPLE_SILICON_FOUND) OR POWER9_FOUND OR POWER10_FOUND OR POWER11_FOUND)
|
||||
# Fetch and build Arm Compute Library (ACL) as oneDNN's backend for AArch64
|
||||
# TODO [fadara01]: remove this once ACL can be fetched and built automatically as a dependency of oneDNN
|
||||
set(ONEDNN_AARCH64_USE_ACL OFF CACHE BOOL "")
|
||||
if(ASIMD_FOUND)
|
||||
# Set number of parallel build processes
|
||||
include(ProcessorCount)
|
||||
ProcessorCount(NPROC)
|
||||
if(NOT NPROC)
|
||||
set(NPROC 4)
|
||||
endif()
|
||||
# locate PyTorch's libgomp (e.g. site-packages/torch.libs/libgomp-947d5fa1.so.1.0.0)
|
||||
# and create a local shim dir with it
|
||||
vllm_prepare_torch_gomp_shim(VLLM_TORCH_GOMP_SHIM_DIR)
|
||||
|
||||
find_library(OPEN_MP
|
||||
NAMES gomp
|
||||
PATHS ${VLLM_TORCH_GOMP_SHIM_DIR}
|
||||
NO_DEFAULT_PATH
|
||||
REQUIRED
|
||||
)
|
||||
# Set LD_LIBRARY_PATH to include the shim dir at build time to use the same libgomp as PyTorch
|
||||
if (OPEN_MP)
|
||||
set(ENV{LD_LIBRARY_PATH} "${VLLM_TORCH_GOMP_SHIM_DIR}:$ENV{LD_LIBRARY_PATH}")
|
||||
endif()
|
||||
|
||||
# Fetch and populate ACL
|
||||
if(DEFINED ENV{ACL_ROOT_DIR} AND IS_DIRECTORY "$ENV{ACL_ROOT_DIR}")
|
||||
message(STATUS "Using ACL from specified source directory: $ENV{ACL_ROOT_DIR}")
|
||||
else()
|
||||
@@ -202,43 +242,58 @@ if ((AVX512_FOUND AND NOT AVX512_DISABLED) OR (ASIMD_FOUND AND NOT APPLE_SILICON
|
||||
SUBBUILD_DIR "${FETCHCONTENT_BASE_DIR}/arm_compute-subbuild"
|
||||
SOURCE_DIR "${FETCHCONTENT_BASE_DIR}/arm_compute-src"
|
||||
GIT_REPOSITORY https://github.com/ARM-software/ComputeLibrary.git
|
||||
GIT_TAG v52.2.0
|
||||
GIT_TAG v52.6.0
|
||||
GIT_SHALLOW TRUE
|
||||
GIT_PROGRESS TRUE
|
||||
)
|
||||
set(ENV{ACL_ROOT_DIR} "${arm_compute_SOURCE_DIR}")
|
||||
set(ACL_LIB_DIR "$ENV{ACL_ROOT_DIR}/build")
|
||||
endif()
|
||||
|
||||
# Build ACL with scons
|
||||
include(ProcessorCount)
|
||||
ProcessorCount(_NPROC)
|
||||
set(_scons_cmd
|
||||
scons -j${_NPROC}
|
||||
Werror=0 debug=0 neon=1 examples=0 embed_kernels=0 os=linux
|
||||
arch=armv8.2-a build=native benchmark_examples=0 fixed_format_kernels=1
|
||||
multi_isa=1 openmp=1 cppthreads=0
|
||||
# Build ACL with CMake
|
||||
set(ARM_COMPUTE_BUILD_SHARED_LIB "OFF")
|
||||
set(CMAKE_BUILD_TYPE "Release")
|
||||
set(ARM_COMPUTE_ARCH "armv8.2-a")
|
||||
set(ARM_COMPUTE_ENABLE_ASSERTS "OFF")
|
||||
set(ARM_COMPUTE_ENABLE_CPPTHREADS "OFF")
|
||||
set(ONEDNN_ENABLE_PRIMITIVE "MATMUL;REORDER")
|
||||
set(ARM_COMPUTE_ENABLE_OPENMP "ON")
|
||||
set(ARM_COMPUTE_ENABLE_WERROR "OFF")
|
||||
set(ARM_COMPUTE_BUILD_EXAMPLES "OFF")
|
||||
set(ARM_COMPUTE_BUILD_TESTING "OFF")
|
||||
|
||||
set(_cmake_config_cmd
|
||||
${CMAKE_COMMAND} -G Ninja -B build
|
||||
-DARM_COMPUTE_BUILD_SHARED_LIB=OFF
|
||||
-DCMAKE_BUILD_TYPE=Release
|
||||
-DARM_COMPUTE_ARCH=armv8.2-a
|
||||
-DARM_COMPUTE_ENABLE_ASSERTS=OFF
|
||||
-DARM_COMPUTE_ENABLE_CPPTHREADS=OFF
|
||||
-DARM_COMPUTE_ENABLE_OPENMP=ON
|
||||
-DARM_COMPUTE_ENABLE_WERROR=OFF
|
||||
-DARM_COMPUTE_BUILD_EXAMPLES=OFF
|
||||
-DARM_COMPUTE_BUILD_TESTING=OFF)
|
||||
set(_cmake_build_cmd
|
||||
${CMAKE_COMMAND} --build build -- -j${NPROC}
|
||||
)
|
||||
|
||||
# locate PyTorch's libgomp (e.g. site-packages/torch.libs/libgomp-947d5fa1.so.1.0.0)
|
||||
# and create a local shim dir with it
|
||||
include("${CMAKE_CURRENT_LIST_DIR}/utils.cmake")
|
||||
vllm_prepare_torch_gomp_shim(VLLM_TORCH_GOMP_SHIM_DIR)
|
||||
|
||||
if(NOT VLLM_TORCH_GOMP_SHIM_DIR STREQUAL "")
|
||||
list(APPEND _scons_cmd extra_link_flags=-L${VLLM_TORCH_GOMP_SHIM_DIR})
|
||||
endif()
|
||||
|
||||
execute_process(
|
||||
COMMAND ${_scons_cmd}
|
||||
COMMAND ${_cmake_config_cmd}
|
||||
WORKING_DIRECTORY "$ENV{ACL_ROOT_DIR}"
|
||||
)
|
||||
execute_process(
|
||||
COMMAND ${_cmake_build_cmd}
|
||||
WORKING_DIRECTORY "$ENV{ACL_ROOT_DIR}"
|
||||
RESULT_VARIABLE _acl_rc
|
||||
)
|
||||
|
||||
if(NOT _acl_rc EQUAL 0)
|
||||
message(FATAL_ERROR "ACL SCons build failed (exit ${_acl_rc}).")
|
||||
endif()
|
||||
message(STATUS "Arm Compute Library (ACL) built successfully.")
|
||||
|
||||
set(ONEDNN_AARCH64_USE_ACL "ON")
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wl,-rpath,$ENV{ACL_ROOT_DIR}/build/")
|
||||
# VLLM/oneDNN settings for ACL
|
||||
set(ONEDNN_AARCH64_USE_ACL ON CACHE BOOL "" FORCE)
|
||||
add_compile_definitions(VLLM_USE_ACL)
|
||||
endif()
|
||||
|
||||
@@ -255,7 +310,7 @@ if ((AVX512_FOUND AND NOT AVX512_DISABLED) OR (ASIMD_FOUND AND NOT APPLE_SILICON
|
||||
FetchContent_Declare(
|
||||
oneDNN
|
||||
GIT_REPOSITORY https://github.com/oneapi-src/oneDNN.git
|
||||
GIT_TAG v3.9
|
||||
GIT_TAG v3.10
|
||||
GIT_PROGRESS TRUE
|
||||
GIT_SHALLOW TRUE
|
||||
)
|
||||
@@ -275,7 +330,10 @@ if ((AVX512_FOUND AND NOT AVX512_DISABLED) OR (ASIMD_FOUND AND NOT APPLE_SILICON
|
||||
set(ONEDNN_VERBOSE "OFF")
|
||||
set(CMAKE_POLICY_DEFAULT_CMP0077 NEW)
|
||||
|
||||
set(VLLM_BUILD_TYPE ${CMAKE_BUILD_TYPE})
|
||||
set(CMAKE_BUILD_TYPE "Release") # remove oneDNN debug symbols to reduce size
|
||||
FetchContent_MakeAvailable(oneDNN)
|
||||
set(CMAKE_BUILD_TYPE ${VLLM_BUILD_TYPE})
|
||||
add_library(dnnl_ext OBJECT "csrc/cpu/dnnl_helper.cpp")
|
||||
target_include_directories(
|
||||
dnnl_ext
|
||||
@@ -305,18 +363,19 @@ endif()
|
||||
#
|
||||
set(VLLM_EXT_SRC
|
||||
"csrc/cpu/activation.cpp"
|
||||
"csrc/cpu/attention.cpp"
|
||||
"csrc/cpu/cache.cpp"
|
||||
"csrc/cpu/utils.cpp"
|
||||
"csrc/cpu/layernorm.cpp"
|
||||
"csrc/cpu/mla_decode.cpp"
|
||||
"csrc/cpu/pos_encoding.cpp"
|
||||
"csrc/cpu/torch_bindings.cpp"
|
||||
"csrc/moe/dynamic_4bit_int_moe_cpu.cpp")
|
||||
"csrc/moe/dynamic_4bit_int_moe_cpu.cpp"
|
||||
"csrc/cpu/cpu_attn.cpp"
|
||||
"csrc/cpu/scratchpad_manager.cpp"
|
||||
"csrc/cpu/torch_bindings.cpp")
|
||||
|
||||
if (AVX512_FOUND AND NOT AVX512_DISABLED)
|
||||
set(VLLM_EXT_SRC
|
||||
"csrc/cpu/shm.cpp"
|
||||
"csrc/cpu/cpu_wna16.cpp"
|
||||
${VLLM_EXT_SRC})
|
||||
if (ENABLE_AVX512BF16 AND ENABLE_AVX512VNNI)
|
||||
set(VLLM_EXT_SRC
|
||||
@@ -343,7 +402,7 @@ message(STATUS "CPU extension source files: ${VLLM_EXT_SRC}")
|
||||
# Define extension targets
|
||||
#
|
||||
|
||||
define_gpu_extension_target(
|
||||
define_extension_target(
|
||||
_C
|
||||
DESTINATION vllm
|
||||
LANGUAGE CXX
|
||||
@@ -354,4 +413,4 @@ define_gpu_extension_target(
|
||||
WITH_SOABI
|
||||
)
|
||||
|
||||
message(STATUS "Enabling C extension.")
|
||||
message(STATUS "Enabling C extension.")
|
||||
|
||||
@@ -92,7 +92,7 @@ if(FLASH_MLA_ARCHS)
|
||||
SRCS "${FlashMLA_Extension_SOURCES}"
|
||||
CUDA_ARCHS "${FLASH_MLA_ARCHS}")
|
||||
|
||||
define_gpu_extension_target(
|
||||
define_extension_target(
|
||||
_flashmla_C
|
||||
DESTINATION vllm
|
||||
LANGUAGE ${VLLM_GPU_LANG}
|
||||
@@ -109,7 +109,7 @@ if(FLASH_MLA_ARCHS)
|
||||
$<$<COMPILE_LANGUAGE:CUDA>:-UPy_LIMITED_API>
|
||||
$<$<COMPILE_LANGUAGE:CXX>:-UPy_LIMITED_API>)
|
||||
|
||||
define_gpu_extension_target(
|
||||
define_extension_target(
|
||||
_flashmla_extension_C
|
||||
DESTINATION vllm
|
||||
LANGUAGE ${VLLM_GPU_LANG}
|
||||
|
||||
53
cmake/external_projects/triton_kernels.cmake
Normal file
53
cmake/external_projects/triton_kernels.cmake
Normal file
@@ -0,0 +1,53 @@
|
||||
# Install OpenAI triton_kernels from https://github.com/triton-lang/triton/tree/main/python/triton_kernels
|
||||
|
||||
set(DEFAULT_TRITON_KERNELS_TAG "v3.5.0")
|
||||
|
||||
# Set TRITON_KERNELS_SRC_DIR for use with local development with vLLM. We expect TRITON_KERNELS_SRC_DIR to
|
||||
# be directly set to the triton_kernels python directory.
|
||||
if (DEFINED ENV{TRITON_KERNELS_SRC_DIR})
|
||||
message(STATUS "[triton_kernels] Fetch from $ENV{TRITON_KERNELS_SRC_DIR}")
|
||||
FetchContent_Declare(
|
||||
triton_kernels
|
||||
SOURCE_DIR $ENV{TRITON_KERNELS_SRC_DIR}
|
||||
)
|
||||
|
||||
else()
|
||||
set(TRITON_GIT "https://github.com/triton-lang/triton.git")
|
||||
message (STATUS "[triton_kernels] Fetch from ${TRITON_GIT}:${DEFAULT_TRITON_KERNELS_TAG}")
|
||||
FetchContent_Declare(
|
||||
triton_kernels
|
||||
# TODO (varun) : Fetch just the triton_kernels directory from Triton
|
||||
GIT_REPOSITORY https://github.com/triton-lang/triton.git
|
||||
GIT_TAG ${DEFAULT_TRITON_KERNELS_TAG}
|
||||
GIT_PROGRESS TRUE
|
||||
SOURCE_SUBDIR python/triton_kernels/triton_kernels
|
||||
)
|
||||
endif()
|
||||
|
||||
# Fetch content
|
||||
FetchContent_MakeAvailable(triton_kernels)
|
||||
|
||||
if (NOT triton_kernels_SOURCE_DIR)
|
||||
message (FATAL_ERROR "[triton_kernels] Cannot resolve triton_kernels_SOURCE_DIR")
|
||||
endif()
|
||||
|
||||
if (DEFINED ENV{TRITON_KERNELS_SRC_DIR})
|
||||
set(TRITON_KERNELS_PYTHON_DIR "${triton_kernels_SOURCE_DIR}/")
|
||||
else()
|
||||
set(TRITON_KERNELS_PYTHON_DIR "${triton_kernels_SOURCE_DIR}/python/triton_kernels/triton_kernels/")
|
||||
endif()
|
||||
|
||||
message (STATUS "[triton_kernels] triton_kernels is available at ${TRITON_KERNELS_PYTHON_DIR}")
|
||||
|
||||
add_custom_target(triton_kernels)
|
||||
|
||||
# Ensure the vllm/third_party directory exists before installation
|
||||
install(CODE "file(MAKE_DIRECTORY \"\${CMAKE_INSTALL_PREFIX}/vllm/third_party/triton_kernels\")")
|
||||
|
||||
## Copy .py files to install directory.
|
||||
install(DIRECTORY
|
||||
${TRITON_KERNELS_PYTHON_DIR}
|
||||
DESTINATION
|
||||
vllm/third_party/triton_kernels/
|
||||
COMPONENT triton_kernels
|
||||
FILES_MATCHING PATTERN "*.py")
|
||||
@@ -38,7 +38,7 @@ else()
|
||||
FetchContent_Declare(
|
||||
vllm-flash-attn
|
||||
GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
|
||||
GIT_TAG a893712401d70362fbb299cd9c4b3476e8e9ed54
|
||||
GIT_TAG 86f8f157cf82aa2342743752b97788922dd7de43
|
||||
GIT_PROGRESS TRUE
|
||||
# Don't share the vllm-flash-attn build between build types
|
||||
BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn
|
||||
|
||||
@@ -453,21 +453,20 @@ macro(override_gpu_arches GPU_ARCHES GPU_LANG GPU_SUPPORTED_ARCHES)
|
||||
endmacro()
|
||||
|
||||
#
|
||||
# Define a target named `GPU_MOD_NAME` for a single extension. The
|
||||
# Define a target named `MOD_NAME` for a single extension. The
|
||||
# arguments are:
|
||||
#
|
||||
# DESTINATION <dest> - Module destination directory.
|
||||
# LANGUAGE <lang> - The GPU language for this module, e.g CUDA, HIP,
|
||||
# etc.
|
||||
# LANGUAGE <lang> - The language for this module, e.g. CUDA, HIP,
|
||||
# CXX, etc.
|
||||
# SOURCES <sources> - List of source files relative to CMakeLists.txt
|
||||
# directory.
|
||||
#
|
||||
# Optional arguments:
|
||||
#
|
||||
# ARCHITECTURES <arches> - A list of target GPU architectures in cmake
|
||||
# format.
|
||||
# Refer `CMAKE_CUDA_ARCHITECTURES` documentation
|
||||
# and `CMAKE_HIP_ARCHITECTURES` for more info.
|
||||
# ARCHITECTURES <arches> - A list of target architectures in cmake format.
|
||||
# For GPU, refer to CMAKE_CUDA_ARCHITECTURES and
|
||||
# CMAKE_HIP_ARCHITECTURES for more info.
|
||||
# ARCHITECTURES will use cmake's defaults if
|
||||
# not provided.
|
||||
# COMPILE_FLAGS <flags> - Extra compiler flags passed to NVCC/hip.
|
||||
@@ -478,63 +477,67 @@ endmacro()
|
||||
#
|
||||
# Note: optimization level/debug info is set via cmake build type.
|
||||
#
|
||||
function (define_gpu_extension_target GPU_MOD_NAME)
|
||||
function (define_extension_target MOD_NAME)
|
||||
cmake_parse_arguments(PARSE_ARGV 1
|
||||
GPU
|
||||
ARG
|
||||
"WITH_SOABI"
|
||||
"DESTINATION;LANGUAGE;USE_SABI"
|
||||
"SOURCES;ARCHITECTURES;COMPILE_FLAGS;INCLUDE_DIRECTORIES;LIBRARIES")
|
||||
|
||||
# Add hipify preprocessing step when building with HIP/ROCm.
|
||||
if (GPU_LANGUAGE STREQUAL "HIP")
|
||||
hipify_sources_target(GPU_SOURCES ${GPU_MOD_NAME} "${GPU_SOURCES}")
|
||||
if (ARG_LANGUAGE STREQUAL "HIP")
|
||||
hipify_sources_target(ARG_SOURCES ${MOD_NAME} "${ARG_SOURCES}")
|
||||
endif()
|
||||
|
||||
if (GPU_WITH_SOABI)
|
||||
set(GPU_WITH_SOABI WITH_SOABI)
|
||||
if (ARG_WITH_SOABI)
|
||||
set(SOABI_KEYWORD WITH_SOABI)
|
||||
else()
|
||||
set(GPU_WITH_SOABI)
|
||||
set(SOABI_KEYWORD "")
|
||||
endif()
|
||||
|
||||
if (GPU_USE_SABI)
|
||||
Python_add_library(${GPU_MOD_NAME} MODULE USE_SABI ${GPU_USE_SABI} ${GPU_WITH_SOABI} "${GPU_SOURCES}")
|
||||
run_python(IS_FREETHREADED_PYTHON
|
||||
"import sysconfig; print(1 if sysconfig.get_config_var(\"Py_GIL_DISABLED\") else 0)"
|
||||
"Failed to determine whether interpreter is free-threaded")
|
||||
|
||||
# Free-threaded Python doesn't yet support the stable ABI (see PEP 803/809),
|
||||
# so avoid using the stable ABI under free-threading only.
|
||||
if (ARG_USE_SABI AND NOT IS_FREETHREADED_PYTHON)
|
||||
Python_add_library(${MOD_NAME} MODULE USE_SABI ${ARG_USE_SABI} ${SOABI_KEYWORD} "${ARG_SOURCES}")
|
||||
else()
|
||||
Python_add_library(${GPU_MOD_NAME} MODULE ${GPU_WITH_SOABI} "${GPU_SOURCES}")
|
||||
Python_add_library(${MOD_NAME} MODULE ${SOABI_KEYWORD} "${ARG_SOURCES}")
|
||||
endif()
|
||||
|
||||
if (GPU_LANGUAGE STREQUAL "HIP")
|
||||
if (ARG_LANGUAGE STREQUAL "HIP")
|
||||
# Make this target dependent on the hipify preprocessor step.
|
||||
add_dependencies(${GPU_MOD_NAME} hipify${GPU_MOD_NAME})
|
||||
add_dependencies(${MOD_NAME} hipify${MOD_NAME})
|
||||
# Make sure we include the hipified versions of the headers, and avoid conflicts with the ones in the original source folder
|
||||
target_include_directories(${GPU_MOD_NAME} PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/csrc
|
||||
${GPU_INCLUDE_DIRECTORIES})
|
||||
target_include_directories(${MOD_NAME} PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/csrc
|
||||
${ARG_INCLUDE_DIRECTORIES})
|
||||
else()
|
||||
target_include_directories(${GPU_MOD_NAME} PRIVATE csrc
|
||||
${GPU_INCLUDE_DIRECTORIES})
|
||||
target_include_directories(${MOD_NAME} PRIVATE csrc
|
||||
${ARG_INCLUDE_DIRECTORIES})
|
||||
endif()
|
||||
|
||||
if (GPU_ARCHITECTURES)
|
||||
set_target_properties(${GPU_MOD_NAME} PROPERTIES
|
||||
${GPU_LANGUAGE}_ARCHITECTURES "${GPU_ARCHITECTURES}")
|
||||
if (ARG_ARCHITECTURES)
|
||||
set_target_properties(${MOD_NAME} PROPERTIES
|
||||
${ARG_LANGUAGE}_ARCHITECTURES "${ARG_ARCHITECTURES}")
|
||||
endif()
|
||||
|
||||
target_compile_options(${MOD_NAME} PRIVATE
|
||||
$<$<COMPILE_LANGUAGE:${ARG_LANGUAGE}>:${ARG_COMPILE_FLAGS}>)
|
||||
|
||||
target_compile_options(${GPU_MOD_NAME} PRIVATE
|
||||
$<$<COMPILE_LANGUAGE:${GPU_LANGUAGE}>:${GPU_COMPILE_FLAGS}>)
|
||||
target_compile_definitions(${MOD_NAME} PRIVATE
|
||||
"-DTORCH_EXTENSION_NAME=${MOD_NAME}")
|
||||
|
||||
target_compile_definitions(${GPU_MOD_NAME} PRIVATE
|
||||
"-DTORCH_EXTENSION_NAME=${GPU_MOD_NAME}")
|
||||
|
||||
|
||||
target_link_libraries(${GPU_MOD_NAME} PRIVATE torch ${GPU_LIBRARIES})
|
||||
target_link_libraries(${MOD_NAME} PRIVATE torch ${ARG_LIBRARIES})
|
||||
|
||||
# Don't use `TORCH_LIBRARIES` for CUDA since it pulls in a bunch of
|
||||
# dependencies that are not necessary and may not be installed.
|
||||
if (GPU_LANGUAGE STREQUAL "CUDA")
|
||||
target_link_libraries(${GPU_MOD_NAME} PRIVATE CUDA::cudart CUDA::cuda_driver)
|
||||
if (ARG_LANGUAGE STREQUAL "CUDA")
|
||||
target_link_libraries(${MOD_NAME} PRIVATE torch CUDA::cudart CUDA::cuda_driver ${ARG_LIBRARIES})
|
||||
else()
|
||||
target_link_libraries(${GPU_MOD_NAME} PRIVATE ${TORCH_LIBRARIES})
|
||||
target_link_libraries(${MOD_NAME} PRIVATE torch ${TORCH_LIBRARIES} ${ARG_LIBRARIES})
|
||||
endif()
|
||||
|
||||
install(TARGETS ${GPU_MOD_NAME} LIBRARY DESTINATION ${GPU_DESTINATION} COMPONENT ${GPU_MOD_NAME})
|
||||
install(TARGETS ${MOD_NAME} LIBRARY DESTINATION ${ARG_DESTINATION} COMPONENT ${MOD_NAME})
|
||||
endfunction()
|
||||
|
||||
@@ -16,7 +16,8 @@ __global__ void merge_attn_states_kernel(
|
||||
scalar_t* output, float* output_lse, const scalar_t* prefix_output,
|
||||
const float* prefix_lse, const scalar_t* suffix_output,
|
||||
const float* suffix_lse, const uint num_tokens, const uint num_heads,
|
||||
const uint head_size) {
|
||||
const uint head_size, const uint prefix_head_stride,
|
||||
const uint output_head_stride) {
|
||||
using pack_128b_t = uint4;
|
||||
const uint pack_size = 16 / sizeof(scalar_t);
|
||||
const uint threads_per_head = head_size / pack_size;
|
||||
@@ -34,11 +35,13 @@ __global__ void merge_attn_states_kernel(
|
||||
const uint head_idx = token_head_idx % num_heads;
|
||||
|
||||
const uint pack_offset = pack_idx * pack_size; // (0~15)*8, etc.
|
||||
const uint head_offset =
|
||||
token_idx * num_heads * head_size + head_idx * head_size;
|
||||
const scalar_t* prefix_head_ptr = prefix_output + head_offset;
|
||||
const scalar_t* suffix_head_ptr = suffix_output + head_offset;
|
||||
scalar_t* output_head_ptr = output + head_offset;
|
||||
const uint src_head_offset = token_idx * num_heads * prefix_head_stride +
|
||||
head_idx * prefix_head_stride;
|
||||
const uint dst_head_offset = token_idx * num_heads * output_head_stride +
|
||||
head_idx * output_head_stride;
|
||||
const scalar_t* prefix_head_ptr = prefix_output + src_head_offset;
|
||||
const scalar_t* suffix_head_ptr = suffix_output + src_head_offset;
|
||||
scalar_t* output_head_ptr = output + dst_head_offset;
|
||||
|
||||
float p_lse = prefix_lse[head_idx * num_tokens + token_idx];
|
||||
float s_lse = suffix_lse[head_idx * num_tokens + token_idx];
|
||||
@@ -46,6 +49,32 @@ __global__ void merge_attn_states_kernel(
|
||||
s_lse = std::isinf(s_lse) ? -std::numeric_limits<float>::infinity() : s_lse;
|
||||
|
||||
const float max_lse = fmaxf(p_lse, s_lse);
|
||||
|
||||
/* In certain edge cases, MLA can produce p_lse = s_lse = -inf;
|
||||
continuing the pipeline then yields NaN. Root cause: with chunked prefill
|
||||
a batch may be split into two chunks; if a request in that batch has no
|
||||
prefix hit, every LSE entry for that request’s position is -inf, and at
|
||||
this moment we merge cross-attention at first. For now we simply emit
|
||||
prefix_output (expected to be all zeros) and prefix_lse (-inf) to fix
|
||||
this problem.
|
||||
*/
|
||||
if (std::isinf(max_lse)) {
|
||||
if (pack_offset < head_size) {
|
||||
// Pack 128b load
|
||||
pack_128b_t p_out_pack = reinterpret_cast<const pack_128b_t*>(
|
||||
prefix_head_ptr)[pack_offset / pack_size];
|
||||
|
||||
// Pack 128b storage
|
||||
reinterpret_cast<pack_128b_t*>(output_head_ptr)[pack_offset / pack_size] =
|
||||
p_out_pack;
|
||||
}
|
||||
// We only need to write to output_lse once per head.
|
||||
if (output_lse != nullptr && pack_idx == 0) {
|
||||
output_lse[head_idx * num_tokens + token_idx] = max_lse;
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
p_lse = p_lse - max_lse;
|
||||
s_lse = s_lse - max_lse;
|
||||
const float p_se = expf(p_lse);
|
||||
@@ -114,7 +143,7 @@ __global__ void merge_attn_states_kernel(
|
||||
reinterpret_cast<float*>(prefix_lse.data_ptr()), \
|
||||
reinterpret_cast<scalar_t*>(suffix_output.data_ptr()), \
|
||||
reinterpret_cast<float*>(suffix_lse.data_ptr()), num_tokens, \
|
||||
num_heads, head_size); \
|
||||
num_heads, head_size, prefix_head_stride, output_head_stride); \
|
||||
}
|
||||
|
||||
/*@brief Merges the attention states from prefix and suffix
|
||||
@@ -140,17 +169,11 @@ void merge_attn_states_launcher(torch::Tensor& output,
|
||||
const uint num_tokens = output.size(0);
|
||||
const uint num_heads = output.size(1);
|
||||
const uint head_size = output.size(2);
|
||||
const uint prefix_head_stride = prefix_output.stride(1);
|
||||
const uint output_head_stride = output.stride(1);
|
||||
const uint pack_size = 16 / sizeof(scalar_t);
|
||||
TORCH_CHECK(head_size % pack_size == 0,
|
||||
"headsize must be multiple of pack_size:", pack_size);
|
||||
TORCH_CHECK(output.stride(-2) == head_size && output.stride(-1) == 1,
|
||||
"output heads must be contiguous in memory");
|
||||
TORCH_CHECK(
|
||||
prefix_output.stride(-2) == head_size && prefix_output.stride(-1) == 1,
|
||||
"prefix_output heads must be contiguous in memory");
|
||||
TORCH_CHECK(
|
||||
suffix_output.stride(-2) == head_size && suffix_output.stride(-1) == 1,
|
||||
"suffix_output heads must be contiguous in memory");
|
||||
float* output_lse_ptr = nullptr;
|
||||
if (output_lse.has_value()) {
|
||||
output_lse_ptr = output_lse.value().data_ptr<float>();
|
||||
|
||||
11
csrc/cache.h
11
csrc/cache.h
@@ -41,11 +41,12 @@ void convert_fp8(torch::Tensor& dst_cache, torch::Tensor& src_cache,
|
||||
const double scale, const std::string& kv_cache_dtype);
|
||||
|
||||
void gather_and_maybe_dequant_cache(
|
||||
torch::Tensor const& src_cache, // [NUM_BLOCKS, BLOCK_SIZE, ENTRIES...]
|
||||
torch::Tensor const& dst, // [TOT_TOKENS, ENTRIES...]
|
||||
torch::Tensor const& block_table, // [BATCH, BLOCK_INDICES]
|
||||
torch::Tensor const& cu_seq_lens, // [BATCH+1]
|
||||
int64_t batch_size, const std::string& kv_cache_dtype,
|
||||
torch::Tensor const& src_cache, // [NUM_BLOCKS, BLOCK_SIZE, ENTRIES...]
|
||||
torch::Tensor const& dst, // [TOT_TOKENS, ENTRIES...]
|
||||
torch::Tensor const& block_table, // [BATCH, BLOCK_INDICES]
|
||||
torch::Tensor const& cu_seq_lens, // [BATCH+1]
|
||||
torch::Tensor const& token_to_seq, // [MAX_TOKEN_ACROSS_CHUNKS]
|
||||
int64_t num_tokens, const std::string& kv_cache_dtype,
|
||||
torch::Tensor const& scale,
|
||||
std::optional<torch::Tensor> seq_starts = std::nullopt);
|
||||
|
||||
|
||||
@@ -552,7 +552,11 @@ __global__ void indexer_k_quant_and_cache_kernel(
|
||||
#ifndef USE_ROCM
|
||||
__syncwarp();
|
||||
#endif
|
||||
#if defined(__gfx942__)
|
||||
float scale = fmaxf(amax, 1e-4) / 224.0f;
|
||||
#else
|
||||
float scale = fmaxf(amax, 1e-4) / 448.0f;
|
||||
#endif
|
||||
if (use_ue8m0) {
|
||||
scale = exp2f(ceilf(log2f(scale)));
|
||||
}
|
||||
@@ -901,87 +905,80 @@ void convert_fp8(torch::Tensor& dst_cache, torch::Tensor& src_cache,
|
||||
namespace vllm {
|
||||
|
||||
// grid is launched with dimensions (batch, num_splits)
|
||||
template <typename scalar_t, typename cache_t, Fp8KVCacheDataType kv_dt>
|
||||
template <typename scalar_t, typename cache_t, Fp8KVCacheDataType kv_dt,
|
||||
int ENTRY_SIZE, int CTA_SIZE>
|
||||
__global__ void gather_and_maybe_dequant_cache(
|
||||
const cache_t* __restrict__ src_cache, // [NUM_BLOCKS, BLOCK_SIZE,
|
||||
// ENTRIES...]
|
||||
scalar_t* __restrict__ dst, // [TOT_TOKENS, ENTRIES...]
|
||||
const int32_t* __restrict__ block_table, // [BATCH, BLOCK_INDICES]
|
||||
const int32_t* __restrict__ cu_seq_lens, // [BATCH+1]
|
||||
const int32_t block_size, const int32_t entry_size,
|
||||
const cache_t* __restrict__ src_cache, // [NUM_BLOCKS, BLOCK_SIZE,
|
||||
// ENTRIES...]
|
||||
scalar_t* __restrict__ dst, // [TOT_TOKENS, ENTRIES...]
|
||||
const int32_t* __restrict__ block_table, // [BATCH, BLOCK_INDICES]
|
||||
const int32_t* __restrict__ cu_seq_lens, // [BATCH+1]
|
||||
const int32_t* __restrict__ token_to_seq, // [MAX_TOKEN_ACROSS_CHUNK]
|
||||
const int32_t num_tokens, const int32_t block_size,
|
||||
const int64_t block_table_stride, const int64_t cache_block_stride,
|
||||
const int64_t cache_entry_stride, const int64_t dst_entry_stride,
|
||||
const float* __restrict__ scale,
|
||||
const int32_t* __restrict__ seq_starts) { // Optional: starting offsets per
|
||||
// batch
|
||||
constexpr int vec_size = sizeof(float4) / sizeof(scalar_t);
|
||||
using ltype = vllm::vec_n_t<cache_t, vec_size>;
|
||||
using stype = vllm::vec_n_t<scalar_t, vec_size>;
|
||||
// We are adding this for code readability which will be optimized out when
|
||||
// build in release.
|
||||
assert(CTA_SIZE == blockDim.x);
|
||||
|
||||
const int64_t bid = blockIdx.x; // Batch ID
|
||||
const int32_t num_splits = gridDim.y;
|
||||
const int32_t split = blockIdx.y;
|
||||
const int32_t seq_start = cu_seq_lens[bid];
|
||||
const int32_t seq_end = cu_seq_lens[bid + 1];
|
||||
const int32_t seq_len = seq_end - seq_start;
|
||||
const int32_t tot_blocks = cuda_utils::ceil_div(seq_len, block_size);
|
||||
const int32_t split_blocks = cuda_utils::ceil_div(tot_blocks, num_splits);
|
||||
#pragma unroll
|
||||
for (int token_id = blockIdx.x; token_id < num_tokens;
|
||||
token_id += gridDim.x) {
|
||||
int64_t batch_id = token_to_seq[token_id];
|
||||
int64_t batch_start = cu_seq_lens[batch_id];
|
||||
int64_t batch_end = cu_seq_lens[batch_id + 1];
|
||||
int32_t batch_offset = token_id - batch_start;
|
||||
|
||||
const int32_t split_start = split * split_blocks;
|
||||
const int32_t split_end = min((split + 1) * split_blocks, tot_blocks);
|
||||
if (token_id >= batch_end) return;
|
||||
int32_t offset = 0;
|
||||
if (seq_starts != nullptr) {
|
||||
offset = seq_starts[batch_id];
|
||||
}
|
||||
batch_offset += offset;
|
||||
int32_t block_table_id = batch_offset / block_size;
|
||||
int32_t slot_id = batch_offset % block_size;
|
||||
int32_t block_table_offset = batch_id * block_table_stride + block_table_id;
|
||||
int32_t block_id = block_table[block_table_offset];
|
||||
int64_t cache_offset =
|
||||
block_id * cache_block_stride + slot_id * cache_entry_stride;
|
||||
constexpr int32_t vec_iter_cnt = ENTRY_SIZE / vec_size;
|
||||
scalar_t* dst_ = dst + token_id * dst_entry_stride;
|
||||
cache_t* src_ = const_cast<cache_t*>(src_cache) + cache_offset;
|
||||
|
||||
const bool is_active_split = (split_start < tot_blocks);
|
||||
const bool is_last_split = (split_end == tot_blocks);
|
||||
|
||||
if (!is_active_split) return;
|
||||
|
||||
int32_t full_blocks_end = split_end;
|
||||
int32_t partial_block_size = 0;
|
||||
|
||||
// Adjust the pointer for the block_table for this batch.
|
||||
// If seq_starts is provided, compute an offset based on (seq_starts[bid] /
|
||||
// page_size)
|
||||
const int32_t batch_offset = bid * block_table_stride;
|
||||
int32_t offset = 0;
|
||||
if (seq_starts != nullptr) {
|
||||
offset = seq_starts[bid] / block_size;
|
||||
}
|
||||
const int32_t* batch_block_table = block_table + batch_offset + offset;
|
||||
|
||||
// Adjust dst pointer based on the cumulative sequence lengths.
|
||||
dst += seq_start * dst_entry_stride;
|
||||
|
||||
if (is_last_split) {
|
||||
partial_block_size = seq_len % block_size;
|
||||
if (partial_block_size) full_blocks_end -= 1;
|
||||
}
|
||||
|
||||
auto copy_entry = [&](const cache_t* __restrict__ _src,
|
||||
scalar_t* __restrict__ _dst) {
|
||||
for (int i = threadIdx.x; i < entry_size; i += blockDim.x) {
|
||||
#pragma unroll
|
||||
for (int idx = threadIdx.x; idx < vec_iter_cnt; idx += CTA_SIZE) {
|
||||
if constexpr (kv_dt == Fp8KVCacheDataType::kAuto) {
|
||||
_dst[i] = static_cast<scalar_t>(_src[i]);
|
||||
reinterpret_cast<stype*>(dst_)[idx] =
|
||||
static_cast<stype>(reinterpret_cast<ltype*>(src_)[idx]);
|
||||
} else {
|
||||
_dst[i] =
|
||||
fp8::scaled_convert<scalar_t, cache_t, kv_dt>(_src[i], *scale);
|
||||
ltype loaded_val = reinterpret_cast<ltype*>(src_)[idx];
|
||||
stype store_val;
|
||||
#pragma unroll
|
||||
for (int j = 0; j < vec_size; ++j) {
|
||||
store_val.val[j] = fp8::scaled_convert<scalar_t, cache_t, kv_dt>(
|
||||
loaded_val.val[j], *scale);
|
||||
}
|
||||
reinterpret_cast<stype*>(dst_)[idx] = store_val;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
for (int pid = split_start; pid < full_blocks_end; ++pid) {
|
||||
auto block_id = batch_block_table[pid];
|
||||
auto block_start_ptr = src_cache + block_id * cache_block_stride;
|
||||
auto block_dst_ptr = dst + pid * block_size * dst_entry_stride;
|
||||
for (int eid = 0; eid < block_size; ++eid) {
|
||||
copy_entry(block_start_ptr + eid * cache_entry_stride,
|
||||
block_dst_ptr + eid * dst_entry_stride);
|
||||
}
|
||||
}
|
||||
|
||||
if (partial_block_size) {
|
||||
auto block_id = batch_block_table[full_blocks_end];
|
||||
auto block_start_ptr = src_cache + block_id * cache_block_stride;
|
||||
auto block_dst_ptr = dst + full_blocks_end * block_size * dst_entry_stride;
|
||||
for (int eid = 0; eid < partial_block_size; ++eid) {
|
||||
copy_entry(block_start_ptr + eid * cache_entry_stride,
|
||||
block_dst_ptr + eid * dst_entry_stride);
|
||||
// process tail
|
||||
constexpr int32_t tail_cnt = ENTRY_SIZE % vec_size;
|
||||
dst_ = dst_ + ENTRY_SIZE - tail_cnt;
|
||||
src_ = src_ + ENTRY_SIZE - tail_cnt;
|
||||
#pragma unroll
|
||||
for (int idx = threadIdx.x; idx < tail_cnt; idx += CTA_SIZE) {
|
||||
if constexpr (kv_dt == Fp8KVCacheDataType::kAuto) {
|
||||
dst_[idx] = static_cast<scalar_t>(src_[idx]);
|
||||
} else {
|
||||
dst_[idx] =
|
||||
fp8::scaled_convert<scalar_t, cache_t, kv_dt>(src_[idx], *scale);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -992,34 +989,38 @@ __global__ void gather_and_maybe_dequant_cache(
|
||||
// SCALAR_T is the data type of the destination tensor.
|
||||
// CACHE_T is the stored data type of kv-cache.
|
||||
// KV_DTYPE is the real data type of kv-cache.
|
||||
#define CALL_GATHER_CACHE(SCALAR_T, CACHE_T, KV_DTYPE) \
|
||||
vllm::gather_and_maybe_dequant_cache<SCALAR_T, CACHE_T, KV_DTYPE> \
|
||||
<<<grid, block, 0, stream>>>( \
|
||||
reinterpret_cast<CACHE_T*>(src_cache.data_ptr()), \
|
||||
reinterpret_cast<SCALAR_T*>(dst.data_ptr()), \
|
||||
block_table.data_ptr<int32_t>(), cu_seq_lens.data_ptr<int32_t>(), \
|
||||
block_size, entry_size, block_table_stride, cache_block_stride, \
|
||||
cache_entry_stride, dst_entry_stride, \
|
||||
reinterpret_cast<const float*>(scale.data_ptr()), seq_starts_ptr);
|
||||
#define CALL_GATHER_CACHE(SCALAR_T, CACHE_T, KV_DTYPE) \
|
||||
vllm::gather_and_maybe_dequant_cache<SCALAR_T, CACHE_T, KV_DTYPE, 576, \
|
||||
thread_block_size> \
|
||||
<<<grid, block, 0, stream>>>( \
|
||||
reinterpret_cast<CACHE_T*>(src_cache.data_ptr()), \
|
||||
reinterpret_cast<SCALAR_T*>(dst.data_ptr()), \
|
||||
block_table.data_ptr<int32_t>(), cu_seq_lens.data_ptr<int32_t>(), \
|
||||
token_to_seq.data_ptr<int32_t>(), num_tokens, block_size, \
|
||||
block_table_stride, cache_block_stride, cache_entry_stride, \
|
||||
dst_entry_stride, reinterpret_cast<const float*>(scale.data_ptr()), \
|
||||
seq_starts_ptr);
|
||||
|
||||
// Gather sequences from the cache into the destination tensor.
|
||||
// - cu_seq_lens contains the cumulative sequence lengths for each batch
|
||||
// - block_table contains the cache block indices for each sequence
|
||||
// - token_to_seq contains the back mapping from token_id to batch_id
|
||||
// - Optionally, seq_starts (if provided) offsets the starting block index by
|
||||
// (seq_starts[bid] / page_size)
|
||||
void gather_and_maybe_dequant_cache(
|
||||
torch::Tensor const& src_cache, // [NUM_BLOCKS, BLOCK_SIZE, ENTRIES...]
|
||||
torch::Tensor const& dst, // [TOT_TOKENS, ENTRIES...]
|
||||
torch::Tensor const& block_table, // [BATCH, BLOCK_INDICES]
|
||||
torch::Tensor const& cu_seq_lens, // [BATCH+1]
|
||||
int64_t batch_size, const std::string& kv_cache_dtype,
|
||||
torch::Tensor const& src_cache, // [NUM_BLOCKS, BLOCK_SIZE, ENTRIES...]
|
||||
torch::Tensor const& dst, // [TOT_TOKENS, ENTRIES...]
|
||||
torch::Tensor const& block_table, // [BATCH, BLOCK_INDICES]
|
||||
torch::Tensor const& cu_seq_lens, // [BATCH+1]
|
||||
torch::Tensor const& token_to_seq, // [MAX_TOKEN_ACROSS_CHUNKS]
|
||||
int64_t num_tokens, const std::string& kv_cache_dtype,
|
||||
torch::Tensor const& scale,
|
||||
std::optional<torch::Tensor> seq_starts = std::nullopt) {
|
||||
at::cuda::OptionalCUDAGuard device_guard(src_cache.device());
|
||||
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
||||
|
||||
int32_t block_size = src_cache.size(1);
|
||||
int32_t entry_size = src_cache.flatten(2, -1).size(2);
|
||||
int32_t head_dim = dst.size(-1);
|
||||
|
||||
TORCH_CHECK(block_table.dtype() == torch::kInt32,
|
||||
"block_table must be int32");
|
||||
@@ -1029,6 +1030,9 @@ void gather_and_maybe_dequant_cache(
|
||||
TORCH_CHECK(seq_starts.value().dtype() == torch::kInt32,
|
||||
"seq_starts must be int32");
|
||||
}
|
||||
TORCH_CHECK(head_dim == 576,
|
||||
"gather_and_maybe_dequant_cache only support the head_dim to 576 "
|
||||
"for better performance")
|
||||
|
||||
TORCH_CHECK(src_cache.device() == dst.device(),
|
||||
"src_cache and dst must be on the same device");
|
||||
@@ -1046,10 +1050,9 @@ void gather_and_maybe_dequant_cache(
|
||||
int64_t cache_entry_stride = src_cache.stride(1);
|
||||
int64_t dst_entry_stride = dst.stride(0);
|
||||
|
||||
// Decide on the number of splits based on the batch size.
|
||||
int num_splits = batch_size > 128 ? 2 : batch_size > 64 ? 4 : 16;
|
||||
dim3 grid(batch_size, num_splits);
|
||||
dim3 block(1024);
|
||||
constexpr int32_t thread_block_size = 64;
|
||||
dim3 grid(num_tokens);
|
||||
dim3 block(thread_block_size);
|
||||
|
||||
const int32_t* seq_starts_ptr =
|
||||
seq_starts.has_value() ? seq_starts.value().data_ptr<int32_t>() : nullptr;
|
||||
|
||||
@@ -1,798 +0,0 @@
|
||||
#include "cpu_types.hpp"
|
||||
|
||||
namespace {
|
||||
|
||||
template <typename scalar_t>
|
||||
struct KernelVecType {
|
||||
using q_load_vec_type = void;
|
||||
using q_vec_type = void;
|
||||
using k_load_vec_type = void;
|
||||
using k_vec_type = void;
|
||||
using qk_acc_vec_type = void;
|
||||
using v_load_vec_type = void;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct KernelVecType<float> {
|
||||
using q_load_vec_type = vec_op::FP32Vec4;
|
||||
using q_vec_type = vec_op::FP32Vec16;
|
||||
using k_load_vec_type = vec_op::FP32Vec16;
|
||||
using k_vec_type = vec_op::FP32Vec16;
|
||||
using qk_acc_vec_type = vec_op::FP32Vec16;
|
||||
using v_load_vec_type = vec_op::FP32Vec16;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct KernelVecType<c10::Half> {
|
||||
#if defined(__powerpc64__) || defined(__s390x__)
|
||||
// Power and s390x architecture-specific vector types
|
||||
using q_load_vec_type = vec_op::FP32Vec8;
|
||||
using k_load_vec_type = vec_op::FP32Vec16;
|
||||
using v_load_vec_type = vec_op::FP32Vec16;
|
||||
#else
|
||||
// Fallback for other architectures, including x86
|
||||
using q_load_vec_type = vec_op::FP16Vec8;
|
||||
using k_load_vec_type = vec_op::FP16Vec16;
|
||||
using v_load_vec_type = vec_op::FP16Vec16;
|
||||
#endif
|
||||
using q_vec_type = vec_op::FP32Vec16;
|
||||
using k_vec_type = vec_op::FP32Vec16;
|
||||
using qk_acc_vec_type = vec_op::FP32Vec16;
|
||||
};
|
||||
|
||||
#ifdef __AVX512BF16__
|
||||
template <>
|
||||
struct KernelVecType<c10::BFloat16> {
|
||||
using q_load_vec_type = vec_op::BF16Vec8;
|
||||
using q_vec_type = vec_op::BF16Vec32;
|
||||
using k_load_vec_type = vec_op::BF16Vec32;
|
||||
using k_vec_type = vec_op::BF16Vec32;
|
||||
using qk_acc_vec_type = vec_op::FP32Vec16;
|
||||
using v_load_vec_type = vec_op::BF16Vec16;
|
||||
};
|
||||
#else
|
||||
#ifdef __aarch64__
|
||||
#ifndef ARM_BF16_SUPPORT
|
||||
// pass
|
||||
#else
|
||||
template <>
|
||||
struct KernelVecType<c10::BFloat16> {
|
||||
using q_load_vec_type = vec_op::BF16Vec8;
|
||||
using q_vec_type = vec_op::FP32Vec16;
|
||||
using k_load_vec_type = vec_op::BF16Vec16;
|
||||
using k_vec_type = vec_op::FP32Vec16;
|
||||
using qk_acc_vec_type = vec_op::FP32Vec16;
|
||||
using v_load_vec_type = vec_op::BF16Vec16;
|
||||
};
|
||||
#endif
|
||||
#else
|
||||
template <>
|
||||
struct KernelVecType<c10::BFloat16> {
|
||||
using q_load_vec_type = vec_op::BF16Vec8;
|
||||
using q_vec_type = vec_op::FP32Vec16;
|
||||
using k_load_vec_type = vec_op::BF16Vec16;
|
||||
using k_vec_type = vec_op::FP32Vec16;
|
||||
using qk_acc_vec_type = vec_op::FP32Vec16;
|
||||
using v_load_vec_type = vec_op::BF16Vec16;
|
||||
};
|
||||
#endif
|
||||
#endif
|
||||
|
||||
template <typename T>
|
||||
FORCE_INLINE std::pair<T, T> reduceSoftmax(T* data, const int size,
|
||||
const int capacity) {
|
||||
T max = data[0];
|
||||
for (int i = 1; i < size; ++i) {
|
||||
max = max >= data[i] ? max : data[i];
|
||||
}
|
||||
|
||||
T sum = 0;
|
||||
for (int i = 0; i < size; ++i) {
|
||||
data[i] = std::exp(data[i] - max);
|
||||
sum += data[i];
|
||||
}
|
||||
|
||||
int i = 0;
|
||||
for (; i < size; ++i) {
|
||||
data[i] /= sum;
|
||||
}
|
||||
|
||||
for (; i < capacity; ++i) {
|
||||
data[i] = 0;
|
||||
}
|
||||
|
||||
return {max, sum};
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
FORCE_INLINE std::pair<T, T> reduceSoftmaxAlibi(T* data, const int size,
|
||||
const int capacity,
|
||||
const float alibi_slope,
|
||||
const int start_index,
|
||||
const int seq_len) {
|
||||
data[0] += alibi_slope * (start_index - seq_len + 1);
|
||||
T max = data[0];
|
||||
for (int i = 1; i < size; ++i) {
|
||||
T qk = data[i] + alibi_slope * (start_index + i - seq_len + 1);
|
||||
data[i] = qk;
|
||||
max = max >= qk ? max : qk;
|
||||
}
|
||||
|
||||
T sum = 0;
|
||||
for (int i = 0; i < size; ++i) {
|
||||
data[i] = std::exp(data[i] - max);
|
||||
sum += data[i];
|
||||
}
|
||||
|
||||
int i = 0;
|
||||
for (; i < size; ++i) {
|
||||
data[i] /= sum;
|
||||
}
|
||||
|
||||
for (; i < capacity; ++i) {
|
||||
data[i] = 0;
|
||||
}
|
||||
|
||||
return {max, sum};
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
FORCE_INLINE void reducePartitionSoftmax(const T* max_data, T* sum_data,
|
||||
const int size) {
|
||||
T max = max_data[0];
|
||||
for (int i = 1; i < size; ++i) {
|
||||
max = max >= max_data[i] ? max : max_data[i];
|
||||
}
|
||||
|
||||
T rescaled_sum = 0;
|
||||
for (int i = 0; i < size; ++i) {
|
||||
T rescale_factor = std::exp(max_data[i] - max);
|
||||
rescaled_sum += rescale_factor * sum_data[i];
|
||||
sum_data[i] *= rescale_factor;
|
||||
}
|
||||
for (int i = 0; i < size; ++i) {
|
||||
sum_data[i] /= rescaled_sum + 1e-8;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename scalar_t, int HEAD_SIZE, int BLOCK_SIZE, int x>
|
||||
struct reduceQKBlockKernel {
|
||||
using q_load_vec_type = typename KernelVecType<scalar_t>::q_load_vec_type;
|
||||
using q_vec_type = typename KernelVecType<scalar_t>::q_vec_type;
|
||||
using k_load_vec_type = typename KernelVecType<scalar_t>::k_load_vec_type;
|
||||
using k_vec_type = typename KernelVecType<scalar_t>::k_vec_type;
|
||||
using qk_acc_vec_type = typename KernelVecType<scalar_t>::qk_acc_vec_type;
|
||||
|
||||
constexpr static int TOKEN_PER_GROUP = k_load_vec_type::get_elem_num() / x;
|
||||
constexpr static int MAX_GROUP_NUM = 16 / TOKEN_PER_GROUP;
|
||||
constexpr static int UNROLL_GROUP_NUM = MAX_GROUP_NUM / 4;
|
||||
|
||||
static_assert(MAX_GROUP_NUM == 8 || MAX_GROUP_NUM == 4);
|
||||
static_assert(k_load_vec_type::get_elem_num() % x == 0);
|
||||
static_assert(q_load_vec_type::get_elem_num() * sizeof(scalar_t) == 16);
|
||||
|
||||
FORCE_INLINE static void call(const scalar_t* __restrict__ q,
|
||||
const scalar_t* __restrict__ k_block,
|
||||
float* __restrict__ logits, float scale,
|
||||
const int token_num) {
|
||||
const int group_num = (token_num + TOKEN_PER_GROUP - 1) / TOKEN_PER_GROUP;
|
||||
|
||||
qk_acc_vec_type group_accums[MAX_GROUP_NUM];
|
||||
if (token_num == BLOCK_SIZE) {
|
||||
for (int q_offset = 0; q_offset < HEAD_SIZE;
|
||||
q_offset += x, k_block += x * BLOCK_SIZE) {
|
||||
q_load_vec_type q_load_group_vec(q + q_offset);
|
||||
q_vec_type q_group_vec(q_load_group_vec);
|
||||
|
||||
vec_op::unroll_loop<int, MAX_GROUP_NUM>(
|
||||
[k_block, &q_group_vec, &group_accums](int token_group_idx) {
|
||||
k_load_vec_type k_load_group_vec(k_block + token_group_idx * x *
|
||||
TOKEN_PER_GROUP);
|
||||
k_vec_type k_group_vec(k_load_group_vec);
|
||||
vec_op::fma(group_accums[token_group_idx], q_group_vec,
|
||||
k_group_vec);
|
||||
vec_op::prefetch(k_block + x * BLOCK_SIZE +
|
||||
token_group_idx * x * TOKEN_PER_GROUP);
|
||||
});
|
||||
}
|
||||
} else {
|
||||
for (int q_offset = 0; q_offset < HEAD_SIZE;
|
||||
q_offset += x, k_block += x * BLOCK_SIZE) {
|
||||
q_load_vec_type q_load_group_vec(q + q_offset);
|
||||
q_vec_type q_group_vec(q_load_group_vec);
|
||||
for (int token_group_start = 0; token_group_start < group_num;
|
||||
token_group_start += UNROLL_GROUP_NUM) {
|
||||
vec_op::unroll_loop<int, UNROLL_GROUP_NUM>(
|
||||
[token_group_start, k_block, &q_group_vec,
|
||||
&group_accums](int token_group_idx) {
|
||||
token_group_idx += token_group_start;
|
||||
k_load_vec_type k_load_group_vec(k_block + token_group_idx * x *
|
||||
TOKEN_PER_GROUP);
|
||||
k_vec_type k_group_vec(k_load_group_vec);
|
||||
vec_op::fma(group_accums[token_group_idx], q_group_vec,
|
||||
k_group_vec);
|
||||
vec_op::prefetch(k_block + x * BLOCK_SIZE +
|
||||
token_group_idx * x * TOKEN_PER_GROUP);
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (int token_group_idx = 0; token_group_idx < group_num;
|
||||
++token_group_idx) {
|
||||
vec_op::unroll_loop<int, TOKEN_PER_GROUP>(
|
||||
[&group_accums, logits, scale, token_group_idx](int token_idx) {
|
||||
float dot_v =
|
||||
group_accums[token_group_idx]
|
||||
.template reduce_sub_sum<qk_acc_vec_type::get_elem_num() /
|
||||
TOKEN_PER_GROUP>(token_idx);
|
||||
logits[token_group_idx * TOKEN_PER_GROUP + token_idx] =
|
||||
dot_v * scale;
|
||||
});
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
template <typename scalar_t, int HEAD_SIZE, int BLOCK_SIZE,
|
||||
int HEAD_PARTITION_SIZE, typename acc_t>
|
||||
FORCE_INLINE void reduceValueBlock(const float* prob, const scalar_t* v_block,
|
||||
acc_t&& acc) {
|
||||
using v_load_vec_type = typename KernelVecType<scalar_t>::v_load_vec_type;
|
||||
constexpr int ELEM_NUM = v_load_vec_type::get_elem_num();
|
||||
static_assert(BLOCK_SIZE == ELEM_NUM);
|
||||
vec_op::FP32Vec16 prob_vec(prob);
|
||||
|
||||
vec_op::unroll_loop<int, HEAD_PARTITION_SIZE>([&](int head_elem_idx) {
|
||||
v_load_vec_type v_vec(v_block + BLOCK_SIZE * head_elem_idx);
|
||||
vec_op::FP32Vec16 fp32_v_vec(v_vec);
|
||||
acc[head_elem_idx] = acc[head_elem_idx] + prob_vec * fp32_v_vec;
|
||||
});
|
||||
}
|
||||
}; // namespace
|
||||
|
||||
// Paged attention v1
|
||||
namespace {
|
||||
template <typename scalar_t, int HEAD_SIZE, int BLOCK_SIZE>
|
||||
struct paged_attention_v1_impl {
|
||||
static void call(
|
||||
scalar_t* __restrict__ out, // [num_seqs, num_heads, head_size]
|
||||
const scalar_t* __restrict__ q, // [num_seqs, num_heads, head_size]
|
||||
const scalar_t* __restrict__ k_cache, // [num_blocks, num_kv_heads,
|
||||
// head_size/x, block_size, x]
|
||||
const scalar_t* __restrict__ v_cache, // [num_blocks, num_kv_heads,
|
||||
// head_size, block_size]
|
||||
const int num_kv_heads, const float scale,
|
||||
const int* __restrict__ block_tables, // [num_seqs,
|
||||
// max_num_blocks_per_seq]
|
||||
const int* __restrict__ seq_lens, // [num_seqs]
|
||||
const int max_num_blocks_per_seq,
|
||||
const float* __restrict__ alibi_slopes, // [num_heads]
|
||||
const int q_stride, const int kv_block_stride, const int kv_head_stride,
|
||||
const int num_seqs, const int num_heads) {
|
||||
constexpr int x = 16 / sizeof(scalar_t);
|
||||
const int num_queries_per_kv = num_heads / num_kv_heads;
|
||||
|
||||
static_assert(BLOCK_SIZE == 16);
|
||||
|
||||
int max_seq_len = max_num_blocks_per_seq * BLOCK_SIZE;
|
||||
int max_seq_len_padded = (max_seq_len + 15) & 0xFFFFFFF0;
|
||||
TORCH_CHECK((max_seq_len_padded * sizeof(float)) % 64 == 0);
|
||||
|
||||
const int parallel_work_item_num = omp_get_max_threads();
|
||||
|
||||
size_t logits_bytes =
|
||||
parallel_work_item_num * max_seq_len_padded * sizeof(float);
|
||||
float* logits = (float*)std::aligned_alloc(
|
||||
64, logits_bytes); // Cacheline alignment for each context token.
|
||||
// [parallel_work_item_num, max_seq_len_padded]
|
||||
|
||||
#pragma omp parallel for collapse(2) schedule(dynamic, 1)
|
||||
for (int seq_idx = 0; seq_idx < num_seqs; ++seq_idx) {
|
||||
for (int head_idx = 0; head_idx < num_heads; ++head_idx) {
|
||||
int seq_len = seq_lens[seq_idx];
|
||||
const int* seq_block_table =
|
||||
block_tables + max_num_blocks_per_seq * seq_idx;
|
||||
const int block_num = (seq_len + BLOCK_SIZE - 1) / BLOCK_SIZE;
|
||||
const int64_t kv_head_idx = head_idx / num_queries_per_kv;
|
||||
const scalar_t* __restrict__ q_vec_ptr =
|
||||
q + seq_idx * q_stride + head_idx * HEAD_SIZE;
|
||||
const int last_block_token_num = seq_len - (block_num - 1) * BLOCK_SIZE;
|
||||
float* __restrict__ thread_block_logits =
|
||||
logits + omp_get_thread_num() * max_seq_len_padded;
|
||||
|
||||
// Compute logits
|
||||
for (int block_idx = 0; block_idx < block_num; ++block_idx) {
|
||||
const int64_t physical_block_idx = seq_block_table[block_idx];
|
||||
const scalar_t* __restrict__ k_block_cache_ptr =
|
||||
k_cache + physical_block_idx * kv_block_stride +
|
||||
kv_head_idx * kv_head_stride;
|
||||
float* __restrict__ head_block_logits =
|
||||
thread_block_logits + block_idx * BLOCK_SIZE;
|
||||
|
||||
reduceQKBlockKernel<scalar_t, HEAD_SIZE, BLOCK_SIZE, x>::call(
|
||||
q_vec_ptr, k_block_cache_ptr, head_block_logits, scale,
|
||||
block_idx == block_num - 1 ? last_block_token_num : BLOCK_SIZE);
|
||||
}
|
||||
|
||||
// Compute softmax
|
||||
if (alibi_slopes) {
|
||||
reduceSoftmaxAlibi(thread_block_logits, seq_len,
|
||||
block_num * BLOCK_SIZE, alibi_slopes[head_idx], 0,
|
||||
seq_len);
|
||||
} else {
|
||||
reduceSoftmax(thread_block_logits, seq_len, block_num * BLOCK_SIZE);
|
||||
}
|
||||
|
||||
// Compute value
|
||||
constexpr int head_elem_num_per_partition = 16;
|
||||
constexpr int head_partition_num =
|
||||
HEAD_SIZE / head_elem_num_per_partition;
|
||||
for (int head_part_idx = 0; head_part_idx < head_partition_num;
|
||||
++head_part_idx) {
|
||||
vec_op::FP32Vec16 accums[head_elem_num_per_partition];
|
||||
scalar_t* __restrict__ out_ptr =
|
||||
out + seq_idx * num_heads * HEAD_SIZE + head_idx * HEAD_SIZE +
|
||||
head_part_idx * head_elem_num_per_partition;
|
||||
for (int block_idx = 0; block_idx < block_num; ++block_idx) {
|
||||
const int64_t physical_block_idx = seq_block_table[block_idx];
|
||||
const float* __restrict__ prob_vec_ptr =
|
||||
thread_block_logits + block_idx * BLOCK_SIZE;
|
||||
const scalar_t* __restrict__ v_block_cache_ptr =
|
||||
v_cache + physical_block_idx * kv_block_stride +
|
||||
kv_head_idx * kv_head_stride +
|
||||
BLOCK_SIZE * head_part_idx * head_elem_num_per_partition;
|
||||
reduceValueBlock<scalar_t, HEAD_SIZE, BLOCK_SIZE,
|
||||
head_elem_num_per_partition>(
|
||||
prob_vec_ptr, v_block_cache_ptr, accums);
|
||||
|
||||
if (block_idx != block_num - 1) {
|
||||
const int64_t next_physical_block_idx =
|
||||
seq_block_table[block_idx + 1];
|
||||
const scalar_t* __restrict__ next_v_block_cache_ptr =
|
||||
v_cache + next_physical_block_idx * kv_block_stride +
|
||||
kv_head_idx * kv_head_stride +
|
||||
BLOCK_SIZE * head_part_idx * head_elem_num_per_partition;
|
||||
vec_op::unroll_loop<int, head_elem_num_per_partition>(
|
||||
[&](int head_elem_idx) {
|
||||
if (head_elem_idx % 2 == 0) {
|
||||
vec_op::prefetch(next_v_block_cache_ptr +
|
||||
BLOCK_SIZE * head_elem_idx);
|
||||
}
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
vec_op::unroll_loop<int, head_elem_num_per_partition>(
|
||||
[&](int head_elem_idx) {
|
||||
float value = accums[head_elem_idx].reduce_sum();
|
||||
vec_op::storeFP32(value, out_ptr + head_elem_idx);
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
std::free(logits);
|
||||
}
|
||||
};
|
||||
|
||||
#define LAUNCH_V1_ATTENTION_KERNEL(T, HEAD_SIZE, BLOCK_SIZE) \
|
||||
paged_attention_v1_impl<T, HEAD_SIZE, BLOCK_SIZE>::call( \
|
||||
out_ptr, query_ptr, key_cache_ptr, value_cache_ptr, num_kv_heads, scale, \
|
||||
block_tables_ptr, seq_lens_ptr, max_num_blocks_per_seq, \
|
||||
alibi_slopes_ptr, q_stride, kv_block_stride, kv_head_stride, num_seqs, \
|
||||
num_heads);
|
||||
|
||||
template <typename T, int BLOCK_SIZE>
|
||||
void paged_attention_v1_impl_launcher(
|
||||
torch::Tensor& out, torch::Tensor& query, torch::Tensor& key_cache,
|
||||
torch::Tensor& value_cache, int num_kv_heads, float scale,
|
||||
torch::Tensor& block_tables, torch::Tensor& seq_lens, int max_seq_len,
|
||||
const std::optional<torch::Tensor>& alibi_slopes) {
|
||||
int num_seqs = query.size(0);
|
||||
int num_heads = query.size(1);
|
||||
int head_size = query.size(2);
|
||||
int max_num_blocks_per_seq = block_tables.size(1);
|
||||
int q_stride = query.stride(0);
|
||||
int kv_block_stride = key_cache.stride(0);
|
||||
int kv_head_stride = key_cache.stride(1);
|
||||
|
||||
// NOTE: alibi_slopes is optional.
|
||||
const float* alibi_slopes_ptr =
|
||||
alibi_slopes
|
||||
? reinterpret_cast<const float*>(alibi_slopes.value().data_ptr())
|
||||
: nullptr;
|
||||
|
||||
T* out_ptr = reinterpret_cast<T*>(out.data_ptr());
|
||||
T* query_ptr = reinterpret_cast<T*>(query.data_ptr());
|
||||
T* key_cache_ptr = reinterpret_cast<T*>(key_cache.data_ptr());
|
||||
T* value_cache_ptr = reinterpret_cast<T*>(value_cache.data_ptr());
|
||||
int* block_tables_ptr = block_tables.data_ptr<int>();
|
||||
int* seq_lens_ptr = seq_lens.data_ptr<int>();
|
||||
|
||||
switch (head_size) {
|
||||
case 32:
|
||||
LAUNCH_V1_ATTENTION_KERNEL(T, 32, BLOCK_SIZE);
|
||||
break;
|
||||
case 64:
|
||||
LAUNCH_V1_ATTENTION_KERNEL(T, 64, BLOCK_SIZE);
|
||||
break;
|
||||
case 80:
|
||||
LAUNCH_V1_ATTENTION_KERNEL(T, 80, BLOCK_SIZE);
|
||||
break;
|
||||
case 96:
|
||||
LAUNCH_V1_ATTENTION_KERNEL(T, 96, BLOCK_SIZE);
|
||||
break;
|
||||
case 112:
|
||||
LAUNCH_V1_ATTENTION_KERNEL(T, 112, BLOCK_SIZE);
|
||||
break;
|
||||
case 128:
|
||||
LAUNCH_V1_ATTENTION_KERNEL(T, 128, BLOCK_SIZE);
|
||||
break;
|
||||
case 192:
|
||||
LAUNCH_V1_ATTENTION_KERNEL(T, 192, BLOCK_SIZE);
|
||||
break;
|
||||
case 256:
|
||||
LAUNCH_V1_ATTENTION_KERNEL(T, 256, BLOCK_SIZE);
|
||||
break;
|
||||
default:
|
||||
TORCH_CHECK(false, "Unsupported head size: ", head_size);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
#define CALL_V1_KERNEL_LAUNCHER(T, BLOCK_SIZE) \
|
||||
paged_attention_v1_impl_launcher<T, BLOCK_SIZE>( \
|
||||
out, query, key_cache, value_cache, num_kv_heads, scale, block_tables, \
|
||||
seq_lens, max_seq_len, alibi_slopes);
|
||||
|
||||
#define CALL_V1_KERNEL_LAUNCHER_BLOCK_SIZE(T) \
|
||||
switch (block_size) { \
|
||||
case 16: \
|
||||
CALL_V1_KERNEL_LAUNCHER(T, 16); \
|
||||
break; \
|
||||
default: \
|
||||
TORCH_CHECK(false, "Unsupported block size: ", block_size); \
|
||||
break; \
|
||||
}
|
||||
} // namespace
|
||||
|
||||
void paged_attention_v1(
|
||||
torch::Tensor& out, torch::Tensor& query, torch::Tensor& key_cache,
|
||||
torch::Tensor& value_cache, int64_t num_kv_heads, double scale,
|
||||
torch::Tensor& block_tables, torch::Tensor& seq_lens, int64_t block_size,
|
||||
int64_t max_seq_len, const std::optional<torch::Tensor>& alibi_slopes,
|
||||
const std::string& kv_cache_dtype, torch::Tensor& k_scale,
|
||||
torch::Tensor& v_scale, const int64_t tp_rank,
|
||||
const int64_t blocksparse_local_blocks,
|
||||
const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size,
|
||||
const int64_t blocksparse_head_sliding_step) {
|
||||
TORCH_CHECK(blocksparse_vert_stride <= 1,
|
||||
"CPU backend does not support blocksparse attention yet.");
|
||||
VLLM_DISPATCH_FLOATING_TYPES(query.scalar_type(), "paged_attention_v1_impl",
|
||||
[&] {
|
||||
CPU_KERNEL_GUARD_IN(paged_attention_v1_impl)
|
||||
CALL_V1_KERNEL_LAUNCHER_BLOCK_SIZE(scalar_t);
|
||||
CPU_KERNEL_GUARD_OUT(paged_attention_v1_impl)
|
||||
});
|
||||
}
|
||||
|
||||
// Paged attention v2
|
||||
namespace {
|
||||
template <typename scalar_t, int HEAD_SIZE, int BLOCK_SIZE, int PARTITION_SIZE>
|
||||
struct paged_attention_v2_impl {
|
||||
static void call(
|
||||
scalar_t* __restrict__ out, // [num_seqs, num_heads, head_size]
|
||||
float* __restrict__ exp_sums, // [num_seqs, num_heads,
|
||||
// max_num_partitions]
|
||||
float* __restrict__ max_logits, // [num_seqs, num_heads,
|
||||
// max_num_partitions]
|
||||
scalar_t* __restrict__ tmp_out, // [num_seqs, num_heads,
|
||||
// max_num_partitions, head_size]
|
||||
const scalar_t* __restrict__ q, // [num_seqs, num_heads, head_size]
|
||||
const scalar_t* __restrict__ k_cache, // [num_blocks, num_kv_heads,
|
||||
// head_size/x, block_size, x]
|
||||
const scalar_t* __restrict__ v_cache, // [num_blocks, num_kv_heads,
|
||||
// head_size, block_size]
|
||||
const int num_kv_heads, const float scale,
|
||||
const int* __restrict__ block_tables, // [num_seqs,
|
||||
// max_num_blocks_per_seq]
|
||||
const int* __restrict__ seq_lens, // [num_seqs]
|
||||
const int max_num_blocks_per_seq,
|
||||
const float* __restrict__ alibi_slopes, // [num_heads]
|
||||
const int q_stride, const int kv_block_stride, const int kv_head_stride,
|
||||
const int num_seqs, const int num_heads, const int max_num_partitions) {
|
||||
constexpr int x = 16 / sizeof(scalar_t);
|
||||
const int num_queries_per_kv = num_heads / num_kv_heads;
|
||||
|
||||
static_assert(BLOCK_SIZE == 16);
|
||||
static_assert(PARTITION_SIZE * sizeof(float) % 64 == 0);
|
||||
static_assert(PARTITION_SIZE % BLOCK_SIZE == 0);
|
||||
|
||||
#pragma omp parallel for collapse(3) schedule(static, 1)
|
||||
for (int seq_idx = 0; seq_idx < num_seqs; ++seq_idx) {
|
||||
for (int partition_idx = 0; partition_idx < max_num_partitions;
|
||||
++partition_idx) {
|
||||
for (int head_idx = 0; head_idx < num_heads; ++head_idx) {
|
||||
const int seq_len = seq_lens[seq_idx];
|
||||
const int start_token_idx = partition_idx * PARTITION_SIZE;
|
||||
|
||||
if (start_token_idx >= seq_len) continue;
|
||||
|
||||
const int partition_num =
|
||||
(seq_len + PARTITION_SIZE - 1) / PARTITION_SIZE;
|
||||
const bool no_reduce = (partition_num == 1);
|
||||
const int token_num =
|
||||
(std::min(seq_len, start_token_idx + PARTITION_SIZE) -
|
||||
start_token_idx);
|
||||
const int block_num = (token_num + BLOCK_SIZE - 1) / BLOCK_SIZE;
|
||||
const int last_block_token_num =
|
||||
token_num - (block_num - 1) * BLOCK_SIZE;
|
||||
const int* seq_block_table = block_tables +
|
||||
max_num_blocks_per_seq * seq_idx +
|
||||
start_token_idx / BLOCK_SIZE;
|
||||
const int64_t kv_head_idx = head_idx / num_queries_per_kv;
|
||||
const scalar_t* __restrict__ q_vec_ptr =
|
||||
q + seq_idx * q_stride + head_idx * HEAD_SIZE;
|
||||
|
||||
float logits[PARTITION_SIZE] __attribute__((aligned(64))) = {0};
|
||||
|
||||
// Compute logits
|
||||
for (int block_idx = 0; block_idx < block_num; ++block_idx) {
|
||||
const int64_t physical_block_idx = seq_block_table[block_idx];
|
||||
const scalar_t* __restrict__ k_block_cache_ptr =
|
||||
k_cache + physical_block_idx * kv_block_stride +
|
||||
kv_head_idx * kv_head_stride;
|
||||
float* __restrict__ head_block_logits =
|
||||
logits + block_idx * BLOCK_SIZE;
|
||||
|
||||
reduceQKBlockKernel<scalar_t, HEAD_SIZE, BLOCK_SIZE, x>::call(
|
||||
q_vec_ptr, k_block_cache_ptr, head_block_logits, scale,
|
||||
block_idx == block_num - 1 ? last_block_token_num : BLOCK_SIZE);
|
||||
}
|
||||
|
||||
std::pair<float, float> max_and_sum;
|
||||
if (alibi_slopes) {
|
||||
max_and_sum = reduceSoftmaxAlibi(
|
||||
logits, token_num, block_num * BLOCK_SIZE,
|
||||
alibi_slopes[head_idx], start_token_idx, seq_len);
|
||||
} else {
|
||||
max_and_sum =
|
||||
reduceSoftmax(logits, token_num, block_num * BLOCK_SIZE);
|
||||
}
|
||||
|
||||
auto&& [max_logit, exp_sum] = max_and_sum;
|
||||
|
||||
scalar_t* __restrict__ output_buffer = nullptr;
|
||||
if (!no_reduce) {
|
||||
auto idx = seq_idx * num_heads * max_num_partitions +
|
||||
head_idx * max_num_partitions + partition_idx;
|
||||
max_logits[idx] = max_logit;
|
||||
exp_sums[idx] = exp_sum;
|
||||
output_buffer =
|
||||
tmp_out + seq_idx * num_heads * max_num_partitions * HEAD_SIZE +
|
||||
head_idx * max_num_partitions * HEAD_SIZE +
|
||||
partition_idx * HEAD_SIZE;
|
||||
} else {
|
||||
output_buffer =
|
||||
out + seq_idx * num_heads * HEAD_SIZE + head_idx * HEAD_SIZE;
|
||||
}
|
||||
|
||||
// Compute value
|
||||
constexpr int head_elem_num_per_partition = 16;
|
||||
constexpr int head_partition_num =
|
||||
HEAD_SIZE / head_elem_num_per_partition;
|
||||
for (int head_part_idx = 0; head_part_idx < head_partition_num;
|
||||
++head_part_idx) {
|
||||
vec_op::FP32Vec16 accums[head_elem_num_per_partition];
|
||||
scalar_t* __restrict__ out_ptr =
|
||||
output_buffer + head_part_idx * head_elem_num_per_partition;
|
||||
for (int block_idx = 0; block_idx < block_num; ++block_idx) {
|
||||
const int64_t physical_block_idx = seq_block_table[block_idx];
|
||||
const float* __restrict__ prob_vec_ptr =
|
||||
logits + block_idx * BLOCK_SIZE;
|
||||
const scalar_t* __restrict__ v_block_cache_ptr =
|
||||
v_cache + physical_block_idx * kv_block_stride +
|
||||
kv_head_idx * kv_head_stride +
|
||||
BLOCK_SIZE * head_part_idx * head_elem_num_per_partition;
|
||||
reduceValueBlock<scalar_t, HEAD_SIZE, BLOCK_SIZE,
|
||||
head_elem_num_per_partition>(
|
||||
prob_vec_ptr, v_block_cache_ptr, accums);
|
||||
|
||||
if (block_idx != block_num - 1) {
|
||||
const int64_t next_physical_block_idx =
|
||||
seq_block_table[block_idx + 1];
|
||||
const scalar_t* __restrict__ next_v_block_cache_ptr =
|
||||
v_cache + next_physical_block_idx * kv_block_stride +
|
||||
kv_head_idx * kv_head_stride +
|
||||
BLOCK_SIZE * head_part_idx * head_elem_num_per_partition;
|
||||
vec_op::unroll_loop<int, head_elem_num_per_partition>(
|
||||
[&](int head_elem_idx) {
|
||||
if (head_elem_idx % 2 == 0) {
|
||||
vec_op::prefetch(next_v_block_cache_ptr +
|
||||
BLOCK_SIZE * head_elem_idx);
|
||||
}
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
vec_op::unroll_loop<int, head_elem_num_per_partition>(
|
||||
[&](int head_elem_idx) {
|
||||
float value = accums[head_elem_idx].reduce_sum();
|
||||
vec_op::storeFP32(value, out_ptr + head_elem_idx);
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Rescale partition softmax and store the factors to exp_sums
|
||||
#pragma omp parallel for collapse(2) schedule(static, 1)
|
||||
for (int seq_idx = 0; seq_idx < num_seqs; ++seq_idx) {
|
||||
for (int head_idx = 0; head_idx < num_heads; ++head_idx) {
|
||||
const int seq_len = seq_lens[seq_idx];
|
||||
const int partition_num =
|
||||
(seq_len + PARTITION_SIZE - 1) / PARTITION_SIZE;
|
||||
|
||||
if (partition_num == 1) continue;
|
||||
|
||||
reducePartitionSoftmax(
|
||||
max_logits + seq_idx * num_heads * max_num_partitions +
|
||||
head_idx * max_num_partitions,
|
||||
exp_sums + seq_idx * num_heads * max_num_partitions +
|
||||
head_idx * max_num_partitions,
|
||||
partition_num);
|
||||
}
|
||||
}
|
||||
|
||||
// Reduce values
|
||||
using v_load_vec_type = typename KernelVecType<scalar_t>::v_load_vec_type;
|
||||
static_assert(v_load_vec_type::get_elem_num() == BLOCK_SIZE);
|
||||
constexpr int head_elem_num_per_group =
|
||||
16; // Note: didn't align with the cacheline size, due to some
|
||||
// HEAD_SIZE didn't align with 64 bytes
|
||||
static_assert(HEAD_SIZE % head_elem_num_per_group == 0);
|
||||
constexpr int head_group_num = HEAD_SIZE / head_elem_num_per_group;
|
||||
const float* __restrict__ rescale_factors = exp_sums;
|
||||
#pragma omp parallel for collapse(3) schedule(static, 1)
|
||||
for (int seq_idx = 0; seq_idx < num_seqs; ++seq_idx) {
|
||||
for (int head_idx = 0; head_idx < num_heads; ++head_idx) {
|
||||
for (int group_idx = 0; group_idx < head_group_num; ++group_idx) {
|
||||
const int seq_len = seq_lens[seq_idx];
|
||||
const int partition_num =
|
||||
(seq_len + PARTITION_SIZE - 1) / PARTITION_SIZE;
|
||||
|
||||
if (partition_num == 1) continue;
|
||||
|
||||
const float* __restrict__ seq_head_rescale_factors =
|
||||
rescale_factors + seq_idx * num_heads * max_num_partitions +
|
||||
head_idx * max_num_partitions;
|
||||
const scalar_t* __restrict__ seq_head_tmp_out =
|
||||
tmp_out + seq_idx * num_heads * max_num_partitions * HEAD_SIZE +
|
||||
head_idx * max_num_partitions * HEAD_SIZE +
|
||||
group_idx * head_elem_num_per_group;
|
||||
scalar_t* __restrict__ seq_head_output =
|
||||
out + seq_idx * num_heads * HEAD_SIZE + head_idx * HEAD_SIZE +
|
||||
group_idx * head_elem_num_per_group;
|
||||
|
||||
vec_op::FP32Vec16 acc;
|
||||
for (int i = 0; i < partition_num; ++i) {
|
||||
vec_op::FP32Vec16 rescale_factor(seq_head_rescale_factors[i]);
|
||||
v_load_vec_type value(seq_head_tmp_out + i * HEAD_SIZE);
|
||||
vec_op::FP32Vec16 fp32_value(value);
|
||||
acc = acc + fp32_value * rescale_factor;
|
||||
}
|
||||
v_load_vec_type cast_acc(acc);
|
||||
cast_acc.save(seq_head_output);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
#define LAUNCH_V2_ATTENTION_KERNEL(T, HEAD_SIZE, BLOCK_SIZE) \
|
||||
paged_attention_v2_impl<T, HEAD_SIZE, BLOCK_SIZE, PARTITION_SIZE>::call( \
|
||||
out_ptr, exp_sums_ptr, max_logits_ptr, tmp_out_ptr, query_ptr, \
|
||||
key_cache_ptr, value_cache_ptr, num_kv_heads, scale, block_tables_ptr, \
|
||||
seq_lens_ptr, max_num_blocks_per_seq, alibi_slopes_ptr, q_stride, \
|
||||
kv_block_stride, kv_head_stride, num_seqs, num_heads, \
|
||||
max_num_partitions);
|
||||
|
||||
template <typename T, int BLOCK_SIZE, int PARTITION_SIZE = 512>
|
||||
void paged_attention_v2_impl_launcher(
|
||||
torch::Tensor& out, torch::Tensor& exp_sums, torch::Tensor& max_logits,
|
||||
torch::Tensor& tmp_out, torch::Tensor& query, torch::Tensor& key_cache,
|
||||
torch::Tensor& value_cache, int num_kv_heads, float scale,
|
||||
torch::Tensor& block_tables, torch::Tensor& seq_lens, int block_size,
|
||||
int max_seq_len, const std::optional<torch::Tensor>& alibi_slopes) {
|
||||
int num_seqs = query.size(0);
|
||||
int num_heads = query.size(1);
|
||||
int head_size = query.size(2);
|
||||
int max_num_blocks_per_seq = block_tables.size(1);
|
||||
int q_stride = query.stride(0);
|
||||
int kv_block_stride = key_cache.stride(0);
|
||||
int kv_head_stride = key_cache.stride(1);
|
||||
int max_num_partitions = exp_sums.size(-1);
|
||||
|
||||
// NOTE: alibi_slopes is optional.
|
||||
const float* alibi_slopes_ptr =
|
||||
alibi_slopes
|
||||
? reinterpret_cast<const float*>(alibi_slopes.value().data_ptr())
|
||||
: nullptr;
|
||||
|
||||
T* out_ptr = reinterpret_cast<T*>(out.data_ptr());
|
||||
float* exp_sums_ptr = reinterpret_cast<float*>(exp_sums.data_ptr());
|
||||
float* max_logits_ptr = reinterpret_cast<float*>(max_logits.data_ptr());
|
||||
T* tmp_out_ptr = reinterpret_cast<T*>(tmp_out.data_ptr());
|
||||
T* query_ptr = reinterpret_cast<T*>(query.data_ptr());
|
||||
T* key_cache_ptr = reinterpret_cast<T*>(key_cache.data_ptr());
|
||||
T* value_cache_ptr = reinterpret_cast<T*>(value_cache.data_ptr());
|
||||
int* block_tables_ptr = block_tables.data_ptr<int>();
|
||||
int* seq_lens_ptr = seq_lens.data_ptr<int>();
|
||||
|
||||
switch (head_size) {
|
||||
case 32:
|
||||
LAUNCH_V2_ATTENTION_KERNEL(T, 32, BLOCK_SIZE);
|
||||
break;
|
||||
case 64:
|
||||
LAUNCH_V2_ATTENTION_KERNEL(T, 64, BLOCK_SIZE);
|
||||
break;
|
||||
case 80:
|
||||
LAUNCH_V2_ATTENTION_KERNEL(T, 80, BLOCK_SIZE);
|
||||
break;
|
||||
case 96:
|
||||
LAUNCH_V2_ATTENTION_KERNEL(T, 96, BLOCK_SIZE);
|
||||
break;
|
||||
case 112:
|
||||
LAUNCH_V2_ATTENTION_KERNEL(T, 112, BLOCK_SIZE);
|
||||
break;
|
||||
case 128:
|
||||
LAUNCH_V2_ATTENTION_KERNEL(T, 128, BLOCK_SIZE);
|
||||
break;
|
||||
case 192:
|
||||
LAUNCH_V2_ATTENTION_KERNEL(T, 192, BLOCK_SIZE);
|
||||
break;
|
||||
case 256:
|
||||
LAUNCH_V2_ATTENTION_KERNEL(T, 256, BLOCK_SIZE);
|
||||
break;
|
||||
default:
|
||||
TORCH_CHECK(false, "Unsupported head size: ", head_size);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
#define CALL_V2_KERNEL_LAUNCHER(T, BLOCK_SIZE) \
|
||||
paged_attention_v2_impl_launcher<T, BLOCK_SIZE>( \
|
||||
out, exp_sums, max_logits, tmp_out, query, key_cache, value_cache, \
|
||||
num_kv_heads, scale, block_tables, seq_lens, block_size, max_seq_len, \
|
||||
alibi_slopes);
|
||||
|
||||
#define CALL_V2_KERNEL_LAUNCHER_BLOCK_SIZE(T) \
|
||||
switch (block_size) { \
|
||||
case 16: \
|
||||
CALL_V2_KERNEL_LAUNCHER(T, 16); \
|
||||
break; \
|
||||
default: \
|
||||
TORCH_CHECK(false, "Unsupported block size: ", block_size); \
|
||||
break; \
|
||||
}
|
||||
} // namespace
|
||||
|
||||
void paged_attention_v2(
|
||||
torch::Tensor& out, torch::Tensor& exp_sums, torch::Tensor& max_logits,
|
||||
torch::Tensor& tmp_out, torch::Tensor& query, torch::Tensor& key_cache,
|
||||
torch::Tensor& value_cache, int64_t num_kv_heads, double scale,
|
||||
torch::Tensor& block_tables, torch::Tensor& seq_lens, int64_t block_size,
|
||||
int64_t max_seq_len, const std::optional<torch::Tensor>& alibi_slopes,
|
||||
const std::string& kv_cache_dtype, torch::Tensor& k_scale,
|
||||
torch::Tensor& v_scale, const int64_t tp_rank,
|
||||
const int64_t blocksparse_local_blocks,
|
||||
const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size,
|
||||
const int64_t blocksparse_head_sliding_step) {
|
||||
TORCH_CHECK(blocksparse_vert_stride <= 1,
|
||||
"CPU backend does not support blocksparse attention yet.");
|
||||
VLLM_DISPATCH_FLOATING_TYPES(query.scalar_type(), "paged_attention_v2_impl",
|
||||
[&] {
|
||||
CPU_KERNEL_GUARD_IN(paged_attention_v2_impl)
|
||||
CALL_V2_KERNEL_LAUNCHER_BLOCK_SIZE(scalar_t);
|
||||
CPU_KERNEL_GUARD_OUT(paged_attention_v2_impl)
|
||||
});
|
||||
}
|
||||
@@ -1,214 +0,0 @@
|
||||
#include <map>
|
||||
#include <vector>
|
||||
|
||||
#include "cpu_types.hpp"
|
||||
|
||||
#if defined(__x86_64__)
|
||||
#define DISPATCH_MACRO VLLM_DISPATCH_FLOATING_TYPES_WITH_E5M2
|
||||
#else
|
||||
#define DISPATCH_MACRO VLLM_DISPATCH_FLOATING_TYPES
|
||||
#endif
|
||||
|
||||
namespace {
|
||||
template <typename scalar_t>
|
||||
void copy_blocks_cpu_impl(std::vector<torch::Tensor> const& key_caches,
|
||||
std::vector<torch::Tensor> const& value_caches,
|
||||
const torch::Tensor& mapping_pairs,
|
||||
const int element_num_per_block,
|
||||
const int layer_num) {
|
||||
const size_t pair_num = mapping_pairs.size(0);
|
||||
const size_t block_bytes = sizeof(scalar_t) * element_num_per_block;
|
||||
#pragma omp parallel for collapse(2)
|
||||
for (int layer = 0; layer < layer_num; ++layer) {
|
||||
for (size_t pair = 0; pair < pair_num; ++pair) {
|
||||
int64_t source_offset =
|
||||
element_num_per_block * mapping_pairs[pair][0].item<int64_t>();
|
||||
int64_t target_offset =
|
||||
element_num_per_block * mapping_pairs[pair][1].item<int64_t>();
|
||||
scalar_t* key_cache_ptr = key_caches[layer].data_ptr<scalar_t>();
|
||||
scalar_t* source_ptr = key_cache_ptr + source_offset;
|
||||
scalar_t* target_ptr = key_cache_ptr + target_offset;
|
||||
std::memcpy(target_ptr, source_ptr, block_bytes);
|
||||
|
||||
scalar_t* value_cache_ptr = value_caches[layer].data_ptr<scalar_t>();
|
||||
source_ptr = value_cache_ptr + source_offset;
|
||||
target_ptr = value_cache_ptr + target_offset;
|
||||
std::memcpy(target_ptr, source_ptr, block_bytes);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename scalar_t>
|
||||
void reshape_and_cache_cpu_impl(
|
||||
const scalar_t* __restrict__ key, const scalar_t* __restrict__ value,
|
||||
scalar_t* __restrict__ key_cache, scalar_t* __restrict__ value_cache,
|
||||
const int64_t* __restrict__ slot_mapping, const int num_tokens,
|
||||
const int key_stride, const int value_stride, const int num_heads,
|
||||
const int head_size, const int block_size, const int x) {
|
||||
const int block_elem_num = num_heads * head_size * block_size;
|
||||
|
||||
#pragma omp parallel for collapse(2)
|
||||
for (int token_idx = 0; token_idx < num_tokens; ++token_idx) {
|
||||
for (int head_idx = 0; head_idx < num_heads; ++head_idx) {
|
||||
const int64_t slot_idx = slot_mapping[token_idx];
|
||||
if (slot_idx >= 0) {
|
||||
int src_key_head_idx = token_idx * key_stride + head_idx * head_size;
|
||||
int src_value_head_idx =
|
||||
token_idx * value_stride + head_idx * head_size;
|
||||
const scalar_t* src_key_head_ptr = key + src_key_head_idx;
|
||||
const scalar_t* src_value_head_ptr = value + src_value_head_idx;
|
||||
const int64_t block_index = slot_idx / block_size;
|
||||
const int64_t block_offset = slot_idx % block_size;
|
||||
scalar_t* target_key_head_ptr = key_cache +
|
||||
block_elem_num * block_index +
|
||||
head_idx * block_size * head_size;
|
||||
scalar_t* target_value_head_ptr = value_cache +
|
||||
block_elem_num * block_index +
|
||||
head_idx * block_size * head_size;
|
||||
|
||||
for (int src_key_idx = 0; src_key_idx < head_size; src_key_idx += x) {
|
||||
const int64_t target_offset =
|
||||
src_key_idx * block_size + block_offset * x;
|
||||
for (int i = 0; i < x; ++i) {
|
||||
target_key_head_ptr[target_offset + i] =
|
||||
src_key_head_ptr[src_key_idx + i];
|
||||
}
|
||||
}
|
||||
|
||||
for (int src_value_idx = 0; src_value_idx < head_size;
|
||||
++src_value_idx) {
|
||||
const int64_t target_offset =
|
||||
src_value_idx * block_size + block_offset;
|
||||
target_value_head_ptr[target_offset] =
|
||||
src_value_head_ptr[src_value_idx];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}; // namespace
|
||||
|
||||
template <typename scalar_t>
|
||||
void concat_and_cache_mla_cpu_impl(
|
||||
const scalar_t* __restrict__ kv_c, // [num_tokens, kv_lora_rank]
|
||||
const scalar_t* __restrict__ k_pe, // [num_tokens, pe_dim]
|
||||
scalar_t* __restrict__ kv_cache, // [num_blocks, block_size, (kv_lora_rank
|
||||
// + pe_dim)]
|
||||
const int64_t* __restrict__ slot_mapping, // [num_tokens]
|
||||
const int num_tokens, //
|
||||
const int block_stride, //
|
||||
const int entry_stride, //
|
||||
const int kv_c_stride, //
|
||||
const int k_pe_stride, //
|
||||
const int kv_lora_rank, //
|
||||
const int pe_dim, //
|
||||
const int block_size //
|
||||
) {
|
||||
#pragma omp parallel for
|
||||
for (int token_idx = 0; token_idx < num_tokens; ++token_idx) {
|
||||
const int64_t slot_idx = slot_mapping[token_idx];
|
||||
// NOTE: slot_idx can be -1 if the token is padded
|
||||
if (slot_idx < 0) {
|
||||
continue;
|
||||
}
|
||||
const int64_t block_idx = slot_idx / block_size;
|
||||
const int64_t block_offset = slot_idx % block_size;
|
||||
|
||||
auto copy = [&](const scalar_t* __restrict__ src,
|
||||
scalar_t* __restrict__ dst, int src_stride, int dst_stride,
|
||||
int size, int offset) {
|
||||
for (int i = 0; i < size; i++) {
|
||||
const int64_t src_idx = token_idx * src_stride + i;
|
||||
const int64_t dst_idx =
|
||||
block_idx * block_stride + block_offset * entry_stride + i + offset;
|
||||
dst[dst_idx] = src[src_idx];
|
||||
}
|
||||
};
|
||||
|
||||
copy(kv_c, kv_cache, kv_c_stride, block_stride, kv_lora_rank, 0);
|
||||
copy(k_pe, kv_cache, k_pe_stride, block_stride, pe_dim, kv_lora_rank);
|
||||
}
|
||||
}
|
||||
|
||||
// Note: the key_caches and value_caches vectors are constant but
|
||||
// not the Tensors they contain. The vectors need to be const refs
|
||||
// in order to satisfy pytorch's C++ operator registration code.
|
||||
void copy_blocks(std::vector<torch::Tensor> const& key_caches,
|
||||
std::vector<torch::Tensor> const& value_caches,
|
||||
const torch::Tensor& block_mapping) {
|
||||
unsigned num_layers = key_caches.size();
|
||||
TORCH_CHECK(num_layers == value_caches.size());
|
||||
if (num_layers == 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
const int element_num_per_block = key_caches[0][0].numel();
|
||||
DISPATCH_MACRO(key_caches[0].scalar_type(), "copy_blocks_cpu_impl", [&] {
|
||||
CPU_KERNEL_GUARD_IN(copy_blocks_cpu_impl)
|
||||
copy_blocks_cpu_impl<scalar_t>(key_caches, value_caches, block_mapping,
|
||||
element_num_per_block, num_layers);
|
||||
CPU_KERNEL_GUARD_OUT(copy_blocks_cpu_impl)
|
||||
});
|
||||
}
|
||||
|
||||
void reshape_and_cache(torch::Tensor& key, torch::Tensor& value,
|
||||
torch::Tensor& key_cache, torch::Tensor& value_cache,
|
||||
torch::Tensor& slot_mapping,
|
||||
const std::string& kv_cache_dtype,
|
||||
torch::Tensor& k_scale, torch::Tensor& v_scale) {
|
||||
int num_tokens = key.size(0);
|
||||
int num_heads = key.size(1);
|
||||
int head_size = key.size(2);
|
||||
int block_size = key_cache.size(3);
|
||||
int x = key_cache.size(4);
|
||||
|
||||
int key_stride = key.stride(0);
|
||||
int value_stride = value.stride(0);
|
||||
|
||||
DISPATCH_MACRO(key.scalar_type(), "reshape_and_cache_cpu_impl", [&] {
|
||||
CPU_KERNEL_GUARD_IN(reshape_and_cache_cpu_impl)
|
||||
reshape_and_cache_cpu_impl<scalar_t>(
|
||||
key.data_ptr<scalar_t>(), value.data_ptr<scalar_t>(),
|
||||
key_cache.data_ptr<scalar_t>(), value_cache.data_ptr<scalar_t>(),
|
||||
slot_mapping.data_ptr<int64_t>(), num_tokens, key_stride, value_stride,
|
||||
num_heads, head_size, block_size, x);
|
||||
CPU_KERNEL_GUARD_OUT(reshape_and_cache_cpu_impl)
|
||||
});
|
||||
}
|
||||
|
||||
void concat_and_cache_mla(
|
||||
torch::Tensor& kv_c, // [num_tokens, kv_lora_rank]
|
||||
torch::Tensor& k_pe, // [num_tokens, pe_dim]
|
||||
torch::Tensor& kv_cache, // [num_blocks, block_size, (kv_lora_rank +
|
||||
// pe_dim)]
|
||||
torch::Tensor& slot_mapping, // [num_tokens] or [num_actual_tokens]
|
||||
const std::string& kv_cache_dtype, torch::Tensor& scale) {
|
||||
int num_tokens = slot_mapping.size(0);
|
||||
int kv_lora_rank = kv_c.size(1);
|
||||
int pe_dim = k_pe.size(1);
|
||||
int block_size = kv_cache.size(1);
|
||||
|
||||
TORCH_CHECK(kv_cache.size(2) == kv_lora_rank + pe_dim);
|
||||
TORCH_CHECK(kv_cache_dtype != "fp8");
|
||||
|
||||
int kv_c_stride = kv_c.stride(0);
|
||||
int k_pe_stride = k_pe.stride(0);
|
||||
int block_stride = kv_cache.stride(0);
|
||||
int entry_stride = kv_cache.stride(1);
|
||||
|
||||
VLLM_DISPATCH_FLOATING_TYPES(
|
||||
kv_c.scalar_type(), "concat_and_cache_mla_cpu_impl", [&] {
|
||||
CPU_KERNEL_GUARD_IN(concat_and_cache_mla_cpu_impl)
|
||||
concat_and_cache_mla_cpu_impl<scalar_t>(
|
||||
kv_c.data_ptr<scalar_t>(), k_pe.data_ptr<scalar_t>(),
|
||||
kv_cache.data_ptr<scalar_t>(), slot_mapping.data_ptr<int64_t>(),
|
||||
num_tokens, block_stride, entry_stride, kv_c_stride, k_pe_stride,
|
||||
kv_lora_rank, pe_dim, block_size);
|
||||
CPU_KERNEL_GUARD_OUT(concat_and_cache_mla_cpu_impl)
|
||||
});
|
||||
}
|
||||
|
||||
void swap_blocks(torch::Tensor& src, torch::Tensor& dst,
|
||||
const torch::Tensor& block_mapping) {
|
||||
TORCH_CHECK(false, "swap_blocks is unsupported on CPU.")
|
||||
}
|
||||
266
csrc/cpu/cpu_attn.cpp
Normal file
266
csrc/cpu/cpu_attn.cpp
Normal file
@@ -0,0 +1,266 @@
|
||||
#include "cpu_attn_vec.hpp"
|
||||
#include "cpu_attn_vec16.hpp"
|
||||
|
||||
#ifdef CPU_CAPABILITY_AMXBF16
|
||||
#include "cpu_attn_amx.hpp"
|
||||
#define AMX_DISPATCH(...) \
|
||||
case cpu_attention::ISA::AMX: { \
|
||||
using attn_impl = cpu_attention::AttentionImpl<cpu_attention::ISA::AMX, \
|
||||
scalar_t, head_dim>; \
|
||||
return __VA_ARGS__(); \
|
||||
}
|
||||
#else
|
||||
#define AMX_DISPATCH(...) case cpu_attention::ISA::AMX:
|
||||
#endif
|
||||
|
||||
#ifdef __aarch64__
|
||||
#include "cpu_attn_neon.hpp"
|
||||
#define NEON_DISPATCH(...) \
|
||||
case cpu_attention::ISA::NEON: { \
|
||||
using attn_impl = cpu_attention::AttentionImpl<cpu_attention::ISA::NEON, \
|
||||
scalar_t, head_dim>; \
|
||||
return __VA_ARGS__(); \
|
||||
}
|
||||
#else
|
||||
#define NEON_DISPATCH(...) case cpu_attention::ISA::NEON:
|
||||
#endif // #ifdef __aarch64__
|
||||
|
||||
#define CPU_ATTN_DISPATCH_CASE(HEAD_DIM, ...) \
|
||||
case HEAD_DIM: { \
|
||||
constexpr size_t head_dim = HEAD_DIM; \
|
||||
return __VA_ARGS__(); \
|
||||
}
|
||||
|
||||
#define CPU_ATTN_DISPATCH_CASE_HEADDIM(HEAD_DIM, ...) \
|
||||
[&] { \
|
||||
switch (HEAD_DIM) { \
|
||||
CPU_ATTN_DISPATCH_CASE(32, __VA_ARGS__) \
|
||||
CPU_ATTN_DISPATCH_CASE(64, __VA_ARGS__) \
|
||||
CPU_ATTN_DISPATCH_CASE(96, __VA_ARGS__) \
|
||||
CPU_ATTN_DISPATCH_CASE(128, __VA_ARGS__) \
|
||||
CPU_ATTN_DISPATCH_CASE(160, __VA_ARGS__) \
|
||||
CPU_ATTN_DISPATCH_CASE(192, __VA_ARGS__) \
|
||||
CPU_ATTN_DISPATCH_CASE(224, __VA_ARGS__) \
|
||||
CPU_ATTN_DISPATCH_CASE(256, __VA_ARGS__) \
|
||||
default: { \
|
||||
TORCH_CHECK(false, "Invalid CPU attention head_dim: " + \
|
||||
std::to_string(HEAD_DIM)); \
|
||||
} \
|
||||
} \
|
||||
}()
|
||||
|
||||
#define CPU_ATTN_DISPATCH_IMPL(ISA_TYPE, ...) \
|
||||
[&] { \
|
||||
switch (ISA_TYPE) { \
|
||||
AMX_DISPATCH(__VA_ARGS__) \
|
||||
NEON_DISPATCH(__VA_ARGS__) \
|
||||
case cpu_attention::ISA::VEC: { \
|
||||
using attn_impl = \
|
||||
cpu_attention::AttentionImpl<cpu_attention::ISA::VEC, scalar_t, \
|
||||
head_dim>; \
|
||||
return __VA_ARGS__(); \
|
||||
} \
|
||||
case cpu_attention::ISA::VEC16: { \
|
||||
using attn_impl = \
|
||||
cpu_attention::AttentionImpl<cpu_attention::ISA::VEC16, scalar_t, \
|
||||
head_dim>; \
|
||||
return __VA_ARGS__(); \
|
||||
} \
|
||||
default: { \
|
||||
TORCH_CHECK(false, "Invalid CPU attention ISA type."); \
|
||||
} \
|
||||
} \
|
||||
}()
|
||||
|
||||
torch::Tensor get_scheduler_metadata(
|
||||
const int64_t num_req, const int64_t num_heads_q,
|
||||
const int64_t num_heads_kv, const int64_t head_dim,
|
||||
const torch::Tensor& seq_lens, at::ScalarType dtype,
|
||||
const torch::Tensor& query_start_loc, const bool casual,
|
||||
const int64_t window_size, const std::string& isa_hint,
|
||||
const bool enable_kv_split) {
|
||||
cpu_attention::ISA isa;
|
||||
if (isa_hint == "amx") {
|
||||
isa = cpu_attention::ISA::AMX;
|
||||
} else if (isa_hint == "vec") {
|
||||
isa = cpu_attention::ISA::VEC;
|
||||
} else if (isa_hint == "vec16") {
|
||||
isa = cpu_attention::ISA::VEC16;
|
||||
} else if (isa_hint == "neon") {
|
||||
isa = cpu_attention::ISA::NEON;
|
||||
} else {
|
||||
TORCH_CHECK(false, "Unsupported CPU attention ISA hint: " + isa_hint);
|
||||
}
|
||||
|
||||
cpu_attention::AttentionScheduler::ScheduleInput input;
|
||||
input.num_reqs = num_req;
|
||||
input.num_heads_q = num_heads_q;
|
||||
input.num_heads_kv = num_heads_kv;
|
||||
input.head_dim = head_dim;
|
||||
input.query_start_loc = query_start_loc.data_ptr<int32_t>();
|
||||
input.seq_lens = seq_lens.data_ptr<int32_t>();
|
||||
if (window_size != -1) {
|
||||
input.left_sliding_window_size = window_size - 1;
|
||||
if (casual) {
|
||||
input.right_sliding_window_size = 0;
|
||||
} else {
|
||||
input.right_sliding_window_size = window_size - 1;
|
||||
}
|
||||
} else {
|
||||
input.left_sliding_window_size = -1;
|
||||
if (casual) {
|
||||
input.right_sliding_window_size = 0;
|
||||
} else {
|
||||
input.right_sliding_window_size = -1;
|
||||
}
|
||||
}
|
||||
input.casual = casual;
|
||||
input.isa = isa;
|
||||
input.enable_kv_split = enable_kv_split;
|
||||
TORCH_CHECK(casual, "Only supports casual mask for now.");
|
||||
|
||||
VLLM_DISPATCH_FLOATING_TYPES(dtype, "get_scheduler_metadata", [&]() {
|
||||
CPU_ATTN_DISPATCH_CASE_HEADDIM(head_dim, [&] {
|
||||
CPU_ATTN_DISPATCH_IMPL(isa, [&]() {
|
||||
input.elem_size = sizeof(scalar_t);
|
||||
input.q_buffer_elem_size = sizeof(attn_impl::q_buffer_t);
|
||||
input.logits_buffer_elem_size = sizeof(attn_impl::logits_buffer_t);
|
||||
input.output_buffer_elem_size =
|
||||
sizeof(attn_impl::partial_output_buffer_t);
|
||||
input.max_num_q_per_iter = attn_impl::MaxQHeadNumPerIteration;
|
||||
input.kv_block_alignment = attn_impl::BlockSizeAlignment;
|
||||
});
|
||||
});
|
||||
});
|
||||
|
||||
cpu_attention::AttentionScheduler scheduler;
|
||||
torch::Tensor metadata = scheduler.schedule(input);
|
||||
return metadata;
|
||||
}
|
||||
|
||||
void cpu_attn_reshape_and_cache(
|
||||
const torch::Tensor& key, // [token_num, head_num, head_size]
|
||||
const torch::Tensor& value, // [token_num, head_num, head_size]
|
||||
torch::Tensor&
|
||||
key_cache, // [num_blocks, num_kv_heads, block_size, head_size]
|
||||
torch::Tensor&
|
||||
value_cache, // [num_blocks, num_kv_heads, block_size, head_size]
|
||||
const torch::Tensor& slot_mapping, const std::string& isa) {
|
||||
TORCH_CHECK_EQ(key.dim(), 3);
|
||||
TORCH_CHECK_EQ(value.dim(), 3);
|
||||
TORCH_CHECK_EQ(key_cache.dim(), 4);
|
||||
TORCH_CHECK_EQ(value_cache.dim(), 4);
|
||||
TORCH_CHECK_EQ(key.stride(2), 1);
|
||||
TORCH_CHECK_EQ(value.stride(2), 1);
|
||||
|
||||
const int64_t token_num = key.size(0);
|
||||
const int64_t key_token_num_stride = key.stride(0);
|
||||
const int64_t value_token_num_stride = value.stride(0);
|
||||
const int64_t head_num = value.size(1);
|
||||
const int64_t key_head_num_stride = key.stride(1);
|
||||
const int64_t value_head_num_stride = value.stride(1);
|
||||
const int64_t num_blocks = key_cache.size(0);
|
||||
const int64_t num_blocks_stride = key_cache.stride(0);
|
||||
const int64_t cache_head_num_stride = key_cache.stride(1);
|
||||
const int64_t block_size = key_cache.size(2);
|
||||
const int64_t block_size_stride = key_cache.stride(2);
|
||||
const int64_t head_dim = key.size(-1);
|
||||
|
||||
cpu_attention::ISA isa_tag = [&]() {
|
||||
if (isa == "amx") {
|
||||
return cpu_attention::ISA::AMX;
|
||||
} else if (isa == "vec") {
|
||||
return cpu_attention::ISA::VEC;
|
||||
} else if (isa == "vec16") {
|
||||
return cpu_attention::ISA::VEC16;
|
||||
} else if (isa == "neon") {
|
||||
return cpu_attention::ISA::NEON;
|
||||
} else {
|
||||
TORCH_CHECK(false, "Invalid ISA type: " + isa);
|
||||
}
|
||||
}();
|
||||
|
||||
VLLM_DISPATCH_FLOATING_TYPES(
|
||||
key.scalar_type(), "cpu_attn_reshape_and_cache", [&]() {
|
||||
CPU_ATTN_DISPATCH_CASE_HEADDIM(head_dim, [&] {
|
||||
CPU_ATTN_DISPATCH_IMPL(isa_tag, [&]() {
|
||||
attn_impl::reshape_and_cache(
|
||||
key.data_ptr<scalar_t>(), value.data_ptr<scalar_t>(),
|
||||
key_cache.data_ptr<scalar_t>(),
|
||||
value_cache.data_ptr<scalar_t>(),
|
||||
slot_mapping.data_ptr<int64_t>(), token_num,
|
||||
key_token_num_stride, value_token_num_stride, head_num,
|
||||
key_head_num_stride, value_head_num_stride, num_blocks,
|
||||
num_blocks_stride, cache_head_num_stride, block_size,
|
||||
block_size_stride);
|
||||
});
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
void cpu_attention_with_kv_cache(
|
||||
const torch::Tensor& query, // [num_tokens, num_heads, head_size]
|
||||
const torch::Tensor&
|
||||
key_cache, // [num_blocks, num_kv_heads, block_size, head_size]
|
||||
const torch::Tensor&
|
||||
value_cache, // [num_blocks, num_kv_heads, block_size, head_size]
|
||||
torch::Tensor& output, // [num_tokens, num_heads, head_size]
|
||||
const torch::Tensor& query_start_loc, // [num_tokens + 1]
|
||||
const torch::Tensor& seq_lens, // [num_tokens]
|
||||
const double scale, const bool causal,
|
||||
const std::optional<torch::Tensor>& alibi_slopes, // [num_heads]
|
||||
const int64_t sliding_window_left, const int64_t sliding_window_right,
|
||||
const torch::Tensor& block_table, // [num_tokens, max_block_num]
|
||||
const double softcap, const torch::Tensor& scheduler_metadata,
|
||||
const std::optional<torch::Tensor>& s_aux // [num_heads]
|
||||
) {
|
||||
TORCH_CHECK_EQ(query.dim(), 3);
|
||||
TORCH_CHECK_EQ(query.stride(2), 1);
|
||||
TORCH_CHECK_EQ(key_cache.dim(), 4);
|
||||
TORCH_CHECK_EQ(value_cache.dim(), 4);
|
||||
|
||||
cpu_attention::AttentionInput input;
|
||||
input.metadata = reinterpret_cast<cpu_attention::AttentionMetadata*>(
|
||||
scheduler_metadata.data_ptr());
|
||||
input.num_tokens = query.size(0);
|
||||
input.num_heads = query.size(1);
|
||||
input.num_kv_heads = key_cache.size(1);
|
||||
input.block_size = key_cache.size(2);
|
||||
input.query = query.data_ptr();
|
||||
input.query_num_tokens_stride = query.stride(0);
|
||||
input.query_num_heads_stride = query.stride(1);
|
||||
input.cache_num_blocks_stride = key_cache.stride(0);
|
||||
input.cache_num_kv_heads_stride = key_cache.stride(1);
|
||||
input.blt_num_tokens_stride = block_table.stride(0);
|
||||
input.key_cache = key_cache.data_ptr();
|
||||
input.value_cache = value_cache.data_ptr();
|
||||
input.output = output.data_ptr();
|
||||
input.query_start_loc = query_start_loc.data_ptr<int32_t>();
|
||||
input.seq_lens = seq_lens.data_ptr<int32_t>();
|
||||
input.block_table = block_table.data_ptr<int32_t>();
|
||||
input.alibi_slopes =
|
||||
alibi_slopes.has_value() ? alibi_slopes->data_ptr<float>() : nullptr;
|
||||
// For now sink must be bf16
|
||||
input.s_aux = s_aux.has_value() ? s_aux->data_ptr<c10::BFloat16>() : nullptr;
|
||||
input.scale = scale;
|
||||
input.causal = causal;
|
||||
input.sliding_window_left = sliding_window_left;
|
||||
input.sliding_window_right = sliding_window_right;
|
||||
if (input.causal) {
|
||||
// to make boundary calculation easier
|
||||
input.sliding_window_right = 0;
|
||||
}
|
||||
float softcap_fp32 = softcap;
|
||||
input.softcap = softcap_fp32;
|
||||
|
||||
VLLM_DISPATCH_FLOATING_TYPES(
|
||||
query.scalar_type(), "cpu_attention_with_kv_cache", [&]() {
|
||||
CPU_ATTN_DISPATCH_CASE_HEADDIM(query.size(2), [&] {
|
||||
CPU_ATTN_DISPATCH_IMPL(input.metadata->isa, [&]() {
|
||||
TORCH_CHECK_EQ(input.block_size % attn_impl::BlockSizeAlignment, 0);
|
||||
cpu_attention::AttentionMainLoop<attn_impl> mainloop;
|
||||
mainloop(&input);
|
||||
});
|
||||
});
|
||||
});
|
||||
}
|
||||
511
csrc/cpu/cpu_attn_amx.hpp
Normal file
511
csrc/cpu/cpu_attn_amx.hpp
Normal file
@@ -0,0 +1,511 @@
|
||||
#ifndef CPU_ATTN_AMX_HPP
|
||||
#define CPU_ATTN_AMX_HPP
|
||||
|
||||
#include "cpu_attn_impl.hpp"
|
||||
|
||||
namespace cpu_attention {
|
||||
namespace {
|
||||
// AMX specific
|
||||
constexpr static int64_t AMX_TILE_ROW_BYTES = 64;
|
||||
constexpr static int64_t AMX_TILE_ROW_NUM = 16;
|
||||
constexpr static int64_t AMX_TILE_BYTES = AMX_TILE_ROW_BYTES * AMX_TILE_ROW_NUM;
|
||||
|
||||
typedef struct __tile_config {
|
||||
uint8_t palette_id = 1;
|
||||
uint8_t start_row = 0;
|
||||
uint8_t reserved_0[14] = {0};
|
||||
uint16_t colsb[16] = {0};
|
||||
uint8_t rows[16] = {0};
|
||||
} __tilecfg;
|
||||
|
||||
// 2-2-4 pattern, for 16 < m <= 32
|
||||
// TILE 0, 1: load A matrix, row num should be 16, m - 16
|
||||
// TILE 2, 3: load B matrix, row num should be 16
|
||||
// TILE 4, 5, 6, 7: store results C matrix, row num should be 16, 16, m - 16, m
|
||||
// - 16
|
||||
template <typename kv_cache_t>
|
||||
class TileGemm224 {
|
||||
public:
|
||||
template <AttentionGemmPhase phase, int32_t k_size>
|
||||
FORCE_INLINE static void gemm(const int32_t m_size, void* __restrict__ a_tile,
|
||||
void* __restrict__ b_tile,
|
||||
float* __restrict__ c_tile, const int64_t lda,
|
||||
const int64_t ldb, const int64_t ldc,
|
||||
const int32_t block_size,
|
||||
const int32_t dynamic_k_size,
|
||||
const bool accum_c) {
|
||||
TORCH_CHECK(false, "Unsupported kv cache type for TileGemm224");
|
||||
}
|
||||
|
||||
FORCE_INLINE static void init_tile_config(int32_t m, __tilecfg& config) {
|
||||
TORCH_CHECK(false, "Unsupported kv cache type for TileGemm224");
|
||||
}
|
||||
};
|
||||
|
||||
template <>
|
||||
class TileGemm224<c10::BFloat16> {
|
||||
public:
|
||||
template <AttentionGemmPhase phase, int32_t k_size>
|
||||
FORCE_INLINE static void gemm(const int32_t m_size,
|
||||
c10::BFloat16* __restrict__ a_tile,
|
||||
c10::BFloat16* __restrict__ b_tile,
|
||||
float* __restrict__ c_tile, const int64_t lda,
|
||||
const int64_t ldb, const int64_t ldc,
|
||||
const int32_t block_size,
|
||||
const int32_t dynamic_k_size,
|
||||
const bool accum_c) {
|
||||
const int32_t k_times =
|
||||
dynamic_k_size / (AMX_TILE_ROW_NUM * 4 / sizeof(c10::BFloat16));
|
||||
c10::BFloat16* __restrict__ a_tile_0 = a_tile;
|
||||
c10::BFloat16* __restrict__ a_tile_1 = a_tile + lda * AMX_TILE_ROW_NUM;
|
||||
const int64_t a_tile_stride = [&]() {
|
||||
if constexpr (phase == AttentionGemmPhase::QK) {
|
||||
// q_buffer is prepacked
|
||||
return AMX_TILE_ROW_BYTES;
|
||||
} else if constexpr (phase == AttentionGemmPhase::PV) {
|
||||
// logits_buffer is row-major
|
||||
return lda * sizeof(c10::BFloat16);
|
||||
} else {
|
||||
TORCH_CHECK(false, "Unreachable");
|
||||
}
|
||||
}();
|
||||
|
||||
c10::BFloat16* __restrict__ b_tile_2 = b_tile;
|
||||
c10::BFloat16* __restrict__ b_tile_3 = [&]() {
|
||||
if constexpr (phase == AttentionGemmPhase::QK) {
|
||||
// k_cache is prepacked
|
||||
return b_tile + (k_size * AMX_TILE_ROW_BYTES / 4);
|
||||
} else if constexpr (phase == AttentionGemmPhase::PV) {
|
||||
// v_cache is prepacked
|
||||
return b_tile + (block_size * AMX_TILE_ROW_BYTES / 4);
|
||||
} else {
|
||||
TORCH_CHECK(false, "Unreachable");
|
||||
}
|
||||
}();
|
||||
// k_cache, v_cache are prepacked
|
||||
const int32_t b_tile_stride = AMX_TILE_ROW_BYTES;
|
||||
|
||||
// logits_buffer, output_buffer are not prepacked
|
||||
float* __restrict__ c_tile_4 = c_tile;
|
||||
float* __restrict__ c_tile_5 =
|
||||
c_tile_4 + AMX_TILE_ROW_BYTES / sizeof(float);
|
||||
float* __restrict__ c_tile_6 = c_tile + AMX_TILE_ROW_NUM * ldc;
|
||||
float* __restrict__ c_tile_7 =
|
||||
c_tile_6 + AMX_TILE_ROW_BYTES / sizeof(float);
|
||||
const int32_t c_tile_stride = ldc * sizeof(float);
|
||||
|
||||
if (accum_c) {
|
||||
_tile_loadd(4, c_tile_4, c_tile_stride);
|
||||
_tile_loadd(5, c_tile_5, c_tile_stride);
|
||||
_tile_loadd(6, c_tile_6, c_tile_stride);
|
||||
_tile_loadd(7, c_tile_7, c_tile_stride);
|
||||
} else {
|
||||
_tile_zero(4);
|
||||
_tile_zero(5);
|
||||
_tile_zero(6);
|
||||
_tile_zero(7);
|
||||
}
|
||||
|
||||
for (int32_t k = 0; k < k_times; ++k) {
|
||||
_tile_loadd(0, a_tile_0, a_tile_stride);
|
||||
_tile_stream_loadd(2, b_tile_2, b_tile_stride);
|
||||
_tile_dpbf16ps(4, 0, 2);
|
||||
_tile_stream_loadd(3, b_tile_3, b_tile_stride);
|
||||
_tile_dpbf16ps(5, 0, 3);
|
||||
_tile_loadd(1, a_tile_1, a_tile_stride);
|
||||
_tile_dpbf16ps(6, 1, 2);
|
||||
_tile_dpbf16ps(7, 1, 3);
|
||||
|
||||
// update ptrs
|
||||
if constexpr (phase == AttentionGemmPhase::QK) {
|
||||
// Q buffer is prepacked
|
||||
a_tile_0 += AMX_TILE_BYTES / sizeof(c10::BFloat16);
|
||||
a_tile_1 += AMX_TILE_BYTES / sizeof(c10::BFloat16);
|
||||
} else if constexpr (phase == AttentionGemmPhase::PV) {
|
||||
// P buffer is not prepacked
|
||||
a_tile_0 += AMX_TILE_ROW_BYTES / sizeof(c10::BFloat16);
|
||||
a_tile_1 += AMX_TILE_ROW_BYTES / sizeof(c10::BFloat16);
|
||||
} else {
|
||||
TORCH_CHECK(false, "Unreachable");
|
||||
}
|
||||
b_tile_2 += AMX_TILE_BYTES / sizeof(c10::BFloat16);
|
||||
b_tile_3 += AMX_TILE_BYTES / sizeof(c10::BFloat16);
|
||||
}
|
||||
|
||||
_tile_stored(4, c_tile_4, c_tile_stride);
|
||||
_tile_stored(5, c_tile_5, c_tile_stride);
|
||||
_tile_stored(6, c_tile_6, c_tile_stride);
|
||||
_tile_stored(7, c_tile_7, c_tile_stride);
|
||||
}
|
||||
|
||||
FORCE_INLINE static void init_tile_config(int32_t m, __tilecfg& config) {
|
||||
const int32_t m_0 = AMX_TILE_ROW_NUM;
|
||||
const int32_t m_1 = m - AMX_TILE_ROW_NUM;
|
||||
config.rows[0] = m_0;
|
||||
config.rows[1] = m_1;
|
||||
config.rows[2] = AMX_TILE_ROW_NUM;
|
||||
config.rows[3] = AMX_TILE_ROW_NUM;
|
||||
config.rows[4] = m_0;
|
||||
config.rows[5] = m_0;
|
||||
config.rows[6] = m_1;
|
||||
config.rows[7] = m_1;
|
||||
_tile_loadconfig(&config);
|
||||
}
|
||||
};
|
||||
|
||||
// 1-2-2 pattern, for 0 < m <= 16
|
||||
// TILE 0, (1): load A matrix, use extra 1 tile for prefetch, row num should be
|
||||
// m, m
|
||||
// TILE 2, 3, (4, 5): load B matrix, use extra 2 tiles for prefetch, row
|
||||
// num should be 16
|
||||
// TILE 6, 7, (6, 7): store results C matrix, row num should be
|
||||
// m
|
||||
template <typename kv_cache_t>
|
||||
class TileGemm122 {
|
||||
public:
|
||||
template <AttentionGemmPhase phase, int32_t k_size>
|
||||
FORCE_INLINE static void gemm(const int32_t m_size, void* __restrict__ a_tile,
|
||||
void* __restrict__ b_tile,
|
||||
float* __restrict__ c_tile, const int64_t lda,
|
||||
const int64_t ldb, const int64_t ldc,
|
||||
const int32_t block_size,
|
||||
const int32_t dynamic_k_size,
|
||||
const bool accum_c) {
|
||||
TORCH_CHECK(false, "Unsupported kv cache type for TileGemm122");
|
||||
}
|
||||
|
||||
FORCE_INLINE static void init_tile_config(int32_t m, __tilecfg& config) {
|
||||
TORCH_CHECK(false, "Unsupported kv cache type for TileGemm122");
|
||||
}
|
||||
};
|
||||
|
||||
template <>
|
||||
class TileGemm122<c10::BFloat16> {
|
||||
public:
|
||||
template <AttentionGemmPhase phase, int32_t k_size>
|
||||
FORCE_INLINE static void gemm(const int32_t m_size,
|
||||
c10::BFloat16* __restrict__ a_tile,
|
||||
c10::BFloat16* __restrict__ b_tile,
|
||||
float* __restrict__ c_tile, const int64_t lda,
|
||||
const int64_t ldb, const int64_t ldc,
|
||||
const int32_t block_size,
|
||||
const int32_t dynamic_k_size,
|
||||
const bool accum_c) {
|
||||
c10::BFloat16* __restrict__ a_tile_0 = a_tile;
|
||||
c10::BFloat16* __restrict__ a_tile_1 = [&]() {
|
||||
if constexpr (phase == AttentionGemmPhase::QK) {
|
||||
// q_buffer is prepacked
|
||||
return a_tile + AMX_TILE_BYTES / sizeof(c10::BFloat16);
|
||||
} else if constexpr (phase == AttentionGemmPhase::PV) {
|
||||
// logits_buffer is row-major
|
||||
return a_tile + AMX_TILE_ROW_BYTES / sizeof(c10::BFloat16);
|
||||
} else {
|
||||
TORCH_CHECK(false, "Unreachable");
|
||||
}
|
||||
}();
|
||||
const int64_t a_tile_stride = [&]() {
|
||||
if constexpr (phase == AttentionGemmPhase::QK) {
|
||||
// q_buffer is prepacked
|
||||
return AMX_TILE_ROW_BYTES;
|
||||
} else if constexpr (phase == AttentionGemmPhase::PV) {
|
||||
// logits_buffer is row-major
|
||||
return lda * sizeof(c10::BFloat16);
|
||||
} else {
|
||||
TORCH_CHECK(false, "Unreachable");
|
||||
}
|
||||
}();
|
||||
|
||||
c10::BFloat16* __restrict__ b_tile_2 = b_tile;
|
||||
c10::BFloat16* __restrict__ b_tile_3 = [&]() {
|
||||
if constexpr (phase == AttentionGemmPhase::QK) {
|
||||
// k_cache is prepacked
|
||||
return b_tile + (k_size * AMX_TILE_ROW_BYTES / 4);
|
||||
} else if constexpr (phase == AttentionGemmPhase::PV) {
|
||||
// v_cache is prepacked
|
||||
return b_tile + (block_size * AMX_TILE_ROW_BYTES / 4);
|
||||
} else {
|
||||
TORCH_CHECK(false, "Unreachable");
|
||||
}
|
||||
}();
|
||||
c10::BFloat16* __restrict__ b_tile_4 =
|
||||
b_tile_2 + AMX_TILE_BYTES / sizeof(c10::BFloat16);
|
||||
c10::BFloat16* __restrict__ b_tile_5 =
|
||||
b_tile_3 + AMX_TILE_BYTES / sizeof(c10::BFloat16);
|
||||
int64_t b_stride = AMX_TILE_ROW_BYTES;
|
||||
|
||||
float* __restrict__ c_tile_6 = c_tile;
|
||||
float* __restrict__ c_tile_7 = c_tile + AMX_TILE_ROW_BYTES / sizeof(float);
|
||||
int64_t c_stride = ldc * sizeof(float);
|
||||
|
||||
const int32_t k_times =
|
||||
dynamic_k_size / (AMX_TILE_ROW_NUM * 4 / sizeof(c10::BFloat16));
|
||||
const int32_t k_group_times = k_times / 2;
|
||||
const bool has_tail = (k_times % 2 == 1);
|
||||
|
||||
if (accum_c) {
|
||||
_tile_loadd(6, c_tile_6, c_stride);
|
||||
_tile_loadd(7, c_tile_7, c_stride);
|
||||
} else {
|
||||
_tile_zero(6);
|
||||
_tile_zero(7);
|
||||
}
|
||||
|
||||
for (int32_t k = 0; k < k_group_times; ++k) {
|
||||
_tile_loadd(0, a_tile_0, a_tile_stride);
|
||||
_tile_stream_loadd(2, b_tile_2, b_stride);
|
||||
_tile_dpbf16ps(6, 0, 2);
|
||||
_tile_stream_loadd(3, b_tile_3, b_stride);
|
||||
_tile_dpbf16ps(7, 0, 3);
|
||||
_tile_loadd(1, a_tile_1, a_tile_stride);
|
||||
_tile_stream_loadd(4, b_tile_4, b_stride);
|
||||
_tile_dpbf16ps(6, 1, 4);
|
||||
_tile_stream_loadd(5, b_tile_5, b_stride);
|
||||
_tile_dpbf16ps(7, 1, 5);
|
||||
|
||||
// update ptrs
|
||||
if constexpr (phase == AttentionGemmPhase::QK) {
|
||||
// Q buffer is prepacked
|
||||
a_tile_0 += 2 * AMX_TILE_BYTES / sizeof(c10::BFloat16);
|
||||
a_tile_1 += 2 * AMX_TILE_BYTES / sizeof(c10::BFloat16);
|
||||
} else if constexpr (phase == AttentionGemmPhase::PV) {
|
||||
// P buffer is not prepacked
|
||||
a_tile_0 += 2 * AMX_TILE_ROW_BYTES / sizeof(c10::BFloat16);
|
||||
a_tile_1 += 2 * AMX_TILE_ROW_BYTES / sizeof(c10::BFloat16);
|
||||
}
|
||||
b_tile_2 += 2 * AMX_TILE_BYTES / sizeof(c10::BFloat16);
|
||||
b_tile_3 += 2 * AMX_TILE_BYTES / sizeof(c10::BFloat16);
|
||||
b_tile_4 += 2 * AMX_TILE_BYTES / sizeof(c10::BFloat16);
|
||||
b_tile_5 += 2 * AMX_TILE_BYTES / sizeof(c10::BFloat16);
|
||||
}
|
||||
|
||||
if (has_tail) {
|
||||
_tile_loadd(0, a_tile_0, a_tile_stride);
|
||||
_tile_stream_loadd(2, b_tile_2, b_stride);
|
||||
_tile_dpbf16ps(6, 0, 2);
|
||||
_tile_stream_loadd(3, b_tile_3, b_stride);
|
||||
_tile_dpbf16ps(7, 0, 3);
|
||||
}
|
||||
|
||||
_tile_stored(6, c_tile_6, c_stride);
|
||||
_tile_stored(7, c_tile_7, c_stride);
|
||||
}
|
||||
|
||||
FORCE_INLINE static void init_tile_config(int32_t m, __tilecfg& config) {
|
||||
config.rows[0] = m;
|
||||
config.rows[1] = m;
|
||||
config.rows[2] = AMX_TILE_ROW_NUM;
|
||||
config.rows[3] = AMX_TILE_ROW_NUM;
|
||||
config.rows[4] = AMX_TILE_ROW_NUM;
|
||||
config.rows[5] = AMX_TILE_ROW_NUM;
|
||||
config.rows[6] = m;
|
||||
config.rows[7] = m;
|
||||
_tile_loadconfig(&config);
|
||||
}
|
||||
};
|
||||
} // namespace
|
||||
|
||||
template <typename scalar_t, int64_t head_dim>
|
||||
class AttentionImpl<ISA::AMX, scalar_t, head_dim> {
|
||||
public:
|
||||
using query_t = scalar_t;
|
||||
using q_buffer_t = scalar_t;
|
||||
using kv_cache_t = scalar_t;
|
||||
using logits_buffer_t = float;
|
||||
using partial_output_buffer_t = float;
|
||||
using prob_buffer_t = scalar_t;
|
||||
|
||||
constexpr static int64_t BlockSizeAlignment =
|
||||
AMX_TILE_ROW_BYTES /
|
||||
sizeof(kv_cache_t); // KV token num unit of QK and PV phases
|
||||
constexpr static int64_t HeadDimAlignment =
|
||||
2 * (AMX_TILE_ROW_BYTES / 4); // headdim num unit of PV phase
|
||||
constexpr static int64_t MaxQHeadNumPerIteration = 32;
|
||||
constexpr static int64_t HeadDim = head_dim;
|
||||
constexpr static ISA ISAType = ISA::AMX;
|
||||
constexpr static bool scale_on_logits = true;
|
||||
|
||||
public:
|
||||
AttentionImpl() : current_q_head_num_(0) {
|
||||
// Use all columns in AMX tiles
|
||||
vec_op::unroll_loop<int, 8>([&](int i) { amx_tile_config_.colsb[i] = 64; });
|
||||
}
|
||||
|
||||
~AttentionImpl() { _tile_release(); }
|
||||
|
||||
template <template <typename tile_gemm_t> typename attention>
|
||||
FORCE_INLINE void execute_attention(DEFINE_CPU_ATTENTION_PARAMS) {
|
||||
if (q_head_num > AMX_TILE_ROW_NUM) {
|
||||
if (q_head_num != current_q_head_num_) {
|
||||
current_q_head_num_ = q_head_num;
|
||||
TileGemm224<kv_cache_t>::init_tile_config(q_head_num, amx_tile_config_);
|
||||
}
|
||||
attention<TileGemm224<kv_cache_t>> attention_iteration;
|
||||
attention_iteration(CPU_ATTENTION_PARAMS);
|
||||
} else {
|
||||
if (q_head_num != current_q_head_num_) {
|
||||
current_q_head_num_ = q_head_num;
|
||||
TileGemm122<kv_cache_t>::init_tile_config(q_head_num, amx_tile_config_);
|
||||
}
|
||||
attention<TileGemm122<kv_cache_t>> attention_iteration;
|
||||
attention_iteration(CPU_ATTENTION_PARAMS);
|
||||
}
|
||||
}
|
||||
|
||||
// k_cache_token_group_stride: stride of K cache when move to next
|
||||
// BlockSizeAlignment tokens in a block
|
||||
constexpr static int64_t k_cache_token_group_stride(
|
||||
const int32_t block_size) {
|
||||
return BlockSizeAlignment * head_dim;
|
||||
}
|
||||
|
||||
// v_cache_token_group_stride: stride of V cache when move to next
|
||||
// BlockSizeAlignment tokens in a block
|
||||
constexpr static int64_t v_cache_token_group_stride(
|
||||
const int32_t block_size) {
|
||||
return BlockSizeAlignment * (AMX_TILE_ROW_BYTES / 4);
|
||||
}
|
||||
|
||||
// v_cache_head_group_stride: stride of V cache when move to next
|
||||
// HeadDimAlignment head dims in a block
|
||||
constexpr static int64_t v_cache_head_group_stride(const int32_t block_size) {
|
||||
return block_size * HeadDimAlignment;
|
||||
}
|
||||
|
||||
static void copy_q_heads_tile(
|
||||
scalar_t* __restrict__ src, // [q_num, q_heads_per_kv, head_size]
|
||||
scalar_t* __restrict__ q_buffer, const int32_t q_num,
|
||||
const int32_t q_heads_per_kv, const int64_t q_num_stride,
|
||||
const int64_t q_head_stride, const float scale) {
|
||||
constexpr int64_t bytes_per_head = head_dim * sizeof(scalar_t);
|
||||
static_assert(bytes_per_head % AMX_TILE_ROW_BYTES == 0);
|
||||
constexpr int64_t head_size_block_num = bytes_per_head / AMX_TILE_ROW_BYTES;
|
||||
constexpr int64_t head_elem_num_pre_block =
|
||||
AMX_TILE_ROW_BYTES / sizeof(scalar_t);
|
||||
|
||||
int32_t idx = 0;
|
||||
int8_t* __restrict__ q_buffer_iter = reinterpret_cast<int8_t*>(q_buffer);
|
||||
for (int32_t q_num_idx = 0; q_num_idx < q_num;
|
||||
++q_num_idx, src += q_num_stride) {
|
||||
scalar_t* __restrict__ src_iter = src;
|
||||
for (int32_t q_head_idx = 0; q_head_idx < q_heads_per_kv;
|
||||
++q_head_idx, src_iter += q_head_stride) {
|
||||
vec_op::unroll_loop<int32_t, head_size_block_num>(
|
||||
[&](int32_t head_size_block_idx) {
|
||||
// Use INT8Vec64 for 64 bytes block
|
||||
vec_op::INT8Vec64 vec(src_iter + head_size_block_idx *
|
||||
head_elem_num_pre_block);
|
||||
vec.save(q_buffer_iter + head_size_block_idx * AMX_TILE_BYTES);
|
||||
});
|
||||
|
||||
++idx;
|
||||
q_buffer_iter += AMX_TILE_ROW_BYTES;
|
||||
if ((idx & (AMX_TILE_ROW_NUM - 1)) == 0) {
|
||||
// head is in another amx tile
|
||||
q_buffer_iter -= AMX_TILE_ROW_NUM * AMX_TILE_ROW_BYTES;
|
||||
q_buffer_iter += head_size_block_num * AMX_TILE_BYTES;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// reshape KV to AMX friendly layout
|
||||
static void reshape_and_cache(
|
||||
const scalar_t* __restrict__ key, const scalar_t* __restrict__ value,
|
||||
scalar_t* __restrict__ key_cache, scalar_t* __restrict__ value_cache,
|
||||
const int64_t* __restrict__ slot_mapping, const int64_t token_num,
|
||||
const int64_t key_token_num_stride, const int64_t value_token_num_stride,
|
||||
const int64_t head_num, const int64_t key_head_num_stride,
|
||||
const int64_t value_head_num_stride, const int64_t num_blocks,
|
||||
const int64_t num_blocks_stride, const int64_t cache_head_num_stride,
|
||||
const int64_t block_size, const int64_t block_size_stride) {
|
||||
// For AMX 2D tiles, size of each line is 64 bytes
|
||||
constexpr int64_t amx_tile_row_size = AMX_TILE_ROW_BYTES;
|
||||
// For AMX B martix, N always is 16
|
||||
constexpr int64_t amx_b_tile_n_size = AMX_TILE_ROW_BYTES / 4;
|
||||
constexpr int64_t amx_b_tile_k_size = amx_tile_row_size / sizeof(scalar_t);
|
||||
// For now suppose block_size is divisible by amx_tile_column_num
|
||||
TORCH_CHECK_EQ(block_size % amx_b_tile_k_size, 0);
|
||||
|
||||
#pragma omp parallel for collapse(2)
|
||||
for (int64_t token_idx = 0; token_idx < token_num; ++token_idx) {
|
||||
for (int64_t head_idx = 0; head_idx < head_num; ++head_idx) {
|
||||
const int64_t pos = slot_mapping[token_idx];
|
||||
if (pos < 0) {
|
||||
// skip
|
||||
continue;
|
||||
}
|
||||
|
||||
const int64_t block_idx = pos / block_size;
|
||||
const int64_t block_offset = pos % block_size;
|
||||
{
|
||||
// Write Key
|
||||
// Head elements should be packed as quand-words and stored in token
|
||||
// groups with (quadword_stride/4) tokens
|
||||
constexpr int64_t token_num_per_group = amx_tile_row_size / 4;
|
||||
static_assert(head_dim % (4 / sizeof(scalar_t)) == 0);
|
||||
constexpr int64_t quadword_num = head_dim / (4 / sizeof(scalar_t));
|
||||
const int32_t* key_start_quadword_ptr =
|
||||
reinterpret_cast<const int32_t*>(
|
||||
key + token_idx * key_token_num_stride +
|
||||
head_idx * key_head_num_stride);
|
||||
const int64_t group_idx = block_offset / token_num_per_group;
|
||||
const int64_t group_offset = block_offset % token_num_per_group;
|
||||
constexpr int64_t quadword_num_per_group =
|
||||
token_num_per_group * quadword_num;
|
||||
int32_t* key_cache_start_ptr =
|
||||
reinterpret_cast<int32_t*>(key_cache +
|
||||
block_idx * num_blocks_stride +
|
||||
head_idx * cache_head_num_stride) +
|
||||
group_idx * quadword_num_per_group + group_offset;
|
||||
|
||||
#pragma GCC unroll 8
|
||||
for (int64_t i = 0, j = 0; j < quadword_num;
|
||||
i += token_num_per_group, ++j) {
|
||||
key_cache_start_ptr[i] = key_start_quadword_ptr[j];
|
||||
}
|
||||
}
|
||||
{
|
||||
// Write Value
|
||||
// Different from Key, block_size dimension is packed rather than
|
||||
// head_size dimension block_size dimension is packed as quand-words;
|
||||
constexpr int64_t token_num_per_sub_group = 4 / sizeof(scalar_t);
|
||||
const int64_t token_num_per_group = block_size;
|
||||
constexpr int64_t head_elems_per_group = amx_b_tile_n_size;
|
||||
const int64_t group_size = token_num_per_group * head_elems_per_group;
|
||||
// For now suppose head_dim is divisible by amx_b_tile_n_size
|
||||
static_assert(head_dim % head_elems_per_group == 0);
|
||||
constexpr int64_t group_num = head_dim / head_elems_per_group;
|
||||
const int64_t sub_group_idx = block_offset / token_num_per_sub_group;
|
||||
const int64_t sub_group_offset =
|
||||
block_offset % token_num_per_sub_group;
|
||||
|
||||
const scalar_t* value_start_ptr = value +
|
||||
token_idx * value_token_num_stride +
|
||||
head_idx * value_head_num_stride;
|
||||
scalar_t* value_cache_start_ptr =
|
||||
value_cache + block_idx * num_blocks_stride +
|
||||
head_idx * cache_head_num_stride +
|
||||
sub_group_idx * token_num_per_sub_group * amx_b_tile_n_size +
|
||||
sub_group_offset;
|
||||
|
||||
for (int64_t i = 0; i < group_num; ++i) {
|
||||
#pragma GCC unroll head_elems_per_group
|
||||
for (int64_t j = 0, k = 0; j < head_elems_per_group;
|
||||
++j, k += token_num_per_sub_group) {
|
||||
value_cache_start_ptr[k] = value_start_ptr[j];
|
||||
}
|
||||
value_start_ptr += head_elems_per_group;
|
||||
value_cache_start_ptr += group_size;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
alignas(64) __tilecfg amx_tile_config_;
|
||||
int32_t current_q_head_num_;
|
||||
};
|
||||
} // namespace cpu_attention
|
||||
|
||||
#endif
|
||||
2013
csrc/cpu/cpu_attn_impl.hpp
Normal file
2013
csrc/cpu/cpu_attn_impl.hpp
Normal file
File diff suppressed because it is too large
Load Diff
63
csrc/cpu/cpu_attn_macros.h
Normal file
63
csrc/cpu/cpu_attn_macros.h
Normal file
@@ -0,0 +1,63 @@
|
||||
#ifndef CPU_ATTN_MACROS_H
|
||||
#define CPU_ATTN_MACROS_H
|
||||
|
||||
// x86_64
|
||||
#ifdef __x86_64__
|
||||
#define FAST_SPINNING _mm_pause();
|
||||
|
||||
#ifdef __AVX512F__
|
||||
#define DEFINE_FAST_EXP \
|
||||
const __m512 vec_factorial_1 = _mm512_set1_ps(0.999999701f); \
|
||||
const __m512 vec_factorial_2 = _mm512_set1_ps(0.499991506f); \
|
||||
const __m512 vec_factorial_3 = _mm512_set1_ps(0.166676521f); \
|
||||
const __m512 vec_factorial_4 = _mm512_set1_ps(0.0418978221f); \
|
||||
const __m512 vec_factorial_5 = _mm512_set1_ps(0.00828929059f); \
|
||||
const __m512 vec_exp_log2ef = \
|
||||
_mm512_castsi512_ps(_mm512_set1_epi32(0x3fb8aa3b)); \
|
||||
const __m512 vec_half = _mm512_set1_ps(0.5f); \
|
||||
const __m512 vec_one = _mm512_set1_ps(1.f); \
|
||||
const __m512 vec_zero = _mm512_set1_ps(0.f); \
|
||||
const __m512 vec_two = _mm512_set1_ps(2.f); \
|
||||
const __m512 vec_ln2f = \
|
||||
_mm512_castsi512_ps(_mm512_set1_epi32(0x3f317218)); \
|
||||
const __m512 vec_ln_flt_min = \
|
||||
_mm512_castsi512_ps(_mm512_set1_epi32(0xc2aeac50)); \
|
||||
const __m512 vec_ln_flt_max = \
|
||||
_mm512_castsi512_ps(_mm512_set1_epi32(0x42b17218)); \
|
||||
const __m512i vec_127 = _mm512_set1_epi32(0x0000007f); \
|
||||
const int n_mantissa_bits = 23; \
|
||||
auto fast_exp = [&](vec_op::FP32Vec16& vec) __attribute__(( \
|
||||
always_inline)) { \
|
||||
__m512 values = vec.reg; \
|
||||
auto less_ln_flt_min_mask = \
|
||||
_mm512_cmp_ps_mask(values, vec_ln_flt_min, 1 /*_CMP_LT_OS*/); \
|
||||
auto vec_src = _mm512_min_ps(values, vec_ln_flt_max); \
|
||||
vec_src = _mm512_max_ps(vec_src, vec_ln_flt_min); \
|
||||
auto vec_fx = _mm512_fmadd_ps(vec_src, vec_exp_log2ef, vec_half); \
|
||||
auto vec_fx_i = _mm512_cvt_roundps_epi32( \
|
||||
vec_fx, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC); \
|
||||
vec_fx = _mm512_cvtepi32_ps(vec_fx_i); \
|
||||
auto vec_exp_poly = _mm512_fnmadd_ps(vec_fx, vec_ln2f, vec_src); \
|
||||
auto vec_res = \
|
||||
_mm512_fmadd_ps(vec_exp_poly, vec_factorial_5, vec_factorial_4); \
|
||||
vec_res = _mm512_fmadd_ps(vec_exp_poly, vec_res, vec_factorial_3); \
|
||||
vec_res = _mm512_fmadd_ps(vec_exp_poly, vec_res, vec_factorial_2); \
|
||||
vec_res = _mm512_fmadd_ps(vec_exp_poly, vec_res, vec_factorial_1); \
|
||||
vec_res = _mm512_fmadd_ps(vec_exp_poly, vec_res, vec_one); \
|
||||
auto vec_exp_number = _mm512_sub_ps(vec_fx, vec_one); \
|
||||
auto vec_exp_number_i = _mm512_cvtps_epi32(vec_exp_number); \
|
||||
auto vec_two_pow_n_i = _mm512_add_epi32(vec_exp_number_i, vec_127); \
|
||||
vec_two_pow_n_i = _mm512_slli_epi32(vec_two_pow_n_i, n_mantissa_bits); \
|
||||
auto vec_two_pow_n = _mm512_castsi512_ps(vec_two_pow_n_i); \
|
||||
vec_two_pow_n = _mm512_mask_blend_ps(less_ln_flt_min_mask, \
|
||||
vec_two_pow_n, vec_zero); \
|
||||
vec_res = _mm512_mul_ps(vec_res, vec_two_pow_n); \
|
||||
vec_res = _mm512_mul_ps(vec_res, vec_two); \
|
||||
vec_op::FP32Vec16 res(vec_res); \
|
||||
return res; \
|
||||
};
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
#endif
|
||||
386
csrc/cpu/cpu_attn_neon.hpp
Normal file
386
csrc/cpu/cpu_attn_neon.hpp
Normal file
@@ -0,0 +1,386 @@
|
||||
#ifndef CPU_ATTN_NEON_HPP
|
||||
#define CPU_ATTN_NEON_HPP
|
||||
|
||||
#include "cpu_attn_impl.hpp"
|
||||
#include <arm_neon.h>
|
||||
#include <type_traits>
|
||||
namespace cpu_attention {
|
||||
|
||||
namespace {
|
||||
|
||||
#define BLOCK_SIZE_ALIGNMENT 32
|
||||
#define HEAD_SIZE_ALIGNMENT 32
|
||||
#define MAX_Q_HEAD_NUM_PER_ITER 16
|
||||
|
||||
// These do not use vectorized class for loading / converting
|
||||
// because csrc/cpu/cpu_types_arm.hpp does not have fallback options
|
||||
// for vec_op::BF16Vec* / vec_op::BF16Vec* on Arm HW that
|
||||
// doesn't support BF16.
|
||||
// We don't use vec_op::FP32Vec* or vec_op::FP16Vec* for consistency.
|
||||
template <typename kv_cache_t>
|
||||
FORCE_INLINE void load_row8_B_as_f32(const kv_cache_t* p, float32x4_t& b0,
|
||||
float32x4_t& b1);
|
||||
|
||||
template <>
|
||||
FORCE_INLINE void load_row8_B_as_f32<float>(const float* p, float32x4_t& b0,
|
||||
float32x4_t& b1) {
|
||||
b0 = vld1q_f32(p + 0);
|
||||
b1 = vld1q_f32(p + 4);
|
||||
}
|
||||
|
||||
template <>
|
||||
FORCE_INLINE void load_row8_B_as_f32<c10::Half>(const c10::Half* p,
|
||||
float32x4_t& b0,
|
||||
float32x4_t& b1) {
|
||||
const float16_t* h = reinterpret_cast<const float16_t*>(p);
|
||||
float16x8_t v = vld1q_f16(h);
|
||||
b0 = vcvt_f32_f16(vget_low_f16(v));
|
||||
b1 = vcvt_f32_f16(vget_high_f16(v));
|
||||
}
|
||||
|
||||
template <>
|
||||
FORCE_INLINE void load_row8_B_as_f32<c10::BFloat16>(const c10::BFloat16* p,
|
||||
float32x4_t& b0,
|
||||
float32x4_t& b1) {
|
||||
const uint16_t* u = reinterpret_cast<const uint16_t*>(p);
|
||||
#ifdef ARM_BF16_SUPPORT
|
||||
uint16x8_t u0 = vld1q_u16(u);
|
||||
bfloat16x8_t bf0 = vreinterpretq_bf16_u16(u0);
|
||||
b0 = vcvtq_low_f32_bf16(bf0);
|
||||
b1 = vcvtq_high_f32_bf16(bf0);
|
||||
#else
|
||||
uint16x8_t x0 = vld1q_u16(u);
|
||||
uint32x4_t lo = vshlq_n_u32(vmovl_u16(vget_low_u16(x0)), 16);
|
||||
uint32x4_t hi = vshlq_n_u32(vmovl_u16(vget_high_u16(x0)), 16);
|
||||
b0 = vreinterpretq_f32_u32(lo);
|
||||
b1 = vreinterpretq_f32_u32(hi);
|
||||
#endif
|
||||
}
|
||||
|
||||
// Mx8, with 1 <= M <= 8 , K streamed, unroll-by-4 with NEON FMLAs
|
||||
// #Loads = (K // 4) * (M + 4 * sizeof(kv_cache_t) / 2)
|
||||
// #FMLAs = (K // 4) * (4 * 2 * M)
|
||||
// We have (4 * 2 * M) FMLAs for (M + 4 * sizeof(kv_cache_t) / 2) loads
|
||||
template <int32_t M, typename kv_cache_t>
|
||||
FORCE_INLINE void gemm_micro_neon_fmla_Mx8_Ku4(
|
||||
const float* __restrict A, // [M x K],
|
||||
const kv_cache_t* __restrict B, // [K x 8],
|
||||
float* __restrict C, // [M x 8],
|
||||
int64_t lda, int64_t ldb, int64_t ldc, int32_t K, bool accumulate) {
|
||||
// kernel supports max M of 8, as it'd spill for larger M
|
||||
static_assert(1 <= M && M <= 8, "M must be in [1,8]");
|
||||
|
||||
// helpers for per-M codegen
|
||||
#define ROWS_APPLY(OP) OP(0) OP(1) OP(2) OP(3) OP(4) OP(5) OP(6) OP(7)
|
||||
#define IF_M(i) if constexpr (M > (i))
|
||||
|
||||
// A row base pointers
|
||||
#define DECL_A(i) const float* a##i = A + (i) * lda;
|
||||
ROWS_APPLY(DECL_A)
|
||||
#undef DECL_A
|
||||
|
||||
// declare 2 accumulators per row of M
|
||||
#define DECL_ACC(i) float32x4_t acc##i##_0, acc##i##_1;
|
||||
ROWS_APPLY(DECL_ACC)
|
||||
#undef DECL_ACC
|
||||
|
||||
// initialize accumulators
|
||||
#define INIT_ACC(i) \
|
||||
IF_M(i) { \
|
||||
if (accumulate) { \
|
||||
acc##i##_0 = vld1q_f32(C + (i) * ldc + 0); \
|
||||
acc##i##_1 = vld1q_f32(C + (i) * ldc + 4); \
|
||||
} else { \
|
||||
acc##i##_0 = vdupq_n_f32(0.f); \
|
||||
acc##i##_1 = vdupq_n_f32(0.f); \
|
||||
} \
|
||||
}
|
||||
ROWS_APPLY(INIT_ACC)
|
||||
#undef INIT_ACC
|
||||
|
||||
int32_t k = 0;
|
||||
|
||||
// K unrolled by 4
|
||||
for (; k + 3 < K; k += 4) {
|
||||
// load A[k..k+3] for each active row (M)
|
||||
#define LOAD_A4(i) \
|
||||
float32x4_t a##i##v; \
|
||||
IF_M(i) a##i##v = vld1q_f32(a##i + k);
|
||||
ROWS_APPLY(LOAD_A4)
|
||||
#undef LOAD_A4
|
||||
|
||||
// helper: FMA lane L from aiv
|
||||
#define FMAS_LANE(i, aiv, L) \
|
||||
IF_M(i) { \
|
||||
acc##i##_0 = vfmaq_laneq_f32(acc##i##_0, b0, aiv, L); \
|
||||
acc##i##_1 = vfmaq_laneq_f32(acc##i##_1, b1, aiv, L); \
|
||||
}
|
||||
|
||||
// k + 0
|
||||
{
|
||||
float32x4_t b0, b1;
|
||||
load_row8_B_as_f32<kv_cache_t>(B + (int64_t)(k + 0) * ldb, b0, b1);
|
||||
#define STEP_K0(i) FMAS_LANE(i, a##i##v, 0)
|
||||
ROWS_APPLY(STEP_K0)
|
||||
#undef STEP_K0
|
||||
}
|
||||
// k + 1
|
||||
{
|
||||
float32x4_t b0, b1;
|
||||
load_row8_B_as_f32<kv_cache_t>(B + (int64_t)(k + 1) * ldb, b0, b1);
|
||||
#define STEP_K1(i) FMAS_LANE(i, a##i##v, 1)
|
||||
ROWS_APPLY(STEP_K1)
|
||||
#undef STEP_K1
|
||||
}
|
||||
// k + 2
|
||||
{
|
||||
float32x4_t b0, b1;
|
||||
load_row8_B_as_f32<kv_cache_t>(B + (int64_t)(k + 2) * ldb, b0, b1);
|
||||
#define STEP_K2(i) FMAS_LANE(i, a##i##v, 2)
|
||||
ROWS_APPLY(STEP_K2)
|
||||
#undef STEP_K2
|
||||
}
|
||||
// k + 3
|
||||
{
|
||||
float32x4_t b0, b1;
|
||||
load_row8_B_as_f32<kv_cache_t>(B + (int64_t)(k + 3) * ldb, b0, b1);
|
||||
#define STEP_K3(i) FMAS_LANE(i, a##i##v, 3)
|
||||
ROWS_APPLY(STEP_K3)
|
||||
#undef STEP_K3
|
||||
}
|
||||
#undef FMAS_LANE
|
||||
}
|
||||
|
||||
// K tail
|
||||
for (; k < K; ++k) {
|
||||
float32x4_t b0, b1;
|
||||
load_row8_B_as_f32<kv_cache_t>(B + (int64_t)k * ldb, b0, b1);
|
||||
#define TAIL_ROW(i) \
|
||||
IF_M(i) { \
|
||||
float32x4_t ai = vdupq_n_f32(*(a##i + k)); \
|
||||
acc##i##_0 = vfmaq_f32(acc##i##_0, b0, ai); \
|
||||
acc##i##_1 = vfmaq_f32(acc##i##_1, b1, ai); \
|
||||
}
|
||||
ROWS_APPLY(TAIL_ROW)
|
||||
#undef TAIL_ROW
|
||||
}
|
||||
|
||||
// store accumulators to C
|
||||
#define STORE_ROW(i) \
|
||||
IF_M(i) { \
|
||||
vst1q_f32(C + (i) * ldc + 0, acc##i##_0); \
|
||||
vst1q_f32(C + (i) * ldc + 4, acc##i##_1); \
|
||||
}
|
||||
ROWS_APPLY(STORE_ROW)
|
||||
#undef STORE_ROW
|
||||
|
||||
#undef ROWS_APPLY
|
||||
#undef IF_M
|
||||
}
|
||||
|
||||
template <int32_t N, typename kv_cache_t>
|
||||
FORCE_INLINE void gemm_macro_neon_fmla_Mx8_Ku4(const float* __restrict A,
|
||||
const kv_cache_t* __restrict B,
|
||||
float* __restrict C, int32_t M,
|
||||
int32_t K, int64_t lda,
|
||||
int64_t ldb, int64_t ldc,
|
||||
bool accumulate) {
|
||||
// micro kernel is Mx8
|
||||
static_assert(N % 8 == 0, "N must be a multiple of 8");
|
||||
for (int32_t m = 0; m < M;) {
|
||||
int32_t mb = (M - m >= 8) ? 8 : (M - m >= 4) ? 4 : (M - m >= 2) ? 2 : 1;
|
||||
const float* Ab = A + m * lda;
|
||||
float* Cb = C + m * ldc;
|
||||
|
||||
for (int32_t n = 0; n < N; n += 8) {
|
||||
const kv_cache_t* Bn = B + n;
|
||||
float* Cn = Cb + n;
|
||||
switch (mb) {
|
||||
case 8:
|
||||
gemm_micro_neon_fmla_Mx8_Ku4<8, kv_cache_t>(Ab, Bn, Cn, lda, ldb, ldc,
|
||||
K, accumulate);
|
||||
break;
|
||||
case 4:
|
||||
gemm_micro_neon_fmla_Mx8_Ku4<4, kv_cache_t>(Ab, Bn, Cn, lda, ldb, ldc,
|
||||
K, accumulate);
|
||||
break;
|
||||
case 2:
|
||||
gemm_micro_neon_fmla_Mx8_Ku4<2, kv_cache_t>(Ab, Bn, Cn, lda, ldb, ldc,
|
||||
K, accumulate);
|
||||
break;
|
||||
default:
|
||||
gemm_micro_neon_fmla_Mx8_Ku4<1, kv_cache_t>(Ab, Bn, Cn, lda, ldb, ldc,
|
||||
K, accumulate);
|
||||
break;
|
||||
}
|
||||
}
|
||||
// no tail loop for N as it's guaranteed to be a multiple of 8
|
||||
m += mb;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename kv_cache_t>
|
||||
class TileGemmNeonFMLA {
|
||||
public:
|
||||
template <AttentionGemmPhase phase, int32_t k_size>
|
||||
FORCE_INLINE static void gemm(const int32_t m_size,
|
||||
float* __restrict__ a_tile,
|
||||
kv_cache_t* __restrict__ b_tile,
|
||||
float* __restrict__ c_tile, const int64_t lda,
|
||||
const int64_t ldb, const int64_t ldc,
|
||||
const int32_t block_size,
|
||||
const int32_t dynamic_k_size,
|
||||
const bool accum_c) {
|
||||
if constexpr (phase == AttentionGemmPhase::QK) {
|
||||
gemm_macro_neon_fmla_Mx8_Ku4<BLOCK_SIZE_ALIGNMENT, kv_cache_t>(
|
||||
a_tile, b_tile, c_tile, m_size, k_size, lda, ldb, ldc, accum_c);
|
||||
} else {
|
||||
gemm_macro_neon_fmla_Mx8_Ku4<HEAD_SIZE_ALIGNMENT, kv_cache_t>(
|
||||
a_tile, b_tile, c_tile, m_size, dynamic_k_size, lda, ldb, ldc,
|
||||
accum_c);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace
|
||||
|
||||
// this is similar to "ISA::VEC" at the moment
|
||||
template <typename scalar_t, int64_t head_dim>
|
||||
class AttentionImpl<ISA::NEON, scalar_t, head_dim> {
|
||||
public:
|
||||
using query_t = scalar_t;
|
||||
using q_buffer_t = float;
|
||||
using kv_cache_t = scalar_t;
|
||||
using logits_buffer_t = float;
|
||||
using partial_output_buffer_t = float;
|
||||
using prob_buffer_t = float;
|
||||
|
||||
constexpr static int64_t BlockSizeAlignment =
|
||||
BLOCK_SIZE_ALIGNMENT; // KV token num unit of QK and PV phases
|
||||
constexpr static int64_t HeadDimAlignment =
|
||||
HEAD_SIZE_ALIGNMENT; // headdim num unit of PV phase
|
||||
constexpr static int64_t MaxQHeadNumPerIteration = MAX_Q_HEAD_NUM_PER_ITER;
|
||||
constexpr static int64_t HeadDim = head_dim;
|
||||
constexpr static ISA ISAType = ISA::NEON;
|
||||
constexpr static bool scale_on_logits = false; // apply scale on q_buffer
|
||||
|
||||
static_assert(HeadDim % HeadDimAlignment == 0);
|
||||
// the gemm micro kernel is Mx8
|
||||
static_assert(HeadDimAlignment % 8 == 0);
|
||||
static_assert(BlockSizeAlignment % 8 == 0);
|
||||
|
||||
public:
|
||||
template <template <typename tile_gemm_t> typename attention>
|
||||
FORCE_INLINE void execute_attention(DEFINE_CPU_ATTENTION_PARAMS) {
|
||||
attention<TileGemmNeonFMLA<kv_cache_t>> attention_iteration;
|
||||
attention_iteration(CPU_ATTENTION_PARAMS);
|
||||
}
|
||||
|
||||
// k_cache_token_group_stride: stride of K cache when move to next
|
||||
// BlockSizeAlignment tokens in a block
|
||||
constexpr static int64_t k_cache_token_group_stride(
|
||||
const int32_t block_size) {
|
||||
return BlockSizeAlignment; // layout of k_cache block is [head_dim,
|
||||
// block_size], row-major
|
||||
}
|
||||
|
||||
// v_cache_token_group_stride: stride of V cache when move to next
|
||||
// BlockSizeAlignment tokens in a block
|
||||
constexpr static int64_t v_cache_token_group_stride(
|
||||
const int32_t block_size) {
|
||||
return head_dim * BlockSizeAlignment; // layout of v_cache is [block_size,
|
||||
// head_dim], row-major
|
||||
}
|
||||
|
||||
// v_cache_head_group_stride: stride of V cache when move to next
|
||||
// HeadDimAlignment head dims in a block
|
||||
constexpr static int64_t v_cache_head_group_stride(const int32_t block_size) {
|
||||
return HeadDimAlignment; // layout of v_cache is [block_size, head_dim],
|
||||
// row-major
|
||||
}
|
||||
|
||||
// Copy q to q_buffer and cast it to fp32
|
||||
static void copy_q_heads_tile(
|
||||
scalar_t* __restrict__ src, // [q_num, q_heads_per_kv, head_size]
|
||||
float* __restrict__ q_buffer, const int32_t q_num,
|
||||
const int32_t q_heads_per_kv, const int64_t q_num_stride,
|
||||
const int64_t q_head_stride, float scale) {
|
||||
static_assert(head_dim % 16 == 0);
|
||||
constexpr int32_t unroll_size = head_dim / 16;
|
||||
using load_vec_t = typename VecTypeTrait<scalar_t>::vec_t;
|
||||
|
||||
vec_op::FP32Vec16 scale_vec(scale);
|
||||
for (int32_t q_num_idx = 0; q_num_idx < q_num; ++q_num_idx) {
|
||||
for (int32_t q_head_idx = 0; q_head_idx < q_heads_per_kv; ++q_head_idx) {
|
||||
scalar_t* __restrict__ curr_q =
|
||||
src + q_num_idx * q_num_stride + q_head_idx * q_head_stride;
|
||||
float* __restrict__ curr_q_buffer =
|
||||
q_buffer + q_num_idx * q_heads_per_kv * head_dim +
|
||||
q_head_idx * head_dim;
|
||||
|
||||
vec_op::unroll_loop<int32_t, unroll_size>([&](int32_t i) {
|
||||
load_vec_t vec(curr_q);
|
||||
vec_op::FP32Vec16 fp32_vec(vec);
|
||||
fp32_vec = fp32_vec * scale_vec;
|
||||
fp32_vec.save(curr_q_buffer);
|
||||
|
||||
curr_q += 16;
|
||||
curr_q_buffer += 16;
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// reshape K as column-major and V as row-major
|
||||
static void reshape_and_cache(
|
||||
const scalar_t* __restrict__ key, const scalar_t* __restrict__ value,
|
||||
scalar_t* __restrict__ key_cache, scalar_t* __restrict__ value_cache,
|
||||
const int64_t* __restrict__ slot_mapping, const int64_t token_num,
|
||||
const int64_t key_token_num_stride, const int64_t value_token_num_stride,
|
||||
const int64_t head_num, const int64_t key_head_num_stride,
|
||||
const int64_t value_head_num_stride, const int64_t num_blocks,
|
||||
const int64_t num_blocks_stride, const int64_t cache_head_num_stride,
|
||||
const int64_t block_size, const int64_t block_size_stride) {
|
||||
#pragma omp parallel for collapse(2)
|
||||
for (int64_t token_idx = 0; token_idx < token_num; ++token_idx) {
|
||||
for (int64_t head_idx = 0; head_idx < head_num; ++head_idx) {
|
||||
const int64_t pos = slot_mapping[token_idx];
|
||||
if (pos < 0) {
|
||||
// skip
|
||||
continue;
|
||||
}
|
||||
|
||||
const int64_t block_idx = pos / block_size;
|
||||
const int64_t block_offset = pos % block_size;
|
||||
{
|
||||
// Write Key
|
||||
const scalar_t* key_start_ptr = key +
|
||||
token_idx * key_token_num_stride +
|
||||
head_idx * key_head_num_stride;
|
||||
scalar_t* key_cache_start_ptr =
|
||||
key_cache + block_idx * num_blocks_stride +
|
||||
head_idx * cache_head_num_stride + block_offset;
|
||||
|
||||
#pragma GCC unroll 8
|
||||
for (int64_t i = 0, j = 0; i < head_dim; ++i, j += block_size) {
|
||||
key_cache_start_ptr[j] = key_start_ptr[i];
|
||||
}
|
||||
}
|
||||
{
|
||||
// Write Value
|
||||
const scalar_t* value_start_ptr = value +
|
||||
token_idx * value_token_num_stride +
|
||||
head_idx * value_head_num_stride;
|
||||
scalar_t* value_cache_start_ptr =
|
||||
value_cache + block_idx * num_blocks_stride +
|
||||
head_idx * cache_head_num_stride + block_offset * head_dim;
|
||||
std::memcpy(value_cache_start_ptr, value_start_ptr,
|
||||
sizeof(scalar_t) * head_dim);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
} // namespace cpu_attention
|
||||
|
||||
#endif // #ifndef CPU_ATTN_NEON_HPP
|
||||
248
csrc/cpu/cpu_attn_vec.hpp
Normal file
248
csrc/cpu/cpu_attn_vec.hpp
Normal file
@@ -0,0 +1,248 @@
|
||||
#ifndef CPU_ATTN_VEC_HPP
|
||||
#define CPU_ATTN_VEC_HPP
|
||||
|
||||
#include "cpu_attn_impl.hpp"
|
||||
|
||||
namespace cpu_attention {
|
||||
|
||||
namespace {
|
||||
// 8-2-16 pattern, 8 regs for A, 2 regs for B, 16 regs for C, [8, K] @ [k, 32]
|
||||
template <typename kv_cache_t>
|
||||
class TileGemm82 {
|
||||
public:
|
||||
template <AttentionGemmPhase phase, int32_t k_size>
|
||||
FORCE_INLINE static void gemm(const int32_t m_size,
|
||||
float* __restrict__ a_tile,
|
||||
kv_cache_t* __restrict__ b_tile,
|
||||
float* __restrict__ c_tile, const int64_t lda,
|
||||
const int64_t ldb, const int64_t ldc,
|
||||
const int32_t block_size,
|
||||
const int32_t dynamic_k_size,
|
||||
const bool accum_c) {
|
||||
switch (m_size) {
|
||||
case 1:
|
||||
gemm_micro<1>(a_tile, b_tile, c_tile, lda, ldb, ldc, block_size,
|
||||
dynamic_k_size, accum_c);
|
||||
break;
|
||||
case 2:
|
||||
gemm_micro<2>(a_tile, b_tile, c_tile, lda, ldb, ldc, block_size,
|
||||
dynamic_k_size, accum_c);
|
||||
break;
|
||||
case 3:
|
||||
case 4:
|
||||
gemm_micro<4>(a_tile, b_tile, c_tile, lda, ldb, ldc, block_size,
|
||||
dynamic_k_size, accum_c);
|
||||
break;
|
||||
case 5:
|
||||
case 6:
|
||||
gemm_micro<6>(a_tile, b_tile, c_tile, lda, ldb, ldc, block_size,
|
||||
dynamic_k_size, accum_c);
|
||||
break;
|
||||
case 7:
|
||||
case 8:
|
||||
gemm_micro<8>(a_tile, b_tile, c_tile, lda, ldb, ldc, block_size,
|
||||
dynamic_k_size, accum_c);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
template <int32_t M>
|
||||
static void gemm_micro(float* __restrict__ a_tile,
|
||||
kv_cache_t* __restrict__ b_tile,
|
||||
float* __restrict__ c_tile, const int64_t lda,
|
||||
const int64_t ldb, const int64_t ldc,
|
||||
const int32_t block_size, const int32_t dynamic_k_size,
|
||||
const bool accum_c) {
|
||||
static_assert(0 < M <= 8);
|
||||
using load_vec_t = typename VecTypeTrait<kv_cache_t>::vec_t;
|
||||
|
||||
kv_cache_t* __restrict__ curr_b_0 = b_tile;
|
||||
kv_cache_t* __restrict__ curr_b_1 = b_tile + 16;
|
||||
float* __restrict__ curr_c_0 = c_tile;
|
||||
float* __restrict__ curr_c_1 = c_tile + 16;
|
||||
|
||||
vec_op::FP32Vec16 c_regs[M * 2];
|
||||
if (accum_c) {
|
||||
float* __restrict__ curr_m_c_0 = curr_c_0;
|
||||
float* __restrict__ curr_m_c_1 = curr_c_1;
|
||||
vec_op::unroll_loop<int32_t, M>([&](int32_t i) {
|
||||
c_regs[i * 2] = vec_op::FP32Vec16(curr_m_c_0);
|
||||
c_regs[i * 2 + 1] = vec_op::FP32Vec16(curr_m_c_1);
|
||||
|
||||
// update
|
||||
curr_m_c_0 += ldc;
|
||||
curr_m_c_1 += ldc;
|
||||
});
|
||||
}
|
||||
|
||||
float* __restrict__ curr_a = a_tile;
|
||||
for (int32_t k = 0; k < dynamic_k_size; ++k) {
|
||||
load_vec_t b_0_reg(curr_b_0);
|
||||
vec_op::FP32Vec16 fp32_b_0_reg(b_0_reg);
|
||||
load_vec_t b_1_reg(curr_b_1);
|
||||
vec_op::FP32Vec16 fp32_b_1_reg(b_1_reg);
|
||||
|
||||
float* __restrict__ curr_m_a = curr_a;
|
||||
vec_op::unroll_loop<int32_t, M>([&](int32_t i) {
|
||||
float v = *curr_m_a;
|
||||
vec_op::FP32Vec16 a_reg(v);
|
||||
c_regs[i * 2] = c_regs[i * 2] + a_reg * fp32_b_0_reg;
|
||||
c_regs[i * 2 + 1] = c_regs[i * 2 + 1] + a_reg * fp32_b_1_reg;
|
||||
|
||||
// update
|
||||
curr_m_a += lda;
|
||||
});
|
||||
|
||||
// update
|
||||
curr_a += 1;
|
||||
curr_b_0 += ldb;
|
||||
curr_b_1 += ldb;
|
||||
}
|
||||
|
||||
vec_op::unroll_loop<int32_t, M>([&](int32_t i) {
|
||||
c_regs[i * 2].save(curr_c_0);
|
||||
c_regs[i * 2 + 1].save(curr_c_1);
|
||||
|
||||
// update
|
||||
curr_c_0 += ldc;
|
||||
curr_c_1 += ldc;
|
||||
});
|
||||
}
|
||||
};
|
||||
} // namespace
|
||||
|
||||
// This is a general but naive implementation based on vector instructions
|
||||
template <typename scalar_t, int64_t head_dim>
|
||||
class AttentionImpl<ISA::VEC, scalar_t, head_dim> {
|
||||
public:
|
||||
using query_t = scalar_t;
|
||||
using q_buffer_t = float;
|
||||
using kv_cache_t = scalar_t;
|
||||
using logits_buffer_t = float;
|
||||
using partial_output_buffer_t = float;
|
||||
using prob_buffer_t = float;
|
||||
|
||||
constexpr static int64_t BlockSizeAlignment =
|
||||
32; // KV token num unit of QK and PV phases
|
||||
constexpr static int64_t HeadDimAlignment =
|
||||
32; // headdim num unit of PV phase
|
||||
constexpr static int64_t MaxQHeadNumPerIteration = 8;
|
||||
constexpr static int64_t HeadDim = head_dim;
|
||||
constexpr static ISA ISAType = ISA::VEC;
|
||||
constexpr static bool scale_on_logits = false; // apply scale on q_buffer
|
||||
|
||||
public:
|
||||
template <template <typename tile_gemm_t> typename attention>
|
||||
FORCE_INLINE void execute_attention(DEFINE_CPU_ATTENTION_PARAMS) {
|
||||
attention<TileGemm82<kv_cache_t>> attention_iteration;
|
||||
attention_iteration(CPU_ATTENTION_PARAMS);
|
||||
}
|
||||
|
||||
// k_cache_token_group_stride: stride of K cache when move to next
|
||||
// BlockSizeAlignment tokens in a block
|
||||
constexpr static int64_t k_cache_token_group_stride(
|
||||
const int32_t block_size) {
|
||||
return BlockSizeAlignment; // layout of k_cache block is [head_dim,
|
||||
// block_size], row-major
|
||||
}
|
||||
|
||||
// v_cache_token_group_stride: stride of V cache when move to next
|
||||
// BlockSizeAlignment tokens in a block
|
||||
constexpr static int64_t v_cache_token_group_stride(
|
||||
const int32_t block_size) {
|
||||
return head_dim * BlockSizeAlignment; // layout of v_cache is [block_size,
|
||||
// head_dim], row-major
|
||||
}
|
||||
|
||||
// v_cache_head_group_stride: stride of V cache when move to next
|
||||
// HeadDimAlignment head dims in a block
|
||||
constexpr static int64_t v_cache_head_group_stride(const int32_t block_size) {
|
||||
return HeadDimAlignment; // layout of v_cache is [block_size, head_dim],
|
||||
// row-major
|
||||
}
|
||||
|
||||
// Copy q to q_buffer and cast it to fp32
|
||||
static void copy_q_heads_tile(
|
||||
scalar_t* __restrict__ src, // [q_num, q_heads_per_kv, head_size]
|
||||
float* __restrict__ q_buffer, const int32_t q_num,
|
||||
const int32_t q_heads_per_kv, const int64_t q_num_stride,
|
||||
const int64_t q_head_stride, float scale) {
|
||||
static_assert(head_dim % 16 == 0);
|
||||
constexpr int32_t unroll_size = head_dim / 16;
|
||||
using load_vec_t = typename VecTypeTrait<scalar_t>::vec_t;
|
||||
|
||||
vec_op::FP32Vec16 scale_vec(scale);
|
||||
for (int32_t q_num_idx = 0; q_num_idx < q_num; ++q_num_idx) {
|
||||
for (int32_t q_head_idx = 0; q_head_idx < q_heads_per_kv; ++q_head_idx) {
|
||||
scalar_t* __restrict__ curr_q =
|
||||
src + q_num_idx * q_num_stride + q_head_idx * q_head_stride;
|
||||
float* __restrict__ curr_q_buffer =
|
||||
q_buffer + q_num_idx * q_heads_per_kv * head_dim +
|
||||
q_head_idx * head_dim;
|
||||
|
||||
vec_op::unroll_loop<int32_t, unroll_size>([&](int32_t i) {
|
||||
load_vec_t vec(curr_q);
|
||||
vec_op::FP32Vec16 fp32_vec(vec);
|
||||
fp32_vec = fp32_vec * scale_vec;
|
||||
fp32_vec.save(curr_q_buffer);
|
||||
|
||||
curr_q += 16;
|
||||
curr_q_buffer += 16;
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// reshape K as column-major and V as row-major
|
||||
static void reshape_and_cache(
|
||||
const scalar_t* __restrict__ key, const scalar_t* __restrict__ value,
|
||||
scalar_t* __restrict__ key_cache, scalar_t* __restrict__ value_cache,
|
||||
const int64_t* __restrict__ slot_mapping, const int64_t token_num,
|
||||
const int64_t key_token_num_stride, const int64_t value_token_num_stride,
|
||||
const int64_t head_num, const int64_t key_head_num_stride,
|
||||
const int64_t value_head_num_stride, const int64_t num_blocks,
|
||||
const int64_t num_blocks_stride, const int64_t cache_head_num_stride,
|
||||
const int64_t block_size, const int64_t block_size_stride) {
|
||||
#pragma omp parallel for collapse(2)
|
||||
for (int64_t token_idx = 0; token_idx < token_num; ++token_idx) {
|
||||
for (int64_t head_idx = 0; head_idx < head_num; ++head_idx) {
|
||||
const int64_t pos = slot_mapping[token_idx];
|
||||
if (pos < 0) {
|
||||
// skip
|
||||
continue;
|
||||
}
|
||||
|
||||
const int64_t block_idx = pos / block_size;
|
||||
const int64_t block_offset = pos % block_size;
|
||||
{
|
||||
// Write Key as column-major
|
||||
const scalar_t* key_start_ptr = key +
|
||||
token_idx * key_token_num_stride +
|
||||
head_idx * key_head_num_stride;
|
||||
scalar_t* key_cache_start_ptr =
|
||||
key_cache + block_idx * num_blocks_stride +
|
||||
head_idx * cache_head_num_stride + block_offset;
|
||||
|
||||
#pragma GCC unroll 8
|
||||
for (int64_t i = 0, j = 0; i < head_dim; ++i, j += block_size) {
|
||||
key_cache_start_ptr[j] = key_start_ptr[i];
|
||||
}
|
||||
}
|
||||
{
|
||||
// Write Value as row-major
|
||||
const scalar_t* value_start_ptr = value +
|
||||
token_idx * value_token_num_stride +
|
||||
head_idx * value_head_num_stride;
|
||||
scalar_t* value_cache_start_ptr =
|
||||
value_cache + block_idx * num_blocks_stride +
|
||||
head_idx * cache_head_num_stride + block_offset * head_dim;
|
||||
std::memcpy(value_cache_start_ptr, value_start_ptr,
|
||||
sizeof(scalar_t) * head_dim);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
} // namespace cpu_attention
|
||||
|
||||
#endif
|
||||
171
csrc/cpu/cpu_attn_vec16.hpp
Normal file
171
csrc/cpu/cpu_attn_vec16.hpp
Normal file
@@ -0,0 +1,171 @@
|
||||
#ifndef CPU_ATTN_VEC16_HPP
|
||||
#define CPU_ATTN_VEC16_HPP
|
||||
|
||||
#include "cpu_attn_vec.hpp"
|
||||
|
||||
namespace cpu_attention {
|
||||
|
||||
namespace {
|
||||
// 16-1-16 pattern, 16 regs for A, 1 regs for B, 16 regs for C, [16, K] @ [k,
|
||||
// 16]
|
||||
template <typename kv_cache_t>
|
||||
class TileGemm161 {
|
||||
public:
|
||||
template <AttentionGemmPhase phase, int32_t k_size>
|
||||
FORCE_INLINE static void gemm(const int32_t m_size,
|
||||
float* __restrict__ a_tile,
|
||||
kv_cache_t* __restrict__ b_tile,
|
||||
float* __restrict__ c_tile, const int64_t lda,
|
||||
const int64_t ldb, const int64_t ldc,
|
||||
const int32_t block_size,
|
||||
const int32_t dynamic_k_size,
|
||||
const bool accum_c) {
|
||||
switch (m_size) {
|
||||
case 1:
|
||||
gemm_micro<1>(a_tile, b_tile, c_tile, lda, ldb, ldc, block_size,
|
||||
dynamic_k_size, accum_c);
|
||||
break;
|
||||
case 2:
|
||||
gemm_micro<2>(a_tile, b_tile, c_tile, lda, ldb, ldc, block_size,
|
||||
dynamic_k_size, accum_c);
|
||||
break;
|
||||
case 3:
|
||||
case 4:
|
||||
gemm_micro<4>(a_tile, b_tile, c_tile, lda, ldb, ldc, block_size,
|
||||
dynamic_k_size, accum_c);
|
||||
break;
|
||||
case 5:
|
||||
case 6:
|
||||
gemm_micro<6>(a_tile, b_tile, c_tile, lda, ldb, ldc, block_size,
|
||||
dynamic_k_size, accum_c);
|
||||
break;
|
||||
case 7:
|
||||
case 8:
|
||||
gemm_micro<8>(a_tile, b_tile, c_tile, lda, ldb, ldc, block_size,
|
||||
dynamic_k_size, accum_c);
|
||||
break;
|
||||
case 9:
|
||||
case 10:
|
||||
case 11:
|
||||
case 12:
|
||||
gemm_micro<12>(a_tile, b_tile, c_tile, lda, ldb, ldc, block_size,
|
||||
dynamic_k_size, accum_c);
|
||||
break;
|
||||
case 13:
|
||||
case 14:
|
||||
case 15:
|
||||
case 16:
|
||||
gemm_micro<16>(a_tile, b_tile, c_tile, lda, ldb, ldc, block_size,
|
||||
dynamic_k_size, accum_c);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
template <int32_t M>
|
||||
static void gemm_micro(float* __restrict__ a_tile,
|
||||
kv_cache_t* __restrict__ b_tile,
|
||||
float* __restrict__ c_tile, const int64_t lda,
|
||||
const int64_t ldb, const int64_t ldc,
|
||||
const int32_t block_size, const int32_t dynamic_k_size,
|
||||
const bool accum_c) {
|
||||
static_assert(0 < M <= 16);
|
||||
using load_vec_t = typename VecTypeTrait<kv_cache_t>::vec_t;
|
||||
|
||||
kv_cache_t* __restrict__ curr_b_0 = b_tile;
|
||||
float* __restrict__ curr_c_0 = c_tile;
|
||||
|
||||
vec_op::FP32Vec16 c_regs[M];
|
||||
if (accum_c) {
|
||||
float* __restrict__ curr_m_c_0 = curr_c_0;
|
||||
vec_op::unroll_loop<int32_t, M>([&](int32_t i) {
|
||||
c_regs[i] = vec_op::FP32Vec16(curr_m_c_0);
|
||||
|
||||
// update
|
||||
curr_m_c_0 += ldc;
|
||||
});
|
||||
}
|
||||
|
||||
float* __restrict__ curr_a = a_tile;
|
||||
for (int32_t k = 0; k < dynamic_k_size; ++k) {
|
||||
load_vec_t b_0_reg(curr_b_0);
|
||||
vec_op::FP32Vec16 fp32_b_0_reg(b_0_reg);
|
||||
|
||||
float* __restrict__ curr_m_a = curr_a;
|
||||
vec_op::unroll_loop<int32_t, M>([&](int32_t i) {
|
||||
float v = *curr_m_a;
|
||||
vec_op::FP32Vec16 a_reg(v);
|
||||
c_regs[i] = c_regs[i] + a_reg * fp32_b_0_reg;
|
||||
|
||||
// update
|
||||
curr_m_a += lda;
|
||||
});
|
||||
|
||||
// update
|
||||
curr_a += 1;
|
||||
curr_b_0 += ldb;
|
||||
}
|
||||
|
||||
vec_op::unroll_loop<int32_t, M>([&](int32_t i) {
|
||||
c_regs[i].save(curr_c_0);
|
||||
|
||||
// update
|
||||
curr_c_0 += ldc;
|
||||
});
|
||||
}
|
||||
};
|
||||
} // namespace
|
||||
|
||||
// This is a general but naive implementation based on vector instructions
|
||||
template <typename scalar_t, int64_t head_dim>
|
||||
class AttentionImpl<ISA::VEC16, scalar_t, head_dim>
|
||||
: public AttentionImpl<ISA::VEC, scalar_t, head_dim> {
|
||||
public:
|
||||
using query_t = scalar_t;
|
||||
using q_buffer_t = float;
|
||||
using kv_cache_t = scalar_t;
|
||||
using logits_buffer_t = float;
|
||||
using partial_output_buffer_t = float;
|
||||
using prob_buffer_t = float;
|
||||
|
||||
constexpr static int64_t BlockSizeAlignment =
|
||||
16; // KV token num unit of QK and PV phases
|
||||
constexpr static int64_t HeadDimAlignment =
|
||||
16; // headdim num unit of PV phase
|
||||
constexpr static int64_t MaxQHeadNumPerIteration = 16;
|
||||
constexpr static int64_t HeadDim = head_dim;
|
||||
constexpr static ISA ISAType = ISA::VEC16;
|
||||
constexpr static bool scale_on_logits = false; // apply scale on q_buffer
|
||||
|
||||
public:
|
||||
template <template <typename tile_gemm_t> typename attention>
|
||||
FORCE_INLINE void execute_attention(DEFINE_CPU_ATTENTION_PARAMS) {
|
||||
attention<TileGemm161<kv_cache_t>> attention_iteration;
|
||||
attention_iteration(CPU_ATTENTION_PARAMS);
|
||||
}
|
||||
|
||||
// k_cache_token_group_stride: stride of K cache when move to next
|
||||
// BlockSizeAlignment tokens in a block
|
||||
constexpr static int64_t k_cache_token_group_stride(
|
||||
const int32_t block_size) {
|
||||
return BlockSizeAlignment; // layout of k_cache block is [head_dim,
|
||||
// block_size], row-major
|
||||
}
|
||||
|
||||
// v_cache_token_group_stride: stride of V cache when move to next
|
||||
// BlockSizeAlignment tokens in a block
|
||||
constexpr static int64_t v_cache_token_group_stride(
|
||||
const int32_t block_size) {
|
||||
return head_dim * BlockSizeAlignment; // layout of v_cache is [block_size,
|
||||
// head_dim], row-major
|
||||
}
|
||||
|
||||
// v_cache_head_group_stride: stride of V cache when move to next
|
||||
// HeadDimAlignment head dims in a block
|
||||
constexpr static int64_t v_cache_head_group_stride(const int32_t block_size) {
|
||||
return HeadDimAlignment; // layout of v_cache is [block_size, head_dim],
|
||||
// row-major
|
||||
}
|
||||
};
|
||||
} // namespace cpu_attention
|
||||
|
||||
#endif
|
||||
@@ -26,10 +26,6 @@ namespace vec_op {
|
||||
|
||||
#define FORCE_INLINE __attribute__((always_inline)) inline
|
||||
|
||||
#define __max(a, b) ((a) > (b) ? (a) : (b))
|
||||
#define __min(a, b) ((a) < (b) ? (a) : (b))
|
||||
#define __abs(a) ((a) < (0) ? (0 - a) : (a))
|
||||
|
||||
typedef struct f16x8_t {
|
||||
uint16_t val[8];
|
||||
} f16x8_t;
|
||||
@@ -99,7 +95,7 @@ struct FP16Vec16 : public Vec<FP16Vec16> {
|
||||
void save(void* ptr) const { *reinterpret_cast<f16x16_t*>(ptr) = reg; }
|
||||
|
||||
void save(void* ptr, const int elem_num) const {
|
||||
int num = __min(elem_num, VEC_ELEM_NUM);
|
||||
int num = std::min(elem_num, VEC_ELEM_NUM);
|
||||
std::memcpy(ptr, &(reg.val[0]), num * sizeof(uint16_t));
|
||||
}
|
||||
};
|
||||
@@ -128,7 +124,7 @@ struct BF16Vec16 : public Vec<BF16Vec16> {
|
||||
void save(void* ptr) const { *reinterpret_cast<f16x16_t*>(ptr) = reg; }
|
||||
|
||||
void save(void* ptr, const int elem_num) const {
|
||||
int num = __min(elem_num, VEC_ELEM_NUM);
|
||||
int num = std::min(elem_num, VEC_ELEM_NUM);
|
||||
std::memcpy(ptr, &(reg.val[0]), num * sizeof(uint16_t));
|
||||
}
|
||||
};
|
||||
@@ -143,9 +139,9 @@ struct BF16Vec32 : public Vec<BF16Vec32> {
|
||||
explicit BF16Vec32(f16x32_t data) : reg(data) {};
|
||||
|
||||
explicit BF16Vec32(BF16Vec8& vec8_data) {
|
||||
for (int i = 0; i < VEC_ELEM_NUM; ++i) {
|
||||
unroll_loop<int, VEC_ELEM_NUM>([&vec8_data, this](int i) {
|
||||
reg.val[i] = vec8_data.reg.val[i % BF16Vec8::VEC_ELEM_NUM];
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
void save(void* ptr) const { *reinterpret_cast<f16x32_t*>(ptr) = reg; }
|
||||
@@ -157,15 +153,11 @@ struct FP32Vec4 : public Vec<FP32Vec4> {
|
||||
f32x4_t reg;
|
||||
|
||||
explicit FP32Vec4(float v) {
|
||||
for (int i = 0; i < VEC_ELEM_NUM; ++i) {
|
||||
reg.val[i] = v;
|
||||
}
|
||||
unroll_loop<int, VEC_ELEM_NUM>([&v, this](int i) { reg.val[i] = v; });
|
||||
}
|
||||
|
||||
explicit FP32Vec4() {
|
||||
for (int i = 0; i < VEC_ELEM_NUM; ++i) {
|
||||
reg.val[i] = 0.0f;
|
||||
}
|
||||
unroll_loop<int, VEC_ELEM_NUM>([this](int i) { reg.val[i] = 0.0f; });
|
||||
}
|
||||
|
||||
explicit FP32Vec4(const float* ptr)
|
||||
@@ -182,15 +174,11 @@ struct FP32Vec8 : public Vec<FP32Vec8> {
|
||||
f32x8_t reg;
|
||||
|
||||
explicit FP32Vec8(float v) {
|
||||
for (int i = 0; i < VEC_ELEM_NUM; ++i) {
|
||||
reg.val[i] = v;
|
||||
}
|
||||
unroll_loop<int, VEC_ELEM_NUM>([&v, this](int i) { reg.val[i] = v; });
|
||||
}
|
||||
|
||||
explicit FP32Vec8() {
|
||||
for (int i = 0; i < VEC_ELEM_NUM; ++i) {
|
||||
reg.val[i] = 0.0f;
|
||||
}
|
||||
unroll_loop<int, VEC_ELEM_NUM>([this](int i) { reg.val[i] = 0.0f; });
|
||||
}
|
||||
|
||||
explicit FP32Vec8(const float* ptr)
|
||||
@@ -201,78 +189,68 @@ struct FP32Vec8 : public Vec<FP32Vec8> {
|
||||
explicit FP32Vec8(const FP32Vec8& data) : reg(data.reg) {};
|
||||
|
||||
explicit FP32Vec8(const FP16Vec8& v) {
|
||||
for (int i = 0; i < VEC_ELEM_NUM; ++i) {
|
||||
reg.val[i] = fp16_to_float(v.reg.val[i]);
|
||||
}
|
||||
unroll_loop<int, VEC_ELEM_NUM>(
|
||||
[&v, this](int i) { reg.val[i] = fp16_to_float(v.reg.val[i]); });
|
||||
}
|
||||
|
||||
FP32Vec8(const BF16Vec8& v) {
|
||||
for (int i = 0; i < VEC_ELEM_NUM; ++i) {
|
||||
reg.val[i] = bf16_to_float(v.reg.val[i]);
|
||||
}
|
||||
unroll_loop<int, VEC_ELEM_NUM>(
|
||||
[&v, this](int i) { reg.val[i] = bf16_to_float(v.reg.val[i]); });
|
||||
}
|
||||
|
||||
float reduce_sum() const {
|
||||
float result = 0;
|
||||
for (int i = 0; i < VEC_ELEM_NUM; ++i) {
|
||||
result += reg.val[i];
|
||||
}
|
||||
unroll_loop<int, VEC_ELEM_NUM>(
|
||||
[&result, this](int i) { result += reg.val[i]; });
|
||||
return result;
|
||||
}
|
||||
|
||||
FP32Vec8 exp() const {
|
||||
f32x8_t ret;
|
||||
for (int i = 0; i < VEC_ELEM_NUM; ++i) {
|
||||
ret.val[i] = expf(reg.val[i]);
|
||||
}
|
||||
unroll_loop<int, VEC_ELEM_NUM>(
|
||||
[&ret, this](int i) { ret.val[i] = expf(reg.val[i]); });
|
||||
return FP32Vec8(ret);
|
||||
}
|
||||
|
||||
FP32Vec8 tanh() const {
|
||||
f32x8_t ret;
|
||||
for (int i = 0; i < VEC_ELEM_NUM; ++i) {
|
||||
ret.val[i] = tanhf(reg.val[i]);
|
||||
}
|
||||
unroll_loop<int, VEC_ELEM_NUM>(
|
||||
[&ret, this](int i) { ret.val[i] = tanhf(reg.val[i]); });
|
||||
return FP32Vec8(ret);
|
||||
}
|
||||
|
||||
FP32Vec8 er() const {
|
||||
f32x8_t ret;
|
||||
for (int i = 0; i < VEC_ELEM_NUM; ++i) {
|
||||
ret.val[i] = erf(reg.val[i]);
|
||||
}
|
||||
unroll_loop<int, VEC_ELEM_NUM>(
|
||||
[&ret, this](int i) { ret.val[i] = erf(reg.val[i]); });
|
||||
return FP32Vec8(ret);
|
||||
}
|
||||
|
||||
FP32Vec8 operator*(const FP32Vec8& b) const {
|
||||
f32x8_t ret;
|
||||
for (int i = 0; i < VEC_ELEM_NUM; ++i) {
|
||||
ret.val[i] = reg.val[i] * b.reg.val[i];
|
||||
}
|
||||
unroll_loop<int, VEC_ELEM_NUM>(
|
||||
[&ret, &b, this](int i) { ret.val[i] = reg.val[i] * b.reg.val[i]; });
|
||||
return FP32Vec8(ret);
|
||||
}
|
||||
|
||||
FP32Vec8 operator+(const FP32Vec8& b) const {
|
||||
f32x8_t ret;
|
||||
for (int i = 0; i < VEC_ELEM_NUM; ++i) {
|
||||
ret.val[i] = reg.val[i] + b.reg.val[i];
|
||||
}
|
||||
unroll_loop<int, VEC_ELEM_NUM>(
|
||||
[&ret, &b, this](int i) { ret.val[i] = reg.val[i] + b.reg.val[i]; });
|
||||
return FP32Vec8(ret);
|
||||
}
|
||||
|
||||
FP32Vec8 operator-(const FP32Vec8& b) const {
|
||||
f32x8_t ret;
|
||||
for (int i = 0; i < VEC_ELEM_NUM; ++i) {
|
||||
ret.val[i] = reg.val[i] - b.reg.val[i];
|
||||
}
|
||||
unroll_loop<int, VEC_ELEM_NUM>(
|
||||
[&ret, &b, this](int i) { ret.val[i] = reg.val[i] - b.reg.val[i]; });
|
||||
return FP32Vec8(ret);
|
||||
}
|
||||
|
||||
FP32Vec8 operator/(const FP32Vec8& b) const {
|
||||
f32x8_t ret;
|
||||
for (int i = 0; i < VEC_ELEM_NUM; ++i) {
|
||||
ret.val[i] = reg.val[i] / b.reg.val[i];
|
||||
}
|
||||
unroll_loop<int, VEC_ELEM_NUM>(
|
||||
[&ret, &b, this](int i) { ret.val[i] = reg.val[i] / b.reg.val[i]; });
|
||||
return FP32Vec8(ret);
|
||||
}
|
||||
|
||||
@@ -284,15 +262,11 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
|
||||
f32x16_t reg;
|
||||
|
||||
explicit FP32Vec16(float v) {
|
||||
for (int i = 0; i < VEC_ELEM_NUM; ++i) {
|
||||
reg.val[i] = v;
|
||||
}
|
||||
unroll_loop<int, VEC_ELEM_NUM>([&v, this](int i) { reg.val[i] = v; });
|
||||
}
|
||||
|
||||
explicit FP32Vec16() {
|
||||
for (int i = 0; i < VEC_ELEM_NUM; ++i) {
|
||||
reg.val[i] = 0.0f;
|
||||
}
|
||||
unroll_loop<int, VEC_ELEM_NUM>([this](int i) { reg.val[i] = 0.0f; });
|
||||
}
|
||||
|
||||
explicit FP32Vec16(const float* ptr)
|
||||
@@ -301,29 +275,27 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
|
||||
explicit FP32Vec16(f32x16_t data) : reg(data) {};
|
||||
|
||||
FP32Vec16(const FP32Vec4& data) {
|
||||
for (int i = 0; i < VEC_ELEM_NUM; ++i) {
|
||||
unroll_loop<int, VEC_ELEM_NUM>([&data, this](int i) {
|
||||
reg.val[i] = data.reg.val[i % FP32Vec4::VEC_ELEM_NUM];
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
FP32Vec16(const FP32Vec8& data) {
|
||||
for (int i = 0; i < VEC_ELEM_NUM; ++i) {
|
||||
unroll_loop<int, VEC_ELEM_NUM>([&data, this](int i) {
|
||||
reg.val[i] = data.reg.val[i % FP32Vec8::VEC_ELEM_NUM];
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
FP32Vec16(const FP32Vec16& data) : reg(data.reg) {};
|
||||
|
||||
explicit FP32Vec16(const FP16Vec16& v) {
|
||||
for (int i = 0; i < VEC_ELEM_NUM; ++i) {
|
||||
reg.val[i] = fp16_to_float(v.reg.val[i]);
|
||||
}
|
||||
unroll_loop<int, VEC_ELEM_NUM>(
|
||||
[&v, this](int i) { reg.val[i] = fp16_to_float(v.reg.val[i]); });
|
||||
}
|
||||
|
||||
explicit FP32Vec16(const BF16Vec16& v) {
|
||||
for (int i = 0; i < VEC_ELEM_NUM; ++i) {
|
||||
reg.val[i] = bf16_to_float(v.reg.val[i]);
|
||||
}
|
||||
unroll_loop<int, VEC_ELEM_NUM>(
|
||||
[&v, this](int i) { reg.val[i] = bf16_to_float(v.reg.val[i]); });
|
||||
}
|
||||
|
||||
explicit FP32Vec16(const FP16Vec8& v) : FP32Vec16(FP32Vec8(v)) {};
|
||||
@@ -331,82 +303,74 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
|
||||
FP32Vec16(const BF16Vec8& v) : FP32Vec16(FP32Vec8(v)) {};
|
||||
|
||||
FP32Vec16 operator*(const FP32Vec16& b) const {
|
||||
FP32Vec16 result(0.0f);
|
||||
for (int i = 0; i < VEC_ELEM_NUM; ++i) {
|
||||
result.reg.val[i] = reg.val[i] * b.reg.val[i];
|
||||
}
|
||||
return result;
|
||||
f32x16_t ret;
|
||||
unroll_loop<int, VEC_ELEM_NUM>(
|
||||
[&ret, &b, this](int i) { ret.val[i] = reg.val[i] * b.reg.val[i]; });
|
||||
return FP32Vec16(ret);
|
||||
}
|
||||
|
||||
FP32Vec16 operator+(const FP32Vec16& b) const {
|
||||
FP32Vec16 result(0.0f);
|
||||
for (int i = 0; i < VEC_ELEM_NUM; ++i) {
|
||||
result.reg.val[i] = reg.val[i] + b.reg.val[i];
|
||||
}
|
||||
return result;
|
||||
f32x16_t ret;
|
||||
unroll_loop<int, VEC_ELEM_NUM>(
|
||||
[&ret, &b, this](int i) { ret.val[i] = reg.val[i] + b.reg.val[i]; });
|
||||
return FP32Vec16(ret);
|
||||
}
|
||||
|
||||
FP32Vec16 operator-(const FP32Vec16& b) const {
|
||||
FP32Vec16 result(0.0f);
|
||||
for (int i = 0; i < VEC_ELEM_NUM; ++i) {
|
||||
result.reg.val[i] = reg.val[i] - b.reg.val[i];
|
||||
}
|
||||
return result;
|
||||
f32x16_t ret;
|
||||
unroll_loop<int, VEC_ELEM_NUM>(
|
||||
[&ret, &b, this](int i) { ret.val[i] = reg.val[i] - b.reg.val[i]; });
|
||||
return FP32Vec16(ret);
|
||||
}
|
||||
|
||||
FP32Vec16 operator/(const FP32Vec16& b) const {
|
||||
FP32Vec16 result(0.0f);
|
||||
for (int i = 0; i < VEC_ELEM_NUM; ++i) {
|
||||
result.reg.val[i] = reg.val[i] / b.reg.val[i];
|
||||
}
|
||||
return result;
|
||||
f32x16_t ret;
|
||||
unroll_loop<int, VEC_ELEM_NUM>(
|
||||
[&ret, &b, this](int i) { ret.val[i] = reg.val[i] / b.reg.val[i]; });
|
||||
return FP32Vec16(ret);
|
||||
}
|
||||
|
||||
FP32Vec16 max(const FP32Vec16& b) const {
|
||||
FP32Vec16 result(0.0f);
|
||||
for (int i = 0; i < VEC_ELEM_NUM; ++i) {
|
||||
result.reg.val[i] = __max(reg.val[i], b.reg.val[i]);
|
||||
}
|
||||
return result;
|
||||
f32x16_t ret;
|
||||
unroll_loop<int, VEC_ELEM_NUM>([&ret, &b, this](int i) {
|
||||
ret.val[i] = std::max(reg.val[i], b.reg.val[i]);
|
||||
});
|
||||
return FP32Vec16(ret);
|
||||
}
|
||||
|
||||
FP32Vec16 min(const FP32Vec16& b) const {
|
||||
FP32Vec16 result(0.0f);
|
||||
for (int i = 0; i < VEC_ELEM_NUM; ++i) {
|
||||
result.reg.val[i] = __min(reg.val[i], b.reg.val[i]);
|
||||
}
|
||||
return result;
|
||||
f32x16_t ret;
|
||||
unroll_loop<int, VEC_ELEM_NUM>([&ret, &b, this](int i) {
|
||||
ret.val[i] = std::min(reg.val[i], b.reg.val[i]);
|
||||
});
|
||||
return FP32Vec16(ret);
|
||||
}
|
||||
|
||||
FP32Vec16 abs() const {
|
||||
FP32Vec16 result(0.0f);
|
||||
for (int i = 0; i < VEC_ELEM_NUM; ++i) {
|
||||
result.reg.val[i] = __abs(reg.val[i]);
|
||||
}
|
||||
return result;
|
||||
f32x16_t ret;
|
||||
unroll_loop<int, VEC_ELEM_NUM>(
|
||||
[&ret, this](int i) { ret.val[i] = std::abs(reg.val[i]); });
|
||||
return FP32Vec16(ret);
|
||||
}
|
||||
|
||||
float reduce_sum() const {
|
||||
float result = 0.0f;
|
||||
for (int i = 0; i < VEC_ELEM_NUM; ++i) {
|
||||
result += reg.val[i];
|
||||
}
|
||||
unroll_loop<int, VEC_ELEM_NUM>(
|
||||
[&result, this](int i) { result += reg.val[i]; });
|
||||
return result;
|
||||
}
|
||||
|
||||
float reduce_max() const {
|
||||
float result = reg.val[0];
|
||||
for (int i = 0; i < VEC_ELEM_NUM; ++i) {
|
||||
result = __max(reg.val[i], result);
|
||||
}
|
||||
float result = std::numeric_limits<float>::lowest();
|
||||
unroll_loop<int, VEC_ELEM_NUM>(
|
||||
[&result, this](int i) { result = std::max(reg.val[i], result); });
|
||||
return result;
|
||||
}
|
||||
|
||||
float reduce_min() const {
|
||||
float result = reg.val[0];
|
||||
for (int i = 0; i < VEC_ELEM_NUM; ++i) {
|
||||
result = __min(reg.val[i], result);
|
||||
}
|
||||
float result = std::numeric_limits<float>::max();
|
||||
unroll_loop<int, VEC_ELEM_NUM>(
|
||||
[&result, this](int i) { result = std::min(reg.val[i], result); });
|
||||
return result;
|
||||
}
|
||||
|
||||
@@ -414,13 +378,9 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
|
||||
float reduce_sub_sum(int idx) {
|
||||
static_assert(VEC_ELEM_NUM % group_size == 0);
|
||||
float sum = 0.0;
|
||||
int start = idx * group_size;
|
||||
int end = (idx + 1) * group_size;
|
||||
|
||||
for (; (start < VEC_ELEM_NUM) && (start < end); ++start) {
|
||||
sum += reg.val[start];
|
||||
}
|
||||
|
||||
const int start = idx * group_size;
|
||||
unroll_loop<int, group_size>(
|
||||
[&sum, &start, this](int i) { sum += reg.val[start + i]; });
|
||||
return sum;
|
||||
}
|
||||
|
||||
@@ -477,17 +437,13 @@ inline void storeFP32<c10::BFloat16>(float v, c10::BFloat16* ptr) {
|
||||
}
|
||||
|
||||
inline FP16Vec16::FP16Vec16(const FP32Vec16& v) {
|
||||
int i = 0;
|
||||
for (i = 0; i < FP16Vec16::VEC_ELEM_NUM; ++i) {
|
||||
reg.val[i] = float_to_fp16(v.reg.val[i]);
|
||||
}
|
||||
unroll_loop<int, FP16Vec16::VEC_ELEM_NUM>(
|
||||
[&v, this](int i) { reg.val[i] = float_to_fp16(v.reg.val[i]); });
|
||||
}
|
||||
|
||||
inline FP16Vec8 ::FP16Vec8(const FP32Vec8& v) {
|
||||
int i = 0;
|
||||
for (i = 0; i < FP16Vec8::VEC_ELEM_NUM; ++i) {
|
||||
reg.val[i] = float_to_fp16(v.reg.val[i]);
|
||||
}
|
||||
unroll_loop<int, FP16Vec8::VEC_ELEM_NUM>(
|
||||
[&v, this](int i) { reg.val[i] = float_to_fp16(v.reg.val[i]); });
|
||||
}
|
||||
|
||||
inline void fma(FP32Vec16& acc, FP32Vec16& a, FP32Vec16& b) {
|
||||
@@ -495,17 +451,13 @@ inline void fma(FP32Vec16& acc, FP32Vec16& a, FP32Vec16& b) {
|
||||
}
|
||||
|
||||
inline BF16Vec8::BF16Vec8(const FP32Vec8& v) {
|
||||
int i = 0;
|
||||
for (i = 0; i < BF16Vec8::VEC_ELEM_NUM; ++i) {
|
||||
reg.val[i] = float_to_bf16(v.reg.val[i]);
|
||||
}
|
||||
unroll_loop<int, BF16Vec8::VEC_ELEM_NUM>(
|
||||
[&v, this](int i) { reg.val[i] = float_to_bf16(v.reg.val[i]); });
|
||||
}
|
||||
|
||||
inline BF16Vec16::BF16Vec16(const FP32Vec16& v) {
|
||||
int i = 0;
|
||||
for (i = 0; i < BF16Vec16::VEC_ELEM_NUM; ++i) {
|
||||
reg.val[i] = float_to_bf16(v.reg.val[i]);
|
||||
}
|
||||
unroll_loop<int, BF16Vec16::VEC_ELEM_NUM>(
|
||||
[&v, this](int i) { reg.val[i] = float_to_bf16(v.reg.val[i]); });
|
||||
}
|
||||
|
||||
inline void prefetch(const void* addr) { __builtin_prefetch(addr, 0, 3); }
|
||||
|
||||
@@ -4,6 +4,7 @@
|
||||
|
||||
#include <vecintrin.h>
|
||||
#include <cmath>
|
||||
#include <limits>
|
||||
#include <torch/all.h>
|
||||
namespace vec_op {
|
||||
|
||||
@@ -174,8 +175,9 @@ struct FP32Vec8 : public Vec<FP32Vec8> {
|
||||
}
|
||||
|
||||
explicit FP32Vec8(const BF16Vec8& v) {
|
||||
reg.val[0] = (__vector float)vec_mergeh(zero, v.reg);
|
||||
reg.val[1] = (__vector float)vec_mergel(zero, v.reg);
|
||||
// On big-endian s390x, place BF16 first to get correct byte order
|
||||
reg.val[0] = (__vector float)vec_mergeh(v.reg, zero);
|
||||
reg.val[1] = (__vector float)vec_mergel(v.reg, zero);
|
||||
}
|
||||
|
||||
float reduce_sum() const {
|
||||
@@ -189,51 +191,257 @@ struct FP32Vec8 : public Vec<FP32Vec8> {
|
||||
}
|
||||
|
||||
FP32Vec8 exp() const {
|
||||
// TODO: Vectorize this
|
||||
AliasReg ar;
|
||||
ar.reg = reg;
|
||||
f32x4x4_t ret;
|
||||
ret.val[0][0] = std::exp(ar.values[0]);
|
||||
ret.val[0][1] = std::exp(ar.values[1]);
|
||||
ret.val[0][2] = std::exp(ar.values[2]);
|
||||
ret.val[0][3] = std::exp(ar.values[3]);
|
||||
ret.val[1][0] = std::exp(ar.values[4]);
|
||||
ret.val[1][1] = std::exp(ar.values[5]);
|
||||
ret.val[1][2] = std::exp(ar.values[6]);
|
||||
ret.val[1][3] = std::exp(ar.values[7]);
|
||||
return FP32Vec8(f32x4x2_t({ret.val[0], ret.val[1]}));
|
||||
f32x4x2_t out;
|
||||
|
||||
const __vector float log2e = vec_splats(1.44269504088896341f);
|
||||
const __vector float one = vec_splats(1.0f);
|
||||
const __vector float min_x = vec_splats(-87.3f);
|
||||
const __vector float max_x = vec_splats(88.7f);
|
||||
|
||||
// 5th-degree minimax polynomial for 2^r (r in [0,1))
|
||||
const __vector float c1 = vec_splats(0.6931471805599453f);
|
||||
const __vector float c2 = vec_splats(0.240226506959101f);
|
||||
const __vector float c3 = vec_splats(0.05550410866482158f);
|
||||
const __vector float c4 = vec_splats(0.009618129107628477f);
|
||||
const __vector float c5 = vec_splats(0.0013333558146428443f);
|
||||
|
||||
for (int i = 0; i < 2; i++) {
|
||||
__vector float x = reg.val[i];
|
||||
|
||||
x = vec_max(x, min_x);
|
||||
x = vec_min(x, max_x);
|
||||
|
||||
__vector float y = vec_mul(x, log2e);
|
||||
|
||||
__vector float kf = vec_floor(y);
|
||||
__vector float r = vec_sub(y, kf);
|
||||
|
||||
__vector signed int k = vec_signed(kf);
|
||||
const __vector signed int min_k = vec_splats((signed int)-126);
|
||||
const __vector signed int max_k = vec_splats((signed int)127);
|
||||
k = vec_min(vec_max(k, min_k), max_k);
|
||||
|
||||
// Build 2^k from exponent bits
|
||||
__vector signed int exp_int = vec_add(k, vec_splats((signed int)127));
|
||||
__vector unsigned int bits = (__vector unsigned int)exp_int;
|
||||
bits = vec_sl(bits, vec_splats((unsigned int)23));
|
||||
__vector float pow2k = (__vector float)bits;
|
||||
|
||||
// Improved minimax polynomial
|
||||
__vector float poly = vec_madd(c5, r, c4);
|
||||
poly = vec_madd(poly, r, c3);
|
||||
poly = vec_madd(poly, r, c2);
|
||||
poly = vec_madd(poly, r, c1);
|
||||
poly = vec_madd(poly, r, one);
|
||||
|
||||
out.val[i] = vec_mul(pow2k, poly);
|
||||
}
|
||||
|
||||
return FP32Vec8(out);
|
||||
}
|
||||
|
||||
FP32Vec8 tanh() const {
|
||||
// TODO: Vectorize this
|
||||
AliasReg ar;
|
||||
ar.reg = reg;
|
||||
f32x4x4_t ret;
|
||||
ret.val[0][0] = std::tanh(ar.values[0]);
|
||||
ret.val[0][1] = std::tanh(ar.values[1]);
|
||||
ret.val[0][2] = std::tanh(ar.values[2]);
|
||||
ret.val[0][3] = std::tanh(ar.values[3]);
|
||||
ret.val[1][0] = std::tanh(ar.values[4]);
|
||||
ret.val[1][1] = std::tanh(ar.values[5]);
|
||||
ret.val[1][2] = std::tanh(ar.values[6]);
|
||||
ret.val[1][3] = std::tanh(ar.values[7]);
|
||||
return FP32Vec8(f32x4x2_t({ret.val[0], ret.val[1]}));
|
||||
// tanh(x) = (exp(2x) - 1) / (exp(2x) + 1)
|
||||
const __vector float one = vec_splats(1.0f);
|
||||
const __vector float two = vec_splats(2.0f);
|
||||
const __vector float zero = vec_splats(0.0f);
|
||||
const __vector float sat =
|
||||
vec_splats(9.0f); // beyond this, tanh(x) ~ sign(x)
|
||||
|
||||
f32x4x2_t out;
|
||||
|
||||
for (int i = 0; i < 2; i++) {
|
||||
__vector float x = reg.val[i];
|
||||
__vector float ax = vec_abs(x);
|
||||
|
||||
// sign(x): +1 or -1
|
||||
__vector float sign = vec_sel(vec_splats(-1.0f), one, vec_cmpgt(x, zero));
|
||||
|
||||
// saturation mask: |x| > sat
|
||||
__vector __bool int saturated = vec_cmpgt(ax, sat);
|
||||
|
||||
// 2x
|
||||
__vector float two_x = vec_mul(x, two);
|
||||
|
||||
// Build a temporary FP32Vec8 with both lanes = 2x, reuse exp()
|
||||
f32x4x2_t tmp;
|
||||
tmp.val[0] = two_x;
|
||||
tmp.val[1] = two_x;
|
||||
FP32Vec8 exp_2x_vec(tmp);
|
||||
|
||||
FP32Vec8 e2x = exp_2x_vec.exp();
|
||||
__vector float e = e2x.reg.val[i];
|
||||
|
||||
// tanh(x) = (e - 1) / (e + 1)
|
||||
__vector float num = vec_sub(e, one);
|
||||
__vector float den = vec_add(e, one);
|
||||
|
||||
__vector float t = vec_div(num, den);
|
||||
|
||||
// For large |x|, clamp to sign(x)
|
||||
out.val[i] = vec_sel(t, sign, saturated);
|
||||
}
|
||||
|
||||
return FP32Vec8(out);
|
||||
}
|
||||
|
||||
FP32Vec8 er() const {
|
||||
// TODO: Vectorize this
|
||||
AliasReg ar;
|
||||
ar.reg = reg;
|
||||
f32x4x4_t ret;
|
||||
ret.val[0][0] = std::erf(ar.values[0]);
|
||||
ret.val[0][1] = std::erf(ar.values[1]);
|
||||
ret.val[0][2] = std::erf(ar.values[2]);
|
||||
ret.val[0][3] = std::erf(ar.values[3]);
|
||||
ret.val[1][0] = std::erf(ar.values[4]);
|
||||
ret.val[1][1] = std::erf(ar.values[5]);
|
||||
ret.val[1][2] = std::erf(ar.values[6]);
|
||||
ret.val[1][3] = std::erf(ar.values[7]);
|
||||
return FP32Vec8(f32x4x2_t({ret.val[0], ret.val[1]}));
|
||||
// A&S 7.1.26 approximation:
|
||||
// erf(x) = sign(x) * (1 - ((((a5*t + a4)*t + a3)*t + a2)*t + a1) * t *
|
||||
// exp(-x^2)) t = 1 / (1 + p*|x|), p = 0.3275911
|
||||
|
||||
const __vector float one = vec_splats(1.0f);
|
||||
const __vector float zero = vec_splats(0.0f);
|
||||
const __vector float p = vec_splats(0.3275911f);
|
||||
|
||||
// Polynomial coeffs
|
||||
const __vector float a1 = vec_splats(0.254829592f);
|
||||
const __vector float a2 = vec_splats(-0.284496736f);
|
||||
const __vector float a3 = vec_splats(1.421413741f);
|
||||
const __vector float a4 = vec_splats(-1.453152027f);
|
||||
const __vector float a5 = vec_splats(1.061405429f);
|
||||
|
||||
// Threshold where erf(x) ~ sign(x)
|
||||
const __vector float sat = vec_splats(6.0f);
|
||||
|
||||
f32x4x2_t out;
|
||||
|
||||
for (int lane = 0; lane < 2; lane++) {
|
||||
__vector float x = reg.val[lane];
|
||||
__vector float ax = vec_abs(x);
|
||||
|
||||
// sign(x)
|
||||
__vector float sign = vec_sel(vec_splats(-1.0f), one, vec_cmpgt(x, zero));
|
||||
|
||||
// |x| > 6 → erf(x) = ±1
|
||||
__vector __bool int saturated = vec_cmpgt(ax, sat);
|
||||
|
||||
// t = 1 / (1 + p * |x|)
|
||||
__vector float t = vec_madd(p, ax, one);
|
||||
t = vec_div(one, t);
|
||||
|
||||
// poly = a5
|
||||
__vector float poly = a5;
|
||||
poly = vec_madd(poly, t, a4);
|
||||
poly = vec_madd(poly, t, a3);
|
||||
poly = vec_madd(poly, t, a2);
|
||||
poly = vec_madd(poly, t, a1);
|
||||
|
||||
// full polynomial: poly = poly * t
|
||||
poly = vec_mul(poly, t);
|
||||
|
||||
// Compute exp(-x^2)
|
||||
__vector float x2 = vec_mul(x, x);
|
||||
__vector float neg_x2 = vec_neg(x2);
|
||||
|
||||
f32x4x2_t tmp;
|
||||
tmp.val[0] = neg_x2;
|
||||
tmp.val[1] = neg_x2;
|
||||
FP32Vec8 exp_neg_x2(tmp);
|
||||
|
||||
FP32Vec8 e = exp_neg_x2.exp();
|
||||
__vector float ex = e.reg.val[lane];
|
||||
|
||||
// erf(x) = sign * (1 - poly * exp(-x^2))
|
||||
__vector float term = vec_mul(poly, ex);
|
||||
__vector float y = vec_sub(one, term);
|
||||
y = vec_mul(y, sign);
|
||||
|
||||
// saturated → ±1
|
||||
__vector float sat_val = vec_mul(sign, one);
|
||||
out.val[lane] = vec_sel(y, sat_val, saturated);
|
||||
}
|
||||
|
||||
return FP32Vec8(out);
|
||||
}
|
||||
// Elementwise sigmoid(x) = 1 / (1 + exp(-x))
|
||||
FP32Vec8 sigmoid() const {
|
||||
const __vector float one = vec_splats(1.0f);
|
||||
|
||||
f32x4x2_t neg;
|
||||
for (int i = 0; i < 2; ++i) {
|
||||
neg.val[i] = vec_neg(reg.val[i]);
|
||||
}
|
||||
|
||||
FP32Vec8 neg_x(neg);
|
||||
FP32Vec8 e = neg_x.exp(); // exp(-x)
|
||||
|
||||
f32x4x2_t denom;
|
||||
for (int i = 0; i < 2; ++i) {
|
||||
denom.val[i] = vec_add(one, e.reg.val[i]);
|
||||
}
|
||||
|
||||
FP32Vec8 denom_vec(denom);
|
||||
FP32Vec8 one_vec(1.0f);
|
||||
|
||||
return one_vec / denom_vec;
|
||||
}
|
||||
|
||||
// Tanh-based GELU:
|
||||
// gelu(x) = 0.5 * x * (1 + tanh(√(2/π) * (x + 0.044715 * x^3)))
|
||||
FP32Vec8 gelu_tanh() const {
|
||||
const __vector float k_s2pi = vec_splats(0.7978845608028654f); // √(2/π)
|
||||
const __vector float k_0_0447 = vec_splats(0.044715f);
|
||||
|
||||
f32x4x2_t x2, x3, inner;
|
||||
for (int i = 0; i < 2; ++i) {
|
||||
__vector float x = reg.val[i];
|
||||
x2.val[i] = vec_mul(x, x); // x^2
|
||||
x3.val[i] = vec_mul(x2.val[i], x); // x^3
|
||||
__vector float t = vec_madd(k_0_0447, x3.val[i], x); // x + 0.044715*x^3
|
||||
inner.val[i] = vec_mul(k_s2pi, t); // √(2/π)*(...)
|
||||
}
|
||||
|
||||
FP32Vec8 inner_vec(inner);
|
||||
FP32Vec8 t = inner_vec.tanh(); // tanh part
|
||||
|
||||
FP32Vec8 one_vec(1.0f);
|
||||
FP32Vec8 half_vec(0.5f);
|
||||
|
||||
FP32Vec8 x_vec(*this);
|
||||
return x_vec * half_vec * (one_vec + t);
|
||||
}
|
||||
|
||||
// Erf-based GELU:
|
||||
// gelu(x) = 0.5 * x * (1 + erf(x / √2))
|
||||
FP32Vec8 gelu_erf() const {
|
||||
const __vector float inv_sqrt2 = vec_splats(0.7071067811865476f); // 1/√2
|
||||
FP32Vec8 x_vec(*this);
|
||||
|
||||
f32x4x2_t scaled;
|
||||
for (int i = 0; i < 2; ++i) {
|
||||
scaled.val[i] = vec_mul(reg.val[i], inv_sqrt2);
|
||||
}
|
||||
FP32Vec8 x_scaled(scaled);
|
||||
|
||||
FP32Vec8 erf_x = x_scaled.er();
|
||||
|
||||
FP32Vec8 one_vec(1.0f);
|
||||
FP32Vec8 half_vec(0.5f);
|
||||
|
||||
return x_vec * half_vec * (one_vec + erf_x);
|
||||
}
|
||||
|
||||
// Elementwise reciprocal: 1/x (scalar per lane, for correctness)
|
||||
FP32Vec8 rcp() const {
|
||||
AliasReg in, out;
|
||||
in.reg = reg;
|
||||
|
||||
for (int i = 0; i < VEC_ELEM_NUM; ++i) {
|
||||
out.values[i] = 1.0f / in.values[i];
|
||||
}
|
||||
return FP32Vec8(out.reg);
|
||||
}
|
||||
|
||||
// Elementwise rsqrt(x) = 1 / sqrt(x) (scalar per lane, for correctness)
|
||||
FP32Vec8 rsqrt() const {
|
||||
AliasReg in, out;
|
||||
in.reg = reg;
|
||||
|
||||
for (int i = 0; i < VEC_ELEM_NUM; ++i) {
|
||||
out.values[i] = 1.0f / std::sqrt(in.values[i]);
|
||||
}
|
||||
return FP32Vec8(out.reg);
|
||||
}
|
||||
|
||||
FP32Vec8 operator*(const FP32Vec8& b) const {
|
||||
@@ -316,10 +524,11 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
|
||||
}
|
||||
|
||||
explicit FP32Vec16(const BF16Vec16& v) {
|
||||
reg.val[0] = (__vector float)vec_mergeh(zero, v.reg.val[0]);
|
||||
reg.val[1] = (__vector float)vec_mergel(zero, v.reg.val[0]);
|
||||
reg.val[2] = (__vector float)vec_mergeh(zero, v.reg.val[1]);
|
||||
reg.val[3] = (__vector float)vec_mergel(zero, v.reg.val[1]);
|
||||
// On big-endian s390x, place BF16 first to get correct byte order
|
||||
reg.val[0] = (__vector float)vec_mergeh(v.reg.val[0], zero);
|
||||
reg.val[1] = (__vector float)vec_mergel(v.reg.val[0], zero);
|
||||
reg.val[2] = (__vector float)vec_mergeh(v.reg.val[1], zero);
|
||||
reg.val[3] = (__vector float)vec_mergel(v.reg.val[1], zero);
|
||||
}
|
||||
|
||||
explicit FP32Vec16(const BF16Vec8& v) : FP32Vec16(FP32Vec8(v)) {}
|
||||
@@ -376,6 +585,23 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
|
||||
return result;
|
||||
}
|
||||
|
||||
FP32Vec16 max(const FP32Vec16& b) const {
|
||||
return FP32Vec16(f32x4x4_t({vec_max(reg.val[0], b.reg.val[0]),
|
||||
vec_max(reg.val[1], b.reg.val[1]),
|
||||
vec_max(reg.val[2], b.reg.val[2]),
|
||||
vec_max(reg.val[3], b.reg.val[3])}));
|
||||
}
|
||||
|
||||
float reduce_max() const {
|
||||
AliasReg ar;
|
||||
ar.reg = reg;
|
||||
float result = ar.values[0];
|
||||
unroll_loop<int, VEC_ELEM_NUM>([&result, &ar](int i) {
|
||||
if (ar.values[i] > result) result = ar.values[i];
|
||||
});
|
||||
return result;
|
||||
}
|
||||
|
||||
void save(float* ptr) const {
|
||||
vec_xst(reg.val[0], 0, ptr);
|
||||
vec_xst(reg.val[1], 16, ptr);
|
||||
@@ -402,15 +628,14 @@ struct VecType<c10::BFloat16> {
|
||||
using vec_type = BF16Vec8;
|
||||
};
|
||||
|
||||
// On s390x, FP16 (Half) is not natively supported, use FP32 vectors instead
|
||||
using FP16Vec16 = FP32Vec16;
|
||||
|
||||
template <typename T>
|
||||
void storeFP32(float v, T* ptr) {
|
||||
*ptr = v;
|
||||
}
|
||||
|
||||
inline void fma(FP32Vec16& acc, FP32Vec16& a, FP32Vec16& b) {
|
||||
acc = acc + a * b;
|
||||
}
|
||||
|
||||
namespace c10 {
|
||||
struct BFloat16 {
|
||||
uint16_t value; // Assume BFloat16 is defined as a struct containing a 16-bit
|
||||
@@ -429,6 +654,79 @@ inline void storeFP32<c10::BFloat16>(float v, c10::BFloat16* ptr) {
|
||||
#define __VEC_CLASS_FP_NAN (1 << 6)
|
||||
#endif
|
||||
|
||||
// Optimized FMA (Fused Multiply-Add) implementations using IBM Z vector
|
||||
// intrinsics
|
||||
|
||||
// FP32Vec4 FMA: acc = acc + (a * b) or equivalently acc = fma(a, b, acc)
|
||||
FORCE_INLINE void fma(FP32Vec4& acc, const FP32Vec4& a, const FP32Vec4& b) {
|
||||
acc.reg = vec_madd(a.reg, b.reg, acc.reg);
|
||||
}
|
||||
|
||||
// FP32Vec8 FMA: acc = acc + (a * b)
|
||||
FORCE_INLINE void fma(FP32Vec8& acc, const FP32Vec8& a, const FP32Vec8& b) {
|
||||
acc.reg.val[0] = vec_madd(a.reg.val[0], b.reg.val[0], acc.reg.val[0]);
|
||||
acc.reg.val[1] = vec_madd(a.reg.val[1], b.reg.val[1], acc.reg.val[1]);
|
||||
}
|
||||
|
||||
// FP32Vec16 FMA: acc = acc + (a * b)
|
||||
FORCE_INLINE void fma(FP32Vec16& acc, const FP32Vec16& a, const FP32Vec16& b) {
|
||||
acc.reg.val[0] = vec_madd(a.reg.val[0], b.reg.val[0], acc.reg.val[0]);
|
||||
acc.reg.val[1] = vec_madd(a.reg.val[1], b.reg.val[1], acc.reg.val[1]);
|
||||
acc.reg.val[2] = vec_madd(a.reg.val[2], b.reg.val[2], acc.reg.val[2]);
|
||||
acc.reg.val[3] = vec_madd(a.reg.val[3], b.reg.val[3], acc.reg.val[3]);
|
||||
}
|
||||
|
||||
// Multiply-Subtract: acc = acc - (a * b)
|
||||
FORCE_INLINE void fms(FP32Vec4& acc, const FP32Vec4& a, const FP32Vec4& b) {
|
||||
acc.reg = vec_msub(a.reg, b.reg, acc.reg);
|
||||
}
|
||||
|
||||
FORCE_INLINE void fms(FP32Vec8& acc, const FP32Vec8& a, const FP32Vec8& b) {
|
||||
acc.reg.val[0] = vec_msub(a.reg.val[0], b.reg.val[0], acc.reg.val[0]);
|
||||
acc.reg.val[1] = vec_msub(a.reg.val[1], b.reg.val[1], acc.reg.val[1]);
|
||||
}
|
||||
|
||||
FORCE_INLINE void fms(FP32Vec16& acc, const FP32Vec16& a, const FP32Vec16& b) {
|
||||
acc.reg.val[0] = vec_msub(a.reg.val[0], b.reg.val[0], acc.reg.val[0]);
|
||||
acc.reg.val[1] = vec_msub(a.reg.val[1], b.reg.val[1], acc.reg.val[1]);
|
||||
acc.reg.val[2] = vec_msub(a.reg.val[2], b.reg.val[2], acc.reg.val[2]);
|
||||
acc.reg.val[3] = vec_msub(a.reg.val[3], b.reg.val[3], acc.reg.val[3]);
|
||||
}
|
||||
|
||||
// Negative Multiply-Add: acc = -(a * b) + acc
|
||||
FORCE_INLINE void nfma(FP32Vec4& acc, const FP32Vec4& a, const FP32Vec4& b) {
|
||||
acc.reg = vec_nmadd(a.reg, b.reg, acc.reg);
|
||||
}
|
||||
|
||||
FORCE_INLINE void nfma(FP32Vec8& acc, const FP32Vec8& a, const FP32Vec8& b) {
|
||||
acc.reg.val[0] = vec_nmadd(a.reg.val[0], b.reg.val[0], acc.reg.val[0]);
|
||||
acc.reg.val[1] = vec_nmadd(a.reg.val[1], b.reg.val[1], acc.reg.val[1]);
|
||||
}
|
||||
|
||||
FORCE_INLINE void nfma(FP32Vec16& acc, const FP32Vec16& a, const FP32Vec16& b) {
|
||||
acc.reg.val[0] = vec_nmadd(a.reg.val[0], b.reg.val[0], acc.reg.val[0]);
|
||||
acc.reg.val[1] = vec_nmadd(a.reg.val[1], b.reg.val[1], acc.reg.val[1]);
|
||||
acc.reg.val[2] = vec_nmadd(a.reg.val[2], b.reg.val[2], acc.reg.val[2]);
|
||||
acc.reg.val[3] = vec_nmadd(a.reg.val[3], b.reg.val[3], acc.reg.val[3]);
|
||||
}
|
||||
|
||||
// Negative Multiply-Subtract: acc = -(a * b) - acc
|
||||
FORCE_INLINE void nfms(FP32Vec4& acc, const FP32Vec4& a, const FP32Vec4& b) {
|
||||
acc.reg = vec_nmsub(a.reg, b.reg, acc.reg);
|
||||
}
|
||||
|
||||
FORCE_INLINE void nfms(FP32Vec8& acc, const FP32Vec8& a, const FP32Vec8& b) {
|
||||
acc.reg.val[0] = vec_nmsub(a.reg.val[0], b.reg.val[0], acc.reg.val[0]);
|
||||
acc.reg.val[1] = vec_nmsub(a.reg.val[1], b.reg.val[1], acc.reg.val[1]);
|
||||
}
|
||||
|
||||
FORCE_INLINE void nfms(FP32Vec16& acc, const FP32Vec16& a, const FP32Vec16& b) {
|
||||
acc.reg.val[0] = vec_nmsub(a.reg.val[0], b.reg.val[0], acc.reg.val[0]);
|
||||
acc.reg.val[1] = vec_nmsub(a.reg.val[1], b.reg.val[1], acc.reg.val[1]);
|
||||
acc.reg.val[2] = vec_nmsub(a.reg.val[2], b.reg.val[2], acc.reg.val[2]);
|
||||
acc.reg.val[3] = vec_nmsub(a.reg.val[3], b.reg.val[3], acc.reg.val[3]);
|
||||
}
|
||||
|
||||
const static __vector unsigned char omask = {2, 3, 6, 7, 10, 11, 14, 15,
|
||||
18, 19, 22, 23, 26, 27, 30, 31};
|
||||
const static __vector unsigned int bias = {0x00007fff, 0x00007fff, 0x00007fff,
|
||||
@@ -441,13 +739,24 @@ const static __vector unsigned int one = {1, 1, 1, 1};
|
||||
inline BF16Vec8::BF16Vec8(const FP32Vec8& v) {
|
||||
__vector unsigned int inp0 = (__vector unsigned int)(v.reg.val[0]);
|
||||
__vector unsigned int inp1 = (__vector unsigned int)(v.reg.val[1]);
|
||||
__vector unsigned int lsb0 = inp0 >> sh16;
|
||||
__vector unsigned int lsb1 = inp1 >> sh16;
|
||||
lsb0 = lsb0 & one;
|
||||
lsb1 = lsb1 & one;
|
||||
__vector unsigned int rnd0 = lsb0 + bias;
|
||||
__vector unsigned int rnd1 = lsb1 + bias;
|
||||
inp0 = inp0 + rnd0;
|
||||
inp1 = inp1 + rnd1;
|
||||
int cc;
|
||||
__vector __bool int sel0 =
|
||||
vec_fp_test_data_class(v.reg.val[0], __VEC_CLASS_FP_NAN, &cc);
|
||||
__vector __bool int sel1 =
|
||||
vec_fp_test_data_class(v.reg.val[1], __VEC_CLASS_FP_NAN, &cc);
|
||||
inp0 = vec_sel(inp0, nan, sel0) >> sh16;
|
||||
inp1 = vec_sel(inp1, nan, sel1) >> sh16;
|
||||
inp0 = vec_sel(inp0, nan, sel0);
|
||||
inp1 = vec_sel(inp1, nan, sel1);
|
||||
inp0 = inp0 >> sh16;
|
||||
inp1 = inp1 >> sh16;
|
||||
|
||||
reg = (__vector signed short)vec_perm(inp0, inp1, omask);
|
||||
}
|
||||
|
||||
@@ -456,6 +765,22 @@ inline BF16Vec16::BF16Vec16(const FP32Vec16& v) {
|
||||
__vector unsigned int inp1 = (__vector unsigned int)(v.reg.val[1]);
|
||||
__vector unsigned int inp2 = (__vector unsigned int)(v.reg.val[2]);
|
||||
__vector unsigned int inp3 = (__vector unsigned int)(v.reg.val[3]);
|
||||
__vector unsigned int lsb0 = inp0 >> sh16;
|
||||
__vector unsigned int lsb1 = inp1 >> sh16;
|
||||
__vector unsigned int lsb2 = inp2 >> sh16;
|
||||
__vector unsigned int lsb3 = inp3 >> sh16;
|
||||
lsb0 = lsb0 & one;
|
||||
lsb1 = lsb1 & one;
|
||||
lsb2 = lsb2 & one;
|
||||
lsb3 = lsb3 & one;
|
||||
__vector unsigned int rnd0 = lsb0 + bias;
|
||||
__vector unsigned int rnd1 = lsb1 + bias;
|
||||
__vector unsigned int rnd2 = lsb2 + bias;
|
||||
__vector unsigned int rnd3 = lsb3 + bias;
|
||||
inp0 = inp0 + rnd0;
|
||||
inp1 = inp1 + rnd1;
|
||||
inp2 = inp2 + rnd2;
|
||||
inp3 = inp3 + rnd3;
|
||||
int cc;
|
||||
__vector __bool int sel0 =
|
||||
vec_fp_test_data_class(v.reg.val[0], __VEC_CLASS_FP_NAN, &cc);
|
||||
@@ -465,15 +790,164 @@ inline BF16Vec16::BF16Vec16(const FP32Vec16& v) {
|
||||
vec_fp_test_data_class(v.reg.val[2], __VEC_CLASS_FP_NAN, &cc);
|
||||
__vector __bool int sel3 =
|
||||
vec_fp_test_data_class(v.reg.val[3], __VEC_CLASS_FP_NAN, &cc);
|
||||
inp0 = vec_sel(inp0, nan, sel0) >> sh16;
|
||||
inp1 = vec_sel(inp1, nan, sel1) >> sh16;
|
||||
inp2 = vec_sel(inp2, nan, sel2) >> sh16;
|
||||
inp3 = vec_sel(inp3, nan, sel3) >> sh16;
|
||||
inp0 = vec_sel(inp0, nan, sel0);
|
||||
inp1 = vec_sel(inp1, nan, sel1);
|
||||
inp2 = vec_sel(inp2, nan, sel2);
|
||||
inp3 = vec_sel(inp3, nan, sel3);
|
||||
inp0 = inp0 >> sh16;
|
||||
inp1 = inp1 >> sh16;
|
||||
inp2 = inp2 >> sh16;
|
||||
inp3 = inp3 >> sh16;
|
||||
|
||||
reg.val[0] = (__vector signed short)vec_perm(inp0, inp1, omask);
|
||||
reg.val[1] = (__vector signed short)vec_perm(inp2, inp3, omask);
|
||||
}
|
||||
|
||||
inline void prefetch(const void* addr) { void __dcbt(const void* addr); }
|
||||
// 1D softmax over `n` elements in `input`, writes result to `output`.
|
||||
// Uses FP32Vec8 for main body, scalar tail handling.
|
||||
// Requirement: n > 0
|
||||
FORCE_INLINE void softmax_fp32vec8(float* output, const float* input, int n) {
|
||||
if (n <= 0) return;
|
||||
|
||||
// ---------- Pass 1: find max ----------
|
||||
float max_val = -std::numeric_limits<float>::infinity();
|
||||
int i = 0;
|
||||
|
||||
for (; i + FP32Vec8::VEC_ELEM_NUM <= n; i += FP32Vec8::VEC_ELEM_NUM) {
|
||||
FP32Vec8 v(input + i);
|
||||
FP32Vec8::AliasReg ar;
|
||||
ar.reg = v.reg;
|
||||
for (int j = 0; j < FP32Vec8::VEC_ELEM_NUM; ++j) {
|
||||
if (ar.values[j] > max_val) max_val = ar.values[j];
|
||||
}
|
||||
}
|
||||
for (; i < n; ++i) {
|
||||
if (input[i] > max_val) max_val = input[i];
|
||||
}
|
||||
|
||||
// ---------- Pass 2: compute exp(x - max) and sum ----------
|
||||
float sum = 0.0f;
|
||||
i = 0;
|
||||
|
||||
for (; i + FP32Vec8::VEC_ELEM_NUM <= n; i += FP32Vec8::VEC_ELEM_NUM) {
|
||||
float tmp[FP32Vec8::VEC_ELEM_NUM];
|
||||
for (int j = 0; j < FP32Vec8::VEC_ELEM_NUM; ++j) {
|
||||
tmp[j] = input[i + j] - max_val;
|
||||
}
|
||||
|
||||
FP32Vec8 v(tmp);
|
||||
FP32Vec8 e = v.exp();
|
||||
|
||||
FP32Vec8::AliasReg ar;
|
||||
ar.reg = e.reg;
|
||||
for (int j = 0; j < FP32Vec8::VEC_ELEM_NUM; ++j) {
|
||||
output[i + j] = ar.values[j];
|
||||
sum += ar.values[j];
|
||||
}
|
||||
}
|
||||
|
||||
// Tail
|
||||
for (; i < n; ++i) {
|
||||
float x = input[i] - max_val;
|
||||
float ex = std::exp(x); // scalar tail
|
||||
output[i] = ex;
|
||||
sum += ex;
|
||||
}
|
||||
|
||||
// ---------- Pass 3: normalize ----------
|
||||
float inv_sum = 1.0f / sum;
|
||||
i = 0;
|
||||
|
||||
for (; i + FP32Vec8::VEC_ELEM_NUM <= n; i += FP32Vec8::VEC_ELEM_NUM) {
|
||||
float tmp[FP32Vec8::VEC_ELEM_NUM];
|
||||
for (int j = 0; j < FP32Vec8::VEC_ELEM_NUM; ++j) {
|
||||
tmp[j] = output[i + j] * inv_sum;
|
||||
}
|
||||
FP32Vec8 v(tmp);
|
||||
v.save(output + i);
|
||||
}
|
||||
|
||||
for (; i < n; ++i) {
|
||||
output[i] *= inv_sum;
|
||||
}
|
||||
}
|
||||
|
||||
// 1D RMSNorm kernel:
|
||||
// input: x[0..n-1]
|
||||
// weight: w[0..n-1] (gamma), may be nullptr
|
||||
// output: y[i] = x[i] * inv_rms * (weight[i] if weight != nullptr else 1)
|
||||
// eps: small epsilon for numerical stability
|
||||
FORCE_INLINE void rmsnorm_fp32vec8(float* output, const float* input,
|
||||
const float* weight, int n, float eps) {
|
||||
if (n <= 0) return;
|
||||
|
||||
// ---------- Pass 1: compute sum of squares ----------
|
||||
float sum_sq = 0.0f;
|
||||
int i = 0;
|
||||
|
||||
for (; i + FP32Vec8::VEC_ELEM_NUM <= n; i += FP32Vec8::VEC_ELEM_NUM) {
|
||||
FP32Vec8 x_vec(input + i);
|
||||
|
||||
FP32Vec8 sq = x_vec * x_vec;
|
||||
|
||||
FP32Vec8::AliasReg ar;
|
||||
ar.reg = sq.reg;
|
||||
for (int j = 0; j < FP32Vec8::VEC_ELEM_NUM; ++j) {
|
||||
sum_sq += ar.values[j];
|
||||
}
|
||||
}
|
||||
|
||||
// Tail
|
||||
for (; i < n; ++i) {
|
||||
float v = input[i];
|
||||
sum_sq += v * v;
|
||||
}
|
||||
|
||||
float mean_sq = sum_sq / static_cast<float>(n);
|
||||
float inv_rms = 1.0f / std::sqrt(mean_sq + eps);
|
||||
|
||||
// ---------- Pass 2: scale (and apply weight if given) ----------
|
||||
const float inv_rms_f = inv_rms;
|
||||
i = 0;
|
||||
|
||||
if (weight) {
|
||||
// with gamma
|
||||
for (; i + FP32Vec8::VEC_ELEM_NUM <= n; i += FP32Vec8::VEC_ELEM_NUM) {
|
||||
FP32Vec8 x_vec(input + i);
|
||||
|
||||
float wtmp[FP32Vec8::VEC_ELEM_NUM];
|
||||
for (int j = 0; j < FP32Vec8::VEC_ELEM_NUM; ++j) {
|
||||
wtmp[j] = weight[i + j];
|
||||
}
|
||||
FP32Vec8 w_vec(wtmp);
|
||||
|
||||
FP32Vec8 scale_vec(inv_rms_f);
|
||||
FP32Vec8 y = x_vec * scale_vec * w_vec;
|
||||
y.save(output + i);
|
||||
}
|
||||
|
||||
for (; i < n; ++i) {
|
||||
output[i] = input[i] * inv_rms_f * weight[i];
|
||||
}
|
||||
} else {
|
||||
// without gamma
|
||||
for (; i + FP32Vec8::VEC_ELEM_NUM <= n; i += FP32Vec8::VEC_ELEM_NUM) {
|
||||
FP32Vec8 x_vec(input + i);
|
||||
FP32Vec8 scale_vec(inv_rms_f);
|
||||
FP32Vec8 y = x_vec * scale_vec;
|
||||
y.save(output + i);
|
||||
}
|
||||
|
||||
for (; i < n; ++i) {
|
||||
output[i] = input[i] * inv_rms_f;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Prefetch data to cache for better memory access performance
|
||||
FORCE_INLINE void prefetch(const void* addr) {
|
||||
__builtin_prefetch(addr, 0, 3); // 0=read, 3=high temporal locality
|
||||
}
|
||||
|
||||
}; // namespace vec_op
|
||||
|
||||
|
||||
@@ -40,6 +40,23 @@ namespace vec_op {
|
||||
|
||||
#define FORCE_INLINE __attribute__((always_inline)) inline
|
||||
|
||||
// Function to get the timestamp using RDTSCP
|
||||
FORCE_INLINE uint64_t bench_timestamp() {
|
||||
unsigned int cycles_low, cycles_high;
|
||||
asm volatile(
|
||||
".intel_syntax noprefix\n\t"
|
||||
"CPUID\n\t" // Serialize instruction stream to ensure previous
|
||||
// instructions complete
|
||||
"RDTSCP\n\t" // Read TSC and core ID
|
||||
"mov %0, edx\n\t" // Store high 32 bits of TSC
|
||||
"mov %1, eax\n\t" // Store low 32 bits of TSC
|
||||
".att_syntax"
|
||||
: "=r"(cycles_high), "=r"(cycles_low)::"rax", "rbx", "rcx",
|
||||
"rdx" // Clobbered registers
|
||||
);
|
||||
return (uint64_t)cycles_high << 32 | cycles_low;
|
||||
}
|
||||
|
||||
namespace {
|
||||
template <typename T, T... indexes, typename F>
|
||||
constexpr void unroll_loop_item(std::integer_sequence<T, indexes...>, F&& f) {
|
||||
@@ -87,6 +104,8 @@ struct FP16Vec16 : public Vec<FP16Vec16> {
|
||||
explicit FP16Vec16(bool, void* ptr)
|
||||
: reg(_mm256_stream_load_si256((__m256i*)ptr)) {}
|
||||
|
||||
explicit FP16Vec16(const c10::Half v) : reg(_mm256_set1_epi16(v.x)) {}
|
||||
|
||||
explicit FP16Vec16(const FP32Vec16&);
|
||||
|
||||
void save(void* ptr) const { _mm256_storeu_si256((__m256i*)ptr, reg); }
|
||||
@@ -124,6 +143,8 @@ struct BF16Vec16 : public Vec<BF16Vec16> {
|
||||
explicit BF16Vec16(bool, void* ptr)
|
||||
: reg(_mm256_stream_load_si256((__m256i*)ptr)) {}
|
||||
|
||||
explicit BF16Vec16(const c10::BFloat16 v) : reg(_mm256_set1_epi16(v.x)) {}
|
||||
|
||||
explicit BF16Vec16(const FP32Vec16&);
|
||||
|
||||
void save(void* ptr) const { _mm256_storeu_si256((__m256i*)ptr, reg); }
|
||||
@@ -333,6 +354,22 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
|
||||
|
||||
explicit FP32Vec16(__m512 data) : reg(data) {}
|
||||
|
||||
// de-pack 4 bit values
|
||||
explicit FP32Vec16(int64_t value, const FP32Vec16& lut) {
|
||||
int64_t mask_0 = 0x0F0F0F0F0F0F0F0F;
|
||||
int64_t mask_1 = 0xF0F0F0F0F0F0F0F0;
|
||||
int64_t value_0 = value & mask_0;
|
||||
int64_t value_1 = value & mask_1;
|
||||
__m128i vec_0 = _mm_movpi64_epi64((__m64)value_0);
|
||||
__m128i vec_1 = _mm_movpi64_epi64((__m64)value_1);
|
||||
vec_0 = _mm_cvtepu8_epi16(vec_0);
|
||||
vec_1 = _mm_cvtepu8_epi16(vec_1);
|
||||
vec_1 = _mm_slli_epi16(vec_1, 4);
|
||||
__m128i vec = _mm_or_si128(vec_0, vec_1);
|
||||
__m512i vec_i32 = _mm512_cvtepu8_epi32(vec);
|
||||
reg = _mm512_permutexvar_ps(vec_i32, lut.reg);
|
||||
}
|
||||
|
||||
explicit FP32Vec16(const FP32Vec4& data)
|
||||
: reg((__m512)_mm512_inserti32x4(
|
||||
_mm512_inserti32x4(
|
||||
@@ -407,13 +444,7 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
|
||||
|
||||
float reduce_min() const { return _mm512_reduce_min_ps(reg); }
|
||||
|
||||
template <int group_size>
|
||||
float reduce_sub_sum(int idx) {
|
||||
static_assert(VEC_ELEM_NUM % group_size == 0);
|
||||
constexpr uint32_t base_mask = (0xFFFF >> (16 - group_size));
|
||||
__mmask16 mask = _cvtu32_mask16(base_mask << (idx * group_size));
|
||||
return _mm512_mask_reduce_add_ps(mask, reg);
|
||||
}
|
||||
float get_last_elem() const { return _mm512_cvtss_f32(reg); }
|
||||
|
||||
void save(float* ptr) const { _mm512_storeu_ps(ptr, reg); }
|
||||
|
||||
@@ -446,9 +477,6 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
|
||||
|
||||
explicit FP32Vec16(__m256 low, __m256 high) : reg_low(low), reg_high(high) {}
|
||||
|
||||
explicit FP32Vec16(const FP32Vec16& data)
|
||||
: reg_low(data.reg_low), reg_high(data.reg_high) {}
|
||||
|
||||
explicit FP32Vec16(const FP32Vec4& data)
|
||||
: reg_low((__m256)_mm256_inserti128_si256(
|
||||
_mm256_castsi128_si256((__m128i)data.reg), (__m128i)data.reg, 1)),
|
||||
@@ -504,6 +532,32 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
|
||||
_mm256_div_ps(reg_high, b.reg_high));
|
||||
}
|
||||
|
||||
FP32Vec16 max(const FP32Vec16& b) const {
|
||||
return FP32Vec16(_mm256_max_ps(reg_low, b.reg_low),
|
||||
_mm256_max_ps(reg_high, b.reg_high));
|
||||
}
|
||||
|
||||
float reduce_max() const {
|
||||
__m256 v = _mm256_max_ps(reg_low, reg_high);
|
||||
// Permute to compare elements within 128-bit lanes
|
||||
__m256 v_shuffled = _mm256_permute_ps(
|
||||
v, 0b00001011); // Swap halves within each 128-bit lane
|
||||
__m256 v_max = _mm256_max_ps(v, v_shuffled);
|
||||
|
||||
v_shuffled = _mm256_permute_ps(
|
||||
v_max, 0b00000001); // Shuffle elements within each 128-bit lane
|
||||
v_max = _mm256_max_ps(v_max, v_shuffled);
|
||||
|
||||
// Permute to compare elements between 128-bit lanes
|
||||
v_shuffled =
|
||||
_mm256_permute2f128_ps(v_max, v_max, 0b00000001); // Swap 128-bit lanes
|
||||
v_max = _mm256_max_ps(v_max, v_shuffled);
|
||||
|
||||
// At this point, the maximum value is present in all elements of v_max.
|
||||
// Extract the first element for the scalar result.
|
||||
return _mm256_cvtss_f32(v_max); // Extract the lowest 32-bit float
|
||||
}
|
||||
|
||||
float reduce_sum() const {
|
||||
FP32Vec8 low = FP32Vec8(reg_low);
|
||||
FP32Vec8 high = FP32Vec8(reg_high);
|
||||
@@ -642,7 +696,7 @@ inline FP16Vec16::FP16Vec16(const FP32Vec16& v)
|
||||
inline FP16Vec16::FP16Vec16(const FP32Vec16& v)
|
||||
: reg(_mm256_insertf128_si256(
|
||||
_mm256_castsi128_si256(FP16Vec8(FP32Vec8(v.reg_low)).reg),
|
||||
FP16Vec8(FP32Vec8(v.reg_low)).reg, 1)) {}
|
||||
FP16Vec8(FP32Vec8(v.reg_high)).reg, 1)) {}
|
||||
#endif
|
||||
|
||||
#ifdef __AVX512BF16__
|
||||
@@ -713,6 +767,25 @@ inline void non_temporal_save(BF16Vec16& vec, void* ptr) {
|
||||
inline void non_temporal_save(FP32Vec16& vec, void* ptr) {
|
||||
_mm512_stream_ps((float*)ptr, vec.reg);
|
||||
}
|
||||
|
||||
static void interleave_save(const BF16Vec16& vec0, const BF16Vec16& vec1,
|
||||
void* ptr) {
|
||||
__m512i vec_0 = _mm512_cvtepu16_epi32(vec0.reg);
|
||||
__m512i vec_1 = _mm512_cvtepu16_epi32(vec1.reg);
|
||||
vec_1 = _mm512_slli_epi32(vec_1, 16);
|
||||
vec_0 = _mm512_or_si512(vec_0, vec_1);
|
||||
_mm512_storeu_epi32(ptr, vec_0);
|
||||
}
|
||||
|
||||
static void interleave_save(const FP16Vec16& vec0, const FP16Vec16& vec1,
|
||||
void* ptr) {
|
||||
__m512i vec_0 = _mm512_cvtepu16_epi32(vec0.reg);
|
||||
__m512i vec_1 = _mm512_cvtepu16_epi32(vec1.reg);
|
||||
vec_1 = _mm512_slli_epi32(vec_1, 16);
|
||||
vec_0 = _mm512_or_si512(vec_0, vec_1);
|
||||
_mm512_storeu_epi32(ptr, vec_0);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
inline void mem_barrier() { _mm_mfence(); }
|
||||
|
||||
402
csrc/cpu/cpu_wna16.cpp
Normal file
402
csrc/cpu/cpu_wna16.cpp
Normal file
@@ -0,0 +1,402 @@
|
||||
#include "cpu_types.hpp"
|
||||
#include "scratchpad_manager.h"
|
||||
#include "utils.hpp"
|
||||
|
||||
#ifdef CPU_CAPABILITY_AMXBF16
|
||||
#include "cpu/micro_gemm/cpu_micro_gemm_amx.hpp"
|
||||
#endif
|
||||
#include "cpu/micro_gemm/cpu_micro_gemm_vec.hpp"
|
||||
|
||||
#define VLLM_DISPATCH_CASE_16B_TYPES(...) \
|
||||
AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__) \
|
||||
AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__)
|
||||
|
||||
#define VLLM_DISPATCH_16B_TYPES(TYPE, NAME, ...) \
|
||||
AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_16B_TYPES(__VA_ARGS__))
|
||||
|
||||
template <typename T>
|
||||
void print_logits(const char* name, T* ptr, int32_t row, int32_t col,
|
||||
int32_t stride) {
|
||||
std::stringstream ss;
|
||||
ss << std::fixed << std::setprecision(5) << name << ": [\n";
|
||||
auto* curr_logits_buffer = ptr;
|
||||
for (int32_t m = 0; m < row; ++m) {
|
||||
for (int32_t n = 0; n < col; ++n) {
|
||||
ss << curr_logits_buffer[n] << ", ";
|
||||
}
|
||||
ss << "\n";
|
||||
curr_logits_buffer += stride;
|
||||
}
|
||||
ss << "]\n";
|
||||
std::printf("%s", ss.str().c_str());
|
||||
}
|
||||
|
||||
namespace {
|
||||
using cpu_utils::ISA;
|
||||
using cpu_utils::VecTypeTrait;
|
||||
|
||||
template <typename scalar_t, ISA isa, bool has_zp, bool use_desc_act>
|
||||
class Dequantizer4b {
|
||||
public:
|
||||
constexpr static int32_t pack_num = 32 / 4;
|
||||
using scalar_vec_t = typename VecTypeTrait<scalar_t>::vec_t;
|
||||
|
||||
public:
|
||||
static void dequant(int32_t* __restrict__ q_weight,
|
||||
scalar_t* __restrict__ weight,
|
||||
scalar_t* __restrict__ scales,
|
||||
int32_t* __restrict__ zeros, int32_t* __restrict__ g_idx,
|
||||
const int64_t scales_stride, const int64_t zeros_stride,
|
||||
const int32_t k_size, const int32_t group_size) {
|
||||
vec_op::FP32Vec16 lut;
|
||||
if constexpr (has_zp) {
|
||||
// AWQ
|
||||
alignas(64) static const float LUT[16] = {
|
||||
0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f,
|
||||
8.0f, 9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f};
|
||||
lut = vec_op::FP32Vec16(LUT);
|
||||
} else {
|
||||
// GPTQ
|
||||
alignas(64) static const float LUT[16] = {
|
||||
-8.0f, -7.0f, -6.0f, -5.0f, -4.0f, -3.0f, -2.0f, -1.0f,
|
||||
0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f};
|
||||
lut = vec_op::FP32Vec16(LUT);
|
||||
}
|
||||
|
||||
// per 64-bits elem contains 16 output channels
|
||||
int64_t* __restrict__ curr_q_weight = reinterpret_cast<int64_t*>(q_weight);
|
||||
int64_t* __restrict__ curr_zeros = reinterpret_cast<int64_t*>(zeros);
|
||||
scalar_t* __restrict__ curr_weight = weight;
|
||||
scalar_t* __restrict__ curr_scale = scales;
|
||||
vec_op::FP32Vec16 scale_0;
|
||||
vec_op::FP32Vec16 scale_1;
|
||||
vec_op::FP32Vec16 zero_0;
|
||||
vec_op::FP32Vec16 zero_1;
|
||||
int32_t group_counter = 0;
|
||||
for (int32_t k_idx = 0; k_idx < k_size; k_idx += 2) {
|
||||
int64_t qwb_0 = *curr_q_weight;
|
||||
int64_t qwb_1 = *(curr_q_weight + 1);
|
||||
vec_op::FP32Vec16 wb_0(qwb_0, lut);
|
||||
vec_op::FP32Vec16 wb_1(qwb_1, lut);
|
||||
|
||||
if constexpr (!use_desc_act) {
|
||||
if (group_counter == 0) {
|
||||
scale_0 = vec_op::FP32Vec16(scalar_vec_t(curr_scale));
|
||||
scale_1 = vec_op::FP32Vec16(scale_0);
|
||||
curr_scale += scales_stride;
|
||||
|
||||
if constexpr (has_zp) {
|
||||
zero_0 = vec_op::FP32Vec16(*curr_zeros, lut);
|
||||
zero_1 = vec_op::FP32Vec16(zero_0);
|
||||
curr_zeros += zeros_stride / 2;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
int32_t g_idx_0 = g_idx[k_idx];
|
||||
int32_t g_idx_1 = g_idx[k_idx + 1];
|
||||
scale_0 = vec_op::FP32Vec16(
|
||||
scalar_vec_t(curr_scale + g_idx_0 * scales_stride));
|
||||
scale_1 = vec_op::FP32Vec16(
|
||||
scalar_vec_t(curr_scale + g_idx_1 * scales_stride));
|
||||
if constexpr (has_zp) {
|
||||
zero_0 = vec_op::FP32Vec16(*(curr_zeros + g_idx_0 * zeros_stride / 2),
|
||||
lut);
|
||||
zero_1 = vec_op::FP32Vec16(*(curr_zeros + g_idx_1 * zeros_stride / 2),
|
||||
lut);
|
||||
}
|
||||
}
|
||||
|
||||
if constexpr (has_zp) {
|
||||
wb_0 = wb_0 - zero_0;
|
||||
wb_1 = wb_1 - zero_1;
|
||||
}
|
||||
|
||||
wb_0 = wb_0 * scale_0;
|
||||
wb_1 = wb_1 * scale_1;
|
||||
|
||||
scalar_vec_t output_vec_0(wb_0);
|
||||
scalar_vec_t output_vec_1(wb_1);
|
||||
|
||||
// AMX needs to interlave K elements to pack as 32 bits
|
||||
if constexpr (isa == ISA::AMX) {
|
||||
vec_op::interleave_save(output_vec_0, output_vec_1, curr_weight);
|
||||
} else {
|
||||
output_vec_0.save(curr_weight);
|
||||
output_vec_1.save(curr_weight + 16);
|
||||
}
|
||||
|
||||
// update
|
||||
curr_q_weight += 2;
|
||||
curr_weight += 32;
|
||||
if constexpr (!use_desc_act) {
|
||||
group_counter += 2;
|
||||
if (group_counter == group_size) {
|
||||
group_counter = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
}; // namespace
|
||||
|
||||
template <typename scalar_t, typename dequantizer_t, typename gemm_t>
|
||||
void cpu_gemm_wna16_impl(
|
||||
scalar_t* __restrict__ input, int32_t* __restrict__ q_weight,
|
||||
scalar_t* __restrict__ output, scalar_t* __restrict__ scales,
|
||||
int32_t* __restrict__ zeros, int32_t* __restrict__ g_idx,
|
||||
scalar_t* __restrict__ bias, const int32_t m_size, const int32_t n_size,
|
||||
const int32_t k_size, const int64_t input_stride,
|
||||
const int64_t output_stride, const int64_t scales_group_stride,
|
||||
const int64_t zeros_group_stride, const int32_t group_num,
|
||||
const int32_t group_size, const int64_t pack_factor) {
|
||||
constexpr int32_t gemm_n_tile_size = gemm_t::NSize;
|
||||
constexpr int32_t gemm_m_tile_size = gemm_t::MaxMSize;
|
||||
constexpr int32_t n_block_size = 16;
|
||||
static_assert(gemm_n_tile_size % n_block_size == 0);
|
||||
const int32_t thread_num = omp_get_max_threads();
|
||||
|
||||
// a simple schedule policy, just to hold more B tiles in L2 and make sure
|
||||
// each thread has tasks
|
||||
const int32_t n_partition_size = [&]() {
|
||||
const int64_t cache_size = cpu_utils::get_l2_size();
|
||||
int64_t ps_cache_limit = cache_size / (k_size * sizeof(scalar_t));
|
||||
int64_t ps_thread_limit = n_size / thread_num;
|
||||
ps_cache_limit =
|
||||
std::max((ps_cache_limit / gemm_n_tile_size) * gemm_n_tile_size,
|
||||
(int64_t)gemm_n_tile_size);
|
||||
ps_thread_limit =
|
||||
std::max((ps_thread_limit / gemm_n_tile_size) * gemm_n_tile_size,
|
||||
(int64_t)gemm_n_tile_size);
|
||||
return std::min(ps_cache_limit, ps_thread_limit);
|
||||
}();
|
||||
const int32_t task_num = (n_size + n_partition_size - 1) / n_partition_size;
|
||||
|
||||
// get buffer size
|
||||
const int64_t b_buffer_size =
|
||||
(((n_partition_size * k_size * sizeof(scalar_t) + 63) / 64) * 64);
|
||||
const int64_t c_buffer_size =
|
||||
(((gemm_m_tile_size * gemm_n_tile_size * sizeof(float) + 63) / 64) * 64);
|
||||
const int64_t b_buffer_offset = 0;
|
||||
const int64_t c_buffer_offset = b_buffer_size;
|
||||
const int64_t buffer_size = b_buffer_size + c_buffer_size;
|
||||
DNNLScratchPadManager::get_dnnl_scratchpad_manager()->realloc(buffer_size *
|
||||
thread_num);
|
||||
|
||||
alignas(64) cpu_utils::Counter counter;
|
||||
cpu_utils::Counter* counter_ptr = &counter;
|
||||
|
||||
#pragma omp parallel for schedule(static, 1)
|
||||
for (int32_t thread_id = 0; thread_id < thread_num; ++thread_id) {
|
||||
scalar_t* __restrict__ b_buffer = nullptr;
|
||||
float* __restrict__ c_buffer = nullptr;
|
||||
{
|
||||
uint8_t* buffer_ptr = DNNLScratchPadManager::get_dnnl_scratchpad_manager()
|
||||
->get_data<uint8_t>() +
|
||||
thread_id * buffer_size;
|
||||
b_buffer = reinterpret_cast<scalar_t*>(buffer_ptr + b_buffer_offset);
|
||||
c_buffer = reinterpret_cast<float*>(buffer_ptr + c_buffer_offset);
|
||||
}
|
||||
|
||||
const int64_t q_weight_block_stride = n_block_size / pack_factor * k_size;
|
||||
const int64_t b_buffer_block_stride = n_block_size * k_size;
|
||||
const int32_t zeros_block_stride = n_block_size / pack_factor;
|
||||
|
||||
gemm_t gemm;
|
||||
|
||||
for (;;) {
|
||||
int32_t task_id = counter_ptr->acquire_counter();
|
||||
|
||||
if (task_id >= task_num) {
|
||||
break;
|
||||
}
|
||||
|
||||
const int32_t n_start_idx = task_id * n_partition_size;
|
||||
const int32_t n_block_start_idx = n_start_idx / n_block_size;
|
||||
const int32_t n_num = std::min(n_partition_size, n_size - n_start_idx);
|
||||
const int32_t n_block_num = n_num / n_block_size;
|
||||
// std::printf("thread_id: %d, task_id: %d, n_start_idx: %d, n_num: %d\n",
|
||||
// thread_id, task_id, n_start_idx, n_num);
|
||||
|
||||
// dequant weight
|
||||
{
|
||||
int32_t* __restrict__ curr_q_weight =
|
||||
q_weight + n_block_start_idx * q_weight_block_stride;
|
||||
scalar_t* __restrict__ curr_b_buffer = b_buffer;
|
||||
scalar_t* __restrict__ curr_scales = scales + n_start_idx;
|
||||
int32_t* __restrict__ curr_zeros = zeros + n_start_idx / pack_factor;
|
||||
for (int32_t block_idx = 0; block_idx < n_block_num; ++block_idx) {
|
||||
dequantizer_t::dequant(curr_q_weight, curr_b_buffer, curr_scales,
|
||||
curr_zeros, g_idx, scales_group_stride,
|
||||
zeros_group_stride, k_size, group_size);
|
||||
|
||||
// if (block_idx == 0 && n_start_idx == 0) {
|
||||
// print_logits("depacked weight", curr_b_buffer, k_size,
|
||||
// n_block_size, n_block_size);
|
||||
// }
|
||||
|
||||
// update
|
||||
curr_q_weight += q_weight_block_stride;
|
||||
curr_b_buffer += b_buffer_block_stride;
|
||||
curr_scales += n_block_size;
|
||||
curr_zeros += zeros_block_stride;
|
||||
}
|
||||
}
|
||||
|
||||
// compute loop
|
||||
{
|
||||
const int32_t n_tile_num = n_num / gemm_n_tile_size;
|
||||
scalar_t* __restrict__ curr_input = input;
|
||||
scalar_t* __restrict__ init_bias = bias;
|
||||
if (bias != nullptr) {
|
||||
init_bias += n_start_idx;
|
||||
}
|
||||
scalar_t* __restrict__ init_output = output + n_start_idx;
|
||||
for (int32_t m_idx = 0; m_idx < m_size; m_idx += gemm_m_tile_size) {
|
||||
const int32_t curr_m_size =
|
||||
std::min(gemm_m_tile_size, m_size - m_idx);
|
||||
scalar_t* __restrict__ curr_b_buffer = b_buffer;
|
||||
scalar_t* __restrict__ curr_bias = init_bias;
|
||||
scalar_t* __restrict__ curr_output = init_output;
|
||||
for (int32_t n_tile_idx = 0; n_tile_idx < n_tile_num; ++n_tile_idx) {
|
||||
gemm.gemm(curr_input, curr_b_buffer, c_buffer, curr_m_size, k_size,
|
||||
input_stride, b_buffer_block_stride, gemm_n_tile_size,
|
||||
false);
|
||||
|
||||
if (bias != nullptr) {
|
||||
cpu_micro_gemm::bias_epilogue<gemm_n_tile_size>(
|
||||
c_buffer, curr_output, curr_bias, curr_m_size,
|
||||
gemm_n_tile_size, output_stride);
|
||||
curr_bias += gemm_n_tile_size;
|
||||
} else {
|
||||
cpu_micro_gemm::default_epilogue<gemm_n_tile_size>(
|
||||
c_buffer, curr_output, curr_m_size, gemm_n_tile_size,
|
||||
output_stride);
|
||||
}
|
||||
|
||||
curr_b_buffer +=
|
||||
b_buffer_block_stride * (gemm_n_tile_size / n_block_size);
|
||||
curr_output += gemm_n_tile_size;
|
||||
}
|
||||
curr_input += gemm_m_tile_size * input_stride;
|
||||
init_output += gemm_m_tile_size * output_stride;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void cpu_gemm_wna16(
|
||||
const torch::Tensor& input, // [M, K]
|
||||
const torch::Tensor&
|
||||
q_weight, // [N / 16, K * 16 / pack_factor], packed as int32
|
||||
torch::Tensor& output, // [M, N]
|
||||
const torch::Tensor& scales, // [group_num, N]
|
||||
const std::optional<torch::Tensor>&
|
||||
zeros, // [group_num, N / pack_factor], packed as int32
|
||||
const std::optional<torch::Tensor>& g_idx, // [K]
|
||||
const std::optional<torch::Tensor>& bias, // [N]
|
||||
const int64_t pack_factor, const std::string& isa_hint) {
|
||||
using cpu_utils::ISA;
|
||||
TORCH_CHECK_EQ(pack_factor, 8); // only supports 4bits
|
||||
const int32_t a_m_size = input.size(0);
|
||||
const int32_t a_k_size = input.size(1);
|
||||
const int64_t a_m_stride = input.stride(0);
|
||||
const int32_t b_n_size = q_weight.size(0) * 16;
|
||||
TORCH_CHECK_EQ(a_k_size % 32, 0);
|
||||
TORCH_CHECK_EQ(b_n_size % 32, 0);
|
||||
const int32_t group_num = scales.size(0);
|
||||
const int32_t group_size = a_k_size / group_num;
|
||||
TORCH_CHECK_EQ(group_size % 2, 0);
|
||||
const int64_t scales_group_stride = scales.stride(0);
|
||||
const int64_t output_m_stride = output.stride(0);
|
||||
|
||||
bool has_zp = zeros.has_value();
|
||||
bool use_desc_act = g_idx.has_value();
|
||||
TORCH_CHECK(!(has_zp && use_desc_act));
|
||||
|
||||
ISA isa = [&]() {
|
||||
if (isa_hint == "amx") {
|
||||
return ISA::AMX;
|
||||
} else if (isa_hint == "vec") {
|
||||
return ISA::VEC;
|
||||
} else {
|
||||
TORCH_CHECK(false, "unsupported isa hint: " + isa_hint);
|
||||
}
|
||||
}();
|
||||
|
||||
int32_t* zeros_ptr = has_zp ? zeros->data_ptr<int32_t>() : nullptr;
|
||||
const int64_t zeros_group_stride = has_zp ? zeros->stride(0) : 0;
|
||||
int32_t* g_idx_ptr = use_desc_act ? g_idx->data_ptr<int32_t>() : nullptr;
|
||||
|
||||
VLLM_DISPATCH_16B_TYPES(input.scalar_type(), "cpu_gemm_wna16", [&]() {
|
||||
if (isa == ISA::AMX) {
|
||||
using gemm_t = cpu_micro_gemm::MicroGemm<ISA::AMX, scalar_t>;
|
||||
if (has_zp) {
|
||||
using dequantizer_t = Dequantizer4b<scalar_t, ISA::AMX, true, false>;
|
||||
cpu_gemm_wna16_impl<scalar_t, dequantizer_t, gemm_t>(
|
||||
input.data_ptr<scalar_t>(), q_weight.data_ptr<int32_t>(),
|
||||
output.data_ptr<scalar_t>(), scales.data_ptr<scalar_t>(), zeros_ptr,
|
||||
g_idx_ptr, bias.has_value() ? bias->data_ptr<scalar_t>() : nullptr,
|
||||
a_m_size, b_n_size, a_k_size, a_m_stride, output_m_stride,
|
||||
scales_group_stride, zeros_group_stride, group_num, group_size,
|
||||
pack_factor);
|
||||
return;
|
||||
}
|
||||
if (use_desc_act) {
|
||||
using dequantizer_t = Dequantizer4b<scalar_t, ISA::AMX, false, true>;
|
||||
cpu_gemm_wna16_impl<scalar_t, dequantizer_t, gemm_t>(
|
||||
input.data_ptr<scalar_t>(), q_weight.data_ptr<int32_t>(),
|
||||
output.data_ptr<scalar_t>(), scales.data_ptr<scalar_t>(), zeros_ptr,
|
||||
g_idx_ptr, bias.has_value() ? bias->data_ptr<scalar_t>() : nullptr,
|
||||
a_m_size, b_n_size, a_k_size, a_m_stride, output_m_stride,
|
||||
scales_group_stride, zeros_group_stride, group_num, group_size,
|
||||
pack_factor);
|
||||
return;
|
||||
} else {
|
||||
using dequantizer_t = Dequantizer4b<scalar_t, ISA::AMX, false, false>;
|
||||
cpu_gemm_wna16_impl<scalar_t, dequantizer_t, gemm_t>(
|
||||
input.data_ptr<scalar_t>(), q_weight.data_ptr<int32_t>(),
|
||||
output.data_ptr<scalar_t>(), scales.data_ptr<scalar_t>(), zeros_ptr,
|
||||
g_idx_ptr, bias.has_value() ? bias->data_ptr<scalar_t>() : nullptr,
|
||||
a_m_size, b_n_size, a_k_size, a_m_stride, output_m_stride,
|
||||
scales_group_stride, zeros_group_stride, group_num, group_size,
|
||||
pack_factor);
|
||||
return;
|
||||
}
|
||||
} else if (isa == ISA::VEC) {
|
||||
using gemm_t = cpu_micro_gemm::MicroGemm<ISA::VEC, scalar_t>;
|
||||
if (has_zp) {
|
||||
using dequantizer_t = Dequantizer4b<scalar_t, ISA::VEC, true, false>;
|
||||
cpu_gemm_wna16_impl<scalar_t, dequantizer_t, gemm_t>(
|
||||
input.data_ptr<scalar_t>(), q_weight.data_ptr<int32_t>(),
|
||||
output.data_ptr<scalar_t>(), scales.data_ptr<scalar_t>(), zeros_ptr,
|
||||
g_idx_ptr, bias.has_value() ? bias->data_ptr<scalar_t>() : nullptr,
|
||||
a_m_size, b_n_size, a_k_size, a_m_stride, output_m_stride,
|
||||
scales_group_stride, zeros_group_stride, group_num, group_size,
|
||||
pack_factor);
|
||||
return;
|
||||
}
|
||||
if (use_desc_act) {
|
||||
using dequantizer_t = Dequantizer4b<scalar_t, ISA::VEC, false, true>;
|
||||
cpu_gemm_wna16_impl<scalar_t, dequantizer_t, gemm_t>(
|
||||
input.data_ptr<scalar_t>(), q_weight.data_ptr<int32_t>(),
|
||||
output.data_ptr<scalar_t>(), scales.data_ptr<scalar_t>(), zeros_ptr,
|
||||
g_idx_ptr, bias.has_value() ? bias->data_ptr<scalar_t>() : nullptr,
|
||||
a_m_size, b_n_size, a_k_size, a_m_stride, output_m_stride,
|
||||
scales_group_stride, zeros_group_stride, group_num, group_size,
|
||||
pack_factor);
|
||||
return;
|
||||
} else {
|
||||
using dequantizer_t = Dequantizer4b<scalar_t, ISA::VEC, false, false>;
|
||||
cpu_gemm_wna16_impl<scalar_t, dequantizer_t, gemm_t>(
|
||||
input.data_ptr<scalar_t>(), q_weight.data_ptr<int32_t>(),
|
||||
output.data_ptr<scalar_t>(), scales.data_ptr<scalar_t>(), zeros_ptr,
|
||||
g_idx_ptr, bias.has_value() ? bias->data_ptr<scalar_t>() : nullptr,
|
||||
a_m_size, b_n_size, a_k_size, a_m_stride, output_m_stride,
|
||||
scales_group_stride, zeros_group_stride, group_num, group_size,
|
||||
pack_factor);
|
||||
return;
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
@@ -5,6 +5,7 @@
|
||||
#include "common/memory.hpp"
|
||||
|
||||
#include "dnnl_helper.h"
|
||||
#include "scratchpad_manager.h"
|
||||
|
||||
static dnnl::engine& default_engine() {
|
||||
static dnnl::engine engine(dnnl::engine::kind::cpu, 0);
|
||||
@@ -22,23 +23,6 @@ void release_dnnl_matmul_handler(int64_t handler) {
|
||||
delete ptr;
|
||||
}
|
||||
|
||||
DNNLScratchPadManager::DNNLScratchPadManager() : size_(0), ptr_(nullptr) {
|
||||
this->realloc(allocation_unit * 128);
|
||||
}
|
||||
|
||||
void DNNLScratchPadManager::realloc(size_t new_size) {
|
||||
new_size = round(new_size);
|
||||
if (new_size > size_) {
|
||||
ptr_ = std::aligned_alloc(64, new_size);
|
||||
size_ = new_size;
|
||||
}
|
||||
}
|
||||
|
||||
DNNLScratchPadManager* DNNLScratchPadManager::get_dnnl_scratchpad_manager() {
|
||||
static DNNLScratchPadManager manager;
|
||||
return &manager;
|
||||
}
|
||||
|
||||
template <typename KT, typename VT>
|
||||
class DNNLPrimitiveCache {
|
||||
public:
|
||||
@@ -412,9 +396,9 @@ MatMulPrimitiveHandler::MatMulPrimitiveHandler(const Args& args)
|
||||
: DNNLMatMulPrimitiveHandler(
|
||||
static_cast<DNNLMatMulPrimitiveHandler::Args>(args), args.ab_type),
|
||||
m_size_cache_(nullptr) {
|
||||
assert(ab_type_ == dnnl::memory::data_type::f32 ||
|
||||
ab_type_ == dnnl::memory::data_type::bf16 ||
|
||||
ab_type_ == dnnl::memory::data_type::f16);
|
||||
assert(b_type_ == dnnl::memory::data_type::f32 ||
|
||||
b_type_ == dnnl::memory::data_type::bf16 ||
|
||||
b_type_ == dnnl::memory::data_type::f16);
|
||||
|
||||
dnnl::memory::desc original_b_md({b_k_size_, b_n_size_}, b_type_,
|
||||
{b_k_stride_, b_n_stride_});
|
||||
|
||||
@@ -59,30 +59,6 @@ constexpr inline dnnl::memory::data_type get_dnnl_type() {
|
||||
return DNNLType<std::decay_t<T>>::type;
|
||||
}
|
||||
|
||||
class DNNLScratchPadManager {
|
||||
public:
|
||||
static constexpr size_t allocation_unit = 4 * 1024 * 1024; // 4KB
|
||||
|
||||
static DNNLScratchPadManager* get_dnnl_scratchpad_manager();
|
||||
|
||||
DNNLScratchPadManager();
|
||||
|
||||
template <typename T>
|
||||
T* get_data() {
|
||||
return reinterpret_cast<T*>(ptr_);
|
||||
}
|
||||
|
||||
static size_t round(size_t size) {
|
||||
return ((size + allocation_unit - 1) / allocation_unit) * allocation_unit;
|
||||
}
|
||||
|
||||
void realloc(size_t new_size);
|
||||
|
||||
private:
|
||||
size_t size_;
|
||||
void* ptr_;
|
||||
};
|
||||
|
||||
class DNNLMatMulPrimitiveHandler {
|
||||
public:
|
||||
virtual ~DNNLMatMulPrimitiveHandler() = default;
|
||||
|
||||
245
csrc/cpu/micro_gemm/cpu_micro_gemm_amx.hpp
Normal file
245
csrc/cpu/micro_gemm/cpu_micro_gemm_amx.hpp
Normal file
@@ -0,0 +1,245 @@
|
||||
#ifndef CPU_MICRO_GEMM_AMX_HPP
|
||||
#define CPU_MICRO_GEMM_AMX_HPP
|
||||
#include "cpu/micro_gemm/cpu_micro_gemm_impl.hpp"
|
||||
|
||||
namespace cpu_micro_gemm {
|
||||
namespace {
|
||||
// AMX specific
|
||||
constexpr static int64_t AMX_TILE_ROW_BYTES = 64;
|
||||
constexpr static int64_t AMX_TILE_ROW_NUM = 16;
|
||||
constexpr static int64_t AMX_TILE_BYTES = AMX_TILE_ROW_BYTES * AMX_TILE_ROW_NUM;
|
||||
|
||||
typedef struct __tile_config {
|
||||
uint8_t palette_id = 1;
|
||||
uint8_t start_row = 0;
|
||||
uint8_t reserved_0[14] = {0};
|
||||
uint16_t colsb[16] = {0};
|
||||
uint8_t rows[16] = {0};
|
||||
} __tilecfg;
|
||||
|
||||
// 2-2-4 pattern, for 16 < m <= 32
|
||||
// TILE 0, 1: load A matrix, row num should be 16, m - 16
|
||||
// TILE 2, 3: load B matrix, row num should be 16
|
||||
// TILE 4, 5, 6, 7: store results C matrix, row num should be 16, 16, m - 16, m
|
||||
// - 16
|
||||
template <typename scalar_t>
|
||||
class TileGemm224 {
|
||||
public:
|
||||
FORCE_INLINE static void gemm(DEFINE_CPU_MICRO_GEMM_PARAMS) {
|
||||
TORCH_CHECK(false, "Unsupported data type for TileGemm224");
|
||||
}
|
||||
|
||||
FORCE_INLINE static void init_tile_config(int32_t m, __tilecfg& config) {
|
||||
TORCH_CHECK(false, "Unsupported data type for TileGemm224");
|
||||
}
|
||||
};
|
||||
|
||||
template <>
|
||||
class TileGemm224<c10::BFloat16> {
|
||||
public:
|
||||
using scalar_t = c10::BFloat16;
|
||||
FORCE_INLINE static void gemm(DEFINE_CPU_MICRO_GEMM_PARAMS) {
|
||||
const int32_t k_times = k / (AMX_TILE_ROW_NUM * 4 / sizeof(c10::BFloat16));
|
||||
c10::BFloat16* __restrict__ a_tile_0 = a_ptr;
|
||||
c10::BFloat16* __restrict__ a_tile_1 = a_ptr + lda * AMX_TILE_ROW_NUM;
|
||||
const int64_t a_tile_stride = lda * sizeof(c10::BFloat16);
|
||||
|
||||
// B is always packed as 16 output channels block
|
||||
c10::BFloat16* __restrict__ b_tile_2 = b_ptr;
|
||||
c10::BFloat16* __restrict__ b_tile_3 = b_ptr + b_n_group_stride;
|
||||
const int32_t b_tile_stride = AMX_TILE_ROW_BYTES;
|
||||
|
||||
float* __restrict__ c_tile_4 = c_ptr;
|
||||
float* __restrict__ c_tile_5 =
|
||||
c_tile_4 + AMX_TILE_ROW_BYTES / sizeof(float);
|
||||
float* __restrict__ c_tile_6 = c_ptr + AMX_TILE_ROW_NUM * ldc;
|
||||
float* __restrict__ c_tile_7 =
|
||||
c_tile_6 + AMX_TILE_ROW_BYTES / sizeof(float);
|
||||
const int32_t c_tile_stride = ldc * sizeof(float);
|
||||
|
||||
if (accum_c) {
|
||||
_tile_loadd(4, c_tile_4, c_tile_stride);
|
||||
_tile_loadd(5, c_tile_5, c_tile_stride);
|
||||
_tile_loadd(6, c_tile_6, c_tile_stride);
|
||||
_tile_loadd(7, c_tile_7, c_tile_stride);
|
||||
} else {
|
||||
_tile_zero(4);
|
||||
_tile_zero(5);
|
||||
_tile_zero(6);
|
||||
_tile_zero(7);
|
||||
}
|
||||
|
||||
for (int32_t k = 0; k < k_times; ++k) {
|
||||
_tile_loadd(0, a_tile_0, a_tile_stride);
|
||||
_tile_stream_loadd(2, b_tile_2, b_tile_stride);
|
||||
_tile_dpbf16ps(4, 0, 2);
|
||||
_tile_stream_loadd(3, b_tile_3, b_tile_stride);
|
||||
_tile_dpbf16ps(5, 0, 3);
|
||||
_tile_loadd(1, a_tile_1, a_tile_stride);
|
||||
_tile_dpbf16ps(6, 1, 2);
|
||||
_tile_dpbf16ps(7, 1, 3);
|
||||
|
||||
// update ptrs
|
||||
a_tile_0 += AMX_TILE_ROW_BYTES / sizeof(c10::BFloat16);
|
||||
a_tile_1 += AMX_TILE_ROW_BYTES / sizeof(c10::BFloat16);
|
||||
b_tile_2 += AMX_TILE_BYTES / sizeof(c10::BFloat16);
|
||||
b_tile_3 += AMX_TILE_BYTES / sizeof(c10::BFloat16);
|
||||
}
|
||||
|
||||
_tile_stored(4, c_tile_4, c_tile_stride);
|
||||
_tile_stored(5, c_tile_5, c_tile_stride);
|
||||
_tile_stored(6, c_tile_6, c_tile_stride);
|
||||
_tile_stored(7, c_tile_7, c_tile_stride);
|
||||
}
|
||||
|
||||
FORCE_INLINE static void init_tile_config(int32_t m, __tilecfg& config) {
|
||||
const int32_t m_0 = AMX_TILE_ROW_NUM;
|
||||
const int32_t m_1 = m - AMX_TILE_ROW_NUM;
|
||||
config.rows[0] = m_0;
|
||||
config.rows[1] = m_1;
|
||||
config.rows[2] = AMX_TILE_ROW_NUM;
|
||||
config.rows[3] = AMX_TILE_ROW_NUM;
|
||||
config.rows[4] = m_0;
|
||||
config.rows[5] = m_0;
|
||||
config.rows[6] = m_1;
|
||||
config.rows[7] = m_1;
|
||||
_tile_loadconfig(&config);
|
||||
}
|
||||
};
|
||||
|
||||
// 1-2-2 pattern, for 0 < m <= 16
|
||||
// TILE 0, (1): load A matrix, use extra 1 tile for prefetch, row num should be
|
||||
// m, m
|
||||
// TILE 2, 3, (4, 5): load B matrix, use extra 2 tiles for prefetch, row
|
||||
// num should be 16
|
||||
// TILE 6, 7, (6, 7): store results C matrix, row num should be
|
||||
// m
|
||||
template <typename scalar_t>
|
||||
class TileGemm122 {
|
||||
public:
|
||||
FORCE_INLINE static void gemm(DEFINE_CPU_MICRO_GEMM_PARAMS) {
|
||||
TORCH_CHECK(false, "Unsupported data type for TileGemm122");
|
||||
}
|
||||
|
||||
FORCE_INLINE static void init_tile_config(int32_t m, __tilecfg& config) {
|
||||
TORCH_CHECK(false, "Unsupported data type for TileGemm122");
|
||||
}
|
||||
};
|
||||
|
||||
template <>
|
||||
class TileGemm122<c10::BFloat16> {
|
||||
public:
|
||||
using scalar_t = c10::BFloat16;
|
||||
FORCE_INLINE static void gemm(DEFINE_CPU_MICRO_GEMM_PARAMS) {
|
||||
c10::BFloat16* __restrict__ a_tile_0 = a_ptr;
|
||||
c10::BFloat16* __restrict__ a_tile_1 =
|
||||
a_ptr + AMX_TILE_ROW_BYTES / sizeof(c10::BFloat16);
|
||||
const int64_t a_tile_stride = lda * sizeof(c10::BFloat16);
|
||||
|
||||
c10::BFloat16* __restrict__ b_tile_2 = b_ptr;
|
||||
c10::BFloat16* __restrict__ b_tile_3 = b_ptr + b_n_group_stride;
|
||||
c10::BFloat16* __restrict__ b_tile_4 =
|
||||
b_tile_2 + AMX_TILE_BYTES / sizeof(c10::BFloat16);
|
||||
c10::BFloat16* __restrict__ b_tile_5 =
|
||||
b_tile_3 + AMX_TILE_BYTES / sizeof(c10::BFloat16);
|
||||
int64_t b_stride = AMX_TILE_ROW_BYTES;
|
||||
|
||||
float* __restrict__ c_tile_6 = c_ptr;
|
||||
float* __restrict__ c_tile_7 = c_ptr + AMX_TILE_ROW_BYTES / sizeof(float);
|
||||
int64_t c_stride = ldc * sizeof(float);
|
||||
|
||||
const int32_t k_times = k / (AMX_TILE_ROW_NUM * 4 / sizeof(c10::BFloat16));
|
||||
const int32_t k_group_times = k_times / 2;
|
||||
const bool has_tail = (k_times % 2 == 1);
|
||||
|
||||
if (accum_c) {
|
||||
_tile_loadd(6, c_tile_6, c_stride);
|
||||
_tile_loadd(7, c_tile_7, c_stride);
|
||||
} else {
|
||||
_tile_zero(6);
|
||||
_tile_zero(7);
|
||||
}
|
||||
|
||||
for (int32_t k = 0; k < k_group_times; ++k) {
|
||||
_tile_loadd(0, a_tile_0, a_tile_stride);
|
||||
_tile_stream_loadd(2, b_tile_2, b_stride);
|
||||
_tile_dpbf16ps(6, 0, 2);
|
||||
_tile_stream_loadd(3, b_tile_3, b_stride);
|
||||
_tile_dpbf16ps(7, 0, 3);
|
||||
_tile_loadd(1, a_tile_1, a_tile_stride);
|
||||
_tile_stream_loadd(4, b_tile_4, b_stride);
|
||||
_tile_dpbf16ps(6, 1, 4);
|
||||
_tile_stream_loadd(5, b_tile_5, b_stride);
|
||||
_tile_dpbf16ps(7, 1, 5);
|
||||
|
||||
// update ptrs
|
||||
a_tile_0 += 2 * AMX_TILE_ROW_BYTES / sizeof(c10::BFloat16);
|
||||
a_tile_1 += 2 * AMX_TILE_ROW_BYTES / sizeof(c10::BFloat16);
|
||||
b_tile_2 += 2 * AMX_TILE_BYTES / sizeof(c10::BFloat16);
|
||||
b_tile_3 += 2 * AMX_TILE_BYTES / sizeof(c10::BFloat16);
|
||||
b_tile_4 += 2 * AMX_TILE_BYTES / sizeof(c10::BFloat16);
|
||||
b_tile_5 += 2 * AMX_TILE_BYTES / sizeof(c10::BFloat16);
|
||||
}
|
||||
|
||||
if (has_tail) {
|
||||
_tile_loadd(0, a_tile_0, a_tile_stride);
|
||||
_tile_stream_loadd(2, b_tile_2, b_stride);
|
||||
_tile_dpbf16ps(6, 0, 2);
|
||||
_tile_stream_loadd(3, b_tile_3, b_stride);
|
||||
_tile_dpbf16ps(7, 0, 3);
|
||||
}
|
||||
|
||||
_tile_stored(6, c_tile_6, c_stride);
|
||||
_tile_stored(7, c_tile_7, c_stride);
|
||||
}
|
||||
|
||||
FORCE_INLINE static void init_tile_config(int32_t m, __tilecfg& config) {
|
||||
config.rows[0] = m;
|
||||
config.rows[1] = m;
|
||||
config.rows[2] = AMX_TILE_ROW_NUM;
|
||||
config.rows[3] = AMX_TILE_ROW_NUM;
|
||||
config.rows[4] = AMX_TILE_ROW_NUM;
|
||||
config.rows[5] = AMX_TILE_ROW_NUM;
|
||||
config.rows[6] = m;
|
||||
config.rows[7] = m;
|
||||
_tile_loadconfig(&config);
|
||||
}
|
||||
};
|
||||
} // namespace
|
||||
|
||||
// Gemm kernel uses AMX, requires B matrix to be packed
|
||||
template <typename scalar_t>
|
||||
class MicroGemm<cpu_utils::ISA::AMX, scalar_t> {
|
||||
public:
|
||||
static constexpr int32_t MaxMSize = 32;
|
||||
static constexpr int32_t NSize = 32;
|
||||
|
||||
public:
|
||||
MicroGemm() : curr_m_(-1) {
|
||||
vec_op::unroll_loop<int, 8>([&](int i) { amx_tile_config_.colsb[i] = 64; });
|
||||
}
|
||||
|
||||
void gemm(DEFINE_CPU_MICRO_GEMM_PARAMS) {
|
||||
if (m > AMX_TILE_ROW_NUM) {
|
||||
if (m != curr_m_) {
|
||||
curr_m_ = m;
|
||||
TileGemm224<scalar_t>::init_tile_config(m, amx_tile_config_);
|
||||
}
|
||||
TileGemm224<scalar_t>::gemm(CPU_MICRO_GEMM_PARAMS);
|
||||
} else {
|
||||
if (m != curr_m_) {
|
||||
curr_m_ = m;
|
||||
TileGemm122<scalar_t>::init_tile_config(m, amx_tile_config_);
|
||||
}
|
||||
TileGemm122<scalar_t>::gemm(CPU_MICRO_GEMM_PARAMS);
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
alignas(64) __tilecfg amx_tile_config_;
|
||||
int32_t curr_m_;
|
||||
};
|
||||
|
||||
} // namespace cpu_micro_gemm
|
||||
|
||||
#endif
|
||||
91
csrc/cpu/micro_gemm/cpu_micro_gemm_impl.hpp
Normal file
91
csrc/cpu/micro_gemm/cpu_micro_gemm_impl.hpp
Normal file
@@ -0,0 +1,91 @@
|
||||
#ifndef CPU_MICRO_GEMM_IMPL_HPP
|
||||
#define CPU_MICRO_GEMM_IMPL_HPP
|
||||
#include "cpu/utils.hpp"
|
||||
#include "cpu/cpu_types.hpp"
|
||||
|
||||
namespace cpu_micro_gemm {
|
||||
#define DEFINE_CPU_MICRO_GEMM_PARAMS \
|
||||
scalar_t *__restrict__ a_ptr, scalar_t *__restrict__ b_ptr, \
|
||||
float *__restrict__ c_ptr, const int32_t m, const int32_t k, \
|
||||
const int64_t lda, const int64_t b_n_group_stride, const int64_t ldc, \
|
||||
const bool accum_c
|
||||
|
||||
#define CPU_MICRO_GEMM_PARAMS \
|
||||
a_ptr, b_ptr, c_ptr, m, k, lda, b_n_group_stride, ldc, accum_c
|
||||
|
||||
template <cpu_utils::ISA isa, typename scalar_t>
|
||||
class MicroGemm {
|
||||
public:
|
||||
static constexpr int32_t MaxMSize = 16;
|
||||
static constexpr int32_t NSize = 16;
|
||||
|
||||
public:
|
||||
void gemm(DEFINE_CPU_MICRO_GEMM_PARAMS) {
|
||||
TORCH_CHECK(false, "Unimplemented MicroGemm.");
|
||||
}
|
||||
};
|
||||
|
||||
template <int32_t n_size, typename scalar_t>
|
||||
FORCE_INLINE void default_epilogue(float* __restrict__ c_ptr,
|
||||
scalar_t* __restrict__ d_ptr,
|
||||
const int32_t m, const int64_t ldc,
|
||||
const int64_t ldd) {
|
||||
using scalar_vec_t = typename cpu_utils::VecTypeTrait<scalar_t>::vec_t;
|
||||
static_assert(n_size % 16 == 0);
|
||||
|
||||
float* __restrict__ curr_c = c_ptr;
|
||||
scalar_t* __restrict__ curr_d = d_ptr;
|
||||
for (int32_t i = 0; i < m; ++i) {
|
||||
float* __restrict__ curr_c_iter = curr_c;
|
||||
scalar_t* __restrict__ curr_d_iter = curr_d;
|
||||
vec_op::unroll_loop<int32_t, n_size / 16>([&](int32_t n_g_idx) {
|
||||
vec_op::FP32Vec16 c_vec_fp32(curr_c_iter);
|
||||
scalar_vec_t c_vec(c_vec_fp32);
|
||||
c_vec.save(curr_d_iter);
|
||||
curr_c_iter += 16;
|
||||
curr_d_iter += 16;
|
||||
});
|
||||
curr_c += ldc;
|
||||
curr_d += ldd;
|
||||
}
|
||||
}
|
||||
|
||||
template <int32_t n_size, typename scalar_t>
|
||||
FORCE_INLINE void bias_epilogue(float* __restrict__ c_ptr,
|
||||
scalar_t* __restrict__ d_ptr,
|
||||
scalar_t* __restrict__ bias_ptr,
|
||||
const int32_t m, const int64_t ldc,
|
||||
const int64_t ldd) {
|
||||
using scalar_vec_t = typename cpu_utils::VecTypeTrait<scalar_t>::vec_t;
|
||||
static_assert(n_size % 16 == 0);
|
||||
constexpr int32_t n_group_num = n_size / 16;
|
||||
static_assert(n_group_num <= 16);
|
||||
|
||||
vec_op::FP32Vec16 bias_vecs[n_group_num];
|
||||
scalar_t* __restrict__ curr_bias = bias_ptr;
|
||||
vec_op::unroll_loop<int32_t, n_group_num>([&](int32_t i) {
|
||||
scalar_vec_t vec(curr_bias);
|
||||
bias_vecs[i] = vec_op::FP32Vec16(vec);
|
||||
curr_bias += 16;
|
||||
});
|
||||
|
||||
float* __restrict__ curr_c = c_ptr;
|
||||
scalar_t* __restrict__ curr_d = d_ptr;
|
||||
for (int32_t i = 0; i < m; ++i) {
|
||||
float* __restrict__ curr_c_iter = curr_c;
|
||||
scalar_t* __restrict__ curr_d_iter = curr_d;
|
||||
vec_op::unroll_loop<int32_t, n_group_num>([&](int32_t n_g_idx) {
|
||||
vec_op::FP32Vec16 c_vec_fp32(curr_c_iter);
|
||||
c_vec_fp32 = c_vec_fp32 + bias_vecs[n_g_idx];
|
||||
scalar_vec_t c_vec(c_vec_fp32);
|
||||
c_vec.save(curr_d_iter);
|
||||
curr_c_iter += 16;
|
||||
curr_d_iter += 16;
|
||||
});
|
||||
curr_c += ldc;
|
||||
curr_d += ldd;
|
||||
}
|
||||
}
|
||||
} // namespace cpu_micro_gemm
|
||||
|
||||
#endif
|
||||
115
csrc/cpu/micro_gemm/cpu_micro_gemm_vec.hpp
Normal file
115
csrc/cpu/micro_gemm/cpu_micro_gemm_vec.hpp
Normal file
@@ -0,0 +1,115 @@
|
||||
#ifndef CPU_MICRO_GEMM_VEC_HPP
|
||||
#define CPU_MICRO_GEMM_VEC_HPP
|
||||
#include "cpu/micro_gemm/cpu_micro_gemm_impl.hpp"
|
||||
|
||||
namespace cpu_micro_gemm {
|
||||
namespace {
|
||||
// 8-2-16 pattern, 8 regs for A, 2 regs for B, 16 regs for C, [8, K] @ [k, 32]
|
||||
template <typename scalar_t>
|
||||
class TileGemm82 {
|
||||
public:
|
||||
FORCE_INLINE static void gemm(DEFINE_CPU_MICRO_GEMM_PARAMS) {
|
||||
switch (m) {
|
||||
case 1:
|
||||
gemm_micro<1>(CPU_MICRO_GEMM_PARAMS);
|
||||
break;
|
||||
case 2:
|
||||
gemm_micro<2>(CPU_MICRO_GEMM_PARAMS);
|
||||
break;
|
||||
case 3:
|
||||
gemm_micro<3>(CPU_MICRO_GEMM_PARAMS);
|
||||
break;
|
||||
case 4:
|
||||
gemm_micro<4>(CPU_MICRO_GEMM_PARAMS);
|
||||
break;
|
||||
case 5:
|
||||
gemm_micro<5>(CPU_MICRO_GEMM_PARAMS);
|
||||
break;
|
||||
case 6:
|
||||
gemm_micro<6>(CPU_MICRO_GEMM_PARAMS);
|
||||
break;
|
||||
case 7:
|
||||
gemm_micro<7>(CPU_MICRO_GEMM_PARAMS);
|
||||
break;
|
||||
case 8:
|
||||
gemm_micro<8>(CPU_MICRO_GEMM_PARAMS);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
template <int32_t M>
|
||||
static void gemm_micro(DEFINE_CPU_MICRO_GEMM_PARAMS) {
|
||||
static_assert(0 < M <= 8);
|
||||
using load_vec_t = typename cpu_utils::VecTypeTrait<scalar_t>::vec_t;
|
||||
|
||||
scalar_t* __restrict__ curr_b_0 = b_ptr;
|
||||
scalar_t* __restrict__ curr_b_1 = b_ptr + b_n_group_stride;
|
||||
float* __restrict__ curr_c_0 = c_ptr;
|
||||
float* __restrict__ curr_c_1 = c_ptr + 16;
|
||||
|
||||
vec_op::FP32Vec16 c_regs[M * 2];
|
||||
if (accum_c) {
|
||||
float* __restrict__ curr_m_c_0 = curr_c_0;
|
||||
float* __restrict__ curr_m_c_1 = curr_c_1;
|
||||
vec_op::unroll_loop<int32_t, M>([&](int32_t i) {
|
||||
c_regs[i * 2] = vec_op::FP32Vec16(curr_m_c_0);
|
||||
c_regs[i * 2 + 1] = vec_op::FP32Vec16(curr_m_c_1);
|
||||
|
||||
// update
|
||||
curr_m_c_0 += ldc;
|
||||
curr_m_c_1 += ldc;
|
||||
});
|
||||
}
|
||||
|
||||
scalar_t* __restrict__ curr_a = a_ptr;
|
||||
for (int32_t k_idx = 0; k_idx < k; ++k_idx) {
|
||||
load_vec_t b_0_reg(curr_b_0);
|
||||
vec_op::FP32Vec16 fp32_b_0_reg(b_0_reg);
|
||||
load_vec_t b_1_reg(curr_b_1);
|
||||
vec_op::FP32Vec16 fp32_b_1_reg(b_1_reg);
|
||||
|
||||
scalar_t* __restrict__ curr_m_a = curr_a;
|
||||
vec_op::unroll_loop<int32_t, M>([&](int32_t i) {
|
||||
scalar_t v = *curr_m_a;
|
||||
load_vec_t a_reg_original(v);
|
||||
vec_op::FP32Vec16 a_reg(a_reg_original);
|
||||
c_regs[i * 2] = c_regs[i * 2] + a_reg * fp32_b_0_reg;
|
||||
c_regs[i * 2 + 1] = c_regs[i * 2 + 1] + a_reg * fp32_b_1_reg;
|
||||
|
||||
// update
|
||||
curr_m_a += lda;
|
||||
});
|
||||
|
||||
// update
|
||||
curr_a += 1;
|
||||
curr_b_0 += 16;
|
||||
curr_b_1 += 16;
|
||||
}
|
||||
|
||||
vec_op::unroll_loop<int32_t, M>([&](int32_t i) {
|
||||
c_regs[i * 2].save(curr_c_0);
|
||||
c_regs[i * 2 + 1].save(curr_c_1);
|
||||
|
||||
// update
|
||||
curr_c_0 += ldc;
|
||||
curr_c_1 += ldc;
|
||||
});
|
||||
}
|
||||
};
|
||||
} // namespace
|
||||
|
||||
// Gemm kernel uses vector instructions, requires B matrix to be packed
|
||||
template <typename scalar_t>
|
||||
class MicroGemm<cpu_utils::ISA::VEC, scalar_t> {
|
||||
public:
|
||||
static constexpr int32_t MaxMSize = 8;
|
||||
static constexpr int32_t NSize = 32;
|
||||
|
||||
public:
|
||||
void gemm(DEFINE_CPU_MICRO_GEMM_PARAMS) {
|
||||
TileGemm82<scalar_t>::gemm(CPU_MICRO_GEMM_PARAMS);
|
||||
}
|
||||
};
|
||||
} // namespace cpu_micro_gemm
|
||||
|
||||
#endif
|
||||
23
csrc/cpu/scratchpad_manager.cpp
Normal file
23
csrc/cpu/scratchpad_manager.cpp
Normal file
@@ -0,0 +1,23 @@
|
||||
#include <cstdlib>
|
||||
|
||||
#include "scratchpad_manager.h"
|
||||
|
||||
DNNLScratchPadManager::DNNLScratchPadManager() : size_(0), ptr_(nullptr) {
|
||||
this->realloc(allocation_unit * 128);
|
||||
}
|
||||
|
||||
void DNNLScratchPadManager::realloc(size_t new_size) {
|
||||
new_size = round(new_size);
|
||||
if (new_size > size_) {
|
||||
if (ptr_ != nullptr) {
|
||||
std::free(ptr_);
|
||||
}
|
||||
ptr_ = std::aligned_alloc(64, new_size);
|
||||
size_ = new_size;
|
||||
}
|
||||
}
|
||||
|
||||
DNNLScratchPadManager* DNNLScratchPadManager::get_dnnl_scratchpad_manager() {
|
||||
static DNNLScratchPadManager manager;
|
||||
return &manager;
|
||||
}
|
||||
31
csrc/cpu/scratchpad_manager.h
Normal file
31
csrc/cpu/scratchpad_manager.h
Normal file
@@ -0,0 +1,31 @@
|
||||
#ifndef SCRATCHPAD_MANAGER_H
|
||||
#define SCRATCHPAD_MANAGER_H
|
||||
|
||||
#include <cstddef>
|
||||
#include <cstdio>
|
||||
|
||||
class DNNLScratchPadManager {
|
||||
public:
|
||||
static constexpr size_t allocation_unit = 4 * 1024; // 4KB
|
||||
|
||||
static DNNLScratchPadManager* get_dnnl_scratchpad_manager();
|
||||
|
||||
DNNLScratchPadManager();
|
||||
|
||||
template <typename T>
|
||||
T* get_data() {
|
||||
return reinterpret_cast<T*>(ptr_);
|
||||
}
|
||||
|
||||
static size_t round(size_t size) {
|
||||
return ((size + allocation_unit - 1) / allocation_unit) * allocation_unit;
|
||||
}
|
||||
|
||||
void realloc(size_t new_size);
|
||||
|
||||
private:
|
||||
size_t size_;
|
||||
void* ptr_;
|
||||
};
|
||||
|
||||
#endif
|
||||
@@ -192,7 +192,7 @@ class SHMManager {
|
||||
const int group_size)
|
||||
: _rank(rank),
|
||||
_group_size(group_size),
|
||||
_thread_num(torch::get_num_threads()),
|
||||
_thread_num(omp_get_max_threads()),
|
||||
_shm_names({""}),
|
||||
_shared_mem_ptrs({nullptr}),
|
||||
_shm_ctx(nullptr) {
|
||||
|
||||
@@ -74,25 +74,45 @@ at::Tensor int8_scaled_mm_with_quant(at::Tensor& mat1, at::Tensor& mat2,
|
||||
const std::optional<at::Tensor>& bias,
|
||||
at::ScalarType out_dtype, bool is_vnni);
|
||||
|
||||
torch::Tensor get_scheduler_metadata(
|
||||
const int64_t num_req, const int64_t num_heads_q,
|
||||
const int64_t num_heads_kv, const int64_t head_dim,
|
||||
const torch::Tensor& seq_lens, at::ScalarType dtype,
|
||||
const torch::Tensor& query_start_loc, const bool casual,
|
||||
const int64_t window_size, const std::string& isa_hint,
|
||||
const bool enable_kv_split);
|
||||
|
||||
void cpu_attn_reshape_and_cache(const torch::Tensor& key,
|
||||
const torch::Tensor& value,
|
||||
torch::Tensor& key_cache,
|
||||
torch::Tensor& value_cache,
|
||||
const torch::Tensor& slot_mapping,
|
||||
const std::string& isa);
|
||||
|
||||
void cpu_attention_with_kv_cache(
|
||||
const torch::Tensor& query, const torch::Tensor& key_cache,
|
||||
const torch::Tensor& value_cache, torch::Tensor& output,
|
||||
const torch::Tensor& query_start_loc, const torch::Tensor& seq_lens,
|
||||
const double scale, const bool causal,
|
||||
const std::optional<torch::Tensor>& alibi_slopes,
|
||||
const int64_t sliding_window_left, const int64_t sliding_window_right,
|
||||
const torch::Tensor& block_table, const double softcap,
|
||||
const torch::Tensor& scheduler_metadata,
|
||||
const std::optional<torch::Tensor>& s_aux);
|
||||
|
||||
// Note: just for avoiding importing errors
|
||||
void placeholder_op() { TORCH_CHECK(false, "Unimplemented"); }
|
||||
|
||||
void cpu_gemm_wna16(const torch::Tensor& input, const torch::Tensor& q_weight,
|
||||
torch::Tensor& output, const torch::Tensor& scales,
|
||||
const std::optional<torch::Tensor>& zeros,
|
||||
const std::optional<torch::Tensor>& g_idx,
|
||||
const std::optional<torch::Tensor>& bias,
|
||||
const int64_t pack_factor, const std::string& isa_hint);
|
||||
|
||||
TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
|
||||
// vLLM custom ops
|
||||
|
||||
// Attention ops
|
||||
// Compute the attention between an input query and the cached keys/values
|
||||
// using PagedAttention.
|
||||
ops.def(
|
||||
"paged_attention_v1("
|
||||
" Tensor! out, Tensor query, Tensor key_cache,"
|
||||
" Tensor value_cache, int num_kv_heads, float scale,"
|
||||
" Tensor block_tables, Tensor seq_lens, int block_size,"
|
||||
" int max_seq_len, Tensor? alibi_slopes,"
|
||||
" str kv_cache_dtype, Tensor k_scale, Tensor v_scale,"
|
||||
" int tp_rank, int blocksparse_local_blocks,"
|
||||
" int blocksparse_vert_stride, int blocksparse_block_size,"
|
||||
" int blocksparse_head_sliding_step) -> ()");
|
||||
|
||||
ops.impl("paged_attention_v1", torch::kCPU, &paged_attention_v1);
|
||||
|
||||
ops.def(
|
||||
"dynamic_4bit_int_moe("
|
||||
"Tensor x, Tensor topk_ids, Tensor topk_weights,"
|
||||
@@ -102,20 +122,6 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
|
||||
|
||||
ops.impl("dynamic_4bit_int_moe", torch::kCPU, &dynamic_4bit_int_moe_cpu);
|
||||
|
||||
// PagedAttention V2.
|
||||
ops.def(
|
||||
"paged_attention_v2("
|
||||
" Tensor! out, Tensor! exp_sums, Tensor! max_logits,"
|
||||
" Tensor! tmp_out, Tensor query, Tensor key_cache,"
|
||||
" Tensor value_cache, int num_kv_heads, float scale,"
|
||||
" Tensor block_tables, Tensor seq_lens, int block_size,"
|
||||
" int max_seq_len, Tensor? alibi_slopes,"
|
||||
" str kv_cache_dtype, Tensor k_scale, Tensor v_scale,"
|
||||
" int tp_rank, int blocksparse_local_blocks,"
|
||||
" int blocksparse_vert_stride, int blocksparse_block_size,"
|
||||
" int blocksparse_head_sliding_step) -> ()");
|
||||
ops.impl("paged_attention_v2", torch::kCPU, &paged_attention_v2);
|
||||
|
||||
// Activation ops
|
||||
|
||||
// Activation function used in SwiGLU.
|
||||
@@ -166,7 +172,6 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
|
||||
// Quantization
|
||||
#if defined(__AVX512F__) || (defined(__aarch64__) && !defined(__APPLE__)) || \
|
||||
defined(__powerpc64__)
|
||||
at::Tag stride_tag = at::Tag::needs_fixed_stride_order;
|
||||
// Helper function to release oneDNN handlers
|
||||
ops.def("release_dnnl_matmul_handler(int handler) -> ()",
|
||||
&release_dnnl_matmul_handler);
|
||||
@@ -202,15 +207,13 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
|
||||
// Compute int8 quantized tensor for given scaling factor.
|
||||
ops.def(
|
||||
"static_scaled_int8_quant(Tensor! out, Tensor input, Tensor scale,"
|
||||
"Tensor? azp) -> ()",
|
||||
{stride_tag});
|
||||
"Tensor? azp) -> ()");
|
||||
ops.impl("static_scaled_int8_quant", torch::kCPU, &static_scaled_int8_quant);
|
||||
|
||||
// Compute int8 quantized tensor and scaling factor
|
||||
ops.def(
|
||||
"dynamic_scaled_int8_quant(Tensor! out, Tensor input, Tensor! scale, "
|
||||
"Tensor!? azp) -> ()",
|
||||
{stride_tag});
|
||||
"Tensor!? azp) -> ()");
|
||||
ops.impl("dynamic_scaled_int8_quant", torch::kCPU,
|
||||
&dynamic_scaled_int8_quant);
|
||||
#endif
|
||||
@@ -259,37 +262,40 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
|
||||
ops.impl("int8_scaled_mm_with_quant", torch::kCPU,
|
||||
&int8_scaled_mm_with_quant);
|
||||
#endif
|
||||
}
|
||||
|
||||
TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) {
|
||||
// Cache ops
|
||||
// Swap in (out) the cache blocks from src to dst.
|
||||
cache_ops.def(
|
||||
"swap_blocks(Tensor src, Tensor! dst, Tensor block_mapping) -> ()");
|
||||
cache_ops.impl("swap_blocks", torch::kCPU, &swap_blocks);
|
||||
// CPU attention kernels
|
||||
ops.def(
|
||||
"get_scheduler_metadata(int num_req, int num_heads_q, int num_heads_kv, "
|
||||
"int head_dim, Tensor seq_lens, ScalarType dtype, Tensor "
|
||||
"query_start_loc, bool casual, int window_size, str isa_hint, bool "
|
||||
"enable_kv_split) -> Tensor",
|
||||
&get_scheduler_metadata);
|
||||
ops.def(
|
||||
"cpu_attn_reshape_and_cache(Tensor key, Tensor value, Tensor(a2!) "
|
||||
"key_cache, Tensor(a3!) value_cache, Tensor slot_mapping, str "
|
||||
"isa) -> ()",
|
||||
&cpu_attn_reshape_and_cache);
|
||||
ops.def(
|
||||
"cpu_attention_with_kv_cache(Tensor query, Tensor key_cache, Tensor "
|
||||
"value_cache, Tensor(a3!) output, Tensor query_start_loc, Tensor "
|
||||
"seq_lens, float scale, bool causal, Tensor? alibi_slopes, SymInt "
|
||||
"sliding_window_left, SymInt sliding_window_right, Tensor block_table, "
|
||||
"float softcap, Tensor sheduler_metadata, Tensor? s_aux) -> ()",
|
||||
&cpu_attention_with_kv_cache);
|
||||
|
||||
// Copy the cache blocks from src to dst.
|
||||
cache_ops.def(
|
||||
"copy_blocks(Tensor(a!)[] key_caches, Tensor[](b!) value_caches, "
|
||||
"Tensor block_mapping) -> ()");
|
||||
cache_ops.impl("copy_blocks", torch::kCPU, ©_blocks);
|
||||
// placeholders
|
||||
ops.def("static_scaled_fp8_quant() -> ()", placeholder_op);
|
||||
ops.def("dynamic_scaled_fp8_quant() -> ()", placeholder_op);
|
||||
ops.def("dynamic_per_token_scaled_fp8_quant() -> ()", placeholder_op);
|
||||
|
||||
// Reshape the key and value tensors and cache them.
|
||||
cache_ops.def(
|
||||
"reshape_and_cache(Tensor key, Tensor value,"
|
||||
" Tensor! key_cache, Tensor! value_cache,"
|
||||
" Tensor slot_mapping,"
|
||||
" str kv_cache_dtype,"
|
||||
" Tensor k_scale, Tensor v_scale) -> ()");
|
||||
cache_ops.impl("reshape_and_cache", torch::kCPU, &reshape_and_cache);
|
||||
|
||||
cache_ops.def(
|
||||
"concat_and_cache_mla(Tensor kv_c, Tensor k_pe,"
|
||||
" Tensor! kv_cache,"
|
||||
" Tensor slot_mapping,"
|
||||
" str kv_cache_dtype,"
|
||||
" Tensor scale) -> ()");
|
||||
cache_ops.impl("concat_and_cache_mla", torch::kCPU, &concat_and_cache_mla);
|
||||
// WNA16
|
||||
#if defined(__AVX512F__)
|
||||
ops.def(
|
||||
"cpu_gemm_wna16(Tensor input, Tensor q_weight, Tensor(a2!) output, "
|
||||
"Tensor scales, Tensor? zeros, Tensor? g_idx, Tensor? bias, SymInt "
|
||||
"pack_factor, str isa_hint) -> ()");
|
||||
ops.impl("cpu_gemm_wna16", torch::kCPU, &cpu_gemm_wna16);
|
||||
#endif
|
||||
}
|
||||
|
||||
TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _utils), utils) {
|
||||
|
||||
@@ -45,21 +45,55 @@ std::string init_cpu_threads_env(const std::string& cpu_ids) {
|
||||
// Memory node binding
|
||||
if (numa_available() != -1) {
|
||||
int mem_node_id = numa_node_of_cpu(omp_cpu_ids.front());
|
||||
bitmask* mask = numa_parse_nodestring(std::to_string(mem_node_id).c_str());
|
||||
bitmask* src_mask = numa_get_membind();
|
||||
|
||||
int pid = getpid();
|
||||
|
||||
// move all existing pages to the specified numa node.
|
||||
*(src_mask->maskp) = *(src_mask->maskp) ^ *(mask->maskp);
|
||||
int page_num = numa_migrate_pages(pid, src_mask, mask);
|
||||
if (page_num == -1) {
|
||||
TORCH_WARN("numa_migrate_pages failed. errno: " + std::to_string(errno));
|
||||
std::set<int> node_ids;
|
||||
for (const auto& cpu_id : omp_cpu_ids) {
|
||||
int node_id = numa_node_of_cpu(cpu_id);
|
||||
if (node_id != -1) {
|
||||
node_ids.insert(node_id);
|
||||
}
|
||||
if (node_id != mem_node_id) {
|
||||
TORCH_WARN("CPU ", cpu_id, " is on NUMA node ", node_id, ", but CPU ",
|
||||
omp_cpu_ids.front(), " is on NUMA node ", mem_node_id,
|
||||
". All CPUs should be on the same NUMA node for optimal "
|
||||
"performance. Memory will be bound to NUMA node ",
|
||||
mem_node_id, ".");
|
||||
}
|
||||
}
|
||||
// Concatenate all node_ids into a single comma-separated string
|
||||
if (!node_ids.empty()) {
|
||||
std::string node_ids_str;
|
||||
for (const int node_id : node_ids) {
|
||||
if (!node_ids_str.empty()) {
|
||||
node_ids_str += ",";
|
||||
}
|
||||
node_ids_str += std::to_string(node_id);
|
||||
}
|
||||
|
||||
// restrict memory allocation node.
|
||||
numa_set_membind(mask);
|
||||
numa_set_strict(1);
|
||||
bitmask* mask = numa_parse_nodestring(node_ids_str.c_str());
|
||||
bitmask* src_mask = numa_get_membind();
|
||||
|
||||
int pid = getpid();
|
||||
|
||||
if (mask && src_mask) {
|
||||
// move all existing pages to the specified numa node.
|
||||
*(src_mask->maskp) = *(src_mask->maskp) ^ *(mask->maskp);
|
||||
int page_num = numa_migrate_pages(pid, src_mask, mask);
|
||||
if (page_num == -1) {
|
||||
TORCH_WARN("numa_migrate_pages failed. errno: " +
|
||||
std::to_string(errno));
|
||||
}
|
||||
|
||||
// restrict memory allocation node.
|
||||
numa_set_membind(mask);
|
||||
numa_set_strict(1);
|
||||
|
||||
numa_free_nodemask(mask);
|
||||
numa_free_nodemask(src_mask);
|
||||
} else {
|
||||
TORCH_WARN("numa_parse_nodestring or numa_get_membind failed. errno: " +
|
||||
std::to_string(errno));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// OMP threads binding
|
||||
|
||||
73
csrc/cpu/utils.hpp
Normal file
73
csrc/cpu/utils.hpp
Normal file
@@ -0,0 +1,73 @@
|
||||
#ifndef UTILS_HPP
|
||||
#define UTILS_HPP
|
||||
|
||||
#include <atomic>
|
||||
#include <cassert>
|
||||
#include <cstdint>
|
||||
#include <unistd.h>
|
||||
|
||||
#if defined(__APPLE__)
|
||||
#include <sys/sysctl.h>
|
||||
#endif
|
||||
|
||||
#include "cpu_types.hpp"
|
||||
|
||||
namespace cpu_utils {
|
||||
enum class ISA { AMX, VEC };
|
||||
|
||||
template <typename T>
|
||||
struct VecTypeTrait {
|
||||
using vec_t = void;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct VecTypeTrait<float> {
|
||||
using vec_t = vec_op::FP32Vec16;
|
||||
};
|
||||
|
||||
#if !defined(__aarch64__) || defined(ARM_BF16_SUPPORT)
|
||||
template <>
|
||||
struct VecTypeTrait<c10::BFloat16> {
|
||||
using vec_t = vec_op::BF16Vec16;
|
||||
};
|
||||
#endif
|
||||
|
||||
template <>
|
||||
struct VecTypeTrait<c10::Half> {
|
||||
using vec_t = vec_op::FP16Vec16;
|
||||
};
|
||||
|
||||
struct Counter {
|
||||
std::atomic<int64_t> counter;
|
||||
char _padding[56];
|
||||
|
||||
Counter() : counter(0) {}
|
||||
|
||||
void reset_counter() { counter.store(0); }
|
||||
|
||||
int64_t acquire_counter() { return counter++; }
|
||||
};
|
||||
|
||||
inline int64_t get_l2_size() {
|
||||
static int64_t size = []() {
|
||||
#if defined(__APPLE__)
|
||||
// macOS doesn't have _SC_LEVEL2_CACHE_SIZE. Use sysctlbyname.
|
||||
int64_t l2_cache_size = 0;
|
||||
size_t len = sizeof(l2_cache_size);
|
||||
if (sysctlbyname("hw.l2cachesize", &l2_cache_size, &len, NULL, 0) == 0 &&
|
||||
l2_cache_size > 0) {
|
||||
return l2_cache_size >> 1; // use 50% of L2 cache
|
||||
}
|
||||
// Fallback if sysctlbyname fails
|
||||
return 128LL * 1024 >> 1; // use 50% of 128KB
|
||||
#else
|
||||
long l2_cache_size = sysconf(_SC_LEVEL2_CACHE_SIZE);
|
||||
assert(l2_cache_size != -1);
|
||||
return l2_cache_size >> 1; // use 50% of L2 cache
|
||||
#endif
|
||||
}();
|
||||
return size;
|
||||
}
|
||||
} // namespace cpu_utils
|
||||
|
||||
#endif
|
||||
@@ -22,15 +22,10 @@ torch::Tensor get_cuda_view_from_cpu_tensor(torch::Tensor& cpu_tensor) {
|
||||
auto strides = cpu_tensor.strides();
|
||||
auto options = cpu_tensor.options().device(torch::kCUDA);
|
||||
|
||||
// from_blob signature: from_blob(void *data, IntArrayRef sizes, ..., Deleter,
|
||||
// const TensorOptions &) Provide a no-op deleter. The CPU tensor holds the
|
||||
// memory, so we don't free it here.
|
||||
auto deleter = [](void*) {
|
||||
// no-op, since the memory is owned by the original CPU tensor
|
||||
};
|
||||
|
||||
// use default no-op deleter, since the memory is owned by the original CPU
|
||||
// tensor
|
||||
torch::Tensor cuda_tensor =
|
||||
torch::from_blob(device_ptr, sizes, strides, deleter, options);
|
||||
torch::from_blob(device_ptr, sizes, strides, options);
|
||||
|
||||
TORCH_CHECK(cuda_tensor.device().is_cuda(),
|
||||
"Resulting tensor is not on CUDA device");
|
||||
|
||||
@@ -3,14 +3,58 @@
|
||||
// need to be unsigned long long
|
||||
#include <iostream>
|
||||
|
||||
#include "cumem_allocator_compat.h"
|
||||
|
||||
#ifndef USE_ROCM
|
||||
static const char* PYARGS_PARSE = "KKKK";
|
||||
#else
|
||||
#include <cstdlib>
|
||||
#include <cerrno>
|
||||
#include <climits>
|
||||
|
||||
// Default chunk size 256MB for ROCm. Can be overridden at runtime by the
|
||||
// environment variable VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE, specified in megabytes
|
||||
// (MB). The env value is parsed with strtoull as an integer number of MB
|
||||
// (decimal or 0x hex). The parsed MB value is converted to bytes. If
|
||||
// parsing fails, the value is 0, or the multiplication would overflow,
|
||||
// the default (256MB) is used.
|
||||
static const unsigned long long DEFAULT_MEMCREATE_CHUNK_SIZE =
|
||||
(256ULL * 1024ULL * 1024ULL);
|
||||
|
||||
static unsigned long long get_memcreate_chunk_size() {
|
||||
const char* env = getenv("VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE");
|
||||
if (!env) return DEFAULT_MEMCREATE_CHUNK_SIZE;
|
||||
char* endptr = nullptr;
|
||||
errno = 0;
|
||||
unsigned long long val_mb = strtoull(env, &endptr, 0);
|
||||
if (endptr == env || errno != 0) {
|
||||
// parsing failed, fallback to default
|
||||
return DEFAULT_MEMCREATE_CHUNK_SIZE;
|
||||
}
|
||||
if (val_mb == 0) return DEFAULT_MEMCREATE_CHUNK_SIZE;
|
||||
|
||||
const unsigned long long MB = 1024ULL * 1024ULL;
|
||||
// guard against overflow when converting MB -> bytes
|
||||
if (val_mb > (ULLONG_MAX / MB)) {
|
||||
return DEFAULT_MEMCREATE_CHUNK_SIZE;
|
||||
}
|
||||
return val_mb * MB;
|
||||
}
|
||||
|
||||
static inline unsigned long long my_min(unsigned long long a,
|
||||
unsigned long long b) {
|
||||
return a < b ? a : b;
|
||||
}
|
||||
|
||||
static const char* PYARGS_PARSE = "KKKO";
|
||||
#endif
|
||||
|
||||
extern "C" {
|
||||
|
||||
#define PY_SSIZE_T_CLEAN
|
||||
#include <Python.h>
|
||||
|
||||
#include <sys/types.h>
|
||||
#include <cuda_runtime_api.h>
|
||||
#include <cuda.h>
|
||||
|
||||
char error_msg[10240]; // 10KB buffer to store error messages
|
||||
CUresult no_error = CUresult(0);
|
||||
@@ -49,7 +93,12 @@ void ensure_context(unsigned long long device) {
|
||||
}
|
||||
|
||||
void create_and_map(unsigned long long device, ssize_t size, CUdeviceptr d_mem,
|
||||
#ifndef USE_ROCM
|
||||
CUmemGenericAllocationHandle* p_memHandle) {
|
||||
#else
|
||||
CUmemGenericAllocationHandle** p_memHandle,
|
||||
unsigned long long* chunk_sizes, size_t num_chunks) {
|
||||
#endif
|
||||
ensure_context(device);
|
||||
// Define memory allocation properties
|
||||
CUmemAllocationProp prop = {};
|
||||
@@ -58,6 +107,7 @@ void create_and_map(unsigned long long device, ssize_t size, CUdeviceptr d_mem,
|
||||
prop.location.id = device;
|
||||
prop.allocFlags.compressionType = CU_MEM_ALLOCATION_COMP_NONE;
|
||||
|
||||
#ifndef USE_ROCM
|
||||
// Allocate memory using cuMemCreate
|
||||
CUDA_CHECK(cuMemCreate(p_memHandle, size, &prop, 0));
|
||||
if (error_code != 0) {
|
||||
@@ -67,6 +117,39 @@ void create_and_map(unsigned long long device, ssize_t size, CUdeviceptr d_mem,
|
||||
if (error_code != 0) {
|
||||
return;
|
||||
}
|
||||
#else
|
||||
for (auto i = 0; i < num_chunks; ++i) {
|
||||
CUDA_CHECK(cuMemCreate(p_memHandle[i], chunk_sizes[i], &prop, 0));
|
||||
if (error_code != 0) {
|
||||
// Clean up previously created handles
|
||||
for (auto j = 0; j < i; ++j) {
|
||||
cuMemRelease(*(p_memHandle[j]));
|
||||
}
|
||||
return;
|
||||
}
|
||||
}
|
||||
unsigned long long allocated_size = 0;
|
||||
for (auto i = 0; i < num_chunks; ++i) {
|
||||
void* map_addr = (void*)((uintptr_t)d_mem + allocated_size);
|
||||
CUDA_CHECK(cuMemMap(map_addr, chunk_sizes[i], 0, *(p_memHandle[i]), 0));
|
||||
if (error_code != 0) {
|
||||
// unmap previously mapped chunks
|
||||
unsigned long long unmapped_size = 0;
|
||||
for (auto j = 0; j < i; ++j) {
|
||||
void* unmap_addr = (void*)((uintptr_t)d_mem + unmapped_size);
|
||||
cuMemUnmap(unmap_addr, chunk_sizes[j]);
|
||||
unmapped_size += chunk_sizes[j];
|
||||
}
|
||||
// release all created handles
|
||||
for (auto j = 0; j < num_chunks; ++j) {
|
||||
cuMemRelease(*(p_memHandle[j]));
|
||||
}
|
||||
return;
|
||||
}
|
||||
allocated_size += chunk_sizes[i];
|
||||
}
|
||||
#endif
|
||||
|
||||
CUmemAccessDesc accessDesc = {};
|
||||
accessDesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
|
||||
accessDesc.location.id = device;
|
||||
@@ -82,10 +165,16 @@ void create_and_map(unsigned long long device, ssize_t size, CUdeviceptr d_mem,
|
||||
|
||||
void unmap_and_release(unsigned long long device, ssize_t size,
|
||||
CUdeviceptr d_mem,
|
||||
#ifndef USE_ROCM
|
||||
CUmemGenericAllocationHandle* p_memHandle) {
|
||||
#else
|
||||
CUmemGenericAllocationHandle** p_memHandle,
|
||||
unsigned long long* chunk_sizes, size_t num_chunks) {
|
||||
#endif
|
||||
// std::cout << "unmap_and_release: device=" << device << ", size=" << size <<
|
||||
// ", d_mem=" << d_mem << ", p_memHandle=" << p_memHandle << std::endl;
|
||||
ensure_context(device);
|
||||
#ifndef USE_ROCM
|
||||
CUDA_CHECK(cuMemUnmap(d_mem, size));
|
||||
if (error_code != 0) {
|
||||
return;
|
||||
@@ -94,6 +183,30 @@ void unmap_and_release(unsigned long long device, ssize_t size,
|
||||
if (error_code != 0) {
|
||||
return;
|
||||
}
|
||||
#else
|
||||
unsigned long long allocated_size = 0;
|
||||
CUresult first_error = no_error;
|
||||
|
||||
for (auto i = 0; i < num_chunks; ++i) {
|
||||
void* map_addr = (void*)((uintptr_t)d_mem + allocated_size);
|
||||
CUresult status = cuMemUnmap(map_addr, chunk_sizes[i]);
|
||||
if (status != no_error && first_error == no_error) {
|
||||
first_error = status;
|
||||
}
|
||||
allocated_size += chunk_sizes[i];
|
||||
}
|
||||
|
||||
for (auto i = 0; i < num_chunks; ++i) {
|
||||
CUresult status = cuMemRelease(*(p_memHandle[i]));
|
||||
if (status != no_error && first_error == no_error) {
|
||||
first_error = status;
|
||||
}
|
||||
}
|
||||
|
||||
if (first_error != no_error) {
|
||||
CUDA_CHECK(first_error);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
PyObject* create_tuple_from_c_integers(unsigned long long a,
|
||||
@@ -120,6 +233,36 @@ PyObject* create_tuple_from_c_integers(unsigned long long a,
|
||||
return tuple; // Return the created tuple
|
||||
}
|
||||
|
||||
PyObject* create_tuple_from_c_mixed(unsigned long long a, unsigned long long b,
|
||||
unsigned long long c,
|
||||
CUmemGenericAllocationHandle** vec,
|
||||
unsigned long long* chunk_sizes,
|
||||
size_t num_chunks) {
|
||||
PyObject* tuple = PyTuple_New(4);
|
||||
if (!tuple) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
// PyObject* list = PyList_New(vec.size());
|
||||
PyObject* list = PyList_New(num_chunks);
|
||||
for (auto i = 0; i < num_chunks; ++i) {
|
||||
PyObject* addr_size_pair = PyTuple_New(2);
|
||||
PyObject* addr = PyLong_FromUnsignedLongLong((unsigned long long)(vec[i]));
|
||||
PyObject* size =
|
||||
PyLong_FromUnsignedLongLong((unsigned long long)(chunk_sizes[i]));
|
||||
PyTuple_SetItem(addr_size_pair, 0, addr);
|
||||
PyTuple_SetItem(addr_size_pair, 1, size);
|
||||
PyList_SetItem(list, i, addr_size_pair);
|
||||
}
|
||||
|
||||
PyTuple_SetItem(tuple, 0, PyLong_FromUnsignedLongLong(a));
|
||||
PyTuple_SetItem(tuple, 1, PyLong_FromUnsignedLongLong(b));
|
||||
PyTuple_SetItem(tuple, 2, PyLong_FromUnsignedLongLong(c));
|
||||
PyTuple_SetItem(tuple, 3, list);
|
||||
|
||||
return tuple;
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Our exported C functions that call Python:
|
||||
|
||||
@@ -147,14 +290,55 @@ void* my_malloc(ssize_t size, int device, CUstream stream) {
|
||||
size_t alignedSize = ((size + granularity - 1) / granularity) * granularity;
|
||||
|
||||
CUdeviceptr d_mem;
|
||||
#ifndef USE_ROCM
|
||||
CUDA_CHECK(cuMemAddressReserve(&d_mem, alignedSize, 0, 0, 0));
|
||||
if (error_code != 0) {
|
||||
return nullptr;
|
||||
}
|
||||
#else
|
||||
CUDA_CHECK(cuMemAddressReserve(&d_mem, alignedSize, granularity, 0, 0));
|
||||
if (error_code != 0) {
|
||||
return nullptr;
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifndef USE_ROCM
|
||||
// allocate the CUmemGenericAllocationHandle
|
||||
CUmemGenericAllocationHandle* p_memHandle =
|
||||
(CUmemGenericAllocationHandle*)malloc(
|
||||
sizeof(CUmemGenericAllocationHandle));
|
||||
#else
|
||||
// Make sure chunk size is aligned with hardware granularity. The base
|
||||
// chunk size can be configured via environment variable
|
||||
// ``VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE``; otherwise
|
||||
// DEFAULT_MEMCREATE_CHUNK_SIZE is used.
|
||||
size_t base_chunk = (size_t)get_memcreate_chunk_size();
|
||||
size_t aligned_chunk_size =
|
||||
((base_chunk + granularity - 1) / granularity) * granularity;
|
||||
size_t num_chunks =
|
||||
(alignedSize + aligned_chunk_size - 1) / aligned_chunk_size;
|
||||
CUmemGenericAllocationHandle** p_memHandle =
|
||||
(CUmemGenericAllocationHandle**)malloc(
|
||||
num_chunks * sizeof(CUmemGenericAllocationHandle*));
|
||||
unsigned long long* chunk_sizes =
|
||||
(unsigned long long*)malloc(num_chunks * sizeof(unsigned long long));
|
||||
for (auto i = 0; i < num_chunks; ++i) {
|
||||
p_memHandle[i] = (CUmemGenericAllocationHandle*)malloc(
|
||||
sizeof(CUmemGenericAllocationHandle));
|
||||
if (p_memHandle[i] == nullptr) {
|
||||
std::cerr << "ERROR: malloc failed for p_memHandle[" << i << "].\n";
|
||||
for (auto j = 0; j < i; ++j) {
|
||||
free(p_memHandle[j]);
|
||||
}
|
||||
free(p_memHandle);
|
||||
free(chunk_sizes);
|
||||
return nullptr;
|
||||
}
|
||||
chunk_sizes[i] = (unsigned long long)my_min(
|
||||
(unsigned long long)(alignedSize - i * aligned_chunk_size),
|
||||
(unsigned long long)aligned_chunk_size);
|
||||
}
|
||||
#endif
|
||||
|
||||
if (!g_python_malloc_callback) {
|
||||
std::cerr << "ERROR: g_python_malloc_callback not set.\n";
|
||||
@@ -164,9 +348,15 @@ void* my_malloc(ssize_t size, int device, CUstream stream) {
|
||||
// Acquire GIL (not in stable ABI officially, but often works)
|
||||
PyGILState_STATE gstate = PyGILState_Ensure();
|
||||
|
||||
#ifndef USE_ROCM
|
||||
PyObject* arg_tuple = create_tuple_from_c_integers(
|
||||
(unsigned long long)device, (unsigned long long)alignedSize,
|
||||
(unsigned long long)d_mem, (unsigned long long)p_memHandle);
|
||||
#else
|
||||
PyObject* arg_tuple = create_tuple_from_c_mixed(
|
||||
(unsigned long long)device, (unsigned long long)alignedSize,
|
||||
(unsigned long long)d_mem, p_memHandle, chunk_sizes, num_chunks);
|
||||
#endif
|
||||
|
||||
// Call g_python_malloc_callback
|
||||
PyObject* py_result =
|
||||
@@ -182,7 +372,27 @@ void* my_malloc(ssize_t size, int device, CUstream stream) {
|
||||
PyGILState_Release(gstate);
|
||||
|
||||
// do the final mapping
|
||||
#ifndef USE_ROCM
|
||||
create_and_map(device, alignedSize, d_mem, p_memHandle);
|
||||
#else
|
||||
create_and_map(device, alignedSize, d_mem, p_memHandle, chunk_sizes,
|
||||
num_chunks);
|
||||
free(chunk_sizes);
|
||||
#endif
|
||||
|
||||
if (error_code != 0) {
|
||||
// free address and the handle
|
||||
CUDA_CHECK(cuMemAddressFree(d_mem, alignedSize));
|
||||
#ifndef USE_ROCM
|
||||
free(p_memHandle);
|
||||
#else
|
||||
for (size_t i = 0; i < num_chunks; ++i) {
|
||||
free(p_memHandle[i]);
|
||||
}
|
||||
free(p_memHandle);
|
||||
#endif
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
return (void*)d_mem;
|
||||
}
|
||||
@@ -206,36 +416,96 @@ void my_free(void* ptr, ssize_t size, int device, CUstream stream) {
|
||||
|
||||
if (!py_result || !PyTuple_Check(py_result) || PyTuple_Size(py_result) != 4) {
|
||||
PyErr_SetString(PyExc_TypeError, "Expected a tuple of size 4");
|
||||
Py_XDECREF(py_result);
|
||||
Py_XDECREF(py_ptr);
|
||||
return;
|
||||
}
|
||||
|
||||
unsigned long long recv_device, recv_size;
|
||||
unsigned long long recv_d_mem, recv_p_memHandle;
|
||||
unsigned long long recv_d_mem;
|
||||
#ifndef USE_ROCM
|
||||
unsigned long long recv_p_memHandle;
|
||||
#else
|
||||
PyObject* recv_p_memHandle;
|
||||
#endif
|
||||
// Unpack the tuple into four C integers
|
||||
if (!PyArg_ParseTuple(py_result, "KKKK", &recv_device, &recv_size,
|
||||
if (!PyArg_ParseTuple(py_result, PYARGS_PARSE, &recv_device, &recv_size,
|
||||
&recv_d_mem, &recv_p_memHandle)) {
|
||||
// PyArg_ParseTuple sets an error if it fails
|
||||
Py_XDECREF(py_result);
|
||||
Py_XDECREF(py_ptr);
|
||||
return;
|
||||
}
|
||||
|
||||
// For ROCm, copy the Python list of (addr,size) pairs into C arrays while
|
||||
// holding the GIL. Then release the GIL and call the unmap/release helper
|
||||
// using the copied arrays. This avoids calling PyList_* APIs without the
|
||||
// GIL (which is undefined behavior and can crash when called from other
|
||||
// threads).
|
||||
CUdeviceptr d_mem = (CUdeviceptr)recv_d_mem;
|
||||
#ifdef USE_ROCM
|
||||
Py_ssize_t num_chunks = PyList_Size(recv_p_memHandle);
|
||||
CUmemGenericAllocationHandle** p_memHandle =
|
||||
(CUmemGenericAllocationHandle**)malloc(
|
||||
num_chunks * sizeof(CUmemGenericAllocationHandle*));
|
||||
if (p_memHandle == nullptr) {
|
||||
Py_DECREF(py_ptr);
|
||||
Py_DECREF(py_result);
|
||||
PyGILState_Release(gstate);
|
||||
std::cerr << "ERROR: malloc failed for p_memHandle in my_free."
|
||||
<< std::endl;
|
||||
return;
|
||||
}
|
||||
unsigned long long* chunk_sizes =
|
||||
(unsigned long long*)malloc(num_chunks * sizeof(unsigned long long));
|
||||
if (chunk_sizes == nullptr) {
|
||||
free(p_memHandle);
|
||||
Py_DECREF(py_ptr);
|
||||
Py_DECREF(py_result);
|
||||
PyGILState_Release(gstate);
|
||||
std::cerr << "ERROR: malloc failed for chunk_sizes in my_free."
|
||||
<< std::endl;
|
||||
return;
|
||||
}
|
||||
for (Py_ssize_t i = 0; i < num_chunks; ++i) {
|
||||
PyObject* item = PyList_GetItem(recv_p_memHandle, i);
|
||||
PyObject* addr_py = PyTuple_GetItem(item, 0);
|
||||
PyObject* size_py = PyTuple_GetItem(item, 1);
|
||||
p_memHandle[i] =
|
||||
(CUmemGenericAllocationHandle*)PyLong_AsUnsignedLongLong(addr_py);
|
||||
chunk_sizes[i] = (unsigned long long)PyLong_AsUnsignedLongLong(size_py);
|
||||
}
|
||||
|
||||
// Drop temporary Python refs, then release the GIL before calling into
|
||||
// non-Python APIs.
|
||||
Py_DECREF(py_ptr);
|
||||
Py_DECREF(py_result);
|
||||
PyGILState_Release(gstate);
|
||||
|
||||
// recv_size == size
|
||||
// recv_device == device
|
||||
unmap_and_release(device, size, d_mem, p_memHandle, chunk_sizes, num_chunks);
|
||||
#else
|
||||
// Non-ROCm path: simple integer handle already extracted; drop temporary
|
||||
// Python refs while still holding the GIL, then release it.
|
||||
Py_DECREF(py_ptr);
|
||||
Py_DECREF(py_result);
|
||||
PyGILState_Release(gstate);
|
||||
|
||||
// Free memory
|
||||
|
||||
CUdeviceptr d_mem = (CUdeviceptr)recv_d_mem;
|
||||
CUmemGenericAllocationHandle* p_memHandle =
|
||||
(CUmemGenericAllocationHandle*)recv_p_memHandle;
|
||||
unmap_and_release(device, size, d_mem, p_memHandle);
|
||||
#endif
|
||||
|
||||
// free address and the handle
|
||||
CUDA_CHECK(cuMemAddressFree(d_mem, size));
|
||||
if (error_code != 0) {
|
||||
return;
|
||||
#ifndef USE_ROCM
|
||||
free(p_memHandle);
|
||||
#else
|
||||
for (auto i = 0; i < num_chunks; ++i) {
|
||||
free(p_memHandle[i]);
|
||||
}
|
||||
free(p_memHandle);
|
||||
free(chunk_sizes);
|
||||
#endif
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
@@ -271,19 +541,87 @@ static PyObject* python_unmap_and_release(PyObject* self, PyObject* args) {
|
||||
}
|
||||
|
||||
unsigned long long recv_device, recv_size;
|
||||
unsigned long long recv_d_mem, recv_p_memHandle;
|
||||
unsigned long long recv_d_mem;
|
||||
#ifndef USE_ROCM
|
||||
unsigned long long recv_p_memHandle;
|
||||
#else
|
||||
PyObject* recv_p_memHandle;
|
||||
#endif
|
||||
// Unpack the tuple into four C integers
|
||||
if (!PyArg_ParseTuple(args, "KKKK", &recv_device, &recv_size, &recv_d_mem,
|
||||
&recv_p_memHandle)) {
|
||||
if (!PyArg_ParseTuple(args, PYARGS_PARSE, &recv_device, &recv_size,
|
||||
&recv_d_mem, &recv_p_memHandle)) {
|
||||
// PyArg_ParseTuple sets an error if it fails
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
CUdeviceptr d_mem_ptr = (CUdeviceptr)recv_d_mem;
|
||||
#ifndef USE_ROCM
|
||||
CUmemGenericAllocationHandle* p_memHandle =
|
||||
(CUmemGenericAllocationHandle*)recv_p_memHandle;
|
||||
|
||||
unmap_and_release(recv_device, recv_size, d_mem_ptr, p_memHandle);
|
||||
#else
|
||||
if (!PyList_Check(recv_p_memHandle)) {
|
||||
PyErr_SetString(PyExc_TypeError,
|
||||
"Expected a list for the 4th argument on ROCm");
|
||||
return nullptr;
|
||||
}
|
||||
Py_ssize_t num_chunks = PyList_Size(recv_p_memHandle);
|
||||
if (num_chunks < 0) {
|
||||
return nullptr; // PyList_Size sets an exception on error.
|
||||
}
|
||||
CUmemGenericAllocationHandle** p_memHandle =
|
||||
(CUmemGenericAllocationHandle**)malloc(
|
||||
num_chunks * sizeof(CUmemGenericAllocationHandle*));
|
||||
if (p_memHandle == nullptr) {
|
||||
PyErr_SetString(PyExc_MemoryError, "malloc failed for p_memHandle");
|
||||
return nullptr;
|
||||
}
|
||||
unsigned long long* chunk_sizes =
|
||||
(unsigned long long*)malloc(num_chunks * sizeof(unsigned long long));
|
||||
if (chunk_sizes == nullptr) {
|
||||
free(p_memHandle);
|
||||
PyErr_SetString(PyExc_MemoryError, "malloc failed for chunk_sizes");
|
||||
return nullptr;
|
||||
}
|
||||
for (Py_ssize_t i = 0; i < num_chunks; ++i) {
|
||||
PyObject* item = PyList_GetItem(recv_p_memHandle, i);
|
||||
if (item == nullptr || !PyTuple_Check(item) || PyTuple_Size(item) != 2) {
|
||||
free(p_memHandle);
|
||||
free(chunk_sizes);
|
||||
PyErr_SetString(
|
||||
PyExc_TypeError,
|
||||
"List items must be tuples of size 2 (handle_addr, size)");
|
||||
return nullptr;
|
||||
}
|
||||
PyObject* addr_py = PyTuple_GetItem(item, 0);
|
||||
PyObject* size_py = PyTuple_GetItem(item, 1);
|
||||
if (addr_py == nullptr || size_py == nullptr) {
|
||||
free(p_memHandle);
|
||||
free(chunk_sizes);
|
||||
return nullptr; // PyTuple_GetItem sets an exception
|
||||
}
|
||||
p_memHandle[i] =
|
||||
(CUmemGenericAllocationHandle*)PyLong_AsUnsignedLongLong(addr_py);
|
||||
if (PyErr_Occurred()) {
|
||||
free(p_memHandle);
|
||||
free(chunk_sizes);
|
||||
return nullptr;
|
||||
}
|
||||
chunk_sizes[i] = (unsigned long long)PyLong_AsUnsignedLongLong(size_py);
|
||||
if (PyErr_Occurred()) {
|
||||
free(p_memHandle);
|
||||
free(chunk_sizes);
|
||||
return nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
unmap_and_release(recv_device, recv_size, d_mem_ptr, p_memHandle, chunk_sizes,
|
||||
num_chunks);
|
||||
|
||||
free(p_memHandle);
|
||||
free(chunk_sizes);
|
||||
#endif
|
||||
|
||||
if (error_code != 0) {
|
||||
error_code = no_error;
|
||||
@@ -301,19 +639,56 @@ static PyObject* python_create_and_map(PyObject* self, PyObject* args) {
|
||||
}
|
||||
|
||||
unsigned long long recv_device, recv_size;
|
||||
unsigned long long recv_d_mem, recv_p_memHandle;
|
||||
unsigned long long recv_d_mem;
|
||||
#ifndef USE_ROCM
|
||||
unsigned long long recv_p_memHandle;
|
||||
#else
|
||||
PyObject* recv_p_memHandle;
|
||||
#endif
|
||||
// Unpack the tuple into four C integers
|
||||
if (!PyArg_ParseTuple(args, "KKKK", &recv_device, &recv_size, &recv_d_mem,
|
||||
&recv_p_memHandle)) {
|
||||
if (!PyArg_ParseTuple(args, PYARGS_PARSE, &recv_device, &recv_size,
|
||||
&recv_d_mem, &recv_p_memHandle)) {
|
||||
// PyArg_ParseTuple sets an error if it fails
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
CUdeviceptr d_mem_ptr = (CUdeviceptr)recv_d_mem;
|
||||
#ifndef USE_ROCM
|
||||
CUmemGenericAllocationHandle* p_memHandle =
|
||||
(CUmemGenericAllocationHandle*)recv_p_memHandle;
|
||||
|
||||
create_and_map(recv_device, recv_size, d_mem_ptr, p_memHandle);
|
||||
#else
|
||||
Py_ssize_t num_chunks = PyList_Size(recv_p_memHandle);
|
||||
CUmemGenericAllocationHandle** p_memHandle =
|
||||
(CUmemGenericAllocationHandle**)malloc(
|
||||
num_chunks * sizeof(CUmemGenericAllocationHandle*));
|
||||
if (p_memHandle == nullptr) {
|
||||
PyErr_SetString(PyExc_MemoryError, "malloc failed for p_memHandle");
|
||||
return nullptr;
|
||||
}
|
||||
unsigned long long* chunk_sizes =
|
||||
(unsigned long long*)malloc(num_chunks * sizeof(unsigned long long));
|
||||
if (chunk_sizes == nullptr) {
|
||||
free(p_memHandle);
|
||||
PyErr_SetString(PyExc_MemoryError, "malloc failed for chunk_sizes");
|
||||
return nullptr;
|
||||
}
|
||||
for (auto i = 0; i < num_chunks; ++i) {
|
||||
PyObject* item = PyList_GetItem(recv_p_memHandle, i);
|
||||
PyObject* addr_py = PyTuple_GetItem(item, 0);
|
||||
PyObject* size_py = PyTuple_GetItem(item, 1);
|
||||
p_memHandle[i] =
|
||||
(CUmemGenericAllocationHandle*)PyLong_AsUnsignedLongLong(addr_py);
|
||||
chunk_sizes[i] = PyLong_AsUnsignedLongLong(size_py);
|
||||
}
|
||||
|
||||
create_and_map(recv_device, recv_size, d_mem_ptr, p_memHandle, chunk_sizes,
|
||||
num_chunks);
|
||||
|
||||
free(p_memHandle);
|
||||
free(chunk_sizes);
|
||||
#endif
|
||||
|
||||
if (error_code != 0) {
|
||||
error_code = no_error;
|
||||
|
||||
109
csrc/cumem_allocator_compat.h
Normal file
109
csrc/cumem_allocator_compat.h
Normal file
@@ -0,0 +1,109 @@
|
||||
#pragma once
|
||||
|
||||
#ifdef USE_ROCM
|
||||
////////////////////////////////////////
|
||||
// For compatibility with CUDA and ROCm
|
||||
////////////////////////////////////////
|
||||
#include <hip/hip_runtime_api.h>
|
||||
|
||||
extern "C" {
|
||||
#ifndef CUDA_SUCCESS
|
||||
#define CUDA_SUCCESS hipSuccess
|
||||
#endif // CUDA_SUCCESS
|
||||
|
||||
// https://rocm.docs.amd.com/projects/HIPIFY/en/latest/tables/CUDA_Driver_API_functions_supported_by_HIP.html
|
||||
typedef unsigned long long CUdevice;
|
||||
typedef hipDeviceptr_t CUdeviceptr;
|
||||
typedef hipError_t CUresult;
|
||||
typedef hipCtx_t CUcontext;
|
||||
typedef hipStream_t CUstream;
|
||||
typedef hipMemGenericAllocationHandle_t CUmemGenericAllocationHandle;
|
||||
typedef hipMemAllocationGranularity_flags CUmemAllocationGranularity_flags;
|
||||
typedef hipMemAllocationProp CUmemAllocationProp;
|
||||
typedef hipMemAccessDesc CUmemAccessDesc;
|
||||
|
||||
#define CU_MEM_ALLOCATION_TYPE_PINNED hipMemAllocationTypePinned
|
||||
#define CU_MEM_LOCATION_TYPE_DEVICE hipMemLocationTypeDevice
|
||||
#define CU_MEM_ACCESS_FLAGS_PROT_READWRITE hipMemAccessFlagsProtReadWrite
|
||||
#define CU_MEM_ALLOC_GRANULARITY_MINIMUM hipMemAllocationGranularityMinimum
|
||||
|
||||
// https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__TYPES.html
|
||||
#define CU_MEM_ALLOCATION_COMP_NONE 0x0
|
||||
|
||||
// Error Handling
|
||||
// https://docs.nvidia.com/cuda/archive/11.4.4/cuda-driver-api/group__CUDA__ERROR.html
|
||||
CUresult cuGetErrorString(CUresult hipError, const char** pStr) {
|
||||
*pStr = hipGetErrorString(hipError);
|
||||
return CUDA_SUCCESS;
|
||||
}
|
||||
|
||||
// Context Management
|
||||
// https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html
|
||||
CUresult cuCtxGetCurrent(CUcontext* ctx) {
|
||||
// This API is deprecated on the AMD platform, only for equivalent cuCtx
|
||||
// driver API on the NVIDIA platform.
|
||||
return hipCtxGetCurrent(ctx);
|
||||
}
|
||||
|
||||
CUresult cuCtxSetCurrent(CUcontext ctx) {
|
||||
// This API is deprecated on the AMD platform, only for equivalent cuCtx
|
||||
// driver API on the NVIDIA platform.
|
||||
return hipCtxSetCurrent(ctx);
|
||||
}
|
||||
|
||||
// Primary Context Management
|
||||
// https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__PRIMARY__CTX.html
|
||||
CUresult cuDevicePrimaryCtxRetain(CUcontext* ctx, CUdevice dev) {
|
||||
return hipDevicePrimaryCtxRetain(ctx, dev);
|
||||
}
|
||||
|
||||
// Virtual Memory Management
|
||||
// https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__VA.html
|
||||
CUresult cuMemAddressFree(CUdeviceptr ptr, size_t size) {
|
||||
return hipMemAddressFree(ptr, size);
|
||||
}
|
||||
|
||||
CUresult cuMemAddressReserve(CUdeviceptr* ptr, size_t size, size_t alignment,
|
||||
CUdeviceptr addr, unsigned long long flags) {
|
||||
return hipMemAddressReserve(ptr, size, alignment, addr, flags);
|
||||
}
|
||||
|
||||
CUresult cuMemCreate(CUmemGenericAllocationHandle* handle, size_t size,
|
||||
const CUmemAllocationProp* prop,
|
||||
unsigned long long flags) {
|
||||
return hipMemCreate(handle, size, prop, flags);
|
||||
}
|
||||
|
||||
CUresult cuMemGetAllocationGranularity(
|
||||
size_t* granularity, const CUmemAllocationProp* prop,
|
||||
CUmemAllocationGranularity_flags option) {
|
||||
return hipMemGetAllocationGranularity(granularity, prop, option);
|
||||
}
|
||||
|
||||
CUresult cuMemMap(CUdeviceptr dptr, size_t size, size_t offset,
|
||||
CUmemGenericAllocationHandle handle,
|
||||
unsigned long long flags) {
|
||||
return hipMemMap(dptr, size, offset, handle, flags);
|
||||
}
|
||||
|
||||
CUresult cuMemRelease(CUmemGenericAllocationHandle handle) {
|
||||
return hipMemRelease(handle);
|
||||
}
|
||||
|
||||
CUresult cuMemSetAccess(CUdeviceptr ptr, size_t size,
|
||||
const CUmemAccessDesc* desc, size_t count) {
|
||||
return hipMemSetAccess(ptr, size, desc, count);
|
||||
}
|
||||
|
||||
CUresult cuMemUnmap(CUdeviceptr ptr, size_t size) {
|
||||
return hipMemUnmap(ptr, size);
|
||||
}
|
||||
} // extern "C"
|
||||
|
||||
#else
|
||||
////////////////////////////////////////
|
||||
// Import CUDA headers for NVIDIA GPUs
|
||||
////////////////////////////////////////
|
||||
#include <cuda_runtime_api.h>
|
||||
#include <cuda.h>
|
||||
#endif
|
||||
@@ -88,3 +88,53 @@
|
||||
#define VLLM_DISPATCH_INTEGRAL_AND_UNSIGNED_TYPES(TYPE, NAME, ...) \
|
||||
AT_DISPATCH_SWITCH( \
|
||||
TYPE, NAME, VLLM_DISPATCH_CASE_INTEGRAL_AND_UNSIGNED_TYPES(__VA_ARGS__))
|
||||
|
||||
#define VLLM_DISPATCH_VEC_SIZE(VEC_SIZE, ...) \
|
||||
switch (VEC_SIZE) { \
|
||||
case 16: { \
|
||||
constexpr int vec_size = 16; \
|
||||
__VA_ARGS__(); \
|
||||
break; \
|
||||
} \
|
||||
case 8: { \
|
||||
constexpr int vec_size = 8; \
|
||||
__VA_ARGS__(); \
|
||||
break; \
|
||||
} \
|
||||
case 4: { \
|
||||
constexpr int vec_size = 4; \
|
||||
__VA_ARGS__(); \
|
||||
break; \
|
||||
} \
|
||||
case 2: { \
|
||||
constexpr int vec_size = 2; \
|
||||
__VA_ARGS__(); \
|
||||
break; \
|
||||
} \
|
||||
default: { \
|
||||
constexpr int vec_size = 1; \
|
||||
__VA_ARGS__(); \
|
||||
break; \
|
||||
} \
|
||||
}
|
||||
|
||||
#define VLLM_DISPATCH_RANK234(NUM_DIMS, ...) \
|
||||
switch (NUM_DIMS) { \
|
||||
case 2: { \
|
||||
constexpr int tensor_rank = 2; \
|
||||
__VA_ARGS__(); \
|
||||
break; \
|
||||
} \
|
||||
case 3: { \
|
||||
constexpr int tensor_rank = 3; \
|
||||
__VA_ARGS__(); \
|
||||
break; \
|
||||
} \
|
||||
case 4: { \
|
||||
constexpr int tensor_rank = 4; \
|
||||
__VA_ARGS__(); \
|
||||
break; \
|
||||
} \
|
||||
default: \
|
||||
TORCH_CHECK(false, "Expects rank 2, 3 or 4 tensors but got ", NUM_DIMS); \
|
||||
}
|
||||
|
||||
428
csrc/fused_qknorm_rope_kernel.cu
Normal file
428
csrc/fused_qknorm_rope_kernel.cu
Normal file
@@ -0,0 +1,428 @@
|
||||
/*
|
||||
* Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include <cmath>
|
||||
#include <cuda_runtime.h>
|
||||
#include <type_traits>
|
||||
|
||||
#include <torch/cuda.h>
|
||||
#include <c10/cuda/CUDAGuard.h>
|
||||
|
||||
#include "cuda_compat.h"
|
||||
#include "dispatch_utils.h"
|
||||
#include "type_convert.cuh"
|
||||
|
||||
#define CHECK_TYPE(x, st) \
|
||||
TORCH_CHECK(x.scalar_type() == st, #x " dtype is ", x.scalar_type(), \
|
||||
", while ", st, " is expected")
|
||||
#define CHECK_TH_CUDA(x) TORCH_CHECK(x.is_cuda(), #x " must be a CUDA tensor")
|
||||
#define CHECK_CONTIGUOUS(x) \
|
||||
TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
|
||||
#define CHECK_INPUT(x) \
|
||||
CHECK_TH_CUDA(x); \
|
||||
CHECK_CONTIGUOUS(x)
|
||||
|
||||
#ifdef USE_ROCM
|
||||
#define FINAL_MASK 0xffffffffffffffffULL
|
||||
|
||||
#if defined(HIP_VERSION) && HIP_VERSION < 70000000
|
||||
// On ROCm versions before 7.0, __syncwarp isn't defined. The below
|
||||
// implementation is copy/pasted from the implementation in ROCm 7.0
|
||||
__device__ inline void __syncwarp() {
|
||||
__builtin_amdgcn_fence(__ATOMIC_RELEASE, "wavefront");
|
||||
__builtin_amdgcn_wave_barrier();
|
||||
__builtin_amdgcn_fence(__ATOMIC_ACQUIRE, "wavefront");
|
||||
}
|
||||
#endif
|
||||
#else
|
||||
#define FINAL_MASK 0xffffffff
|
||||
#endif
|
||||
|
||||
namespace tensorrt_llm::common {
|
||||
template <typename T, int num>
|
||||
struct packed_as;
|
||||
// Specialization for packed_as used in this kernel.
|
||||
template <>
|
||||
struct packed_as<uint, 1> {
|
||||
using type = uint;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct packed_as<uint, 2> {
|
||||
using type = uint2;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct packed_as<uint, 4> {
|
||||
using type = uint4;
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
__inline__ __device__ T warpReduceSum(T val) {
|
||||
#pragma unroll
|
||||
for (int mask = 16; mask > 0; mask >>= 1)
|
||||
val += __shfl_xor_sync(FINAL_MASK, val, mask, 32);
|
||||
return val;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
inline __device__ __host__ T divUp(T m, T n) {
|
||||
return (m + n - 1) / n;
|
||||
}
|
||||
|
||||
} // namespace tensorrt_llm::common
|
||||
|
||||
namespace tensorrt_llm::kernels {
|
||||
// NOTE(zhuhaoran): This kernel is adapted from TensorRT-LLM implementation,
|
||||
// with added support for passing the cos_sin_cache as an input.
|
||||
// https://github.com/NVIDIA/TensorRT-LLM/blob/main/cpp/tensorrt_llm/kernels/fusedQKNormRopeKernel.cu
|
||||
|
||||
// Perform per-head QK Norm and RoPE in a single kernel.
|
||||
// scalar_t_in: data type of QKV and RMSNorm weights
|
||||
// scalar_t_cache: data type of cos/sin cache
|
||||
// head_dim: the dimension of each head
|
||||
// interleave: interleave=!is_neox.
|
||||
template <typename scalar_t_in, typename scalar_t_cache, int head_dim,
|
||||
bool interleave>
|
||||
__global__ void fusedQKNormRopeKernel(
|
||||
void* qkv_void, // Combined QKV tensor
|
||||
int const num_heads_q, // Number of query heads
|
||||
int const num_heads_k, // Number of key heads
|
||||
int const num_heads_v, // Number of value heads
|
||||
float const eps, // Epsilon for RMS normalization
|
||||
void const* q_weight_void, // RMSNorm weights for query
|
||||
void const* k_weight_void, // RMSNorm weights for key
|
||||
void const* cos_sin_cache_void, // Pre-computed cos/sin cache
|
||||
int64_t const* position_ids, // Position IDs for RoPE
|
||||
int const num_tokens // Number of tokens
|
||||
) {
|
||||
#if (!defined(__CUDA_ARCH__) || __CUDA_ARCH__ < 800) && !defined(USE_ROCM)
|
||||
if constexpr ((std::is_same_v<scalar_t_in, c10::BFloat16>) ||
|
||||
std::is_same_v<scalar_t_cache, c10::BFloat16>) {
|
||||
return;
|
||||
} else {
|
||||
#endif
|
||||
|
||||
using Converter = vllm::_typeConvert<scalar_t_in>;
|
||||
static_assert(Converter::exists,
|
||||
"Input QKV data type is not supported for this CUDA "
|
||||
"architecture or toolkit version.");
|
||||
using T_in = typename Converter::hip_type;
|
||||
using T2_in = typename Converter::packed_hip_type;
|
||||
|
||||
using CacheConverter = vllm::_typeConvert<scalar_t_cache>;
|
||||
static_assert(CacheConverter::exists,
|
||||
"Cache data type is not supported for this CUDA architecture "
|
||||
"or toolkit version.");
|
||||
using T_cache = typename CacheConverter::hip_type;
|
||||
|
||||
T_in* qkv = reinterpret_cast<T_in*>(qkv_void);
|
||||
T_in const* q_weight = reinterpret_cast<T_in const*>(q_weight_void);
|
||||
T_in const* k_weight = reinterpret_cast<T_in const*>(k_weight_void);
|
||||
T_cache const* cos_sin_cache =
|
||||
reinterpret_cast<T_cache const*>(cos_sin_cache_void);
|
||||
|
||||
int const warpsPerBlock = blockDim.x / 32;
|
||||
int const warpId = threadIdx.x / 32;
|
||||
int const laneId = threadIdx.x % 32;
|
||||
|
||||
// Calculate global warp index to determine which head/token this warp
|
||||
// processes
|
||||
int const globalWarpIdx = blockIdx.x * warpsPerBlock + warpId;
|
||||
|
||||
// Total number of attention heads (Q and K)
|
||||
int const total_qk_heads = num_heads_q + num_heads_k;
|
||||
|
||||
// Determine which token and head type (Q or K) this warp processes
|
||||
int const tokenIdx = globalWarpIdx / total_qk_heads;
|
||||
int const localHeadIdx = globalWarpIdx % total_qk_heads;
|
||||
|
||||
// Skip if this warp is assigned beyond the number of tokens
|
||||
if (tokenIdx >= num_tokens) return;
|
||||
|
||||
bool const isQ = localHeadIdx < num_heads_q;
|
||||
int const headIdx = isQ ? localHeadIdx : localHeadIdx - num_heads_q;
|
||||
|
||||
int const num_heads = num_heads_q + num_heads_k + num_heads_v;
|
||||
|
||||
static_assert(head_dim % (32 * 2) == 0,
|
||||
"head_dim must be divisible by 64 (each warp processes one "
|
||||
"head, and each thread gets even number of "
|
||||
"elements)");
|
||||
constexpr int numElemsPerThread = head_dim / 32;
|
||||
float elements[numElemsPerThread];
|
||||
constexpr int elemSizeBytes = numElemsPerThread * sizeof(__nv_bfloat16);
|
||||
static_assert(elemSizeBytes % 4 == 0,
|
||||
"numSizeBytes must be a multiple of 4");
|
||||
constexpr int vecSize =
|
||||
elemSizeBytes /
|
||||
4; // Use packed_as<uint, vecSize> to perform loading/saving.
|
||||
using vec_T = typename tensorrt_llm::common::packed_as<uint, vecSize>::type;
|
||||
|
||||
int offsetWarp; // Offset for the warp
|
||||
if (isQ) {
|
||||
// Q segment: token offset + head offset within Q segment
|
||||
offsetWarp = tokenIdx * num_heads * head_dim + headIdx * head_dim;
|
||||
} else {
|
||||
// K segment: token offset + entire Q segment + head offset within K
|
||||
// segment
|
||||
offsetWarp = tokenIdx * num_heads * head_dim + num_heads_q * head_dim +
|
||||
headIdx * head_dim;
|
||||
}
|
||||
int offsetThread = offsetWarp + laneId * numElemsPerThread;
|
||||
|
||||
// Sum of squares for RMSNorm
|
||||
float sumOfSquares = 0.0f;
|
||||
|
||||
// Load.
|
||||
{
|
||||
vec_T vec = *reinterpret_cast<vec_T const*>(&qkv[offsetThread]);
|
||||
constexpr int num_packed_elems = elemSizeBytes / sizeof(T2_in);
|
||||
#pragma unroll
|
||||
for (int i = 0; i < num_packed_elems; i++) {
|
||||
// Interpret the generic vector chunk as the specific packed type
|
||||
T2_in packed_val = *(reinterpret_cast<T2_in*>(&vec) + i);
|
||||
// Convert to float2 for computation
|
||||
float2 vals = Converter::convert(packed_val);
|
||||
sumOfSquares += vals.x * vals.x;
|
||||
sumOfSquares += vals.y * vals.y;
|
||||
|
||||
elements[2 * i] = vals.x;
|
||||
elements[2 * i + 1] = vals.y;
|
||||
}
|
||||
}
|
||||
|
||||
// Reduce sum across warp using the utility function
|
||||
sumOfSquares = tensorrt_llm::common::warpReduceSum(sumOfSquares);
|
||||
|
||||
// Compute RMS normalization factor
|
||||
float rms_rcp = rsqrtf(sumOfSquares / static_cast<float>(head_dim) + eps);
|
||||
|
||||
// Normalize elements
|
||||
#pragma unroll
|
||||
for (int i = 0; i < numElemsPerThread; i++) {
|
||||
int dim = laneId * numElemsPerThread + i;
|
||||
float weight = isQ ? Converter::convert(q_weight[dim])
|
||||
: Converter::convert(k_weight[dim]);
|
||||
elements[i] *= rms_rcp * weight;
|
||||
}
|
||||
|
||||
// Apply RoPE to normalized elements
|
||||
float elements2[numElemsPerThread]; // Additional buffer required for RoPE.
|
||||
|
||||
int64_t pos_id = position_ids[tokenIdx];
|
||||
|
||||
// Calculate cache pointer for this position - similar to
|
||||
// pos_encoding_kernels.cu
|
||||
T_cache const* cache_ptr = cos_sin_cache + pos_id * head_dim;
|
||||
int const embed_dim = head_dim / 2;
|
||||
T_cache const* cos_ptr = cache_ptr;
|
||||
T_cache const* sin_ptr = cache_ptr + embed_dim;
|
||||
|
||||
if constexpr (interleave) {
|
||||
// Perform interleaving. Use pre-computed cos/sin values.
|
||||
#pragma unroll
|
||||
for (int i = 0; i < numElemsPerThread / 2; ++i) {
|
||||
int const idx0 = 2 * i;
|
||||
int const idx1 = 2 * i + 1;
|
||||
|
||||
float const val0 = elements[idx0];
|
||||
float const val1 = elements[idx1];
|
||||
|
||||
int const dim_idx = laneId * numElemsPerThread + idx0;
|
||||
int const half_dim = dim_idx / 2;
|
||||
float const cos_val =
|
||||
CacheConverter::convert(VLLM_LDG(cos_ptr + half_dim));
|
||||
float const sin_val =
|
||||
CacheConverter::convert(VLLM_LDG(sin_ptr + half_dim));
|
||||
|
||||
elements[idx0] = val0 * cos_val - val1 * sin_val;
|
||||
elements[idx1] = val0 * sin_val + val1 * cos_val;
|
||||
}
|
||||
} else {
|
||||
// Before data exchange with in warp, we need to sync.
|
||||
__syncwarp();
|
||||
// Get the data from the other half of the warp. Use pre-computed cos/sin
|
||||
// values.
|
||||
#pragma unroll
|
||||
for (int i = 0; i < numElemsPerThread; i++) {
|
||||
elements2[i] = __shfl_xor_sync(FINAL_MASK, elements[i], 16);
|
||||
if (laneId < 16) {
|
||||
elements2[i] = -elements2[i];
|
||||
}
|
||||
|
||||
int dim_idx = laneId * numElemsPerThread + i;
|
||||
dim_idx = (dim_idx * 2) % head_dim;
|
||||
int half_dim = dim_idx / 2;
|
||||
// Use pre-computed cos/sin from cache
|
||||
float cos_val = CacheConverter::convert(VLLM_LDG(cos_ptr + half_dim));
|
||||
float sin_val = CacheConverter::convert(VLLM_LDG(sin_ptr + half_dim));
|
||||
|
||||
elements[i] = elements[i] * cos_val + elements2[i] * sin_val;
|
||||
}
|
||||
// __shfl_xor_sync does not provide memfence. Need to sync again.
|
||||
__syncwarp();
|
||||
}
|
||||
|
||||
// Store.
|
||||
{
|
||||
vec_T vec;
|
||||
constexpr int num_packed_elems = elemSizeBytes / sizeof(T2_in);
|
||||
#pragma unroll
|
||||
for (int i = 0; i < num_packed_elems; i++) {
|
||||
// Convert from float2 back to the specific packed type
|
||||
T2_in packed_val = Converter::convert(
|
||||
make_float2(elements[2 * i], elements[2 * i + 1]));
|
||||
// Place it into the generic vector
|
||||
*(reinterpret_cast<T2_in*>(&vec) + i) = packed_val;
|
||||
}
|
||||
*reinterpret_cast<vec_T*>(&qkv[offsetThread]) = vec;
|
||||
}
|
||||
|
||||
#if (!defined(__CUDA_ARCH__) || __CUDA_ARCH__ < 800) && !defined(USE_ROCM)
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
// Borrowed from
|
||||
// https://github.com/flashinfer-ai/flashinfer/blob/8125d079a43e9a0ba463a4ed1b639cefd084cec9/include/flashinfer/pos_enc.cuh#L568
|
||||
#define DISPATCH_INTERLEAVE(interleave, INTERLEAVE, ...) \
|
||||
if (interleave) { \
|
||||
const bool INTERLEAVE = true; \
|
||||
__VA_ARGS__ \
|
||||
} else { \
|
||||
const bool INTERLEAVE = false; \
|
||||
__VA_ARGS__ \
|
||||
}
|
||||
|
||||
template <typename scalar_t_in, typename scalar_t_cache>
|
||||
void launchFusedQKNormRope(void* qkv, int const num_tokens,
|
||||
int const num_heads_q, int const num_heads_k,
|
||||
int const num_heads_v, int const head_dim,
|
||||
float const eps, void const* q_weight,
|
||||
void const* k_weight, void const* cos_sin_cache,
|
||||
bool const interleave, int64_t const* position_ids,
|
||||
cudaStream_t stream) {
|
||||
constexpr int blockSize = 256;
|
||||
|
||||
int const warpsPerBlock = blockSize / 32;
|
||||
int const totalQKHeads = num_heads_q + num_heads_k;
|
||||
int const totalWarps = num_tokens * totalQKHeads;
|
||||
|
||||
int const gridSize = common::divUp(totalWarps, warpsPerBlock);
|
||||
dim3 gridDim(gridSize);
|
||||
dim3 blockDim(blockSize);
|
||||
|
||||
switch (head_dim) {
|
||||
case 64:
|
||||
DISPATCH_INTERLEAVE(interleave, INTERLEAVE, {
|
||||
fusedQKNormRopeKernel<scalar_t_in, scalar_t_cache, 64, INTERLEAVE>
|
||||
<<<gridDim, blockDim, 0, stream>>>(
|
||||
qkv, num_heads_q, num_heads_k, num_heads_v, eps, q_weight,
|
||||
k_weight, cos_sin_cache, position_ids, num_tokens);
|
||||
});
|
||||
break;
|
||||
case 128:
|
||||
DISPATCH_INTERLEAVE(interleave, INTERLEAVE, {
|
||||
fusedQKNormRopeKernel<scalar_t_in, scalar_t_cache, 128, INTERLEAVE>
|
||||
<<<gridDim, blockDim, 0, stream>>>(
|
||||
qkv, num_heads_q, num_heads_k, num_heads_v, eps, q_weight,
|
||||
k_weight, cos_sin_cache, position_ids, num_tokens);
|
||||
});
|
||||
break;
|
||||
case 256:
|
||||
DISPATCH_INTERLEAVE(interleave, INTERLEAVE, {
|
||||
fusedQKNormRopeKernel<scalar_t_in, scalar_t_cache, 256, INTERLEAVE>
|
||||
<<<gridDim, blockDim, 0, stream>>>(
|
||||
qkv, num_heads_q, num_heads_k, num_heads_v, eps, q_weight,
|
||||
k_weight, cos_sin_cache, position_ids, num_tokens);
|
||||
});
|
||||
break;
|
||||
default:
|
||||
TORCH_CHECK(false,
|
||||
"Unsupported head dimension for fusedQKNormRope: ", head_dim);
|
||||
}
|
||||
}
|
||||
} // namespace tensorrt_llm::kernels
|
||||
|
||||
void fused_qk_norm_rope(
|
||||
torch::Tensor& qkv, // Combined QKV tensor [num_tokens,
|
||||
// (num_heads_q+num_heads_k+num_heads_v)*head_dim]
|
||||
int64_t num_heads_q, // Number of query heads
|
||||
int64_t num_heads_k, // Number of key heads
|
||||
int64_t num_heads_v, // Number of value heads
|
||||
int64_t head_dim, // Dimension per head
|
||||
double eps, // Epsilon for RMS normalization
|
||||
torch::Tensor& q_weight, // RMSNorm weights for query [head_dim]
|
||||
torch::Tensor& k_weight, // RMSNorm weights for key [head_dim]
|
||||
torch::Tensor& cos_sin_cache, // Cos/sin cache [max_position, head_dim]
|
||||
bool is_neox, // Whether RoPE is applied in Neox style
|
||||
torch::Tensor& position_ids // Position IDs for RoPE [num_tokens]
|
||||
) {
|
||||
// Input validation
|
||||
CHECK_INPUT(qkv);
|
||||
CHECK_INPUT(position_ids);
|
||||
CHECK_INPUT(q_weight);
|
||||
CHECK_INPUT(k_weight);
|
||||
CHECK_INPUT(cos_sin_cache);
|
||||
CHECK_TYPE(position_ids, torch::kInt64);
|
||||
|
||||
TORCH_CHECK(qkv.dim() == 2,
|
||||
"QKV tensor must be 2D: [num_tokens, "
|
||||
"(num_heads_q+num_heads_k+num_heads_v)*head_dim]");
|
||||
TORCH_CHECK(position_ids.dim() == 1, "Position IDs must be 1D: [num_tokens]");
|
||||
TORCH_CHECK(q_weight.dim() == 1, "Query weights must be 1D: [head_dim]");
|
||||
TORCH_CHECK(k_weight.dim() == 1, "Key weights must be 1D: [head_dim]");
|
||||
TORCH_CHECK(cos_sin_cache.dim() == 2,
|
||||
"Cos/sin cache must be 2D: [max_position, head_dim]");
|
||||
TORCH_CHECK(q_weight.size(0) == head_dim,
|
||||
"Query weights size must match head dimension");
|
||||
TORCH_CHECK(k_weight.size(0) == head_dim,
|
||||
"Key weights size must match head dimension");
|
||||
TORCH_CHECK(cos_sin_cache.size(1) == head_dim,
|
||||
"Cos/sin cache dimension must match head_dim");
|
||||
TORCH_CHECK(qkv.scalar_type() == q_weight.scalar_type() &&
|
||||
qkv.scalar_type() == k_weight.scalar_type(),
|
||||
"qkv, q_weight and k_weight must have the same dtype");
|
||||
|
||||
int64_t num_tokens = qkv.size(0);
|
||||
TORCH_CHECK(position_ids.size(0) == num_tokens,
|
||||
"Number of tokens in position_ids must match QKV");
|
||||
|
||||
int64_t total_heads = num_heads_q + num_heads_k + num_heads_v;
|
||||
TORCH_CHECK(
|
||||
qkv.size(1) == total_heads * head_dim,
|
||||
"QKV tensor size must match total number of heads and head dimension");
|
||||
|
||||
auto stream = at::cuda::getCurrentCUDAStream(qkv.get_device());
|
||||
|
||||
VLLM_DISPATCH_HALF_TYPES(qkv.scalar_type(), "fused_qk_norm_rope_kernel", [&] {
|
||||
using qkv_scalar_t = scalar_t;
|
||||
VLLM_DISPATCH_FLOATING_TYPES(
|
||||
cos_sin_cache.scalar_type(), "fused_qk_norm_rope_kernel", [&] {
|
||||
using cache_scalar_t = scalar_t;
|
||||
tensorrt_llm::kernels::launchFusedQKNormRope<qkv_scalar_t,
|
||||
cache_scalar_t>(
|
||||
qkv.data_ptr(), static_cast<int>(num_tokens),
|
||||
static_cast<int>(num_heads_q), static_cast<int>(num_heads_k),
|
||||
static_cast<int>(num_heads_v), static_cast<int>(head_dim),
|
||||
static_cast<float>(eps), q_weight.data_ptr(), k_weight.data_ptr(),
|
||||
cos_sin_cache.data_ptr(), !is_neox,
|
||||
reinterpret_cast<int64_t const*>(position_ids.data_ptr()),
|
||||
stream);
|
||||
});
|
||||
});
|
||||
}
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user